xref: /llvm-project/llvm/test/CodeGen/AMDGPU/mul.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
6; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
8; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s
9
10; mul24 and mad24 are affected
11
12define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
13; SI-LABEL: test_mul_v2i32:
14; SI:       ; %bb.0: ; %entry
15; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
16; SI-NEXT:    s_mov_b32 s7, 0xf000
17; SI-NEXT:    s_mov_b32 s6, -1
18; SI-NEXT:    s_mov_b32 s10, s6
19; SI-NEXT:    s_mov_b32 s11, s7
20; SI-NEXT:    s_waitcnt lgkmcnt(0)
21; SI-NEXT:    s_mov_b32 s8, s2
22; SI-NEXT:    s_mov_b32 s9, s3
23; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
24; SI-NEXT:    s_mov_b32 s4, s0
25; SI-NEXT:    s_mov_b32 s5, s1
26; SI-NEXT:    s_waitcnt vmcnt(0)
27; SI-NEXT:    v_mul_lo_u32 v1, v1, v3
28; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
29; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
30; SI-NEXT:    s_endpgm
31;
32; VI-LABEL: test_mul_v2i32:
33; VI:       ; %bb.0: ; %entry
34; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
35; VI-NEXT:    s_mov_b32 s7, 0xf000
36; VI-NEXT:    s_mov_b32 s6, -1
37; VI-NEXT:    s_mov_b32 s10, s6
38; VI-NEXT:    s_mov_b32 s11, s7
39; VI-NEXT:    s_waitcnt lgkmcnt(0)
40; VI-NEXT:    s_mov_b32 s8, s2
41; VI-NEXT:    s_mov_b32 s9, s3
42; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
43; VI-NEXT:    s_mov_b32 s4, s0
44; VI-NEXT:    s_mov_b32 s5, s1
45; VI-NEXT:    s_waitcnt vmcnt(0)
46; VI-NEXT:    v_mul_lo_u32 v1, v1, v3
47; VI-NEXT:    v_mul_lo_u32 v0, v0, v2
48; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
49; VI-NEXT:    s_endpgm
50;
51; GFX9-LABEL: test_mul_v2i32:
52; GFX9:       ; %bb.0: ; %entry
53; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
54; GFX9-NEXT:    s_mov_b32 s7, 0xf000
55; GFX9-NEXT:    s_mov_b32 s6, -1
56; GFX9-NEXT:    s_mov_b32 s10, s6
57; GFX9-NEXT:    s_mov_b32 s11, s7
58; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
59; GFX9-NEXT:    s_mov_b32 s8, s2
60; GFX9-NEXT:    s_mov_b32 s9, s3
61; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
62; GFX9-NEXT:    s_mov_b32 s4, s0
63; GFX9-NEXT:    s_mov_b32 s5, s1
64; GFX9-NEXT:    s_waitcnt vmcnt(0)
65; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
66; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
67; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
68; GFX9-NEXT:    s_endpgm
69;
70; GFX10-LABEL: test_mul_v2i32:
71; GFX10:       ; %bb.0: ; %entry
72; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
73; GFX10-NEXT:    s_mov_b32 s6, -1
74; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
75; GFX10-NEXT:    s_mov_b32 s10, s6
76; GFX10-NEXT:    s_mov_b32 s11, s7
77; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
78; GFX10-NEXT:    s_mov_b32 s8, s2
79; GFX10-NEXT:    s_mov_b32 s9, s3
80; GFX10-NEXT:    s_mov_b32 s4, s0
81; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
82; GFX10-NEXT:    s_mov_b32 s5, s1
83; GFX10-NEXT:    s_waitcnt vmcnt(0)
84; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v3
85; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
86; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
87; GFX10-NEXT:    s_endpgm
88;
89; GFX11-LABEL: test_mul_v2i32:
90; GFX11:       ; %bb.0: ; %entry
91; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
92; GFX11-NEXT:    s_mov_b32 s6, -1
93; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
94; GFX11-NEXT:    s_mov_b32 s10, s6
95; GFX11-NEXT:    s_mov_b32 s11, s7
96; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX11-NEXT:    s_mov_b32 s8, s2
98; GFX11-NEXT:    s_mov_b32 s9, s3
99; GFX11-NEXT:    s_mov_b32 s4, s0
100; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], 0
101; GFX11-NEXT:    s_mov_b32 s5, s1
102; GFX11-NEXT:    s_waitcnt vmcnt(0)
103; GFX11-NEXT:    v_mul_lo_u32 v1, v1, v3
104; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v2
105; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
106; GFX11-NEXT:    s_endpgm
107;
108; GFX12-LABEL: test_mul_v2i32:
109; GFX12:       ; %bb.0: ; %entry
110; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
111; GFX12-NEXT:    s_mov_b32 s6, -1
112; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
113; GFX12-NEXT:    s_mov_b32 s10, s6
114; GFX12-NEXT:    s_mov_b32 s11, s7
115; GFX12-NEXT:    s_wait_kmcnt 0x0
116; GFX12-NEXT:    s_mov_b32 s8, s2
117; GFX12-NEXT:    s_mov_b32 s9, s3
118; GFX12-NEXT:    s_mov_b32 s4, s0
119; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], null
120; GFX12-NEXT:    s_mov_b32 s5, s1
121; GFX12-NEXT:    s_wait_loadcnt 0x0
122; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v3
123; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v2
124; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
125; GFX12-NEXT:    s_endpgm
126;
127; EG-LABEL: test_mul_v2i32:
128; EG:       ; %bb.0: ; %entry
129; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
130; EG-NEXT:    TEX 0 @6
131; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
132; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
133; EG-NEXT:    CF_END
134; EG-NEXT:    PAD
135; EG-NEXT:    Fetch clause starting at 6:
136; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
137; EG-NEXT:    ALU clause starting at 8:
138; EG-NEXT:     MOV * T0.X, KC0[2].Z,
139; EG-NEXT:    ALU clause starting at 9:
140; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T0.W,
141; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
142; EG-NEXT:     MULLO_INT * T0.X, T0.X, T0.Z,
143; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
144entry:
145  %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
146  %a = load <2 x i32>, ptr addrspace(1) %in
147  %b = load <2 x i32>, ptr addrspace(1) %b_ptr
148  %result = mul <2 x i32> %a, %b
149  store <2 x i32> %result, ptr addrspace(1) %out
150  ret void
151}
152
153define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
154; SI-LABEL: v_mul_v4i32:
155; SI:       ; %bb.0: ; %entry
156; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
157; SI-NEXT:    s_mov_b32 s7, 0xf000
158; SI-NEXT:    s_mov_b32 s6, -1
159; SI-NEXT:    s_mov_b32 s10, s6
160; SI-NEXT:    s_mov_b32 s11, s7
161; SI-NEXT:    s_waitcnt lgkmcnt(0)
162; SI-NEXT:    s_mov_b32 s8, s2
163; SI-NEXT:    s_mov_b32 s9, s3
164; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
165; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
166; SI-NEXT:    s_mov_b32 s4, s0
167; SI-NEXT:    s_mov_b32 s5, s1
168; SI-NEXT:    s_waitcnt vmcnt(0)
169; SI-NEXT:    v_mul_lo_u32 v3, v3, v7
170; SI-NEXT:    v_mul_lo_u32 v2, v2, v6
171; SI-NEXT:    v_mul_lo_u32 v1, v1, v5
172; SI-NEXT:    v_mul_lo_u32 v0, v0, v4
173; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
174; SI-NEXT:    s_endpgm
175;
176; VI-LABEL: v_mul_v4i32:
177; VI:       ; %bb.0: ; %entry
178; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
179; VI-NEXT:    s_mov_b32 s7, 0xf000
180; VI-NEXT:    s_mov_b32 s6, -1
181; VI-NEXT:    s_mov_b32 s10, s6
182; VI-NEXT:    s_mov_b32 s11, s7
183; VI-NEXT:    s_waitcnt lgkmcnt(0)
184; VI-NEXT:    s_mov_b32 s8, s2
185; VI-NEXT:    s_mov_b32 s9, s3
186; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
187; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
188; VI-NEXT:    s_mov_b32 s4, s0
189; VI-NEXT:    s_mov_b32 s5, s1
190; VI-NEXT:    s_waitcnt vmcnt(0)
191; VI-NEXT:    v_mul_lo_u32 v3, v3, v7
192; VI-NEXT:    v_mul_lo_u32 v2, v2, v6
193; VI-NEXT:    v_mul_lo_u32 v1, v1, v5
194; VI-NEXT:    v_mul_lo_u32 v0, v0, v4
195; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
196; VI-NEXT:    s_endpgm
197;
198; GFX9-LABEL: v_mul_v4i32:
199; GFX9:       ; %bb.0: ; %entry
200; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
201; GFX9-NEXT:    s_mov_b32 s7, 0xf000
202; GFX9-NEXT:    s_mov_b32 s6, -1
203; GFX9-NEXT:    s_mov_b32 s10, s6
204; GFX9-NEXT:    s_mov_b32 s11, s7
205; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
206; GFX9-NEXT:    s_mov_b32 s8, s2
207; GFX9-NEXT:    s_mov_b32 s9, s3
208; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
209; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
210; GFX9-NEXT:    s_mov_b32 s4, s0
211; GFX9-NEXT:    s_mov_b32 s5, s1
212; GFX9-NEXT:    s_waitcnt vmcnt(0)
213; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v7
214; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v6
215; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v5
216; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v4
217; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
218; GFX9-NEXT:    s_endpgm
219;
220; GFX10-LABEL: v_mul_v4i32:
221; GFX10:       ; %bb.0: ; %entry
222; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
223; GFX10-NEXT:    s_mov_b32 s6, -1
224; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
225; GFX10-NEXT:    s_mov_b32 s10, s6
226; GFX10-NEXT:    s_mov_b32 s11, s7
227; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX10-NEXT:    s_mov_b32 s8, s2
229; GFX10-NEXT:    s_mov_b32 s9, s3
230; GFX10-NEXT:    s_clause 0x1
231; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
232; GFX10-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
233; GFX10-NEXT:    s_mov_b32 s4, s0
234; GFX10-NEXT:    s_mov_b32 s5, s1
235; GFX10-NEXT:    s_waitcnt vmcnt(0)
236; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v7
237; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v6
238; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v5
239; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v4
240; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
241; GFX10-NEXT:    s_endpgm
242;
243; GFX11-LABEL: v_mul_v4i32:
244; GFX11:       ; %bb.0: ; %entry
245; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
246; GFX11-NEXT:    s_mov_b32 s6, -1
247; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
248; GFX11-NEXT:    s_mov_b32 s10, s6
249; GFX11-NEXT:    s_mov_b32 s11, s7
250; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
251; GFX11-NEXT:    s_mov_b32 s8, s2
252; GFX11-NEXT:    s_mov_b32 s9, s3
253; GFX11-NEXT:    s_clause 0x1
254; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], 0
255; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[8:11], 0 offset:16
256; GFX11-NEXT:    s_mov_b32 s4, s0
257; GFX11-NEXT:    s_mov_b32 s5, s1
258; GFX11-NEXT:    s_waitcnt vmcnt(0)
259; GFX11-NEXT:    v_mul_lo_u32 v3, v3, v7
260; GFX11-NEXT:    v_mul_lo_u32 v2, v2, v6
261; GFX11-NEXT:    v_mul_lo_u32 v1, v1, v5
262; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v4
263; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
264; GFX11-NEXT:    s_endpgm
265;
266; GFX12-LABEL: v_mul_v4i32:
267; GFX12:       ; %bb.0: ; %entry
268; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
269; GFX12-NEXT:    s_mov_b32 s6, -1
270; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
271; GFX12-NEXT:    s_mov_b32 s10, s6
272; GFX12-NEXT:    s_mov_b32 s11, s7
273; GFX12-NEXT:    s_wait_kmcnt 0x0
274; GFX12-NEXT:    s_mov_b32 s8, s2
275; GFX12-NEXT:    s_mov_b32 s9, s3
276; GFX12-NEXT:    s_clause 0x1
277; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], null
278; GFX12-NEXT:    buffer_load_b128 v[4:7], off, s[8:11], null offset:16
279; GFX12-NEXT:    s_mov_b32 s4, s0
280; GFX12-NEXT:    s_mov_b32 s5, s1
281; GFX12-NEXT:    s_wait_loadcnt 0x0
282; GFX12-NEXT:    v_mul_lo_u32 v3, v3, v7
283; GFX12-NEXT:    v_mul_lo_u32 v2, v2, v6
284; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v5
285; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v4
286; GFX12-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], null
287; GFX12-NEXT:    s_endpgm
288;
289; EG-LABEL: v_mul_v4i32:
290; EG:       ; %bb.0: ; %entry
291; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
292; EG-NEXT:    TEX 1 @6
293; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
294; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
295; EG-NEXT:    CF_END
296; EG-NEXT:    PAD
297; EG-NEXT:    Fetch clause starting at 6:
298; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
299; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
300; EG-NEXT:    ALU clause starting at 10:
301; EG-NEXT:     MOV * T0.X, KC0[2].Z,
302; EG-NEXT:    ALU clause starting at 11:
303; EG-NEXT:     MULLO_INT * T0.W, T0.W, T1.W,
304; EG-NEXT:     MULLO_INT * T0.Z, T0.Z, T1.Z,
305; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T1.Y,
306; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
307; EG-NEXT:     MULLO_INT * T0.X, T0.X, T1.X,
308; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
309entry:
310  %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
311  %a = load <4 x i32>, ptr addrspace(1) %in
312  %b = load <4 x i32>, ptr addrspace(1) %b_ptr
313  %result = mul <4 x i32> %a, %b
314  store <4 x i32> %result, ptr addrspace(1) %out
315  ret void
316}
317
318define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) {
319; SI-LABEL: s_trunc_i64_mul_to_i32:
320; SI:       ; %bb.0: ; %entry
321; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
322; SI-NEXT:    s_waitcnt lgkmcnt(0)
323; SI-NEXT:    s_load_dword s3, s[4:5], 0xd
324; SI-NEXT:    s_mov_b32 s7, 0xf000
325; SI-NEXT:    s_mov_b32 s6, -1
326; SI-NEXT:    s_mov_b32 s4, s0
327; SI-NEXT:    s_waitcnt lgkmcnt(0)
328; SI-NEXT:    s_mul_i32 s0, s3, s2
329; SI-NEXT:    s_mov_b32 s5, s1
330; SI-NEXT:    v_mov_b32_e32 v0, s0
331; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
332; SI-NEXT:    s_endpgm
333;
334; VI-LABEL: s_trunc_i64_mul_to_i32:
335; VI:       ; %bb.0: ; %entry
336; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
337; VI-NEXT:    s_waitcnt lgkmcnt(0)
338; VI-NEXT:    s_load_dword s3, s[4:5], 0x34
339; VI-NEXT:    s_mov_b32 s7, 0xf000
340; VI-NEXT:    s_mov_b32 s6, -1
341; VI-NEXT:    s_mov_b32 s4, s0
342; VI-NEXT:    s_waitcnt lgkmcnt(0)
343; VI-NEXT:    s_mul_i32 s0, s3, s2
344; VI-NEXT:    s_mov_b32 s5, s1
345; VI-NEXT:    v_mov_b32_e32 v0, s0
346; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
347; VI-NEXT:    s_endpgm
348;
349; GFX9-LABEL: s_trunc_i64_mul_to_i32:
350; GFX9:       ; %bb.0: ; %entry
351; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
352; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x34
354; GFX9-NEXT:    ; kill: killed $sgpr4_sgpr5
355; GFX9-NEXT:    s_mov_b32 s7, 0xf000
356; GFX9-NEXT:    s_mov_b32 s6, -1
357; GFX9-NEXT:    s_mov_b32 s4, s0
358; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
359; GFX9-NEXT:    s_mul_i32 s0, s3, s2
360; GFX9-NEXT:    s_mov_b32 s5, s1
361; GFX9-NEXT:    v_mov_b32_e32 v0, s0
362; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
363; GFX9-NEXT:    s_endpgm
364;
365; GFX10-LABEL: s_trunc_i64_mul_to_i32:
366; GFX10:       ; %bb.0: ; %entry
367; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
368; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
369; GFX10-NEXT:    s_load_dword s3, s[4:5], 0x34
370; GFX10-NEXT:    ; kill: killed $sgpr4_sgpr5
371; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
372; GFX10-NEXT:    s_mul_i32 s2, s3, s2
373; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
374; GFX10-NEXT:    v_mov_b32_e32 v0, s2
375; GFX10-NEXT:    s_mov_b32 s2, -1
376; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
377; GFX10-NEXT:    s_endpgm
378;
379; GFX11-LABEL: s_trunc_i64_mul_to_i32:
380; GFX11:       ; %bb.0: ; %entry
381; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
382; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
383; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x34
384; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
385; GFX11-NEXT:    s_mul_i32 s2, s3, s2
386; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
387; GFX11-NEXT:    v_mov_b32_e32 v0, s2
388; GFX11-NEXT:    s_mov_b32 s2, -1
389; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
390; GFX11-NEXT:    s_endpgm
391;
392; GFX12-LABEL: s_trunc_i64_mul_to_i32:
393; GFX12:       ; %bb.0: ; %entry
394; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
395; GFX12-NEXT:    s_wait_kmcnt 0x0
396; GFX12-NEXT:    s_load_b32 s3, s[4:5], 0x34
397; GFX12-NEXT:    s_wait_kmcnt 0x0
398; GFX12-NEXT:    s_mul_i32 s2, s3, s2
399; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
400; GFX12-NEXT:    v_mov_b32_e32 v0, s2
401; GFX12-NEXT:    s_mov_b32 s2, -1
402; GFX12-NEXT:    buffer_store_b32 v0, off, s[0:3], null
403; GFX12-NEXT:    s_endpgm
404;
405; EG-LABEL: s_trunc_i64_mul_to_i32:
406; EG:       ; %bb.0: ; %entry
407; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
408; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
409; EG-NEXT:    CF_END
410; EG-NEXT:    PAD
411; EG-NEXT:    ALU clause starting at 4:
412; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
413; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
414; EG-NEXT:     MULLO_INT * T1.X, KC0[3].Y, KC0[2].W,
415entry:
416  %mul = mul i64 %b, %a
417  %trunc = trunc i64 %mul to i32
418  store i32 %trunc, ptr addrspace(1) %out, align 8
419  ret void
420}
421
422define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
423; SI-LABEL: v_trunc_i64_mul_to_i32:
424; SI:       ; %bb.0: ; %entry
425; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
426; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
427; SI-NEXT:    s_mov_b32 s7, 0xf000
428; SI-NEXT:    s_mov_b32 s6, -1
429; SI-NEXT:    s_mov_b32 s14, s6
430; SI-NEXT:    s_waitcnt lgkmcnt(0)
431; SI-NEXT:    s_mov_b32 s12, s2
432; SI-NEXT:    s_mov_b32 s13, s3
433; SI-NEXT:    s_mov_b32 s15, s7
434; SI-NEXT:    s_mov_b32 s10, s6
435; SI-NEXT:    s_mov_b32 s11, s7
436; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
437; SI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
438; SI-NEXT:    s_mov_b32 s4, s0
439; SI-NEXT:    s_mov_b32 s5, s1
440; SI-NEXT:    s_waitcnt vmcnt(0)
441; SI-NEXT:    v_mul_lo_u32 v0, v1, v0
442; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
443; SI-NEXT:    s_endpgm
444;
445; VI-LABEL: v_trunc_i64_mul_to_i32:
446; VI:       ; %bb.0: ; %entry
447; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
448; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
449; VI-NEXT:    s_mov_b32 s7, 0xf000
450; VI-NEXT:    s_mov_b32 s6, -1
451; VI-NEXT:    s_mov_b32 s14, s6
452; VI-NEXT:    s_waitcnt lgkmcnt(0)
453; VI-NEXT:    s_mov_b32 s12, s2
454; VI-NEXT:    s_mov_b32 s13, s3
455; VI-NEXT:    s_mov_b32 s15, s7
456; VI-NEXT:    s_mov_b32 s10, s6
457; VI-NEXT:    s_mov_b32 s11, s7
458; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
459; VI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
460; VI-NEXT:    s_mov_b32 s4, s0
461; VI-NEXT:    s_mov_b32 s5, s1
462; VI-NEXT:    s_waitcnt vmcnt(0)
463; VI-NEXT:    v_mul_lo_u32 v0, v1, v0
464; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
465; VI-NEXT:    s_endpgm
466;
467; GFX9-LABEL: v_trunc_i64_mul_to_i32:
468; GFX9:       ; %bb.0: ; %entry
469; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
470; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
471; GFX9-NEXT:    s_mov_b32 s7, 0xf000
472; GFX9-NEXT:    s_mov_b32 s6, -1
473; GFX9-NEXT:    s_mov_b32 s14, s6
474; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
475; GFX9-NEXT:    s_mov_b32 s12, s2
476; GFX9-NEXT:    s_mov_b32 s13, s3
477; GFX9-NEXT:    s_mov_b32 s15, s7
478; GFX9-NEXT:    s_mov_b32 s10, s6
479; GFX9-NEXT:    s_mov_b32 s11, s7
480; GFX9-NEXT:    buffer_load_dword v0, off, s[12:15], 0
481; GFX9-NEXT:    buffer_load_dword v1, off, s[8:11], 0
482; GFX9-NEXT:    s_mov_b32 s4, s0
483; GFX9-NEXT:    s_mov_b32 s5, s1
484; GFX9-NEXT:    s_waitcnt vmcnt(0)
485; GFX9-NEXT:    v_mul_lo_u32 v0, v1, v0
486; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
487; GFX9-NEXT:    s_endpgm
488;
489; GFX10-LABEL: v_trunc_i64_mul_to_i32:
490; GFX10:       ; %bb.0: ; %entry
491; GFX10-NEXT:    s_clause 0x1
492; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
493; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
494; GFX10-NEXT:    s_mov_b32 s6, -1
495; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
496; GFX10-NEXT:    s_mov_b32 s14, s6
497; GFX10-NEXT:    s_mov_b32 s15, s7
498; GFX10-NEXT:    s_mov_b32 s10, s6
499; GFX10-NEXT:    s_mov_b32 s11, s7
500; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
501; GFX10-NEXT:    s_mov_b32 s12, s2
502; GFX10-NEXT:    s_mov_b32 s13, s3
503; GFX10-NEXT:    buffer_load_dword v0, off, s[12:15], 0
504; GFX10-NEXT:    buffer_load_dword v1, off, s[8:11], 0
505; GFX10-NEXT:    s_mov_b32 s4, s0
506; GFX10-NEXT:    s_mov_b32 s5, s1
507; GFX10-NEXT:    s_waitcnt vmcnt(0)
508; GFX10-NEXT:    v_mul_lo_u32 v0, v1, v0
509; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
510; GFX10-NEXT:    s_endpgm
511;
512; GFX11-LABEL: v_trunc_i64_mul_to_i32:
513; GFX11:       ; %bb.0: ; %entry
514; GFX11-NEXT:    s_clause 0x1
515; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
516; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
517; GFX11-NEXT:    s_mov_b32 s10, -1
518; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
519; GFX11-NEXT:    s_mov_b32 s14, s10
520; GFX11-NEXT:    s_mov_b32 s15, s11
521; GFX11-NEXT:    s_mov_b32 s6, s10
522; GFX11-NEXT:    s_mov_b32 s7, s11
523; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
524; GFX11-NEXT:    s_mov_b32 s12, s2
525; GFX11-NEXT:    s_mov_b32 s13, s3
526; GFX11-NEXT:    buffer_load_b32 v0, off, s[12:15], 0
527; GFX11-NEXT:    buffer_load_b32 v1, off, s[4:7], 0
528; GFX11-NEXT:    s_mov_b32 s8, s0
529; GFX11-NEXT:    s_mov_b32 s9, s1
530; GFX11-NEXT:    s_waitcnt vmcnt(0)
531; GFX11-NEXT:    v_mul_lo_u32 v0, v1, v0
532; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
533; GFX11-NEXT:    s_endpgm
534;
535; GFX12-LABEL: v_trunc_i64_mul_to_i32:
536; GFX12:       ; %bb.0: ; %entry
537; GFX12-NEXT:    s_clause 0x1
538; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
539; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
540; GFX12-NEXT:    s_mov_b32 s10, -1
541; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
542; GFX12-NEXT:    s_mov_b32 s14, s10
543; GFX12-NEXT:    s_mov_b32 s15, s11
544; GFX12-NEXT:    s_mov_b32 s6, s10
545; GFX12-NEXT:    s_mov_b32 s7, s11
546; GFX12-NEXT:    s_wait_kmcnt 0x0
547; GFX12-NEXT:    s_mov_b32 s12, s2
548; GFX12-NEXT:    s_mov_b32 s13, s3
549; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null
550; GFX12-NEXT:    buffer_load_b32 v1, off, s[4:7], null
551; GFX12-NEXT:    s_mov_b32 s8, s0
552; GFX12-NEXT:    s_mov_b32 s9, s1
553; GFX12-NEXT:    s_wait_loadcnt 0x0
554; GFX12-NEXT:    v_mul_lo_u32 v0, v1, v0
555; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
556; GFX12-NEXT:    s_endpgm
557;
558; EG-LABEL: v_trunc_i64_mul_to_i32:
559; EG:       ; %bb.0: ; %entry
560; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
561; EG-NEXT:    TEX 1 @6
562; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
563; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
564; EG-NEXT:    CF_END
565; EG-NEXT:    PAD
566; EG-NEXT:    Fetch clause starting at 6:
567; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
568; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
569; EG-NEXT:    ALU clause starting at 10:
570; EG-NEXT:     MOV T0.X, KC0[2].Z,
571; EG-NEXT:     MOV * T1.X, KC0[2].W,
572; EG-NEXT:    ALU clause starting at 12:
573; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
574; EG-NEXT:     MULLO_INT * T0.X, T1.X, T0.X,
575; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
576entry:
577  %a = load i64, ptr addrspace(1) %aptr, align 8
578  %b = load i64, ptr addrspace(1) %bptr, align 8
579  %mul = mul i64 %b, %a
580  %trunc = trunc i64 %mul to i32
581  store i32 %trunc, ptr addrspace(1) %out, align 8
582  ret void
583}
584
585; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top
586; 32-bits of both arguments are sign bits.
587
588define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
589; SI-LABEL: mul64_sext_c:
590; SI:       ; %bb.0: ; %entry
591; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
592; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
593; SI-NEXT:    v_mov_b32_e32 v0, 0x50
594; SI-NEXT:    s_mov_b32 s3, 0xf000
595; SI-NEXT:    s_mov_b32 s2, -1
596; SI-NEXT:    s_waitcnt lgkmcnt(0)
597; SI-NEXT:    v_mul_hi_i32 v1, s6, v0
598; SI-NEXT:    s_mulk_i32 s6, 0x50
599; SI-NEXT:    v_mov_b32_e32 v0, s6
600; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
601; SI-NEXT:    s_endpgm
602;
603; VI-LABEL: mul64_sext_c:
604; VI:       ; %bb.0: ; %entry
605; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
606; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
607; VI-NEXT:    v_mov_b32_e32 v0, 0x50
608; VI-NEXT:    s_waitcnt lgkmcnt(0)
609; VI-NEXT:    v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0
610; VI-NEXT:    s_mov_b32 s3, 0xf000
611; VI-NEXT:    s_mov_b32 s2, -1
612; VI-NEXT:    s_nop 2
613; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
614; VI-NEXT:    s_endpgm
615;
616; GFX9-LABEL: mul64_sext_c:
617; GFX9:       ; %bb.0: ; %entry
618; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
619; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
620; GFX9-NEXT:    s_mov_b32 s3, 0xf000
621; GFX9-NEXT:    s_mov_b32 s2, -1
622; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
623; GFX9-NEXT:    s_mul_hi_i32 s4, s6, 0x50
624; GFX9-NEXT:    s_mulk_i32 s6, 0x50
625; GFX9-NEXT:    v_mov_b32_e32 v0, s6
626; GFX9-NEXT:    v_mov_b32_e32 v1, s4
627; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
628; GFX9-NEXT:    s_endpgm
629;
630; GFX10-LABEL: mul64_sext_c:
631; GFX10:       ; %bb.0: ; %entry
632; GFX10-NEXT:    s_clause 0x1
633; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x2c
634; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
635; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
636; GFX10-NEXT:    s_mul_i32 s3, s2, 0x50
637; GFX10-NEXT:    s_mul_hi_i32 s2, s2, 0x50
638; GFX10-NEXT:    v_mov_b32_e32 v0, s3
639; GFX10-NEXT:    v_mov_b32_e32 v1, s2
640; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
641; GFX10-NEXT:    s_mov_b32 s2, -1
642; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
643; GFX10-NEXT:    s_endpgm
644;
645; GFX11-LABEL: mul64_sext_c:
646; GFX11:       ; %bb.0: ; %entry
647; GFX11-NEXT:    s_clause 0x1
648; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
649; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
650; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
651; GFX11-NEXT:    s_mul_i32 s3, s2, 0x50
652; GFX11-NEXT:    s_mul_hi_i32 s2, s2, 0x50
653; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
654; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
655; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
656; GFX11-NEXT:    s_mov_b32 s2, -1
657; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
658; GFX11-NEXT:    s_endpgm
659;
660; GFX12-LABEL: mul64_sext_c:
661; GFX12:       ; %bb.0: ; %entry
662; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
663; GFX12-NEXT:    s_wait_kmcnt 0x0
664; GFX12-NEXT:    s_ashr_i32 s3, s2, 31
665; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
666; GFX12-NEXT:    s_mul_u64 s[4:5], s[2:3], 0x50
667; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
668; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
669; GFX12-NEXT:    s_mov_b32 s2, -1
670; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
671; GFX12-NEXT:    s_endpgm
672;
673; EG-LABEL: mul64_sext_c:
674; EG:       ; %bb.0: ; %entry
675; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
676; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
677; EG-NEXT:    CF_END
678; EG-NEXT:    PAD
679; EG-NEXT:    ALU clause starting at 4:
680; EG-NEXT:     MULHI_INT * T0.Y, KC0[2].Z, literal.x,
681; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
682; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
683; EG-NEXT:     MULLO_INT * T0.X, KC0[2].Z, literal.y,
684; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
685entry:
686  %0 = sext i32 %in to i64
687  %1 = mul i64 %0, 80
688  store i64 %1, ptr addrspace(1) %out
689  ret void
690}
691
692define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
693; SI-LABEL: mul64_zext_c:
694; SI:       ; %bb.0: ; %entry
695; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
696; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
697; SI-NEXT:    v_mov_b32_e32 v0, 0x50
698; SI-NEXT:    s_mov_b32 s3, 0xf000
699; SI-NEXT:    s_mov_b32 s2, -1
700; SI-NEXT:    s_waitcnt lgkmcnt(0)
701; SI-NEXT:    v_mul_hi_u32 v1, s6, v0
702; SI-NEXT:    s_mulk_i32 s6, 0x50
703; SI-NEXT:    v_mov_b32_e32 v0, s6
704; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
705; SI-NEXT:    s_endpgm
706;
707; VI-LABEL: mul64_zext_c:
708; VI:       ; %bb.0: ; %entry
709; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
710; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
711; VI-NEXT:    v_mov_b32_e32 v0, 0x50
712; VI-NEXT:    s_waitcnt lgkmcnt(0)
713; VI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v0, 0
714; VI-NEXT:    s_mov_b32 s3, 0xf000
715; VI-NEXT:    s_mov_b32 s2, -1
716; VI-NEXT:    s_nop 2
717; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
718; VI-NEXT:    s_endpgm
719;
720; GFX9-LABEL: mul64_zext_c:
721; GFX9:       ; %bb.0: ; %entry
722; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
723; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
724; GFX9-NEXT:    s_mov_b32 s3, 0xf000
725; GFX9-NEXT:    s_mov_b32 s2, -1
726; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
727; GFX9-NEXT:    s_mul_hi_u32 s4, s6, 0x50
728; GFX9-NEXT:    s_mulk_i32 s6, 0x50
729; GFX9-NEXT:    v_mov_b32_e32 v0, s6
730; GFX9-NEXT:    v_mov_b32_e32 v1, s4
731; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
732; GFX9-NEXT:    s_endpgm
733;
734; GFX10-LABEL: mul64_zext_c:
735; GFX10:       ; %bb.0: ; %entry
736; GFX10-NEXT:    s_clause 0x1
737; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x2c
738; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
739; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
740; GFX10-NEXT:    s_mul_i32 s3, s2, 0x50
741; GFX10-NEXT:    s_mul_hi_u32 s2, s2, 0x50
742; GFX10-NEXT:    v_mov_b32_e32 v0, s3
743; GFX10-NEXT:    v_mov_b32_e32 v1, s2
744; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
745; GFX10-NEXT:    s_mov_b32 s2, -1
746; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
747; GFX10-NEXT:    s_endpgm
748;
749; GFX11-LABEL: mul64_zext_c:
750; GFX11:       ; %bb.0: ; %entry
751; GFX11-NEXT:    s_clause 0x1
752; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
753; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
754; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
755; GFX11-NEXT:    s_mul_i32 s3, s2, 0x50
756; GFX11-NEXT:    s_mul_hi_u32 s2, s2, 0x50
757; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
758; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
759; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
760; GFX11-NEXT:    s_mov_b32 s2, -1
761; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
762; GFX11-NEXT:    s_endpgm
763;
764; GFX12-LABEL: mul64_zext_c:
765; GFX12:       ; %bb.0: ; %entry
766; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
767; GFX12-NEXT:    s_mov_b32 s3, 0
768; GFX12-NEXT:    s_wait_kmcnt 0x0
769; GFX12-NEXT:    s_mul_u64 s[4:5], s[2:3], 0x50
770; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
771; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
772; GFX12-NEXT:    s_mov_b32 s2, -1
773; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
774; GFX12-NEXT:    s_endpgm
775;
776; EG-LABEL: mul64_zext_c:
777; EG:       ; %bb.0: ; %entry
778; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
779; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
780; EG-NEXT:    CF_END
781; EG-NEXT:    PAD
782; EG-NEXT:    ALU clause starting at 4:
783; EG-NEXT:     MULHI * T0.Y, KC0[2].Z, literal.x,
784; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
785; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
786; EG-NEXT:     MULLO_INT * T0.X, KC0[2].Z, literal.y,
787; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
788entry:
789  %0 = zext i32 %in to i64
790  %1 = mul i64 %0, 80
791  store i64 %1, ptr addrspace(1) %out
792  ret void
793}
794
795define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) {
796; SI-LABEL: v_mul64_sext_c:
797; SI:       ; %bb.0: ; %entry
798; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
799; SI-NEXT:    s_mov_b32 s7, 0xf000
800; SI-NEXT:    s_mov_b32 s6, -1
801; SI-NEXT:    s_mov_b32 s10, s6
802; SI-NEXT:    s_mov_b32 s11, s7
803; SI-NEXT:    s_waitcnt lgkmcnt(0)
804; SI-NEXT:    s_mov_b32 s8, s2
805; SI-NEXT:    s_mov_b32 s9, s3
806; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
807; SI-NEXT:    s_movk_i32 s2, 0x50
808; SI-NEXT:    s_mov_b32 s4, s0
809; SI-NEXT:    s_mov_b32 s5, s1
810; SI-NEXT:    s_waitcnt vmcnt(0)
811; SI-NEXT:    v_mul_hi_i32 v1, v0, s2
812; SI-NEXT:    v_mul_lo_u32 v0, v0, s2
813; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
814; SI-NEXT:    s_endpgm
815;
816; VI-LABEL: v_mul64_sext_c:
817; VI:       ; %bb.0: ; %entry
818; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
819; VI-NEXT:    s_mov_b32 s7, 0xf000
820; VI-NEXT:    s_mov_b32 s6, -1
821; VI-NEXT:    s_mov_b32 s10, s6
822; VI-NEXT:    s_mov_b32 s11, s7
823; VI-NEXT:    s_waitcnt lgkmcnt(0)
824; VI-NEXT:    s_mov_b32 s8, s2
825; VI-NEXT:    s_mov_b32 s9, s3
826; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
827; VI-NEXT:    s_movk_i32 s2, 0x50
828; VI-NEXT:    s_mov_b32 s4, s0
829; VI-NEXT:    s_mov_b32 s5, s1
830; VI-NEXT:    s_waitcnt vmcnt(0)
831; VI-NEXT:    v_mad_i64_i32 v[0:1], s[2:3], v0, s2, 0
832; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
833; VI-NEXT:    s_endpgm
834;
835; GFX9-LABEL: v_mul64_sext_c:
836; GFX9:       ; %bb.0: ; %entry
837; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
838; GFX9-NEXT:    s_mov_b32 s7, 0xf000
839; GFX9-NEXT:    s_mov_b32 s6, -1
840; GFX9-NEXT:    s_mov_b32 s10, s6
841; GFX9-NEXT:    s_mov_b32 s11, s7
842; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
843; GFX9-NEXT:    s_mov_b32 s8, s2
844; GFX9-NEXT:    s_mov_b32 s9, s3
845; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
846; GFX9-NEXT:    s_movk_i32 s2, 0x50
847; GFX9-NEXT:    s_mov_b32 s4, s0
848; GFX9-NEXT:    s_mov_b32 s5, s1
849; GFX9-NEXT:    s_waitcnt vmcnt(0)
850; GFX9-NEXT:    v_mul_hi_i32 v1, v0, s2
851; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
852; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
853; GFX9-NEXT:    s_endpgm
854;
855; GFX10-LABEL: v_mul64_sext_c:
856; GFX10:       ; %bb.0: ; %entry
857; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
858; GFX10-NEXT:    s_mov_b32 s6, -1
859; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
860; GFX10-NEXT:    s_mov_b32 s10, s6
861; GFX10-NEXT:    s_mov_b32 s11, s7
862; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
863; GFX10-NEXT:    s_mov_b32 s8, s2
864; GFX10-NEXT:    s_mov_b32 s9, s3
865; GFX10-NEXT:    s_mov_b32 s4, s0
866; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
867; GFX10-NEXT:    s_mov_b32 s5, s1
868; GFX10-NEXT:    s_waitcnt vmcnt(0)
869; GFX10-NEXT:    v_mul_hi_i32 v1, 0x50, v0
870; GFX10-NEXT:    v_mul_lo_u32 v0, 0x50, v0
871; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
872; GFX10-NEXT:    s_endpgm
873;
874; GFX11-LABEL: v_mul64_sext_c:
875; GFX11:       ; %bb.0: ; %entry
876; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
877; GFX11-NEXT:    s_mov_b32 s6, -1
878; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
879; GFX11-NEXT:    s_mov_b32 s10, s6
880; GFX11-NEXT:    s_mov_b32 s11, s7
881; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
882; GFX11-NEXT:    s_mov_b32 s8, s2
883; GFX11-NEXT:    s_mov_b32 s9, s3
884; GFX11-NEXT:    s_mov_b32 s4, s0
885; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
886; GFX11-NEXT:    s_mov_b32 s5, s1
887; GFX11-NEXT:    s_waitcnt vmcnt(0)
888; GFX11-NEXT:    v_mul_hi_i32 v1, 0x50, v0
889; GFX11-NEXT:    v_mul_lo_u32 v0, 0x50, v0
890; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
891; GFX11-NEXT:    s_endpgm
892;
893; GFX12-LABEL: v_mul64_sext_c:
894; GFX12:       ; %bb.0: ; %entry
895; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
896; GFX12-NEXT:    s_mov_b32 s6, -1
897; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
898; GFX12-NEXT:    s_mov_b32 s10, s6
899; GFX12-NEXT:    s_mov_b32 s11, s7
900; GFX12-NEXT:    s_wait_kmcnt 0x0
901; GFX12-NEXT:    s_mov_b32 s8, s2
902; GFX12-NEXT:    s_mov_b32 s9, s3
903; GFX12-NEXT:    s_mov_b32 s4, s0
904; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
905; GFX12-NEXT:    s_mov_b32 s5, s1
906; GFX12-NEXT:    s_wait_loadcnt 0x0
907; GFX12-NEXT:    v_mul_hi_i32 v1, 0x50, v0
908; GFX12-NEXT:    v_mul_lo_u32 v0, 0x50, v0
909; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
910; GFX12-NEXT:    s_endpgm
911;
912; EG-LABEL: v_mul64_sext_c:
913; EG:       ; %bb.0: ; %entry
914; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
915; EG-NEXT:    TEX 0 @6
916; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
917; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
918; EG-NEXT:    CF_END
919; EG-NEXT:    PAD
920; EG-NEXT:    Fetch clause starting at 6:
921; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
922; EG-NEXT:    ALU clause starting at 8:
923; EG-NEXT:     MOV * T0.X, KC0[2].Z,
924; EG-NEXT:    ALU clause starting at 9:
925; EG-NEXT:     MULHI_INT * T0.Y, T0.X, literal.x,
926; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
927; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
928; EG-NEXT:     MULLO_INT * T0.X, T0.X, literal.y,
929; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
930entry:
931  %val = load i32, ptr addrspace(1) %in, align 4
932  %ext = sext i32 %val to i64
933  %mul = mul i64 %ext, 80
934  store i64 %mul, ptr addrspace(1) %out, align 8
935  ret void
936}
937
938define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) {
939; SI-LABEL: v_mul64_zext_c:
940; SI:       ; %bb.0: ; %entry
941; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
942; SI-NEXT:    s_mov_b32 s7, 0xf000
943; SI-NEXT:    s_mov_b32 s6, -1
944; SI-NEXT:    s_mov_b32 s10, s6
945; SI-NEXT:    s_mov_b32 s11, s7
946; SI-NEXT:    s_waitcnt lgkmcnt(0)
947; SI-NEXT:    s_mov_b32 s8, s2
948; SI-NEXT:    s_mov_b32 s9, s3
949; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
950; SI-NEXT:    s_movk_i32 s2, 0x50
951; SI-NEXT:    s_mov_b32 s4, s0
952; SI-NEXT:    s_mov_b32 s5, s1
953; SI-NEXT:    s_waitcnt vmcnt(0)
954; SI-NEXT:    v_mul_hi_u32 v1, v0, s2
955; SI-NEXT:    v_mul_lo_u32 v0, v0, s2
956; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
957; SI-NEXT:    s_endpgm
958;
959; VI-LABEL: v_mul64_zext_c:
960; VI:       ; %bb.0: ; %entry
961; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
962; VI-NEXT:    s_mov_b32 s7, 0xf000
963; VI-NEXT:    s_mov_b32 s6, -1
964; VI-NEXT:    s_mov_b32 s10, s6
965; VI-NEXT:    s_mov_b32 s11, s7
966; VI-NEXT:    s_waitcnt lgkmcnt(0)
967; VI-NEXT:    s_mov_b32 s8, s2
968; VI-NEXT:    s_mov_b32 s9, s3
969; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
970; VI-NEXT:    s_movk_i32 s2, 0x50
971; VI-NEXT:    s_mov_b32 s4, s0
972; VI-NEXT:    s_mov_b32 s5, s1
973; VI-NEXT:    s_waitcnt vmcnt(0)
974; VI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, s2, 0
975; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
976; VI-NEXT:    s_endpgm
977;
978; GFX9-LABEL: v_mul64_zext_c:
979; GFX9:       ; %bb.0: ; %entry
980; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
981; GFX9-NEXT:    s_mov_b32 s7, 0xf000
982; GFX9-NEXT:    s_mov_b32 s6, -1
983; GFX9-NEXT:    s_mov_b32 s10, s6
984; GFX9-NEXT:    s_mov_b32 s11, s7
985; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
986; GFX9-NEXT:    s_mov_b32 s8, s2
987; GFX9-NEXT:    s_mov_b32 s9, s3
988; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
989; GFX9-NEXT:    s_movk_i32 s2, 0x50
990; GFX9-NEXT:    s_mov_b32 s4, s0
991; GFX9-NEXT:    s_mov_b32 s5, s1
992; GFX9-NEXT:    s_waitcnt vmcnt(0)
993; GFX9-NEXT:    v_mul_hi_u32 v1, v0, s2
994; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
995; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
996; GFX9-NEXT:    s_endpgm
997;
998; GFX10-LABEL: v_mul64_zext_c:
999; GFX10:       ; %bb.0: ; %entry
1000; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1001; GFX10-NEXT:    s_mov_b32 s6, -1
1002; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1003; GFX10-NEXT:    s_mov_b32 s10, s6
1004; GFX10-NEXT:    s_mov_b32 s11, s7
1005; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1006; GFX10-NEXT:    s_mov_b32 s8, s2
1007; GFX10-NEXT:    s_mov_b32 s9, s3
1008; GFX10-NEXT:    s_mov_b32 s4, s0
1009; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1010; GFX10-NEXT:    s_mov_b32 s5, s1
1011; GFX10-NEXT:    s_waitcnt vmcnt(0)
1012; GFX10-NEXT:    v_mul_hi_u32 v1, 0x50, v0
1013; GFX10-NEXT:    v_mul_lo_u32 v0, 0x50, v0
1014; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1015; GFX10-NEXT:    s_endpgm
1016;
1017; GFX11-LABEL: v_mul64_zext_c:
1018; GFX11:       ; %bb.0: ; %entry
1019; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1020; GFX11-NEXT:    s_mov_b32 s6, -1
1021; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1022; GFX11-NEXT:    s_mov_b32 s10, s6
1023; GFX11-NEXT:    s_mov_b32 s11, s7
1024; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1025; GFX11-NEXT:    s_mov_b32 s8, s2
1026; GFX11-NEXT:    s_mov_b32 s9, s3
1027; GFX11-NEXT:    s_mov_b32 s4, s0
1028; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
1029; GFX11-NEXT:    s_mov_b32 s5, s1
1030; GFX11-NEXT:    s_waitcnt vmcnt(0)
1031; GFX11-NEXT:    v_mul_hi_u32 v1, 0x50, v0
1032; GFX11-NEXT:    v_mul_lo_u32 v0, 0x50, v0
1033; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
1034; GFX11-NEXT:    s_endpgm
1035;
1036; GFX12-LABEL: v_mul64_zext_c:
1037; GFX12:       ; %bb.0: ; %entry
1038; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1039; GFX12-NEXT:    s_mov_b32 s6, -1
1040; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
1041; GFX12-NEXT:    s_mov_b32 s10, s6
1042; GFX12-NEXT:    s_mov_b32 s11, s7
1043; GFX12-NEXT:    s_wait_kmcnt 0x0
1044; GFX12-NEXT:    s_mov_b32 s8, s2
1045; GFX12-NEXT:    s_mov_b32 s9, s3
1046; GFX12-NEXT:    s_mov_b32 s4, s0
1047; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
1048; GFX12-NEXT:    s_mov_b32 s5, s1
1049; GFX12-NEXT:    s_wait_loadcnt 0x0
1050; GFX12-NEXT:    v_mul_hi_u32 v1, 0x50, v0
1051; GFX12-NEXT:    v_mul_lo_u32 v0, 0x50, v0
1052; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
1053; GFX12-NEXT:    s_endpgm
1054;
1055; EG-LABEL: v_mul64_zext_c:
1056; EG:       ; %bb.0: ; %entry
1057; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1058; EG-NEXT:    TEX 0 @6
1059; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
1060; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1061; EG-NEXT:    CF_END
1062; EG-NEXT:    PAD
1063; EG-NEXT:    Fetch clause starting at 6:
1064; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1065; EG-NEXT:    ALU clause starting at 8:
1066; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1067; EG-NEXT:    ALU clause starting at 9:
1068; EG-NEXT:     MULHI * T0.Y, T0.X, literal.x,
1069; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
1070; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
1071; EG-NEXT:     MULLO_INT * T0.X, T0.X, literal.y,
1072; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
1073entry:
1074  %val = load i32, ptr addrspace(1) %in, align 4
1075  %ext = zext i32 %val to i64
1076  %mul = mul i64 %ext, 80
1077  store i64 %mul, ptr addrspace(1) %out, align 8
1078  ret void
1079}
1080
1081define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1082; SI-LABEL: v_mul64_sext_inline_imm:
1083; SI:       ; %bb.0: ; %entry
1084; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1085; SI-NEXT:    s_mov_b32 s7, 0xf000
1086; SI-NEXT:    s_mov_b32 s6, -1
1087; SI-NEXT:    s_mov_b32 s10, s6
1088; SI-NEXT:    s_mov_b32 s11, s7
1089; SI-NEXT:    s_waitcnt lgkmcnt(0)
1090; SI-NEXT:    s_mov_b32 s8, s2
1091; SI-NEXT:    s_mov_b32 s9, s3
1092; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1093; SI-NEXT:    s_mov_b32 s4, s0
1094; SI-NEXT:    s_mov_b32 s5, s1
1095; SI-NEXT:    s_waitcnt vmcnt(0)
1096; SI-NEXT:    v_mul_hi_i32 v1, v0, 9
1097; SI-NEXT:    v_mul_lo_u32 v0, v0, 9
1098; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1099; SI-NEXT:    s_endpgm
1100;
1101; VI-LABEL: v_mul64_sext_inline_imm:
1102; VI:       ; %bb.0: ; %entry
1103; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1104; VI-NEXT:    s_mov_b32 s7, 0xf000
1105; VI-NEXT:    s_mov_b32 s6, -1
1106; VI-NEXT:    s_mov_b32 s10, s6
1107; VI-NEXT:    s_mov_b32 s11, s7
1108; VI-NEXT:    s_waitcnt lgkmcnt(0)
1109; VI-NEXT:    s_mov_b32 s8, s2
1110; VI-NEXT:    s_mov_b32 s9, s3
1111; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1112; VI-NEXT:    s_mov_b32 s4, s0
1113; VI-NEXT:    s_mov_b32 s5, s1
1114; VI-NEXT:    s_waitcnt vmcnt(0)
1115; VI-NEXT:    v_mad_i64_i32 v[0:1], s[2:3], v0, 9, 0
1116; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1117; VI-NEXT:    s_endpgm
1118;
1119; GFX9-LABEL: v_mul64_sext_inline_imm:
1120; GFX9:       ; %bb.0: ; %entry
1121; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1122; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1123; GFX9-NEXT:    s_mov_b32 s6, -1
1124; GFX9-NEXT:    s_mov_b32 s10, s6
1125; GFX9-NEXT:    s_mov_b32 s11, s7
1126; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1127; GFX9-NEXT:    s_mov_b32 s8, s2
1128; GFX9-NEXT:    s_mov_b32 s9, s3
1129; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1130; GFX9-NEXT:    s_mov_b32 s4, s0
1131; GFX9-NEXT:    s_mov_b32 s5, s1
1132; GFX9-NEXT:    s_waitcnt vmcnt(0)
1133; GFX9-NEXT:    v_mul_hi_i32 v1, v0, 9
1134; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 9
1135; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1136; GFX9-NEXT:    s_endpgm
1137;
1138; GFX10-LABEL: v_mul64_sext_inline_imm:
1139; GFX10:       ; %bb.0: ; %entry
1140; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1141; GFX10-NEXT:    s_mov_b32 s6, -1
1142; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1143; GFX10-NEXT:    s_mov_b32 s10, s6
1144; GFX10-NEXT:    s_mov_b32 s11, s7
1145; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1146; GFX10-NEXT:    s_mov_b32 s8, s2
1147; GFX10-NEXT:    s_mov_b32 s9, s3
1148; GFX10-NEXT:    s_mov_b32 s4, s0
1149; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1150; GFX10-NEXT:    s_mov_b32 s5, s1
1151; GFX10-NEXT:    s_waitcnt vmcnt(0)
1152; GFX10-NEXT:    v_mul_hi_i32 v1, v0, 9
1153; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 9
1154; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1155; GFX10-NEXT:    s_endpgm
1156;
1157; GFX11-LABEL: v_mul64_sext_inline_imm:
1158; GFX11:       ; %bb.0: ; %entry
1159; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1160; GFX11-NEXT:    s_mov_b32 s6, -1
1161; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1162; GFX11-NEXT:    s_mov_b32 s10, s6
1163; GFX11-NEXT:    s_mov_b32 s11, s7
1164; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1165; GFX11-NEXT:    s_mov_b32 s8, s2
1166; GFX11-NEXT:    s_mov_b32 s9, s3
1167; GFX11-NEXT:    s_mov_b32 s4, s0
1168; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
1169; GFX11-NEXT:    s_mov_b32 s5, s1
1170; GFX11-NEXT:    s_waitcnt vmcnt(0)
1171; GFX11-NEXT:    v_mul_hi_i32 v1, v0, 9
1172; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 9
1173; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
1174; GFX11-NEXT:    s_endpgm
1175;
1176; GFX12-LABEL: v_mul64_sext_inline_imm:
1177; GFX12:       ; %bb.0: ; %entry
1178; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1179; GFX12-NEXT:    s_mov_b32 s6, -1
1180; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
1181; GFX12-NEXT:    s_mov_b32 s10, s6
1182; GFX12-NEXT:    s_mov_b32 s11, s7
1183; GFX12-NEXT:    s_wait_kmcnt 0x0
1184; GFX12-NEXT:    s_mov_b32 s8, s2
1185; GFX12-NEXT:    s_mov_b32 s9, s3
1186; GFX12-NEXT:    s_mov_b32 s4, s0
1187; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
1188; GFX12-NEXT:    s_mov_b32 s5, s1
1189; GFX12-NEXT:    s_wait_loadcnt 0x0
1190; GFX12-NEXT:    v_mul_hi_i32 v1, 9, v0
1191; GFX12-NEXT:    v_mul_lo_u32 v0, 9, v0
1192; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
1193; GFX12-NEXT:    s_endpgm
1194;
1195; EG-LABEL: v_mul64_sext_inline_imm:
1196; EG:       ; %bb.0: ; %entry
1197; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1198; EG-NEXT:    TEX 0 @6
1199; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
1200; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1201; EG-NEXT:    CF_END
1202; EG-NEXT:    PAD
1203; EG-NEXT:    Fetch clause starting at 6:
1204; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1205; EG-NEXT:    ALU clause starting at 8:
1206; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1207; EG-NEXT:    ALU clause starting at 9:
1208; EG-NEXT:     MULHI_INT * T0.Y, T0.X, literal.x,
1209; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1210; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
1211; EG-NEXT:     MULLO_INT * T0.X, T0.X, literal.y,
1212; EG-NEXT:    2(2.802597e-45), 9(1.261169e-44)
1213entry:
1214  %val = load i32, ptr addrspace(1) %in, align 4
1215  %ext = sext i32 %val to i64
1216  %mul = mul i64 %ext, 9
1217  store i64 %mul, ptr addrspace(1) %out, align 8
1218  ret void
1219}
1220
1221define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind {
1222; SI-LABEL: s_mul_i32:
1223; SI:       ; %bb.0: ; %entry
1224; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
1225; SI-NEXT:    s_load_dword s7, s[4:5], 0x1c
1226; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1227; SI-NEXT:    s_mov_b32 s3, 0xf000
1228; SI-NEXT:    s_mov_b32 s2, -1
1229; SI-NEXT:    s_waitcnt lgkmcnt(0)
1230; SI-NEXT:    s_mul_i32 s4, s6, s7
1231; SI-NEXT:    v_mov_b32_e32 v0, s4
1232; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1233; SI-NEXT:    s_endpgm
1234;
1235; VI-LABEL: s_mul_i32:
1236; VI:       ; %bb.0: ; %entry
1237; VI-NEXT:    s_load_dword s6, s[4:5], 0x4c
1238; VI-NEXT:    s_load_dword s7, s[4:5], 0x70
1239; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1240; VI-NEXT:    s_mov_b32 s3, 0xf000
1241; VI-NEXT:    s_mov_b32 s2, -1
1242; VI-NEXT:    s_waitcnt lgkmcnt(0)
1243; VI-NEXT:    s_mul_i32 s4, s6, s7
1244; VI-NEXT:    v_mov_b32_e32 v0, s4
1245; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1246; VI-NEXT:    s_endpgm
1247;
1248; GFX9-LABEL: s_mul_i32:
1249; GFX9:       ; %bb.0: ; %entry
1250; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x4c
1251; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x70
1252; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1253; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1254; GFX9-NEXT:    s_mov_b32 s2, -1
1255; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1256; GFX9-NEXT:    s_mul_i32 s4, s6, s7
1257; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1258; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1259; GFX9-NEXT:    s_endpgm
1260;
1261; GFX10-LABEL: s_mul_i32:
1262; GFX10:       ; %bb.0: ; %entry
1263; GFX10-NEXT:    s_clause 0x2
1264; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x4c
1265; GFX10-NEXT:    s_load_dword s3, s[4:5], 0x70
1266; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1267; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1268; GFX10-NEXT:    s_mul_i32 s2, s2, s3
1269; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1270; GFX10-NEXT:    v_mov_b32_e32 v0, s2
1271; GFX10-NEXT:    s_mov_b32 s2, -1
1272; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1273; GFX10-NEXT:    s_endpgm
1274;
1275; GFX11-LABEL: s_mul_i32:
1276; GFX11:       ; %bb.0: ; %entry
1277; GFX11-NEXT:    s_clause 0x2
1278; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x4c
1279; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x70
1280; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1281; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1282; GFX11-NEXT:    s_mul_i32 s2, s2, s3
1283; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1284; GFX11-NEXT:    v_mov_b32_e32 v0, s2
1285; GFX11-NEXT:    s_mov_b32 s2, -1
1286; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1287; GFX11-NEXT:    s_endpgm
1288;
1289; GFX12-LABEL: s_mul_i32:
1290; GFX12:       ; %bb.0: ; %entry
1291; GFX12-NEXT:    s_clause 0x2
1292; GFX12-NEXT:    s_load_b32 s2, s[4:5], 0x4c
1293; GFX12-NEXT:    s_load_b32 s3, s[4:5], 0x70
1294; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1295; GFX12-NEXT:    s_wait_kmcnt 0x0
1296; GFX12-NEXT:    s_mul_i32 s2, s2, s3
1297; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
1298; GFX12-NEXT:    v_mov_b32_e32 v0, s2
1299; GFX12-NEXT:    s_mov_b32 s2, -1
1300; GFX12-NEXT:    buffer_store_b32 v0, off, s[0:3], null
1301; GFX12-NEXT:    s_endpgm
1302;
1303; EG-LABEL: s_mul_i32:
1304; EG:       ; %bb.0: ; %entry
1305; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
1306; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1307; EG-NEXT:    CF_END
1308; EG-NEXT:    PAD
1309; EG-NEXT:    ALU clause starting at 4:
1310; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1311; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1312; EG-NEXT:     MULLO_INT * T1.X, KC0[4].Z, KC0[6].W,
1313entry:
1314  %mul = mul i32 %a, %b
1315  store i32 %mul, ptr addrspace(1) %out, align 4
1316  ret void
1317}
1318
1319define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1320; SI-LABEL: v_mul_i32:
1321; SI:       ; %bb.0: ; %entry
1322; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1323; SI-NEXT:    s_mov_b32 s7, 0xf000
1324; SI-NEXT:    s_mov_b32 s6, -1
1325; SI-NEXT:    s_mov_b32 s10, s6
1326; SI-NEXT:    s_mov_b32 s11, s7
1327; SI-NEXT:    s_waitcnt lgkmcnt(0)
1328; SI-NEXT:    s_mov_b32 s8, s2
1329; SI-NEXT:    s_mov_b32 s9, s3
1330; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1331; SI-NEXT:    s_mov_b32 s4, s0
1332; SI-NEXT:    s_mov_b32 s5, s1
1333; SI-NEXT:    s_waitcnt vmcnt(0)
1334; SI-NEXT:    v_mul_lo_u32 v0, v0, v1
1335; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1336; SI-NEXT:    s_endpgm
1337;
1338; VI-LABEL: v_mul_i32:
1339; VI:       ; %bb.0: ; %entry
1340; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1341; VI-NEXT:    s_mov_b32 s7, 0xf000
1342; VI-NEXT:    s_mov_b32 s6, -1
1343; VI-NEXT:    s_mov_b32 s10, s6
1344; VI-NEXT:    s_mov_b32 s11, s7
1345; VI-NEXT:    s_waitcnt lgkmcnt(0)
1346; VI-NEXT:    s_mov_b32 s8, s2
1347; VI-NEXT:    s_mov_b32 s9, s3
1348; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1349; VI-NEXT:    s_mov_b32 s4, s0
1350; VI-NEXT:    s_mov_b32 s5, s1
1351; VI-NEXT:    s_waitcnt vmcnt(0)
1352; VI-NEXT:    v_mul_lo_u32 v0, v0, v1
1353; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1354; VI-NEXT:    s_endpgm
1355;
1356; GFX9-LABEL: v_mul_i32:
1357; GFX9:       ; %bb.0: ; %entry
1358; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1359; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1360; GFX9-NEXT:    s_mov_b32 s6, -1
1361; GFX9-NEXT:    s_mov_b32 s10, s6
1362; GFX9-NEXT:    s_mov_b32 s11, s7
1363; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1364; GFX9-NEXT:    s_mov_b32 s8, s2
1365; GFX9-NEXT:    s_mov_b32 s9, s3
1366; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1367; GFX9-NEXT:    s_mov_b32 s4, s0
1368; GFX9-NEXT:    s_mov_b32 s5, s1
1369; GFX9-NEXT:    s_waitcnt vmcnt(0)
1370; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
1371; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1372; GFX9-NEXT:    s_endpgm
1373;
1374; GFX10-LABEL: v_mul_i32:
1375; GFX10:       ; %bb.0: ; %entry
1376; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1377; GFX10-NEXT:    s_mov_b32 s6, -1
1378; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1379; GFX10-NEXT:    s_mov_b32 s10, s6
1380; GFX10-NEXT:    s_mov_b32 s11, s7
1381; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1382; GFX10-NEXT:    s_mov_b32 s8, s2
1383; GFX10-NEXT:    s_mov_b32 s9, s3
1384; GFX10-NEXT:    s_mov_b32 s4, s0
1385; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1386; GFX10-NEXT:    s_mov_b32 s5, s1
1387; GFX10-NEXT:    s_waitcnt vmcnt(0)
1388; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
1389; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1390; GFX10-NEXT:    s_endpgm
1391;
1392; GFX11-LABEL: v_mul_i32:
1393; GFX11:       ; %bb.0: ; %entry
1394; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1395; GFX11-NEXT:    s_mov_b32 s6, -1
1396; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1397; GFX11-NEXT:    s_mov_b32 s10, s6
1398; GFX11-NEXT:    s_mov_b32 s11, s7
1399; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1400; GFX11-NEXT:    s_mov_b32 s8, s2
1401; GFX11-NEXT:    s_mov_b32 s9, s3
1402; GFX11-NEXT:    s_mov_b32 s4, s0
1403; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
1404; GFX11-NEXT:    s_mov_b32 s5, s1
1405; GFX11-NEXT:    s_waitcnt vmcnt(0)
1406; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v1
1407; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
1408; GFX11-NEXT:    s_endpgm
1409;
1410; GFX12-LABEL: v_mul_i32:
1411; GFX12:       ; %bb.0: ; %entry
1412; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1413; GFX12-NEXT:    s_mov_b32 s6, -1
1414; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
1415; GFX12-NEXT:    s_mov_b32 s10, s6
1416; GFX12-NEXT:    s_mov_b32 s11, s7
1417; GFX12-NEXT:    s_wait_kmcnt 0x0
1418; GFX12-NEXT:    s_mov_b32 s8, s2
1419; GFX12-NEXT:    s_mov_b32 s9, s3
1420; GFX12-NEXT:    s_mov_b32 s4, s0
1421; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], null
1422; GFX12-NEXT:    s_mov_b32 s5, s1
1423; GFX12-NEXT:    s_wait_loadcnt 0x0
1424; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v1
1425; GFX12-NEXT:    buffer_store_b32 v0, off, s[4:7], null
1426; GFX12-NEXT:    s_endpgm
1427;
1428; EG-LABEL: v_mul_i32:
1429; EG:       ; %bb.0: ; %entry
1430; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1431; EG-NEXT:    TEX 0 @6
1432; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
1433; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1434; EG-NEXT:    CF_END
1435; EG-NEXT:    PAD
1436; EG-NEXT:    Fetch clause starting at 6:
1437; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
1438; EG-NEXT:    ALU clause starting at 8:
1439; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1440; EG-NEXT:    ALU clause starting at 9:
1441; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
1442; EG-NEXT:     MULLO_INT * T0.X, T0.X, T0.Y,
1443; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1444entry:
1445  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
1446  %a = load i32, ptr addrspace(1) %in
1447  %b = load i32, ptr addrspace(1) %b_ptr
1448  %result = mul i32 %a, %b
1449  store i32 %result, ptr addrspace(1) %out
1450  ret void
1451}
1452
1453define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind {
1454; SI-LABEL: s_mul_i1:
1455; SI:       ; %bb.0: ; %entry
1456; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
1457; SI-NEXT:    s_load_dword s7, s[4:5], 0x1c
1458; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1459; SI-NEXT:    s_mov_b32 s3, 0xf000
1460; SI-NEXT:    s_mov_b32 s2, -1
1461; SI-NEXT:    s_waitcnt lgkmcnt(0)
1462; SI-NEXT:    s_mul_i32 s6, s6, s7
1463; SI-NEXT:    s_and_b32 s4, s6, 1
1464; SI-NEXT:    v_mov_b32_e32 v0, s4
1465; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1466; SI-NEXT:    s_endpgm
1467;
1468; VI-LABEL: s_mul_i1:
1469; VI:       ; %bb.0: ; %entry
1470; VI-NEXT:    s_load_dword s6, s[4:5], 0x4c
1471; VI-NEXT:    s_load_dword s7, s[4:5], 0x70
1472; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1473; VI-NEXT:    s_mov_b32 s3, 0xf000
1474; VI-NEXT:    s_mov_b32 s2, -1
1475; VI-NEXT:    s_waitcnt lgkmcnt(0)
1476; VI-NEXT:    s_mul_i32 s6, s6, s7
1477; VI-NEXT:    s_and_b32 s4, s6, 1
1478; VI-NEXT:    v_mov_b32_e32 v0, s4
1479; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1480; VI-NEXT:    s_endpgm
1481;
1482; GFX9-LABEL: s_mul_i1:
1483; GFX9:       ; %bb.0: ; %entry
1484; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x4c
1485; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x70
1486; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1487; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1488; GFX9-NEXT:    s_mov_b32 s2, -1
1489; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1490; GFX9-NEXT:    s_mul_i32 s6, s6, s7
1491; GFX9-NEXT:    s_and_b32 s4, s6, 1
1492; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1493; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1494; GFX9-NEXT:    s_endpgm
1495;
1496; GFX10-LABEL: s_mul_i1:
1497; GFX10:       ; %bb.0: ; %entry
1498; GFX10-NEXT:    s_clause 0x2
1499; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x4c
1500; GFX10-NEXT:    s_load_dword s3, s[4:5], 0x70
1501; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1502; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1503; GFX10-NEXT:    s_mul_i32 s2, s2, s3
1504; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1505; GFX10-NEXT:    s_and_b32 s2, s2, 1
1506; GFX10-NEXT:    v_mov_b32_e32 v0, s2
1507; GFX10-NEXT:    s_mov_b32 s2, -1
1508; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1509; GFX10-NEXT:    s_endpgm
1510;
1511; GFX11-LABEL: s_mul_i1:
1512; GFX11:       ; %bb.0: ; %entry
1513; GFX11-NEXT:    s_clause 0x2
1514; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x4c
1515; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x70
1516; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1517; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1518; GFX11-NEXT:    s_mul_i32 s2, s2, s3
1519; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1520; GFX11-NEXT:    s_and_b32 s2, s2, 1
1521; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1522; GFX11-NEXT:    v_mov_b32_e32 v0, s2
1523; GFX11-NEXT:    s_mov_b32 s2, -1
1524; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
1525; GFX11-NEXT:    s_endpgm
1526;
1527; GFX12-LABEL: s_mul_i1:
1528; GFX12:       ; %bb.0: ; %entry
1529; GFX12-NEXT:    s_clause 0x2
1530; GFX12-NEXT:    s_load_b32 s2, s[4:5], 0x4c
1531; GFX12-NEXT:    s_load_b32 s3, s[4:5], 0x70
1532; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1533; GFX12-NEXT:    s_wait_kmcnt 0x0
1534; GFX12-NEXT:    s_mul_i32 s2, s2, s3
1535; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
1536; GFX12-NEXT:    s_and_b32 s2, s2, 1
1537; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1538; GFX12-NEXT:    v_mov_b32_e32 v0, s2
1539; GFX12-NEXT:    s_mov_b32 s2, -1
1540; GFX12-NEXT:    buffer_store_b8 v0, off, s[0:3], null
1541; GFX12-NEXT:    s_endpgm
1542;
1543; EG-LABEL: s_mul_i1:
1544; EG:       ; %bb.0: ; %entry
1545; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
1546; EG-NEXT:    TEX 1 @6
1547; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
1548; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1549; EG-NEXT:    CF_END
1550; EG-NEXT:    PAD
1551; EG-NEXT:    Fetch clause starting at 6:
1552; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 72, #3
1553; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 108, #3
1554; EG-NEXT:    ALU clause starting at 10:
1555; EG-NEXT:     MOV * T0.X, 0.0,
1556; EG-NEXT:    ALU clause starting at 11:
1557; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
1558; EG-NEXT:     MULLO_INT * T0.X, T1.X, T0.X,
1559; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1560; EG-NEXT:     AND_INT T1.W, PS, 1,
1561; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
1562; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1563; EG-NEXT:     LSHL T0.X, PV.W, PS,
1564; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1565; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1566; EG-NEXT:     MOV T0.Y, 0.0,
1567; EG-NEXT:     MOV * T0.Z, 0.0,
1568; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1569; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1570entry:
1571  %mul = mul i1 %a, %b
1572  store i1 %mul, ptr addrspace(1) %out, align 4
1573  ret void
1574}
1575
1576define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1577; SI-LABEL: v_mul_i1:
1578; SI:       ; %bb.0: ; %entry
1579; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1580; SI-NEXT:    s_mov_b32 s7, 0xf000
1581; SI-NEXT:    s_mov_b32 s6, -1
1582; SI-NEXT:    s_mov_b32 s10, s6
1583; SI-NEXT:    s_mov_b32 s11, s7
1584; SI-NEXT:    s_waitcnt lgkmcnt(0)
1585; SI-NEXT:    s_mov_b32 s8, s2
1586; SI-NEXT:    s_mov_b32 s9, s3
1587; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
1588; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1589; SI-NEXT:    s_mov_b32 s4, s0
1590; SI-NEXT:    s_mov_b32 s5, s1
1591; SI-NEXT:    s_waitcnt vmcnt(0)
1592; SI-NEXT:    v_mul_lo_u32 v0, v0, v1
1593; SI-NEXT:    v_and_b32_e32 v0, 1, v0
1594; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1595; SI-NEXT:    s_endpgm
1596;
1597; VI-LABEL: v_mul_i1:
1598; VI:       ; %bb.0: ; %entry
1599; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1600; VI-NEXT:    s_mov_b32 s7, 0xf000
1601; VI-NEXT:    s_mov_b32 s6, -1
1602; VI-NEXT:    s_mov_b32 s10, s6
1603; VI-NEXT:    s_mov_b32 s11, s7
1604; VI-NEXT:    s_waitcnt lgkmcnt(0)
1605; VI-NEXT:    s_mov_b32 s8, s2
1606; VI-NEXT:    s_mov_b32 s9, s3
1607; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
1608; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1609; VI-NEXT:    s_mov_b32 s4, s0
1610; VI-NEXT:    s_mov_b32 s5, s1
1611; VI-NEXT:    s_waitcnt vmcnt(0)
1612; VI-NEXT:    v_mul_lo_u32 v0, v0, v1
1613; VI-NEXT:    v_and_b32_e32 v0, 1, v0
1614; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1615; VI-NEXT:    s_endpgm
1616;
1617; GFX9-LABEL: v_mul_i1:
1618; GFX9:       ; %bb.0: ; %entry
1619; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1620; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1621; GFX9-NEXT:    s_mov_b32 s6, -1
1622; GFX9-NEXT:    s_mov_b32 s10, s6
1623; GFX9-NEXT:    s_mov_b32 s11, s7
1624; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1625; GFX9-NEXT:    s_mov_b32 s8, s2
1626; GFX9-NEXT:    s_mov_b32 s9, s3
1627; GFX9-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
1628; GFX9-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1629; GFX9-NEXT:    s_mov_b32 s4, s0
1630; GFX9-NEXT:    s_mov_b32 s5, s1
1631; GFX9-NEXT:    s_waitcnt vmcnt(0)
1632; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
1633; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
1634; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1635; GFX9-NEXT:    s_endpgm
1636;
1637; GFX10-LABEL: v_mul_i1:
1638; GFX10:       ; %bb.0: ; %entry
1639; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1640; GFX10-NEXT:    s_mov_b32 s6, -1
1641; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1642; GFX10-NEXT:    s_mov_b32 s10, s6
1643; GFX10-NEXT:    s_mov_b32 s11, s7
1644; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1645; GFX10-NEXT:    s_mov_b32 s8, s2
1646; GFX10-NEXT:    s_mov_b32 s9, s3
1647; GFX10-NEXT:    s_clause 0x1
1648; GFX10-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
1649; GFX10-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1650; GFX10-NEXT:    s_mov_b32 s4, s0
1651; GFX10-NEXT:    s_mov_b32 s5, s1
1652; GFX10-NEXT:    s_waitcnt vmcnt(0)
1653; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
1654; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
1655; GFX10-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1656; GFX10-NEXT:    s_endpgm
1657;
1658; GFX11-LABEL: v_mul_i1:
1659; GFX11:       ; %bb.0: ; %entry
1660; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1661; GFX11-NEXT:    s_mov_b32 s6, -1
1662; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1663; GFX11-NEXT:    s_mov_b32 s10, s6
1664; GFX11-NEXT:    s_mov_b32 s11, s7
1665; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1666; GFX11-NEXT:    s_mov_b32 s8, s2
1667; GFX11-NEXT:    s_mov_b32 s9, s3
1668; GFX11-NEXT:    s_clause 0x1
1669; GFX11-NEXT:    buffer_load_u8 v0, off, s[8:11], 0
1670; GFX11-NEXT:    buffer_load_u8 v1, off, s[8:11], 0 offset:4
1671; GFX11-NEXT:    s_mov_b32 s4, s0
1672; GFX11-NEXT:    s_mov_b32 s5, s1
1673; GFX11-NEXT:    s_waitcnt vmcnt(0)
1674; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v1
1675; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1676; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
1677; GFX11-NEXT:    buffer_store_b8 v0, off, s[4:7], 0
1678; GFX11-NEXT:    s_endpgm
1679;
1680; GFX12-LABEL: v_mul_i1:
1681; GFX12:       ; %bb.0: ; %entry
1682; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1683; GFX12-NEXT:    s_mov_b32 s6, -1
1684; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
1685; GFX12-NEXT:    s_mov_b32 s10, s6
1686; GFX12-NEXT:    s_mov_b32 s11, s7
1687; GFX12-NEXT:    s_wait_kmcnt 0x0
1688; GFX12-NEXT:    s_mov_b32 s8, s2
1689; GFX12-NEXT:    s_mov_b32 s9, s3
1690; GFX12-NEXT:    s_clause 0x1
1691; GFX12-NEXT:    buffer_load_u8 v0, off, s[8:11], null
1692; GFX12-NEXT:    buffer_load_u8 v1, off, s[8:11], null offset:4
1693; GFX12-NEXT:    s_mov_b32 s4, s0
1694; GFX12-NEXT:    s_mov_b32 s5, s1
1695; GFX12-NEXT:    s_wait_loadcnt 0x0
1696; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v1
1697; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1698; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
1699; GFX12-NEXT:    buffer_store_b8 v0, off, s[4:7], null
1700; GFX12-NEXT:    s_endpgm
1701;
1702; EG-LABEL: v_mul_i1:
1703; EG:       ; %bb.0: ; %entry
1704; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1705; EG-NEXT:    TEX 1 @6
1706; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
1707; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1708; EG-NEXT:    CF_END
1709; EG-NEXT:    PAD
1710; EG-NEXT:    Fetch clause starting at 6:
1711; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 4, #1
1712; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1713; EG-NEXT:    ALU clause starting at 10:
1714; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1715; EG-NEXT:    ALU clause starting at 11:
1716; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
1717; EG-NEXT:     MULLO_INT * T0.X, T0.X, T1.X,
1718; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1719; EG-NEXT:     AND_INT T1.W, PS, 1,
1720; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
1721; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1722; EG-NEXT:     LSHL T0.X, PV.W, PS,
1723; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1724; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1725; EG-NEXT:     MOV T0.Y, 0.0,
1726; EG-NEXT:     MOV * T0.Z, 0.0,
1727; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1728; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1729entry:
1730  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
1731  %a = load i1, ptr addrspace(1) %in
1732  %b = load i1, ptr addrspace(1) %b_ptr
1733  %result = mul i1 %a, %b
1734  store i1 %result, ptr addrspace(1) %out
1735  ret void
1736}
1737
1738; A standard 64-bit multiply.  The expansion should be around 6 instructions.
1739; It would be difficult to match the expansion correctly without writing
1740; a really complicated list of FileCheck expressions.  I don't want
1741; to confuse people who may 'break' this test with a correct optimization,
1742; so this test just uses FUNC-LABEL to make sure the compiler does not
1743; crash with a 'failed to select' error.
1744
1745define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
1746; SI-LABEL: s_mul_i64:
1747; SI:       ; %bb.0: ; %entry
1748; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1749; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1750; SI-NEXT:    s_mov_b32 s7, 0xf000
1751; SI-NEXT:    s_mov_b32 s6, -1
1752; SI-NEXT:    s_waitcnt lgkmcnt(0)
1753; SI-NEXT:    s_mov_b32 s4, s0
1754; SI-NEXT:    v_mov_b32_e32 v0, s8
1755; SI-NEXT:    v_mul_hi_u32 v0, s2, v0
1756; SI-NEXT:    s_mul_i32 s0, s2, s9
1757; SI-NEXT:    s_mov_b32 s5, s1
1758; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
1759; SI-NEXT:    s_mul_i32 s0, s3, s8
1760; SI-NEXT:    v_add_i32_e32 v1, vcc, s0, v0
1761; SI-NEXT:    s_mul_i32 s0, s2, s8
1762; SI-NEXT:    v_mov_b32_e32 v0, s0
1763; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1764; SI-NEXT:    s_endpgm
1765;
1766; VI-LABEL: s_mul_i64:
1767; VI:       ; %bb.0: ; %entry
1768; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1769; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
1770; VI-NEXT:    s_mov_b32 s7, 0xf000
1771; VI-NEXT:    s_mov_b32 s6, -1
1772; VI-NEXT:    s_waitcnt lgkmcnt(0)
1773; VI-NEXT:    s_mov_b32 s4, s0
1774; VI-NEXT:    v_mov_b32_e32 v0, s8
1775; VI-NEXT:    v_mad_u64_u32 v[0:1], s[10:11], s2, v0, 0
1776; VI-NEXT:    s_mul_i32 s0, s2, s9
1777; VI-NEXT:    s_mov_b32 s5, s1
1778; VI-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
1779; VI-NEXT:    s_mul_i32 s0, s3, s8
1780; VI-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
1781; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1782; VI-NEXT:    s_endpgm
1783;
1784; GFX9-LABEL: s_mul_i64:
1785; GFX9:       ; %bb.0: ; %entry
1786; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1787; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
1788; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1789; GFX9-NEXT:    s_mov_b32 s6, -1
1790; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1791; GFX9-NEXT:    s_mov_b32 s4, s0
1792; GFX9-NEXT:    s_mov_b32 s5, s1
1793; GFX9-NEXT:    s_mul_i32 s0, s2, s9
1794; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s8
1795; GFX9-NEXT:    s_add_i32 s0, s1, s0
1796; GFX9-NEXT:    s_mul_i32 s1, s3, s8
1797; GFX9-NEXT:    s_add_i32 s0, s0, s1
1798; GFX9-NEXT:    s_mul_i32 s1, s2, s8
1799; GFX9-NEXT:    v_mov_b32_e32 v0, s1
1800; GFX9-NEXT:    v_mov_b32_e32 v1, s0
1801; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1802; GFX9-NEXT:    s_endpgm
1803;
1804; GFX10-LABEL: s_mul_i64:
1805; GFX10:       ; %bb.0: ; %entry
1806; GFX10-NEXT:    s_clause 0x1
1807; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1808; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1809; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1810; GFX10-NEXT:    s_mul_i32 s4, s2, s7
1811; GFX10-NEXT:    s_mul_hi_u32 s5, s2, s6
1812; GFX10-NEXT:    s_mul_i32 s3, s3, s6
1813; GFX10-NEXT:    s_add_i32 s4, s5, s4
1814; GFX10-NEXT:    s_mul_i32 s2, s2, s6
1815; GFX10-NEXT:    s_add_i32 s4, s4, s3
1816; GFX10-NEXT:    v_mov_b32_e32 v0, s2
1817; GFX10-NEXT:    v_mov_b32_e32 v1, s4
1818; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1819; GFX10-NEXT:    s_mov_b32 s6, -1
1820; GFX10-NEXT:    s_mov_b32 s4, s0
1821; GFX10-NEXT:    s_mov_b32 s5, s1
1822; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1823; GFX10-NEXT:    s_endpgm
1824;
1825; GFX11-LABEL: s_mul_i64:
1826; GFX11:       ; %bb.0: ; %entry
1827; GFX11-NEXT:    s_clause 0x1
1828; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1829; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1830; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1831; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1832; GFX11-NEXT:    s_mul_i32 s5, s2, s5
1833; GFX11-NEXT:    s_mul_hi_u32 s6, s2, s4
1834; GFX11-NEXT:    s_mul_i32 s3, s3, s4
1835; GFX11-NEXT:    s_add_i32 s5, s6, s5
1836; GFX11-NEXT:    s_mul_i32 s2, s2, s4
1837; GFX11-NEXT:    s_add_i32 s5, s5, s3
1838; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1839; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s5
1840; GFX11-NEXT:    s_mov_b32 s6, -1
1841; GFX11-NEXT:    s_mov_b32 s4, s0
1842; GFX11-NEXT:    s_mov_b32 s5, s1
1843; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
1844; GFX11-NEXT:    s_endpgm
1845;
1846; GFX12-LABEL: s_mul_i64:
1847; GFX12:       ; %bb.0: ; %entry
1848; GFX12-NEXT:    s_clause 0x1
1849; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1850; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1851; GFX12-NEXT:    s_wait_kmcnt 0x0
1852; GFX12-NEXT:    s_mul_u64 s[4:5], s[2:3], s[4:5]
1853; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
1854; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
1855; GFX12-NEXT:    s_mov_b32 s2, -1
1856; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
1857; GFX12-NEXT:    s_endpgm
1858;
1859; EG-LABEL: s_mul_i64:
1860; EG:       ; %bb.0: ; %entry
1861; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1862; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1863; EG-NEXT:    CF_END
1864; EG-NEXT:    PAD
1865; EG-NEXT:    ALU clause starting at 4:
1866; EG-NEXT:     MULHI * T0.X, KC0[2].W, KC0[3].Y,
1867; EG-NEXT:     MULLO_INT * T0.Y, KC0[2].W, KC0[3].Z,
1868; EG-NEXT:     ADD_INT T0.W, T0.X, PS,
1869; EG-NEXT:     MULLO_INT * T0.X, KC0[3].X, KC0[3].Y,
1870; EG-NEXT:     ADD_INT * T0.Y, PV.W, PS,
1871; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1872; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1873; EG-NEXT:     MULLO_INT * T0.X, KC0[2].W, KC0[3].Y,
1874entry:
1875  %mul = mul i64 %a, %b
1876  store i64 %mul, ptr addrspace(1) %out, align 8
1877  ret void
1878}
1879
1880define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
1881; SI-LABEL: v_mul_i64:
1882; SI:       ; %bb.0: ; %entry
1883; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1884; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1885; SI-NEXT:    s_mov_b32 s7, 0xf000
1886; SI-NEXT:    s_mov_b32 s6, -1
1887; SI-NEXT:    s_mov_b32 s10, s6
1888; SI-NEXT:    s_mov_b32 s11, s7
1889; SI-NEXT:    s_waitcnt lgkmcnt(0)
1890; SI-NEXT:    s_mov_b32 s12, s2
1891; SI-NEXT:    s_mov_b32 s13, s3
1892; SI-NEXT:    s_mov_b32 s14, s6
1893; SI-NEXT:    s_mov_b32 s15, s7
1894; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1895; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1896; SI-NEXT:    s_mov_b32 s4, s0
1897; SI-NEXT:    s_mov_b32 s5, s1
1898; SI-NEXT:    s_waitcnt vmcnt(0)
1899; SI-NEXT:    v_mul_lo_u32 v1, v2, v1
1900; SI-NEXT:    v_mul_hi_u32 v4, v2, v0
1901; SI-NEXT:    v_mul_lo_u32 v3, v3, v0
1902; SI-NEXT:    v_mul_lo_u32 v0, v2, v0
1903; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
1904; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
1905; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1906; SI-NEXT:    s_endpgm
1907;
1908; VI-LABEL: v_mul_i64:
1909; VI:       ; %bb.0: ; %entry
1910; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1911; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
1912; VI-NEXT:    s_mov_b32 s7, 0xf000
1913; VI-NEXT:    s_mov_b32 s6, -1
1914; VI-NEXT:    s_mov_b32 s10, s6
1915; VI-NEXT:    s_mov_b32 s11, s7
1916; VI-NEXT:    s_waitcnt lgkmcnt(0)
1917; VI-NEXT:    s_mov_b32 s12, s2
1918; VI-NEXT:    s_mov_b32 s13, s3
1919; VI-NEXT:    s_mov_b32 s14, s6
1920; VI-NEXT:    s_mov_b32 s15, s7
1921; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1922; VI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1923; VI-NEXT:    s_mov_b32 s4, s0
1924; VI-NEXT:    s_mov_b32 s5, s1
1925; VI-NEXT:    s_waitcnt vmcnt(0)
1926; VI-NEXT:    v_mul_lo_u32 v4, v2, v1
1927; VI-NEXT:    v_mad_u64_u32 v[1:2], s[2:3], v2, v0, 0
1928; VI-NEXT:    v_mul_lo_u32 v0, v3, v0
1929; VI-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
1930; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
1931; VI-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
1932; VI-NEXT:    s_endpgm
1933;
1934; GFX9-LABEL: v_mul_i64:
1935; GFX9:       ; %bb.0: ; %entry
1936; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1937; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
1938; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1939; GFX9-NEXT:    s_mov_b32 s6, -1
1940; GFX9-NEXT:    s_mov_b32 s10, s6
1941; GFX9-NEXT:    s_mov_b32 s11, s7
1942; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1943; GFX9-NEXT:    s_mov_b32 s12, s2
1944; GFX9-NEXT:    s_mov_b32 s13, s3
1945; GFX9-NEXT:    s_mov_b32 s14, s6
1946; GFX9-NEXT:    s_mov_b32 s15, s7
1947; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1948; GFX9-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1949; GFX9-NEXT:    s_mov_b32 s4, s0
1950; GFX9-NEXT:    s_mov_b32 s5, s1
1951; GFX9-NEXT:    s_waitcnt vmcnt(0)
1952; GFX9-NEXT:    v_mul_lo_u32 v1, v2, v1
1953; GFX9-NEXT:    v_mul_hi_u32 v4, v2, v0
1954; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v0
1955; GFX9-NEXT:    v_mul_lo_u32 v0, v2, v0
1956; GFX9-NEXT:    v_add_u32_e32 v1, v4, v1
1957; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
1958; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1959; GFX9-NEXT:    s_endpgm
1960;
1961; GFX10-LABEL: v_mul_i64:
1962; GFX10:       ; %bb.0: ; %entry
1963; GFX10-NEXT:    s_clause 0x1
1964; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1965; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
1966; GFX10-NEXT:    s_mov_b32 s6, -1
1967; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1968; GFX10-NEXT:    s_mov_b32 s10, s6
1969; GFX10-NEXT:    s_mov_b32 s11, s7
1970; GFX10-NEXT:    s_mov_b32 s14, s6
1971; GFX10-NEXT:    s_mov_b32 s15, s7
1972; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1973; GFX10-NEXT:    s_mov_b32 s12, s2
1974; GFX10-NEXT:    s_mov_b32 s13, s3
1975; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1976; GFX10-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1977; GFX10-NEXT:    s_mov_b32 s4, s0
1978; GFX10-NEXT:    s_mov_b32 s5, s1
1979; GFX10-NEXT:    s_waitcnt vmcnt(0)
1980; GFX10-NEXT:    v_mul_lo_u32 v1, v2, v1
1981; GFX10-NEXT:    v_mul_hi_u32 v4, v2, v0
1982; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v0
1983; GFX10-NEXT:    v_mul_lo_u32 v0, v2, v0
1984; GFX10-NEXT:    v_add_nc_u32_e32 v1, v4, v1
1985; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
1986; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1987; GFX10-NEXT:    s_endpgm
1988;
1989; GFX11-LABEL: v_mul_i64:
1990; GFX11:       ; %bb.0: ; %entry
1991; GFX11-NEXT:    s_clause 0x1
1992; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1993; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1994; GFX11-NEXT:    s_mov_b32 s10, -1
1995; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
1996; GFX11-NEXT:    s_mov_b32 s6, s10
1997; GFX11-NEXT:    s_mov_b32 s7, s11
1998; GFX11-NEXT:    s_mov_b32 s14, s10
1999; GFX11-NEXT:    s_mov_b32 s15, s11
2000; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2001; GFX11-NEXT:    s_mov_b32 s12, s2
2002; GFX11-NEXT:    s_mov_b32 s13, s3
2003; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], 0
2004; GFX11-NEXT:    buffer_load_b64 v[2:3], off, s[12:15], 0
2005; GFX11-NEXT:    s_mov_b32 s8, s0
2006; GFX11-NEXT:    s_mov_b32 s9, s1
2007; GFX11-NEXT:    s_waitcnt vmcnt(0)
2008; GFX11-NEXT:    v_mul_lo_u32 v1, v2, v1
2009; GFX11-NEXT:    v_mul_hi_u32 v4, v2, v0
2010; GFX11-NEXT:    v_mul_lo_u32 v3, v3, v0
2011; GFX11-NEXT:    v_mul_lo_u32 v0, v2, v0
2012; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2013; GFX11-NEXT:    v_add_nc_u32_e32 v1, v4, v1
2014; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v3
2015; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
2016; GFX11-NEXT:    s_endpgm
2017;
2018; GFX12-LABEL: v_mul_i64:
2019; GFX12:       ; %bb.0: ; %entry
2020; GFX12-NEXT:    s_clause 0x1
2021; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2022; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
2023; GFX12-NEXT:    s_mov_b32 s10, -1
2024; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
2025; GFX12-NEXT:    s_mov_b32 s6, s10
2026; GFX12-NEXT:    s_mov_b32 s7, s11
2027; GFX12-NEXT:    s_mov_b32 s14, s10
2028; GFX12-NEXT:    s_mov_b32 s15, s11
2029; GFX12-NEXT:    s_wait_kmcnt 0x0
2030; GFX12-NEXT:    s_mov_b32 s12, s2
2031; GFX12-NEXT:    s_mov_b32 s13, s3
2032; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], null
2033; GFX12-NEXT:    buffer_load_b64 v[2:3], off, s[12:15], null
2034; GFX12-NEXT:    s_mov_b32 s8, s0
2035; GFX12-NEXT:    s_mov_b32 s9, s1
2036; GFX12-NEXT:    s_wait_loadcnt 0x0
2037; GFX12-NEXT:    v_mul_lo_u32 v3, v0, v3
2038; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v2
2039; GFX12-NEXT:    v_mul_hi_u32 v4, v0, v2
2040; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v2
2041; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2042; GFX12-NEXT:    v_add_nc_u32_e32 v1, v3, v1
2043; GFX12-NEXT:    v_add_nc_u32_e32 v1, v1, v4
2044; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
2045; GFX12-NEXT:    s_endpgm
2046;
2047; EG-LABEL: v_mul_i64:
2048; EG:       ; %bb.0: ; %entry
2049; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
2050; EG-NEXT:    TEX 1 @6
2051; EG-NEXT:    ALU 7, @12, KC0[CB0:0-32], KC1[]
2052; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1
2053; EG-NEXT:    CF_END
2054; EG-NEXT:    PAD
2055; EG-NEXT:    Fetch clause starting at 6:
2056; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
2057; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
2058; EG-NEXT:    ALU clause starting at 10:
2059; EG-NEXT:     MOV T0.X, KC0[2].Z,
2060; EG-NEXT:     MOV * T1.X, KC0[2].W,
2061; EG-NEXT:    ALU clause starting at 12:
2062; EG-NEXT:     MULHI * T0.Z, T0.X, T1.X,
2063; EG-NEXT:     MULLO_INT * T0.W, T0.X, T1.Y,
2064; EG-NEXT:     ADD_INT T0.W, T0.Z, PS,
2065; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T1.X,
2066; EG-NEXT:     ADD_INT * T0.Y, PV.W, PS,
2067; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
2068; EG-NEXT:     MULLO_INT * T0.X, T0.X, T1.X,
2069; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2070entry:
2071  %a = load i64, ptr addrspace(1) %aptr, align 8
2072  %b = load i64, ptr addrspace(1) %bptr, align 8
2073  %mul = mul i64 %a, %b
2074  store i64 %mul, ptr addrspace(1) %out, align 8
2075  ret void
2076}
2077
2078define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) {
2079; SI-LABEL: mul32_in_branch:
2080; SI:       ; %bb.0: ; %entry
2081; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
2082; SI-NEXT:    s_waitcnt lgkmcnt(0)
2083; SI-NEXT:    s_cmp_lg_u32 s0, 0
2084; SI-NEXT:    s_cbranch_scc0 .LBB15_2
2085; SI-NEXT:  ; %bb.1: ; %else
2086; SI-NEXT:    s_mul_i32 s8, s0, s1
2087; SI-NEXT:    s_mov_b64 s[6:7], 0
2088; SI-NEXT:    s_branch .LBB15_3
2089; SI-NEXT:  .LBB15_2:
2090; SI-NEXT:    s_mov_b64 s[6:7], -1
2091; SI-NEXT:    ; implicit-def: $sgpr8
2092; SI-NEXT:  .LBB15_3: ; %Flow
2093; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2094; SI-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
2095; SI-NEXT:    s_waitcnt lgkmcnt(0)
2096; SI-NEXT:    s_mov_b64 vcc, vcc
2097; SI-NEXT:    s_cbranch_vccnz .LBB15_5
2098; SI-NEXT:  ; %bb.4: ; %if
2099; SI-NEXT:    s_mov_b32 s7, 0xf000
2100; SI-NEXT:    s_mov_b32 s6, -1
2101; SI-NEXT:    s_mov_b32 s4, s2
2102; SI-NEXT:    s_mov_b32 s5, s3
2103; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
2104; SI-NEXT:    s_branch .LBB15_6
2105; SI-NEXT:  .LBB15_5:
2106; SI-NEXT:    v_mov_b32_e32 v0, s8
2107; SI-NEXT:  .LBB15_6: ; %endif
2108; SI-NEXT:    s_mov_b32 s3, 0xf000
2109; SI-NEXT:    s_mov_b32 s2, -1
2110; SI-NEXT:    s_waitcnt vmcnt(0)
2111; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2112; SI-NEXT:    s_endpgm
2113;
2114; VI-LABEL: mul32_in_branch:
2115; VI:       ; %bb.0: ; %entry
2116; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2117; VI-NEXT:    s_waitcnt lgkmcnt(0)
2118; VI-NEXT:    s_cmp_lg_u32 s0, 0
2119; VI-NEXT:    s_cbranch_scc0 .LBB15_2
2120; VI-NEXT:  ; %bb.1: ; %else
2121; VI-NEXT:    s_mul_i32 s8, s0, s1
2122; VI-NEXT:    s_mov_b64 s[6:7], 0
2123; VI-NEXT:    s_branch .LBB15_3
2124; VI-NEXT:  .LBB15_2:
2125; VI-NEXT:    s_mov_b64 s[6:7], -1
2126; VI-NEXT:    ; implicit-def: $sgpr8
2127; VI-NEXT:  .LBB15_3: ; %Flow
2128; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2129; VI-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
2130; VI-NEXT:    s_cbranch_vccnz .LBB15_5
2131; VI-NEXT:  ; %bb.4: ; %if
2132; VI-NEXT:    s_mov_b32 s7, 0xf000
2133; VI-NEXT:    s_mov_b32 s6, -1
2134; VI-NEXT:    s_waitcnt lgkmcnt(0)
2135; VI-NEXT:    s_mov_b32 s4, s2
2136; VI-NEXT:    s_mov_b32 s5, s3
2137; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
2138; VI-NEXT:    s_branch .LBB15_6
2139; VI-NEXT:  .LBB15_5:
2140; VI-NEXT:    v_mov_b32_e32 v0, s8
2141; VI-NEXT:  .LBB15_6: ; %endif
2142; VI-NEXT:    s_waitcnt lgkmcnt(0)
2143; VI-NEXT:    s_mov_b32 s3, 0xf000
2144; VI-NEXT:    s_mov_b32 s2, -1
2145; VI-NEXT:    s_waitcnt vmcnt(0)
2146; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2147; VI-NEXT:    s_endpgm
2148;
2149; GFX9-LABEL: mul32_in_branch:
2150; GFX9:       ; %bb.0: ; %entry
2151; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2152; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2153; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
2154; GFX9-NEXT:    s_cbranch_scc0 .LBB15_2
2155; GFX9-NEXT:  ; %bb.1: ; %else
2156; GFX9-NEXT:    s_mul_i32 s8, s0, s1
2157; GFX9-NEXT:    s_mov_b64 s[6:7], 0
2158; GFX9-NEXT:    s_branch .LBB15_3
2159; GFX9-NEXT:  .LBB15_2:
2160; GFX9-NEXT:    s_mov_b64 s[6:7], -1
2161; GFX9-NEXT:    ; implicit-def: $sgpr8
2162; GFX9-NEXT:  .LBB15_3: ; %Flow
2163; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2164; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
2165; GFX9-NEXT:    s_cbranch_vccnz .LBB15_5
2166; GFX9-NEXT:  ; %bb.4: ; %if
2167; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2168; GFX9-NEXT:    s_mov_b32 s6, -1
2169; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2170; GFX9-NEXT:    s_mov_b32 s4, s2
2171; GFX9-NEXT:    s_mov_b32 s5, s3
2172; GFX9-NEXT:    buffer_load_dword v0, off, s[4:7], 0
2173; GFX9-NEXT:    s_branch .LBB15_6
2174; GFX9-NEXT:  .LBB15_5:
2175; GFX9-NEXT:    v_mov_b32_e32 v0, s8
2176; GFX9-NEXT:  .LBB15_6: ; %endif
2177; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2178; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2179; GFX9-NEXT:    s_mov_b32 s2, -1
2180; GFX9-NEXT:    s_waitcnt vmcnt(0)
2181; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2182; GFX9-NEXT:    s_endpgm
2183;
2184; GFX10-LABEL: mul32_in_branch:
2185; GFX10:       ; %bb.0: ; %entry
2186; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
2187; GFX10-NEXT:    s_mov_b32 s6, 0
2188; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2189; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
2190; GFX10-NEXT:    s_cbranch_scc0 .LBB15_2
2191; GFX10-NEXT:  ; %bb.1: ; %else
2192; GFX10-NEXT:    s_mul_i32 s7, s0, s1
2193; GFX10-NEXT:    s_branch .LBB15_3
2194; GFX10-NEXT:  .LBB15_2:
2195; GFX10-NEXT:    s_mov_b32 s6, -1
2196; GFX10-NEXT:    ; implicit-def: $sgpr7
2197; GFX10-NEXT:  .LBB15_3: ; %Flow
2198; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2199; GFX10-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s6
2200; GFX10-NEXT:    s_cbranch_vccnz .LBB15_5
2201; GFX10-NEXT:  ; %bb.4: ; %if
2202; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
2203; GFX10-NEXT:    s_mov_b32 s6, -1
2204; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2205; GFX10-NEXT:    s_mov_b32 s4, s2
2206; GFX10-NEXT:    s_mov_b32 s5, s3
2207; GFX10-NEXT:    buffer_load_dword v0, off, s[4:7], 0
2208; GFX10-NEXT:    s_branch .LBB15_6
2209; GFX10-NEXT:  .LBB15_5:
2210; GFX10-NEXT:    v_mov_b32_e32 v0, s7
2211; GFX10-NEXT:  .LBB15_6: ; %endif
2212; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2213; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
2214; GFX10-NEXT:    s_mov_b32 s2, -1
2215; GFX10-NEXT:    s_waitcnt vmcnt(0)
2216; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2217; GFX10-NEXT:    s_endpgm
2218;
2219; GFX11-LABEL: mul32_in_branch:
2220; GFX11:       ; %bb.0: ; %entry
2221; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
2222; GFX11-NEXT:    s_mov_b32 s6, 0
2223; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2224; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
2225; GFX11-NEXT:    s_cbranch_scc0 .LBB15_2
2226; GFX11-NEXT:  ; %bb.1: ; %else
2227; GFX11-NEXT:    s_mul_i32 s7, s0, s1
2228; GFX11-NEXT:    s_branch .LBB15_3
2229; GFX11-NEXT:  .LBB15_2:
2230; GFX11-NEXT:    s_mov_b32 s6, -1
2231; GFX11-NEXT:    ; implicit-def: $sgpr7
2232; GFX11-NEXT:  .LBB15_3: ; %Flow
2233; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2234; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
2235; GFX11-NEXT:    s_cbranch_vccnz .LBB15_5
2236; GFX11-NEXT:  ; %bb.4: ; %if
2237; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
2238; GFX11-NEXT:    s_mov_b32 s6, -1
2239; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2240; GFX11-NEXT:    s_mov_b32 s4, s2
2241; GFX11-NEXT:    s_mov_b32 s5, s3
2242; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
2243; GFX11-NEXT:    s_branch .LBB15_6
2244; GFX11-NEXT:  .LBB15_5:
2245; GFX11-NEXT:    v_mov_b32_e32 v0, s7
2246; GFX11-NEXT:  .LBB15_6: ; %endif
2247; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2248; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
2249; GFX11-NEXT:    s_mov_b32 s2, -1
2250; GFX11-NEXT:    s_waitcnt vmcnt(0)
2251; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2252; GFX11-NEXT:    s_endpgm
2253;
2254; GFX12-LABEL: mul32_in_branch:
2255; GFX12:       ; %bb.0: ; %entry
2256; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
2257; GFX12-NEXT:    s_mov_b32 s6, 0
2258; GFX12-NEXT:    s_wait_kmcnt 0x0
2259; GFX12-NEXT:    s_cmp_lg_u32 s0, 0
2260; GFX12-NEXT:    s_cbranch_scc0 .LBB15_2
2261; GFX12-NEXT:  ; %bb.1: ; %else
2262; GFX12-NEXT:    s_mul_i32 s7, s0, s1
2263; GFX12-NEXT:    s_branch .LBB15_3
2264; GFX12-NEXT:  .LBB15_2:
2265; GFX12-NEXT:    s_mov_b32 s6, -1
2266; GFX12-NEXT:    ; implicit-def: $sgpr7
2267; GFX12-NEXT:  .LBB15_3: ; %Flow
2268; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2269; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
2270; GFX12-NEXT:    s_cbranch_vccnz .LBB15_5
2271; GFX12-NEXT:  ; %bb.4: ; %if
2272; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
2273; GFX12-NEXT:    s_mov_b32 s6, -1
2274; GFX12-NEXT:    s_wait_kmcnt 0x0
2275; GFX12-NEXT:    s_mov_b32 s4, s2
2276; GFX12-NEXT:    s_mov_b32 s5, s3
2277; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
2278; GFX12-NEXT:    s_branch .LBB15_6
2279; GFX12-NEXT:  .LBB15_5:
2280; GFX12-NEXT:    v_mov_b32_e32 v0, s7
2281; GFX12-NEXT:  .LBB15_6: ; %endif
2282; GFX12-NEXT:    s_wait_kmcnt 0x0
2283; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
2284; GFX12-NEXT:    s_mov_b32 s2, -1
2285; GFX12-NEXT:    s_wait_loadcnt 0x0
2286; GFX12-NEXT:    buffer_store_b32 v0, off, s[0:3], null
2287; GFX12-NEXT:    s_endpgm
2288;
2289; EG-LABEL: mul32_in_branch:
2290; EG:       ; %bb.0: ; %entry
2291; EG-NEXT:    ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[]
2292; EG-NEXT:    JUMP @3 POP:1
2293; EG-NEXT:    ALU_POP_AFTER 4, @18, KC0[CB0:0-32], KC1[]
2294; EG-NEXT:    ALU_PUSH_BEFORE 2, @23, KC0[CB0:0-32], KC1[]
2295; EG-NEXT:    JUMP @8 POP:1
2296; EG-NEXT:    ALU 0, @26, KC0[CB0:0-32], KC1[]
2297; EG-NEXT:    TEX 0 @12
2298; EG-NEXT:    POP @8 POP:1
2299; EG-NEXT:    ALU 1, @27, KC0[], KC1[]
2300; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2301; EG-NEXT:    CF_END
2302; EG-NEXT:    PAD
2303; EG-NEXT:    Fetch clause starting at 12:
2304; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
2305; EG-NEXT:    ALU clause starting at 14:
2306; EG-NEXT:     MOV T0.W, literal.x,
2307; EG-NEXT:     SETNE_INT * T1.W, KC0[2].W, 0.0,
2308; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
2309; EG-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
2310; EG-NEXT:    ALU clause starting at 18:
2311; EG-NEXT:     MOV T1.W, KC0[2].W,
2312; EG-NEXT:     MOV * T2.W, KC0[3].X,
2313; EG-NEXT:     MOV T0.W, literal.x,
2314; EG-NEXT:     MULLO_INT * T0.X, PV.W, PS,
2315; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2316; EG-NEXT:    ALU clause starting at 23:
2317; EG-NEXT:     MOV T1.W, KC0[2].Y,
2318; EG-NEXT:     SETE_INT * T0.W, T0.W, 0.0,
2319; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
2320; EG-NEXT:    ALU clause starting at 26:
2321; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2322; EG-NEXT:    ALU clause starting at 27:
2323; EG-NEXT:     LSHR * T1.X, T1.W, literal.x,
2324; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2325entry:
2326  %0 = icmp eq i32 %a, 0
2327  br i1 %0, label %if, label %else
2328
2329if:
2330  %1 = load i32, ptr addrspace(1) %in
2331  br label %endif
2332
2333else:
2334  %2 = mul i32 %a, %b
2335  br label %endif
2336
2337endif:
2338  %3 = phi i32 [%1, %if], [%2, %else]
2339  store i32 %3, ptr addrspace(1) %out
2340  ret void
2341}
2342
2343define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) {
2344; SI-LABEL: mul64_in_branch:
2345; SI:       ; %bb.0: ; %entry
2346; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
2347; SI-NEXT:    s_mov_b64 s[8:9], 0
2348; SI-NEXT:    s_waitcnt lgkmcnt(0)
2349; SI-NEXT:    v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
2350; SI-NEXT:    s_and_b64 vcc, exec, s[10:11]
2351; SI-NEXT:    s_cbranch_vccz .LBB16_4
2352; SI-NEXT:  ; %bb.1: ; %else
2353; SI-NEXT:    v_mov_b32_e32 v0, s6
2354; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
2355; SI-NEXT:    s_mul_i32 s7, s4, s7
2356; SI-NEXT:    s_mul_i32 s5, s5, s6
2357; SI-NEXT:    s_mul_i32 s4, s4, s6
2358; SI-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
2359; SI-NEXT:    v_add_i32_e32 v1, vcc, s5, v0
2360; SI-NEXT:    v_mov_b32_e32 v0, s4
2361; SI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
2362; SI-NEXT:    s_cbranch_vccnz .LBB16_3
2363; SI-NEXT:  .LBB16_2: ; %if
2364; SI-NEXT:    s_mov_b32 s7, 0xf000
2365; SI-NEXT:    s_mov_b32 s6, -1
2366; SI-NEXT:    s_mov_b32 s4, s2
2367; SI-NEXT:    s_mov_b32 s5, s3
2368; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2369; SI-NEXT:  .LBB16_3: ; %endif
2370; SI-NEXT:    s_mov_b32 s3, 0xf000
2371; SI-NEXT:    s_mov_b32 s2, -1
2372; SI-NEXT:    s_waitcnt vmcnt(0)
2373; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2374; SI-NEXT:    s_endpgm
2375; SI-NEXT:  .LBB16_4:
2376; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1
2377; SI-NEXT:    s_branch .LBB16_2
2378;
2379; VI-LABEL: mul64_in_branch:
2380; VI:       ; %bb.0: ; %entry
2381; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
2382; VI-NEXT:    s_mov_b64 s[8:9], 0
2383; VI-NEXT:    s_waitcnt lgkmcnt(0)
2384; VI-NEXT:    s_cmp_lg_u64 s[4:5], 0
2385; VI-NEXT:    s_cbranch_scc0 .LBB16_4
2386; VI-NEXT:  ; %bb.1: ; %else
2387; VI-NEXT:    v_mov_b32_e32 v0, s6
2388; VI-NEXT:    v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0
2389; VI-NEXT:    s_mul_i32 s4, s4, s7
2390; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
2391; VI-NEXT:    s_mul_i32 s4, s5, s6
2392; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
2393; VI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
2394; VI-NEXT:    s_cbranch_vccnz .LBB16_3
2395; VI-NEXT:  .LBB16_2: ; %if
2396; VI-NEXT:    s_mov_b32 s7, 0xf000
2397; VI-NEXT:    s_mov_b32 s6, -1
2398; VI-NEXT:    s_mov_b32 s4, s2
2399; VI-NEXT:    s_mov_b32 s5, s3
2400; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2401; VI-NEXT:  .LBB16_3: ; %endif
2402; VI-NEXT:    s_mov_b32 s3, 0xf000
2403; VI-NEXT:    s_mov_b32 s2, -1
2404; VI-NEXT:    s_waitcnt vmcnt(0)
2405; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2406; VI-NEXT:    s_endpgm
2407; VI-NEXT:  .LBB16_4:
2408; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
2409; VI-NEXT:    s_branch .LBB16_2
2410;
2411; GFX9-LABEL: mul64_in_branch:
2412; GFX9:       ; %bb.0: ; %entry
2413; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
2414; GFX9-NEXT:    s_mov_b64 s[0:1], 0
2415; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2416; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
2417; GFX9-NEXT:    s_cbranch_scc0 .LBB16_3
2418; GFX9-NEXT:  ; %bb.1: ; %else
2419; GFX9-NEXT:    s_mul_i32 s2, s12, s15
2420; GFX9-NEXT:    s_mul_hi_u32 s3, s12, s14
2421; GFX9-NEXT:    s_add_i32 s2, s3, s2
2422; GFX9-NEXT:    s_mul_i32 s3, s13, s14
2423; GFX9-NEXT:    s_add_i32 s3, s2, s3
2424; GFX9-NEXT:    s_mul_i32 s2, s12, s14
2425; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
2426; GFX9-NEXT:    s_cbranch_vccnz .LBB16_4
2427; GFX9-NEXT:  .LBB16_2: ; %if
2428; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2429; GFX9-NEXT:    s_mov_b32 s2, -1
2430; GFX9-NEXT:    s_mov_b32 s0, s10
2431; GFX9-NEXT:    s_mov_b32 s1, s11
2432; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
2433; GFX9-NEXT:    s_branch .LBB16_5
2434; GFX9-NEXT:  .LBB16_3:
2435; GFX9-NEXT:    ; implicit-def: $sgpr2_sgpr3
2436; GFX9-NEXT:    s_branch .LBB16_2
2437; GFX9-NEXT:  .LBB16_4:
2438; GFX9-NEXT:    v_mov_b32_e32 v0, s2
2439; GFX9-NEXT:    v_mov_b32_e32 v1, s3
2440; GFX9-NEXT:  .LBB16_5: ; %endif
2441; GFX9-NEXT:    s_mov_b32 s11, 0xf000
2442; GFX9-NEXT:    s_mov_b32 s10, -1
2443; GFX9-NEXT:    s_waitcnt vmcnt(0)
2444; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2445; GFX9-NEXT:    s_endpgm
2446;
2447; GFX10-LABEL: mul64_in_branch:
2448; GFX10:       ; %bb.0: ; %entry
2449; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
2450; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2451; GFX10-NEXT:    s_cmp_lg_u64 s[12:13], 0
2452; GFX10-NEXT:    s_cbranch_scc0 .LBB16_3
2453; GFX10-NEXT:  ; %bb.1: ; %else
2454; GFX10-NEXT:    s_mul_i32 s0, s12, s15
2455; GFX10-NEXT:    s_mul_hi_u32 s1, s12, s14
2456; GFX10-NEXT:    s_mul_i32 s2, s13, s14
2457; GFX10-NEXT:    s_add_i32 s0, s1, s0
2458; GFX10-NEXT:    s_add_i32 s1, s0, s2
2459; GFX10-NEXT:    s_mul_i32 s0, s12, s14
2460; GFX10-NEXT:    s_cbranch_execnz .LBB16_4
2461; GFX10-NEXT:  .LBB16_2: ; %if
2462; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
2463; GFX10-NEXT:    s_mov_b32 s2, -1
2464; GFX10-NEXT:    s_mov_b32 s0, s10
2465; GFX10-NEXT:    s_mov_b32 s1, s11
2466; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
2467; GFX10-NEXT:    s_branch .LBB16_5
2468; GFX10-NEXT:  .LBB16_3:
2469; GFX10-NEXT:    ; implicit-def: $sgpr0_sgpr1
2470; GFX10-NEXT:    s_branch .LBB16_2
2471; GFX10-NEXT:  .LBB16_4:
2472; GFX10-NEXT:    v_mov_b32_e32 v0, s0
2473; GFX10-NEXT:    v_mov_b32_e32 v1, s1
2474; GFX10-NEXT:  .LBB16_5: ; %endif
2475; GFX10-NEXT:    s_mov_b32 s11, 0x31016000
2476; GFX10-NEXT:    s_mov_b32 s10, -1
2477; GFX10-NEXT:    s_waitcnt vmcnt(0)
2478; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2479; GFX10-NEXT:    s_endpgm
2480;
2481; GFX11-LABEL: mul64_in_branch:
2482; GFX11:       ; %bb.0: ; %entry
2483; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
2484; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2485; GFX11-NEXT:    s_cmp_lg_u64 s[4:5], 0
2486; GFX11-NEXT:    s_cbranch_scc0 .LBB16_3
2487; GFX11-NEXT:  ; %bb.1: ; %else
2488; GFX11-NEXT:    s_mul_i32 s7, s4, s7
2489; GFX11-NEXT:    s_mul_hi_u32 s8, s4, s6
2490; GFX11-NEXT:    s_mul_i32 s5, s5, s6
2491; GFX11-NEXT:    s_add_i32 s7, s8, s7
2492; GFX11-NEXT:    s_mul_i32 s4, s4, s6
2493; GFX11-NEXT:    s_add_i32 s5, s7, s5
2494; GFX11-NEXT:    s_cbranch_execnz .LBB16_4
2495; GFX11-NEXT:  .LBB16_2: ; %if
2496; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
2497; GFX11-NEXT:    s_mov_b32 s6, -1
2498; GFX11-NEXT:    s_mov_b32 s4, s2
2499; GFX11-NEXT:    s_mov_b32 s5, s3
2500; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], 0
2501; GFX11-NEXT:    s_branch .LBB16_5
2502; GFX11-NEXT:  .LBB16_3:
2503; GFX11-NEXT:    ; implicit-def: $sgpr4_sgpr5
2504; GFX11-NEXT:    s_branch .LBB16_2
2505; GFX11-NEXT:  .LBB16_4:
2506; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
2507; GFX11-NEXT:  .LBB16_5: ; %endif
2508; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
2509; GFX11-NEXT:    s_mov_b32 s2, -1
2510; GFX11-NEXT:    s_waitcnt vmcnt(0)
2511; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2512; GFX11-NEXT:    s_endpgm
2513;
2514; GFX12-LABEL: mul64_in_branch:
2515; GFX12:       ; %bb.0: ; %entry
2516; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
2517; GFX12-NEXT:    s_wait_kmcnt 0x0
2518; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
2519; GFX12-NEXT:    s_cbranch_scc0 .LBB16_3
2520; GFX12-NEXT:  ; %bb.1: ; %else
2521; GFX12-NEXT:    s_mul_u64 s[4:5], s[4:5], s[6:7]
2522; GFX12-NEXT:    s_cbranch_execnz .LBB16_4
2523; GFX12-NEXT:  .LBB16_2: ; %if
2524; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
2525; GFX12-NEXT:    s_mov_b32 s6, -1
2526; GFX12-NEXT:    s_mov_b32 s4, s2
2527; GFX12-NEXT:    s_mov_b32 s5, s3
2528; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], null
2529; GFX12-NEXT:    s_branch .LBB16_5
2530; GFX12-NEXT:  .LBB16_3:
2531; GFX12-NEXT:    ; implicit-def: $sgpr4_sgpr5
2532; GFX12-NEXT:    s_branch .LBB16_2
2533; GFX12-NEXT:  .LBB16_4:
2534; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
2535; GFX12-NEXT:  .LBB16_5: ; %endif
2536; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
2537; GFX12-NEXT:    s_mov_b32 s2, -1
2538; GFX12-NEXT:    s_wait_loadcnt 0x0
2539; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
2540; GFX12-NEXT:    s_endpgm
2541;
2542; EG-LABEL: mul64_in_branch:
2543; EG:       ; %bb.0: ; %entry
2544; EG-NEXT:    ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[]
2545; EG-NEXT:    JUMP @3 POP:1
2546; EG-NEXT:    ALU_POP_AFTER 11, @19, KC0[CB0:0-32], KC1[]
2547; EG-NEXT:    ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[]
2548; EG-NEXT:    JUMP @8 POP:1
2549; EG-NEXT:    ALU 0, @34, KC0[CB0:0-32], KC1[]
2550; EG-NEXT:    TEX 0 @12
2551; EG-NEXT:    POP @8 POP:1
2552; EG-NEXT:    ALU 1, @35, KC0[], KC1[]
2553; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
2554; EG-NEXT:    CF_END
2555; EG-NEXT:    PAD
2556; EG-NEXT:    Fetch clause starting at 12:
2557; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
2558; EG-NEXT:    ALU clause starting at 14:
2559; EG-NEXT:     OR_INT T0.W, KC0[2].W, KC0[3].X,
2560; EG-NEXT:     MOV * T1.W, literal.x,
2561; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
2562; EG-NEXT:     SETNE_INT * T0.W, PV.W, 0.0,
2563; EG-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
2564; EG-NEXT:    ALU clause starting at 19:
2565; EG-NEXT:     MOV T0.W, KC0[2].W,
2566; EG-NEXT:     MOV * T1.W, KC0[3].Z,
2567; EG-NEXT:     MOV T2.W, KC0[3].Y,
2568; EG-NEXT:     MULLO_INT * T0.X, PV.W, PS,
2569; EG-NEXT:     MOV T1.W, KC0[3].X,
2570; EG-NEXT:     MULHI * T0.Y, T0.W, PV.W,
2571; EG-NEXT:     ADD_INT T3.W, PS, T0.X,
2572; EG-NEXT:     MULLO_INT * T0.X, PV.W, T2.W,
2573; EG-NEXT:     ADD_INT T0.Y, PV.W, PS,
2574; EG-NEXT:     MOV T1.W, literal.x,
2575; EG-NEXT:     MULLO_INT * T0.X, T0.W, T2.W,
2576; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2577; EG-NEXT:    ALU clause starting at 31:
2578; EG-NEXT:     MOV T0.W, KC0[2].Y,
2579; EG-NEXT:     SETE_INT * T1.W, T1.W, 0.0,
2580; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
2581; EG-NEXT:    ALU clause starting at 34:
2582; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2583; EG-NEXT:    ALU clause starting at 35:
2584; EG-NEXT:     LSHR * T1.X, T0.W, literal.x,
2585; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2586entry:
2587  %0 = icmp eq i64 %a, 0
2588  br i1 %0, label %if, label %else
2589
2590if:
2591  %1 = load i64, ptr addrspace(1) %in
2592  br label %endif
2593
2594else:
2595  %2 = mul i64 %a, %b
2596  br label %endif
2597
2598endif:
2599  %3 = phi i64 [%1, %if], [%2, %else]
2600  store i64 %3, ptr addrspace(1) %out
2601  ret void
2602}
2603
2604define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 {
2605; SI-LABEL: s_mul_i128:
2606; SI:       ; %bb.0: ; %entry
2607; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x13
2608; SI-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x1f
2609; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2610; SI-NEXT:    s_mov_b32 s3, 0xf000
2611; SI-NEXT:    s_mov_b32 s2, -1
2612; SI-NEXT:    s_waitcnt lgkmcnt(0)
2613; SI-NEXT:    v_mov_b32_e32 v0, s10
2614; SI-NEXT:    v_mul_hi_u32 v0, s12, v0
2615; SI-NEXT:    v_mov_b32_e32 v1, s8
2616; SI-NEXT:    v_mul_hi_u32 v1, s14, v1
2617; SI-NEXT:    s_mul_i32 s4, s12, s11
2618; SI-NEXT:    s_mul_i32 s5, s13, s10
2619; SI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
2620; SI-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
2621; SI-NEXT:    s_mul_i32 s5, s14, s9
2622; SI-NEXT:    s_mul_i32 s4, s12, s10
2623; SI-NEXT:    v_add_i32_e32 v1, vcc, s5, v1
2624; SI-NEXT:    s_mul_i32 s5, s15, s8
2625; SI-NEXT:    v_add_i32_e32 v1, vcc, s5, v1
2626; SI-NEXT:    s_mul_i32 s5, s14, s8
2627; SI-NEXT:    v_mov_b32_e32 v2, s4
2628; SI-NEXT:    v_add_i32_e32 v2, vcc, s5, v2
2629; SI-NEXT:    v_addc_u32_e32 v0, vcc, v1, v0, vcc
2630; SI-NEXT:    v_mov_b32_e32 v1, s12
2631; SI-NEXT:    v_mul_hi_u32 v5, s8, v1
2632; SI-NEXT:    v_mul_hi_u32 v1, s9, v1
2633; SI-NEXT:    v_mov_b32_e32 v3, s13
2634; SI-NEXT:    v_mul_hi_u32 v4, s8, v3
2635; SI-NEXT:    s_mul_i32 s5, s9, s12
2636; SI-NEXT:    v_add_i32_e32 v5, vcc, s5, v5
2637; SI-NEXT:    s_mul_i32 s4, s8, s13
2638; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
2639; SI-NEXT:    v_add_i32_e32 v1, vcc, s4, v5
2640; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
2641; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
2642; SI-NEXT:    v_mul_hi_u32 v3, s9, v3
2643; SI-NEXT:    v_addc_u32_e64 v5, s[4:5], 0, 0, vcc
2644; SI-NEXT:    s_mul_i32 s4, s9, s13
2645; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
2646; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
2647; SI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
2648; SI-NEXT:    s_mul_i32 s4, s8, s12
2649; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v0, vcc
2650; SI-NEXT:    v_mov_b32_e32 v0, s4
2651; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2652; SI-NEXT:    s_endpgm
2653;
2654; VI-LABEL: s_mul_i128:
2655; VI:       ; %bb.0: ; %entry
2656; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4c
2657; VI-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x7c
2658; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2659; VI-NEXT:    v_mov_b32_e32 v5, 0
2660; VI-NEXT:    s_mov_b32 s3, 0xf000
2661; VI-NEXT:    s_waitcnt lgkmcnt(0)
2662; VI-NEXT:    v_mov_b32_e32 v0, s10
2663; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s12, v0, 0
2664; VI-NEXT:    s_mul_i32 s4, s12, s11
2665; VI-NEXT:    v_mov_b32_e32 v6, s12
2666; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
2667; VI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s8, v6, 0
2668; VI-NEXT:    s_mul_i32 s6, s13, s10
2669; VI-NEXT:    v_add_u32_e32 v3, vcc, s6, v3
2670; VI-NEXT:    v_mov_b32_e32 v4, v1
2671; VI-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s9, v6, v[4:5]
2672; VI-NEXT:    v_mov_b32_e32 v8, s8
2673; VI-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], s14, v8, v[2:3]
2674; VI-NEXT:    v_mov_b32_e32 v3, v7
2675; VI-NEXT:    v_mov_b32_e32 v7, v5
2676; VI-NEXT:    v_mov_b32_e32 v8, s13
2677; VI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s8, v8, v[6:7]
2678; VI-NEXT:    s_mul_i32 s6, s15, s8
2679; VI-NEXT:    v_add_u32_e32 v6, vcc, s6, v2
2680; VI-NEXT:    v_mov_b32_e32 v2, v5
2681; VI-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
2682; VI-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
2683; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s9, v8, v[2:3]
2684; VI-NEXT:    s_mul_i32 s6, s14, s9
2685; VI-NEXT:    v_add_u32_e32 v5, vcc, s6, v6
2686; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
2687; VI-NEXT:    s_mov_b32 s2, -1
2688; VI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
2689; VI-NEXT:    v_mov_b32_e32 v1, v4
2690; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2691; VI-NEXT:    s_endpgm
2692;
2693; GFX9-LABEL: s_mul_i128:
2694; GFX9:       ; %bb.0: ; %entry
2695; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4c
2696; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x7c
2697; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2698; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2699; GFX9-NEXT:    s_mov_b32 s2, -1
2700; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2701; GFX9-NEXT:    s_mul_i32 s4, s12, s11
2702; GFX9-NEXT:    s_mul_hi_u32 s5, s12, s10
2703; GFX9-NEXT:    s_mul_i32 s6, s14, s9
2704; GFX9-NEXT:    s_mul_hi_u32 s7, s14, s8
2705; GFX9-NEXT:    s_add_i32 s4, s5, s4
2706; GFX9-NEXT:    s_mul_i32 s5, s13, s10
2707; GFX9-NEXT:    s_add_i32 s6, s7, s6
2708; GFX9-NEXT:    s_mul_i32 s7, s15, s8
2709; GFX9-NEXT:    s_add_i32 s4, s4, s5
2710; GFX9-NEXT:    s_mul_i32 s5, s12, s10
2711; GFX9-NEXT:    s_add_i32 s6, s6, s7
2712; GFX9-NEXT:    s_mul_i32 s7, s14, s8
2713; GFX9-NEXT:    s_add_u32 s7, s7, s5
2714; GFX9-NEXT:    s_addc_u32 s6, s6, s4
2715; GFX9-NEXT:    s_mul_i32 s14, s9, s12
2716; GFX9-NEXT:    s_mul_hi_u32 s15, s8, s12
2717; GFX9-NEXT:    s_mul_hi_u32 s11, s9, s12
2718; GFX9-NEXT:    s_add_u32 s14, s14, s15
2719; GFX9-NEXT:    s_mul_i32 s5, s8, s13
2720; GFX9-NEXT:    s_addc_u32 s11, s11, 0
2721; GFX9-NEXT:    s_mul_hi_u32 s10, s8, s13
2722; GFX9-NEXT:    s_add_u32 s5, s5, s14
2723; GFX9-NEXT:    s_addc_u32 s10, s10, 0
2724; GFX9-NEXT:    s_add_u32 s10, s11, s10
2725; GFX9-NEXT:    s_addc_u32 s11, 0, 0
2726; GFX9-NEXT:    s_mul_hi_u32 s14, s9, s13
2727; GFX9-NEXT:    s_mul_i32 s9, s9, s13
2728; GFX9-NEXT:    s_add_u32 s9, s9, s10
2729; GFX9-NEXT:    s_addc_u32 s10, s14, s11
2730; GFX9-NEXT:    s_mov_b32 s4, 0
2731; GFX9-NEXT:    s_add_u32 s9, s9, s7
2732; GFX9-NEXT:    s_addc_u32 s10, s10, s6
2733; GFX9-NEXT:    s_mul_i32 s6, s8, s12
2734; GFX9-NEXT:    s_mov_b32 s7, s4
2735; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
2736; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2737; GFX9-NEXT:    v_mov_b32_e32 v1, s5
2738; GFX9-NEXT:    v_mov_b32_e32 v2, s9
2739; GFX9-NEXT:    v_mov_b32_e32 v3, s10
2740; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2741; GFX9-NEXT:    s_endpgm
2742;
2743; GFX10-LABEL: s_mul_i128:
2744; GFX10:       ; %bb.0: ; %entry
2745; GFX10-NEXT:    s_clause 0x2
2746; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4c
2747; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x7c
2748; GFX10-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
2749; GFX10-NEXT:    s_mov_b32 s6, 0
2750; GFX10-NEXT:    s_mov_b32 s5, s6
2751; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2752; GFX10-NEXT:    s_mul_i32 s3, s8, s3
2753; GFX10-NEXT:    s_mul_hi_u32 s4, s8, s2
2754; GFX10-NEXT:    s_mul_i32 s14, s10, s1
2755; GFX10-NEXT:    s_mul_hi_u32 s15, s10, s0
2756; GFX10-NEXT:    s_mul_i32 s7, s9, s2
2757; GFX10-NEXT:    s_mul_i32 s11, s11, s0
2758; GFX10-NEXT:    s_add_i32 s3, s4, s3
2759; GFX10-NEXT:    s_add_i32 s4, s15, s14
2760; GFX10-NEXT:    s_mul_i32 s2, s8, s2
2761; GFX10-NEXT:    s_mul_i32 s10, s10, s0
2762; GFX10-NEXT:    s_add_i32 s3, s3, s7
2763; GFX10-NEXT:    s_add_i32 s4, s4, s11
2764; GFX10-NEXT:    s_mul_i32 s19, s1, s8
2765; GFX10-NEXT:    s_mul_hi_u32 s20, s0, s8
2766; GFX10-NEXT:    s_add_u32 s2, s10, s2
2767; GFX10-NEXT:    s_mul_hi_u32 s18, s1, s8
2768; GFX10-NEXT:    s_addc_u32 s3, s4, s3
2769; GFX10-NEXT:    s_mul_i32 s17, s0, s9
2770; GFX10-NEXT:    s_add_u32 s4, s19, s20
2771; GFX10-NEXT:    s_mul_hi_u32 s16, s0, s9
2772; GFX10-NEXT:    s_mul_hi_u32 s21, s1, s9
2773; GFX10-NEXT:    s_mul_i32 s1, s1, s9
2774; GFX10-NEXT:    s_addc_u32 s9, s18, 0
2775; GFX10-NEXT:    s_add_u32 s7, s17, s4
2776; GFX10-NEXT:    s_addc_u32 s10, s16, 0
2777; GFX10-NEXT:    s_mul_i32 s4, s0, s8
2778; GFX10-NEXT:    s_add_u32 s0, s9, s10
2779; GFX10-NEXT:    s_addc_u32 s8, 0, 0
2780; GFX10-NEXT:    s_add_u32 s0, s1, s0
2781; GFX10-NEXT:    s_addc_u32 s1, s21, s8
2782; GFX10-NEXT:    s_add_u32 s2, s0, s2
2783; GFX10-NEXT:    s_addc_u32 s3, s1, s3
2784; GFX10-NEXT:    s_or_b64 s[0:1], s[4:5], s[6:7]
2785; GFX10-NEXT:    v_mov_b32_e32 v2, s2
2786; GFX10-NEXT:    v_mov_b32_e32 v0, s0
2787; GFX10-NEXT:    v_mov_b32_e32 v1, s1
2788; GFX10-NEXT:    v_mov_b32_e32 v3, s3
2789; GFX10-NEXT:    s_mov_b32 s15, 0x31016000
2790; GFX10-NEXT:    s_mov_b32 s14, -1
2791; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
2792; GFX10-NEXT:    s_endpgm
2793;
2794; GFX11-LABEL: s_mul_i128:
2795; GFX11:       ; %bb.0: ; %entry
2796; GFX11-NEXT:    s_clause 0x2
2797; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x4c
2798; GFX11-NEXT:    s_load_b128 s[8:11], s[4:5], 0x7c
2799; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
2800; GFX11-NEXT:    s_mov_b32 s6, 0
2801; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2802; GFX11-NEXT:    s_mov_b32 s13, s6
2803; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2804; GFX11-NEXT:    s_mul_i32 s3, s8, s3
2805; GFX11-NEXT:    s_mul_hi_u32 s7, s8, s2
2806; GFX11-NEXT:    s_mul_i32 s14, s10, s1
2807; GFX11-NEXT:    s_mul_hi_u32 s15, s10, s0
2808; GFX11-NEXT:    s_mul_i32 s12, s9, s2
2809; GFX11-NEXT:    s_mul_i32 s11, s11, s0
2810; GFX11-NEXT:    s_add_i32 s3, s7, s3
2811; GFX11-NEXT:    s_add_i32 s7, s15, s14
2812; GFX11-NEXT:    s_mul_i32 s2, s8, s2
2813; GFX11-NEXT:    s_mul_i32 s10, s10, s0
2814; GFX11-NEXT:    s_add_i32 s3, s3, s12
2815; GFX11-NEXT:    s_add_i32 s7, s7, s11
2816; GFX11-NEXT:    s_mul_i32 s19, s1, s8
2817; GFX11-NEXT:    s_mul_hi_u32 s20, s0, s8
2818; GFX11-NEXT:    s_add_u32 s2, s10, s2
2819; GFX11-NEXT:    s_mul_hi_u32 s18, s1, s8
2820; GFX11-NEXT:    s_addc_u32 s3, s7, s3
2821; GFX11-NEXT:    s_mul_i32 s17, s0, s9
2822; GFX11-NEXT:    s_add_u32 s7, s19, s20
2823; GFX11-NEXT:    s_mul_hi_u32 s16, s0, s9
2824; GFX11-NEXT:    s_mul_hi_u32 s21, s1, s9
2825; GFX11-NEXT:    s_mul_i32 s1, s1, s9
2826; GFX11-NEXT:    s_addc_u32 s9, s18, 0
2827; GFX11-NEXT:    s_add_u32 s7, s17, s7
2828; GFX11-NEXT:    s_addc_u32 s10, s16, 0
2829; GFX11-NEXT:    s_mul_i32 s12, s0, s8
2830; GFX11-NEXT:    s_add_u32 s0, s9, s10
2831; GFX11-NEXT:    s_addc_u32 s8, 0, 0
2832; GFX11-NEXT:    s_add_u32 s0, s1, s0
2833; GFX11-NEXT:    s_addc_u32 s1, s21, s8
2834; GFX11-NEXT:    s_add_u32 s2, s0, s2
2835; GFX11-NEXT:    s_addc_u32 s3, s1, s3
2836; GFX11-NEXT:    s_or_b64 s[0:1], s[12:13], s[6:7]
2837; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2838; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
2839; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
2840; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
2841; GFX11-NEXT:    s_mov_b32 s6, -1
2842; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
2843; GFX11-NEXT:    s_endpgm
2844;
2845; GFX12-LABEL: s_mul_i128:
2846; GFX12:       ; %bb.0: ; %entry
2847; GFX12-NEXT:    s_clause 0x1
2848; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x7c
2849; GFX12-NEXT:    s_load_b128 s[12:15], s[4:5], 0x4c
2850; GFX12-NEXT:    s_mov_b32 s3, 0
2851; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2852; GFX12-NEXT:    s_mov_b32 s7, s3
2853; GFX12-NEXT:    s_mov_b32 s5, s3
2854; GFX12-NEXT:    s_mov_b32 s17, s3
2855; GFX12-NEXT:    s_mov_b32 s19, s3
2856; GFX12-NEXT:    s_mov_b32 s24, s3
2857; GFX12-NEXT:    s_wait_kmcnt 0x0
2858; GFX12-NEXT:    s_mov_b32 s2, s8
2859; GFX12-NEXT:    s_mov_b32 s6, s12
2860; GFX12-NEXT:    s_mov_b32 s4, s13
2861; GFX12-NEXT:    s_mul_u64 s[22:23], s[6:7], s[2:3]
2862; GFX12-NEXT:    s_mul_u64 s[20:21], s[4:5], s[2:3]
2863; GFX12-NEXT:    s_mov_b32 s2, s23
2864; GFX12-NEXT:    s_mov_b32 s16, s9
2865; GFX12-NEXT:    s_mul_u64 s[10:11], s[10:11], s[12:13]
2866; GFX12-NEXT:    s_add_nc_u64 s[12:13], s[20:21], s[2:3]
2867; GFX12-NEXT:    s_mul_u64 s[6:7], s[6:7], s[16:17]
2868; GFX12-NEXT:    s_mov_b32 s2, s13
2869; GFX12-NEXT:    s_mov_b32 s13, s3
2870; GFX12-NEXT:    s_mul_u64 s[8:9], s[8:9], s[14:15]
2871; GFX12-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[12:13]
2872; GFX12-NEXT:    s_mul_u64 s[4:5], s[4:5], s[16:17]
2873; GFX12-NEXT:    s_mov_b32 s18, s7
2874; GFX12-NEXT:    s_mov_b32 s23, s3
2875; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[18:19]
2876; GFX12-NEXT:    s_add_nc_u64 s[8:9], s[10:11], s[8:9]
2877; GFX12-NEXT:    s_mov_b32 s25, s6
2878; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], s[2:3]
2879; GFX12-NEXT:    s_or_b64 s[6:7], s[22:23], s[24:25]
2880; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[8:9]
2881; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
2882; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2883; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
2884; GFX12-NEXT:    s_mov_b32 s2, -1
2885; GFX12-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null
2886; GFX12-NEXT:    s_endpgm
2887;
2888; EG-LABEL: s_mul_i128:
2889; EG:       ; %bb.0: ; %entry
2890; EG-NEXT:    ALU 41, @4, KC0[CB0:0-32], KC1[]
2891; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2892; EG-NEXT:    CF_END
2893; EG-NEXT:    PAD
2894; EG-NEXT:    ALU clause starting at 4:
2895; EG-NEXT:     MULLO_INT * T0.X, KC0[5].X, KC0[8].X,
2896; EG-NEXT:     MULHI * T0.Y, KC0[5].X, KC0[8].X,
2897; EG-NEXT:     MULLO_INT * T0.Z, KC0[8].Y, KC0[4].W,
2898; EG-NEXT:     MULLO_INT * T0.W, KC0[8].X, KC0[5].Y,
2899; EG-NEXT:     MULHI * T1.X, KC0[5].X, KC0[7].W,
2900; EG-NEXT:     MULHI * T1.Y, KC0[4].W, KC0[8].X,
2901; EG-NEXT:     MULHI * T1.Z, KC0[8].Y, KC0[4].W,
2902; EG-NEXT:     MULLO_INT * T1.W, KC0[8].Y, KC0[5].X,
2903; EG-NEXT:     MULHI * T2.X, KC0[7].W, KC0[5].Y,
2904; EG-NEXT:     MULLO_INT * T2.Y, KC0[5].X, KC0[7].W,
2905; EG-NEXT:     MULHI * T2.Z, KC0[4].W, KC0[7].W,
2906; EG-NEXT:     ADD_INT T2.W, T2.Y, PS,
2907; EG-NEXT:     MULLO_INT * T3.X, KC0[4].W, KC0[8].X,
2908; EG-NEXT:     ADDC_UINT T2.Z, T2.Y, T2.Z,
2909; EG-NEXT:     ADDC_UINT T3.W, PS, PV.W,
2910; EG-NEXT:     MULLO_INT * T2.Y, KC0[7].W, KC0[5].Z,
2911; EG-NEXT:     ADD_INT T2.X, T2.X, PS,
2912; EG-NEXT:     ADD_INT T2.Y, T1.Z, T1.W,
2913; EG-NEXT:     ADD_INT T1.Z, T1.Y, PV.W,
2914; EG-NEXT:     ADD_INT T1.W, T1.X, PV.Z, BS:VEC_120/SCL_212
2915; EG-NEXT:     MULLO_INT * T1.X, KC0[8].Z, KC0[4].W,
2916; EG-NEXT:     ADD_INT T4.X, PV.W, PV.Z,
2917; EG-NEXT:     ADDC_UINT T1.Y, PV.W, PV.Z,
2918; EG-NEXT:     ADD_INT T1.Z, PV.Y, PS,
2919; EG-NEXT:     ADD_INT T0.W, PV.X, T0.W,
2920; EG-NEXT:     MULLO_INT * T1.X, KC0[7].W, KC0[5].Y,
2921; EG-NEXT:     ADD_INT T2.Y, PV.Z, PV.W,
2922; EG-NEXT:     ADDC_UINT T1.Z, T0.Z, PS,
2923; EG-NEXT:     ADD_INT T0.W, T0.Y, PV.Y,
2924; EG-NEXT:     ADDC_UINT * T1.W, T0.X, PV.X,
2925; EG-NEXT:     ADD_INT T0.Y, T0.X, T4.X,
2926; EG-NEXT:     ADD_INT T0.Z, T0.Z, T1.X, BS:VEC_021/SCL_122
2927; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
2928; EG-NEXT:     ADD_INT * T1.W, PV.Y, PV.Z,
2929; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
2930; EG-NEXT:     ADDC_UINT * T1.W, PV.Y, PV.Z,
2931; EG-NEXT:     ADD_INT * T0.W, PV.W, PS,
2932; EG-NEXT:     ADD_INT * T0.Z, T0.Y, T0.Z,
2933; EG-NEXT:     ADD_INT * T0.Y, T3.X, T2.W,
2934; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2935; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2936; EG-NEXT:     MULLO_INT * T0.X, KC0[4].W, KC0[7].W,
2937entry:
2938  %mul = mul i128 %a, %b
2939  store i128 %mul, ptr addrspace(1) %out
2940  ret void
2941}
2942
2943define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
2944; SI-LABEL: v_mul_i128:
2945; SI:       ; %bb.0: ; %entry
2946; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
2947; SI-NEXT:    s_mov_b32 s7, 0xf000
2948; SI-NEXT:    s_mov_b32 s6, 0
2949; SI-NEXT:    v_lshlrev_b32_e32 v8, 4, v0
2950; SI-NEXT:    v_mov_b32_e32 v9, 0
2951; SI-NEXT:    s_waitcnt lgkmcnt(0)
2952; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
2953; SI-NEXT:    s_mov_b64 s[0:1], s[2:3]
2954; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
2955; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
2956; SI-NEXT:    buffer_load_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64
2957; SI-NEXT:    s_waitcnt vmcnt(0)
2958; SI-NEXT:    v_mul_lo_u32 v3, v4, v3
2959; SI-NEXT:    v_mul_hi_u32 v10, v4, v2
2960; SI-NEXT:    v_mul_lo_u32 v12, v6, v1
2961; SI-NEXT:    v_mul_hi_u32 v13, v6, v0
2962; SI-NEXT:    v_mul_lo_u32 v17, v1, v4
2963; SI-NEXT:    v_mul_hi_u32 v18, v0, v4
2964; SI-NEXT:    v_mul_lo_u32 v11, v5, v2
2965; SI-NEXT:    v_mul_lo_u32 v7, v7, v0
2966; SI-NEXT:    v_mul_hi_u32 v16, v1, v4
2967; SI-NEXT:    v_mul_lo_u32 v15, v0, v5
2968; SI-NEXT:    v_mul_hi_u32 v14, v0, v5
2969; SI-NEXT:    v_mul_hi_u32 v19, v1, v5
2970; SI-NEXT:    v_mul_lo_u32 v5, v1, v5
2971; SI-NEXT:    v_add_i32_e32 v1, vcc, v10, v3
2972; SI-NEXT:    v_add_i32_e32 v3, vcc, v13, v12
2973; SI-NEXT:    v_mul_lo_u32 v2, v4, v2
2974; SI-NEXT:    v_mul_lo_u32 v6, v6, v0
2975; SI-NEXT:    v_mul_lo_u32 v0, v0, v4
2976; SI-NEXT:    v_add_i32_e32 v4, vcc, v17, v18
2977; SI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v16, vcc
2978; SI-NEXT:    v_add_i32_e32 v11, vcc, v1, v11
2979; SI-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
2980; SI-NEXT:    v_add_i32_e32 v1, vcc, v15, v4
2981; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v14, vcc
2982; SI-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
2983; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
2984; SI-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
2985; SI-NEXT:    v_addc_u32_e64 v6, s[4:5], 0, 0, vcc
2986; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2987; SI-NEXT:    v_addc_u32_e32 v5, vcc, v19, v6, vcc
2988; SI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
2989; SI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
2990; SI-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64
2991; SI-NEXT:    s_endpgm
2992;
2993; VI-LABEL: v_mul_i128:
2994; VI:       ; %bb.0: ; %entry
2995; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
2996; VI-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
2997; VI-NEXT:    v_mov_b32_e32 v10, 0
2998; VI-NEXT:    s_waitcnt lgkmcnt(0)
2999; VI-NEXT:    v_mov_b32_e32 v1, s1
3000; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3001; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3002; VI-NEXT:    v_mov_b32_e32 v3, s3
3003; VI-NEXT:    v_add_u32_e32 v12, vcc, s2, v2
3004; VI-NEXT:    v_addc_u32_e32 v13, vcc, 0, v3, vcc
3005; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
3006; VI-NEXT:    flat_load_dwordx4 v[4:7], v[12:13]
3007; VI-NEXT:    s_waitcnt vmcnt(0)
3008; VI-NEXT:    v_mul_lo_u32 v3, v4, v3
3009; VI-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v4, v2, 0
3010; VI-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
3011; VI-NEXT:    v_mul_lo_u32 v2, v5, v2
3012; VI-NEXT:    v_add_u32_e32 v3, vcc, v15, v3
3013; VI-NEXT:    v_add_u32_e32 v15, vcc, v3, v2
3014; VI-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[9:10]
3015; VI-NEXT:    v_mov_b32_e32 v4, v3
3016; VI-NEXT:    v_mov_b32_e32 v3, v10
3017; VI-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, v5, v[2:3]
3018; VI-NEXT:    v_mad_u64_u32 v[9:10], s[0:1], v6, v0, v[14:15]
3019; VI-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
3020; VI-NEXT:    v_addc_u32_e64 v4, s[0:1], 0, 0, vcc
3021; VI-NEXT:    v_mul_lo_u32 v0, v7, v0
3022; VI-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v1, v5, v[3:4]
3023; VI-NEXT:    v_mul_lo_u32 v1, v6, v1
3024; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v10
3025; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
3026; VI-NEXT:    v_add_u32_e32 v10, vcc, v3, v9
3027; VI-NEXT:    v_addc_u32_e32 v11, vcc, v4, v0, vcc
3028; VI-NEXT:    v_mov_b32_e32 v9, v2
3029; VI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
3030; VI-NEXT:    s_endpgm
3031;
3032; GFX9-LABEL: v_mul_i128:
3033; GFX9:       ; %bb.0: ; %entry
3034; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
3035; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 4, v0
3036; GFX9-NEXT:    v_mov_b32_e32 v11, 0
3037; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3038; GFX9-NEXT:    global_load_dwordx4 v[0:3], v12, s[0:1]
3039; GFX9-NEXT:    global_load_dwordx4 v[4:7], v12, s[2:3]
3040; GFX9-NEXT:    s_waitcnt vmcnt(0)
3041; GFX9-NEXT:    v_mul_lo_u32 v10, v5, v2
3042; GFX9-NEXT:    v_mul_lo_u32 v13, v4, v3
3043; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v4, v2, 0
3044; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
3045; GFX9-NEXT:    v_add3_u32 v9, v9, v13, v10
3046; GFX9-NEXT:    v_mul_lo_u32 v13, v6, v1
3047; GFX9-NEXT:    v_mov_b32_e32 v10, v3
3048; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11]
3049; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v6, v0, v[8:9]
3050; GFX9-NEXT:    v_mov_b32_e32 v10, v4
3051; GFX9-NEXT:    v_mov_b32_e32 v4, v11
3052; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4]
3053; GFX9-NEXT:    v_mul_lo_u32 v0, v7, v0
3054; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v4
3055; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, 0, vcc
3056; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, v5, v[10:11]
3057; GFX9-NEXT:    v_add3_u32 v0, v0, v9, v13
3058; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
3059; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v0, vcc
3060; GFX9-NEXT:    global_store_dwordx4 v12, v[2:5], s[2:3]
3061; GFX9-NEXT:    s_endpgm
3062;
3063; GFX10-LABEL: v_mul_i128:
3064; GFX10:       ; %bb.0: ; %entry
3065; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
3066; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 4, v0
3067; GFX10-NEXT:    v_mov_b32_e32 v10, 0
3068; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3069; GFX10-NEXT:    s_clause 0x1
3070; GFX10-NEXT:    global_load_dwordx4 v[0:3], v13, s[0:1]
3071; GFX10-NEXT:    global_load_dwordx4 v[4:7], v13, s[2:3]
3072; GFX10-NEXT:    s_waitcnt vmcnt(0)
3073; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s0, v0, v4, 0
3074; GFX10-NEXT:    v_mul_lo_u32 v15, v5, v2
3075; GFX10-NEXT:    v_mul_lo_u32 v7, v7, v0
3076; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s0, v1, v4, v[9:10]
3077; GFX10-NEXT:    v_mov_b32_e32 v14, v12
3078; GFX10-NEXT:    v_mov_b32_e32 v12, v10
3079; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s0, v0, v5, v[11:12]
3080; GFX10-NEXT:    v_mul_lo_u32 v11, v4, v3
3081; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, v4, v2, 0
3082; GFX10-NEXT:    v_mul_lo_u32 v12, v6, v1
3083; GFX10-NEXT:    v_mov_b32_e32 v4, v10
3084; GFX10-NEXT:    v_add3_u32 v3, v3, v11, v15
3085; GFX10-NEXT:    v_add_co_u32 v10, s0, v14, v4
3086; GFX10-NEXT:    v_add_co_ci_u32_e64 v11, s0, 0, 0, s0
3087; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, v6, v0, v[2:3]
3088; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v1, v5, v[10:11]
3089; GFX10-NEXT:    v_add3_u32 v3, v7, v3, v12
3090; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v0, v2
3091; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
3092; GFX10-NEXT:    global_store_dwordx4 v13, v[8:11], s[2:3]
3093; GFX10-NEXT:    s_endpgm
3094;
3095; GFX11-LABEL: v_mul_i128:
3096; GFX11:       ; %bb.0: ; %entry
3097; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
3098; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3099; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3100; GFX11-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0
3101; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3102; GFX11-NEXT:    s_clause 0x1
3103; GFX11-NEXT:    global_load_b128 v[0:3], v15, s[0:1]
3104; GFX11-NEXT:    global_load_b128 v[4:7], v15, s[2:3]
3105; GFX11-NEXT:    s_waitcnt vmcnt(0)
3106; GFX11-NEXT:    v_mad_u64_u32 v[8:9], null, v0, v4, 0
3107; GFX11-NEXT:    v_mul_lo_u32 v14, v5, v2
3108; GFX11-NEXT:    v_mul_lo_u32 v3, v4, v3
3109; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3110; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10]
3111; GFX11-NEXT:    v_dual_mov_b32 v13, v12 :: v_dual_mov_b32 v12, v10
3112; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
3113; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v0, v5, v[11:12]
3114; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v4, v2, 0
3115; GFX11-NEXT:    v_mul_lo_u32 v4, v6, v1
3116; GFX11-NEXT:    v_mov_b32_e32 v2, v10
3117; GFX11-NEXT:    v_mul_lo_u32 v10, v7, v0
3118; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
3119; GFX11-NEXT:    v_add3_u32 v12, v12, v3, v14
3120; GFX11-NEXT:    v_add_co_u32 v2, s0, v13, v2
3121; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
3122; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s0
3123; GFX11-NEXT:    v_mad_u64_u32 v[13:14], null, v6, v0, v[11:12]
3124; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3125; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3]
3126; GFX11-NEXT:    v_add3_u32 v0, v10, v14, v4
3127; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3128; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v6, v13
3129; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo
3130; GFX11-NEXT:    global_store_b128 v15, v[8:11], s[2:3]
3131; GFX11-NEXT:    s_endpgm
3132;
3133; GFX12-LABEL: v_mul_i128:
3134; GFX12:       ; %bb.0: ; %entry
3135; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
3136; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3137; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3138; GFX12-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0
3139; GFX12-NEXT:    s_wait_kmcnt 0x0
3140; GFX12-NEXT:    s_clause 0x1
3141; GFX12-NEXT:    global_load_b128 v[0:3], v13, s[0:1]
3142; GFX12-NEXT:    global_load_b128 v[4:7], v13, s[2:3]
3143; GFX12-NEXT:    s_wait_loadcnt 0x0
3144; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v0, v4, 0
3145; GFX12-NEXT:    v_mul_lo_u32 v15, v5, v2
3146; GFX12-NEXT:    v_mul_lo_u32 v7, v7, v0
3147; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3148; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], null, v1, v4, v[9:10]
3149; GFX12-NEXT:    v_mov_b32_e32 v14, v12
3150; GFX12-NEXT:    v_mov_b32_e32 v12, v10
3151; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
3152; GFX12-NEXT:    v_mad_co_u64_u32 v[9:10], null, v0, v5, v[11:12]
3153; GFX12-NEXT:    v_mul_lo_u32 v11, v4, v3
3154; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, v4, v2, 0
3155; GFX12-NEXT:    v_mul_lo_u32 v12, v6, v1
3156; GFX12-NEXT:    v_mov_b32_e32 v4, v10
3157; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
3158; GFX12-NEXT:    v_add3_u32 v3, v3, v11, v15
3159; GFX12-NEXT:    v_add_co_u32 v10, s0, v14, v4
3160; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
3161; GFX12-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, 0, s0
3162; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, v6, v0, v[2:3]
3163; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3164; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v1, v5, v[10:11]
3165; GFX12-NEXT:    v_add3_u32 v3, v7, v3, v12
3166; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3167; GFX12-NEXT:    v_add_co_u32 v10, vcc_lo, v0, v2
3168; GFX12-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
3169; GFX12-NEXT:    global_store_b128 v13, v[8:11], s[2:3]
3170; GFX12-NEXT:    s_endpgm
3171;
3172; EG-LABEL: v_mul_i128:
3173; EG:       ; %bb.0: ; %entry
3174; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
3175; EG-NEXT:    TEX 1 @6
3176; EG-NEXT:    ALU 41, @14, KC0[], KC1[]
3177; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
3178; EG-NEXT:    CF_END
3179; EG-NEXT:    PAD
3180; EG-NEXT:    Fetch clause starting at 6:
3181; EG-NEXT:     VTX_READ_128 T2.XYZW, T1.X, 0, #1
3182; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
3183; EG-NEXT:    ALU clause starting at 10:
3184; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
3185; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
3186; EG-NEXT:     ADD_INT T0.X, KC0[2].Z, PV.W,
3187; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, PV.W,
3188; EG-NEXT:    ALU clause starting at 14:
3189; EG-NEXT:     MULLO_INT * T1.Y, T0.Y, T2.Y,
3190; EG-NEXT:     MULHI * T1.Z, T0.Y, T2.Y,
3191; EG-NEXT:     MULLO_INT * T1.W, T2.Z, T0.X,
3192; EG-NEXT:     MULLO_INT * T3.X, T2.Y, T0.Z,
3193; EG-NEXT:     MULHI * T3.Y, T0.Y, T2.X,
3194; EG-NEXT:     MULHI * T3.Z, T0.X, T2.Y,
3195; EG-NEXT:     MULHI * T3.W, T2.Z, T0.X,
3196; EG-NEXT:     MULLO_INT * T2.Z, T2.Z, T0.Y,
3197; EG-NEXT:     MULHI * T4.X, T2.X, T0.Z,
3198; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T2.X,
3199; EG-NEXT:     MULHI * T4.Y, T0.X, T2.X,
3200; EG-NEXT:     ADD_INT T4.W, T0.Y, PS,
3201; EG-NEXT:     MULLO_INT * T2.Y, T0.X, T2.Y,
3202; EG-NEXT:     ADDC_UINT T4.Z, T0.Y, T4.Y,
3203; EG-NEXT:     ADDC_UINT T5.W, PS, PV.W,
3204; EG-NEXT:     MULLO_INT * T0.Y, T2.X, T0.W,
3205; EG-NEXT:     ADD_INT T4.X, T4.X, PS,
3206; EG-NEXT:     ADD_INT T0.Y, T3.W, T2.Z,
3207; EG-NEXT:     ADD_INT T2.Z, T3.Z, PV.W,
3208; EG-NEXT:     ADD_INT T0.W, T3.Y, PV.Z,
3209; EG-NEXT:     MULLO_INT * T2.W, T2.W, T0.X,
3210; EG-NEXT:     ADD_INT T5.X, PV.W, PV.Z,
3211; EG-NEXT:     ADDC_UINT T3.Y, PV.W, PV.Z,
3212; EG-NEXT:     ADD_INT T2.Z, PV.Y, PS,
3213; EG-NEXT:     ADD_INT T0.W, PV.X, T3.X,
3214; EG-NEXT:     MULLO_INT * T0.Y, T2.X, T0.Z,
3215; EG-NEXT:     ADD_INT T4.Y, PV.Z, PV.W,
3216; EG-NEXT:     ADDC_UINT T0.Z, T1.W, PS,
3217; EG-NEXT:     ADD_INT T0.W, T1.Z, PV.Y,
3218; EG-NEXT:     ADDC_UINT * T2.W, T1.Y, PV.X,
3219; EG-NEXT:     ADD_INT T1.Y, T1.Y, T5.X,
3220; EG-NEXT:     ADD_INT T1.Z, T1.W, T0.Y,
3221; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
3222; EG-NEXT:     ADD_INT * T1.W, PV.Y, PV.Z,
3223; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
3224; EG-NEXT:     ADDC_UINT * T1.W, PV.Y, PV.Z,
3225; EG-NEXT:     ADD_INT * T0.W, PV.W, PS,
3226; EG-NEXT:     ADD_INT * T0.Z, T1.Y, T1.Z,
3227; EG-NEXT:     ADD_INT * T0.Y, T2.Y, T4.W,
3228; EG-NEXT:     LSHR T1.X, T1.X, literal.x,
3229; EG-NEXT:     MULLO_INT * T0.X, T0.X, T2.X,
3230; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3231entry:
3232  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3233  %gep.a = getelementptr inbounds i128, ptr addrspace(1) %aptr, i32 %tid
3234  %gep.b = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid
3235  %gep.out = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid
3236  %a = load i128, ptr addrspace(1) %gep.a
3237  %b = load i128, ptr addrspace(1) %gep.b
3238  %mul = mul i128 %a, %b
3239  store i128 %mul, ptr addrspace(1) %gep.out
3240  ret void
3241}
3242
3243define i32 @mul_pow2_plus_1(i32 %val) {
3244; SI-LABEL: mul_pow2_plus_1:
3245; SI:       ; %bb.0:
3246; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3247; SI-NEXT:    v_mul_lo_u32 v0, v0, 9
3248; SI-NEXT:    s_setpc_b64 s[30:31]
3249;
3250; VI-LABEL: mul_pow2_plus_1:
3251; VI:       ; %bb.0:
3252; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3253; VI-NEXT:    v_mul_lo_u32 v0, v0, 9
3254; VI-NEXT:    s_setpc_b64 s[30:31]
3255;
3256; GFX9-LABEL: mul_pow2_plus_1:
3257; GFX9:       ; %bb.0:
3258; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3259; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
3260; GFX9-NEXT:    s_setpc_b64 s[30:31]
3261;
3262; GFX10-LABEL: mul_pow2_plus_1:
3263; GFX10:       ; %bb.0:
3264; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3265; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
3266; GFX10-NEXT:    s_setpc_b64 s[30:31]
3267;
3268; GFX11-LABEL: mul_pow2_plus_1:
3269; GFX11:       ; %bb.0:
3270; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3271; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
3272; GFX11-NEXT:    s_setpc_b64 s[30:31]
3273;
3274; GFX12-LABEL: mul_pow2_plus_1:
3275; GFX12:       ; %bb.0:
3276; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3277; GFX12-NEXT:    s_wait_expcnt 0x0
3278; GFX12-NEXT:    s_wait_samplecnt 0x0
3279; GFX12-NEXT:    s_wait_bvhcnt 0x0
3280; GFX12-NEXT:    s_wait_kmcnt 0x0
3281; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
3282; GFX12-NEXT:    s_setpc_b64 s[30:31]
3283;
3284; EG-LABEL: mul_pow2_plus_1:
3285; EG:       ; %bb.0:
3286; EG-NEXT:    CF_END
3287; EG-NEXT:    PAD
3288  %mul = mul i32 %val, 9
3289  ret i32 %mul
3290}
3291
3292declare i32 @llvm.amdgcn.workitem.id.x() #1
3293
3294attributes #0 = { nounwind }
3295attributes #1 = { nounwind readnone}
3296