xref: /llvm-project/llvm/test/CodeGen/AMDGPU/mul.ll (revision ba52f06f9d92c7ca04b440f618f8d352ea121fcc)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
6; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
8; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s
9
10; mul24 and mad24 are affected
11
12define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
13; SI-LABEL: test_mul_v2i32:
14; SI:       ; %bb.0: ; %entry
15; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
16; SI-NEXT:    s_mov_b32 s7, 0xf000
17; SI-NEXT:    s_mov_b32 s6, -1
18; SI-NEXT:    s_mov_b32 s10, s6
19; SI-NEXT:    s_mov_b32 s11, s7
20; SI-NEXT:    s_waitcnt lgkmcnt(0)
21; SI-NEXT:    s_mov_b32 s8, s2
22; SI-NEXT:    s_mov_b32 s9, s3
23; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
24; SI-NEXT:    s_mov_b32 s4, s0
25; SI-NEXT:    s_mov_b32 s5, s1
26; SI-NEXT:    s_waitcnt vmcnt(0)
27; SI-NEXT:    v_mul_lo_u32 v1, v1, v3
28; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
29; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
30; SI-NEXT:    s_endpgm
31;
32; VI-LABEL: test_mul_v2i32:
33; VI:       ; %bb.0: ; %entry
34; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
35; VI-NEXT:    s_mov_b32 s7, 0xf000
36; VI-NEXT:    s_mov_b32 s6, -1
37; VI-NEXT:    s_mov_b32 s10, s6
38; VI-NEXT:    s_mov_b32 s11, s7
39; VI-NEXT:    s_waitcnt lgkmcnt(0)
40; VI-NEXT:    s_mov_b32 s8, s2
41; VI-NEXT:    s_mov_b32 s9, s3
42; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
43; VI-NEXT:    s_mov_b32 s4, s0
44; VI-NEXT:    s_mov_b32 s5, s1
45; VI-NEXT:    s_waitcnt vmcnt(0)
46; VI-NEXT:    v_mul_lo_u32 v1, v1, v3
47; VI-NEXT:    v_mul_lo_u32 v0, v0, v2
48; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
49; VI-NEXT:    s_endpgm
50;
51; GFX9-LABEL: test_mul_v2i32:
52; GFX9:       ; %bb.0: ; %entry
53; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
54; GFX9-NEXT:    s_mov_b32 s7, 0xf000
55; GFX9-NEXT:    s_mov_b32 s6, -1
56; GFX9-NEXT:    s_mov_b32 s10, s6
57; GFX9-NEXT:    s_mov_b32 s11, s7
58; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
59; GFX9-NEXT:    s_mov_b32 s8, s2
60; GFX9-NEXT:    s_mov_b32 s9, s3
61; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
62; GFX9-NEXT:    s_mov_b32 s4, s0
63; GFX9-NEXT:    s_mov_b32 s5, s1
64; GFX9-NEXT:    s_waitcnt vmcnt(0)
65; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
66; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
67; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
68; GFX9-NEXT:    s_endpgm
69;
70; GFX10-LABEL: test_mul_v2i32:
71; GFX10:       ; %bb.0: ; %entry
72; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
73; GFX10-NEXT:    s_mov_b32 s6, -1
74; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
75; GFX10-NEXT:    s_mov_b32 s10, s6
76; GFX10-NEXT:    s_mov_b32 s11, s7
77; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
78; GFX10-NEXT:    s_mov_b32 s8, s2
79; GFX10-NEXT:    s_mov_b32 s9, s3
80; GFX10-NEXT:    s_mov_b32 s4, s0
81; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
82; GFX10-NEXT:    s_mov_b32 s5, s1
83; GFX10-NEXT:    s_waitcnt vmcnt(0)
84; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v3
85; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
86; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
87; GFX10-NEXT:    s_endpgm
88;
89; GFX11-LABEL: test_mul_v2i32:
90; GFX11:       ; %bb.0: ; %entry
91; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
92; GFX11-NEXT:    s_mov_b32 s6, -1
93; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
94; GFX11-NEXT:    s_mov_b32 s10, s6
95; GFX11-NEXT:    s_mov_b32 s11, s7
96; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX11-NEXT:    s_mov_b32 s8, s2
98; GFX11-NEXT:    s_mov_b32 s9, s3
99; GFX11-NEXT:    s_mov_b32 s4, s0
100; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], 0
101; GFX11-NEXT:    s_mov_b32 s5, s1
102; GFX11-NEXT:    s_waitcnt vmcnt(0)
103; GFX11-NEXT:    v_mul_lo_u32 v1, v1, v3
104; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v2
105; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
106; GFX11-NEXT:    s_nop 0
107; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
108; GFX11-NEXT:    s_endpgm
109;
110; GFX12-LABEL: test_mul_v2i32:
111; GFX12:       ; %bb.0: ; %entry
112; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
113; GFX12-NEXT:    s_mov_b32 s6, -1
114; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
115; GFX12-NEXT:    s_mov_b32 s10, s6
116; GFX12-NEXT:    s_mov_b32 s11, s7
117; GFX12-NEXT:    s_wait_kmcnt 0x0
118; GFX12-NEXT:    s_mov_b32 s8, s2
119; GFX12-NEXT:    s_mov_b32 s9, s3
120; GFX12-NEXT:    s_mov_b32 s4, s0
121; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], null
122; GFX12-NEXT:    s_mov_b32 s5, s1
123; GFX12-NEXT:    s_wait_loadcnt 0x0
124; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v3
125; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v2
126; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
127; GFX12-NEXT:    s_nop 0
128; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
129; GFX12-NEXT:    s_endpgm
130;
131; EG-LABEL: test_mul_v2i32:
132; EG:       ; %bb.0: ; %entry
133; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
134; EG-NEXT:    TEX 0 @6
135; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
136; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
137; EG-NEXT:    CF_END
138; EG-NEXT:    PAD
139; EG-NEXT:    Fetch clause starting at 6:
140; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
141; EG-NEXT:    ALU clause starting at 8:
142; EG-NEXT:     MOV * T0.X, KC0[2].Z,
143; EG-NEXT:    ALU clause starting at 9:
144; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T0.W,
145; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
146; EG-NEXT:     MULLO_INT * T0.X, T0.X, T0.Z,
147; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
148entry:
149  %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
150  %a = load <2 x i32>, ptr addrspace(1) %in
151  %b = load <2 x i32>, ptr addrspace(1) %b_ptr
152  %result = mul <2 x i32> %a, %b
153  store <2 x i32> %result, ptr addrspace(1) %out
154  ret void
155}
156
157define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
158; SI-LABEL: v_mul_v4i32:
159; SI:       ; %bb.0: ; %entry
160; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
161; SI-NEXT:    s_mov_b32 s7, 0xf000
162; SI-NEXT:    s_mov_b32 s6, -1
163; SI-NEXT:    s_mov_b32 s10, s6
164; SI-NEXT:    s_mov_b32 s11, s7
165; SI-NEXT:    s_waitcnt lgkmcnt(0)
166; SI-NEXT:    s_mov_b32 s8, s2
167; SI-NEXT:    s_mov_b32 s9, s3
168; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
169; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
170; SI-NEXT:    s_mov_b32 s4, s0
171; SI-NEXT:    s_mov_b32 s5, s1
172; SI-NEXT:    s_waitcnt vmcnt(0)
173; SI-NEXT:    v_mul_lo_u32 v3, v3, v7
174; SI-NEXT:    v_mul_lo_u32 v2, v2, v6
175; SI-NEXT:    v_mul_lo_u32 v1, v1, v5
176; SI-NEXT:    v_mul_lo_u32 v0, v0, v4
177; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
178; SI-NEXT:    s_endpgm
179;
180; VI-LABEL: v_mul_v4i32:
181; VI:       ; %bb.0: ; %entry
182; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
183; VI-NEXT:    s_mov_b32 s7, 0xf000
184; VI-NEXT:    s_mov_b32 s6, -1
185; VI-NEXT:    s_mov_b32 s10, s6
186; VI-NEXT:    s_mov_b32 s11, s7
187; VI-NEXT:    s_waitcnt lgkmcnt(0)
188; VI-NEXT:    s_mov_b32 s8, s2
189; VI-NEXT:    s_mov_b32 s9, s3
190; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
191; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
192; VI-NEXT:    s_mov_b32 s4, s0
193; VI-NEXT:    s_mov_b32 s5, s1
194; VI-NEXT:    s_waitcnt vmcnt(0)
195; VI-NEXT:    v_mul_lo_u32 v3, v3, v7
196; VI-NEXT:    v_mul_lo_u32 v2, v2, v6
197; VI-NEXT:    v_mul_lo_u32 v1, v1, v5
198; VI-NEXT:    v_mul_lo_u32 v0, v0, v4
199; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
200; VI-NEXT:    s_endpgm
201;
202; GFX9-LABEL: v_mul_v4i32:
203; GFX9:       ; %bb.0: ; %entry
204; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
205; GFX9-NEXT:    s_mov_b32 s7, 0xf000
206; GFX9-NEXT:    s_mov_b32 s6, -1
207; GFX9-NEXT:    s_mov_b32 s10, s6
208; GFX9-NEXT:    s_mov_b32 s11, s7
209; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX9-NEXT:    s_mov_b32 s8, s2
211; GFX9-NEXT:    s_mov_b32 s9, s3
212; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
213; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
214; GFX9-NEXT:    s_mov_b32 s4, s0
215; GFX9-NEXT:    s_mov_b32 s5, s1
216; GFX9-NEXT:    s_waitcnt vmcnt(0)
217; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v7
218; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v6
219; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v5
220; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v4
221; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
222; GFX9-NEXT:    s_endpgm
223;
224; GFX10-LABEL: v_mul_v4i32:
225; GFX10:       ; %bb.0: ; %entry
226; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
227; GFX10-NEXT:    s_mov_b32 s6, -1
228; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
229; GFX10-NEXT:    s_mov_b32 s10, s6
230; GFX10-NEXT:    s_mov_b32 s11, s7
231; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
232; GFX10-NEXT:    s_mov_b32 s8, s2
233; GFX10-NEXT:    s_mov_b32 s9, s3
234; GFX10-NEXT:    s_clause 0x1
235; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
236; GFX10-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
237; GFX10-NEXT:    s_mov_b32 s4, s0
238; GFX10-NEXT:    s_mov_b32 s5, s1
239; GFX10-NEXT:    s_waitcnt vmcnt(0)
240; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v7
241; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v6
242; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v5
243; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v4
244; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
245; GFX10-NEXT:    s_endpgm
246;
247; GFX11-LABEL: v_mul_v4i32:
248; GFX11:       ; %bb.0: ; %entry
249; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
250; GFX11-NEXT:    s_mov_b32 s6, -1
251; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
252; GFX11-NEXT:    s_mov_b32 s10, s6
253; GFX11-NEXT:    s_mov_b32 s11, s7
254; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
255; GFX11-NEXT:    s_mov_b32 s8, s2
256; GFX11-NEXT:    s_mov_b32 s9, s3
257; GFX11-NEXT:    s_clause 0x1
258; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], 0
259; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[8:11], 0 offset:16
260; GFX11-NEXT:    s_mov_b32 s4, s0
261; GFX11-NEXT:    s_mov_b32 s5, s1
262; GFX11-NEXT:    s_waitcnt vmcnt(0)
263; GFX11-NEXT:    v_mul_lo_u32 v3, v3, v7
264; GFX11-NEXT:    v_mul_lo_u32 v2, v2, v6
265; GFX11-NEXT:    v_mul_lo_u32 v1, v1, v5
266; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v4
267; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
268; GFX11-NEXT:    s_nop 0
269; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
270; GFX11-NEXT:    s_endpgm
271;
272; GFX12-LABEL: v_mul_v4i32:
273; GFX12:       ; %bb.0: ; %entry
274; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
275; GFX12-NEXT:    s_mov_b32 s6, -1
276; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
277; GFX12-NEXT:    s_mov_b32 s10, s6
278; GFX12-NEXT:    s_mov_b32 s11, s7
279; GFX12-NEXT:    s_wait_kmcnt 0x0
280; GFX12-NEXT:    s_mov_b32 s8, s2
281; GFX12-NEXT:    s_mov_b32 s9, s3
282; GFX12-NEXT:    s_clause 0x1
283; GFX12-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], null
284; GFX12-NEXT:    buffer_load_b128 v[4:7], off, s[8:11], null offset:16
285; GFX12-NEXT:    s_mov_b32 s4, s0
286; GFX12-NEXT:    s_mov_b32 s5, s1
287; GFX12-NEXT:    s_wait_loadcnt 0x0
288; GFX12-NEXT:    v_mul_lo_u32 v3, v3, v7
289; GFX12-NEXT:    v_mul_lo_u32 v2, v2, v6
290; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v5
291; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v4
292; GFX12-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], null
293; GFX12-NEXT:    s_nop 0
294; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
295; GFX12-NEXT:    s_endpgm
296;
297; EG-LABEL: v_mul_v4i32:
298; EG:       ; %bb.0: ; %entry
299; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
300; EG-NEXT:    TEX 1 @6
301; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
302; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
303; EG-NEXT:    CF_END
304; EG-NEXT:    PAD
305; EG-NEXT:    Fetch clause starting at 6:
306; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
307; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
308; EG-NEXT:    ALU clause starting at 10:
309; EG-NEXT:     MOV * T0.X, KC0[2].Z,
310; EG-NEXT:    ALU clause starting at 11:
311; EG-NEXT:     MULLO_INT * T0.W, T0.W, T1.W,
312; EG-NEXT:     MULLO_INT * T0.Z, T0.Z, T1.Z,
313; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T1.Y,
314; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
315; EG-NEXT:     MULLO_INT * T0.X, T0.X, T1.X,
316; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
317entry:
318  %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
319  %a = load <4 x i32>, ptr addrspace(1) %in
320  %b = load <4 x i32>, ptr addrspace(1) %b_ptr
321  %result = mul <4 x i32> %a, %b
322  store <4 x i32> %result, ptr addrspace(1) %out
323  ret void
324}
325
326define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) {
327; SI-LABEL: s_trunc_i64_mul_to_i32:
328; SI:       ; %bb.0: ; %entry
329; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
330; SI-NEXT:    s_waitcnt lgkmcnt(0)
331; SI-NEXT:    s_load_dword s7, s[0:1], 0xd
332; SI-NEXT:    s_mov_b32 s3, 0xf000
333; SI-NEXT:    s_mov_b32 s2, -1
334; SI-NEXT:    s_mov_b32 s0, s4
335; SI-NEXT:    s_waitcnt lgkmcnt(0)
336; SI-NEXT:    s_mul_i32 s4, s7, s6
337; SI-NEXT:    s_mov_b32 s1, s5
338; SI-NEXT:    v_mov_b32_e32 v0, s4
339; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
340; SI-NEXT:    s_endpgm
341;
342; VI-LABEL: s_trunc_i64_mul_to_i32:
343; VI:       ; %bb.0: ; %entry
344; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
345; VI-NEXT:    s_waitcnt lgkmcnt(0)
346; VI-NEXT:    s_load_dword s7, s[0:1], 0x34
347; VI-NEXT:    s_mov_b32 s3, 0xf000
348; VI-NEXT:    s_mov_b32 s2, -1
349; VI-NEXT:    s_mov_b32 s0, s4
350; VI-NEXT:    s_waitcnt lgkmcnt(0)
351; VI-NEXT:    s_mul_i32 s4, s7, s6
352; VI-NEXT:    s_mov_b32 s1, s5
353; VI-NEXT:    v_mov_b32_e32 v0, s4
354; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
355; VI-NEXT:    s_endpgm
356;
357; GFX9-LABEL: s_trunc_i64_mul_to_i32:
358; GFX9:       ; %bb.0: ; %entry
359; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
360; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
361; GFX9-NEXT:    s_load_dword s7, s[0:1], 0x34
362; GFX9-NEXT:    ; kill: killed $sgpr0_sgpr1
363; GFX9-NEXT:    s_mov_b32 s3, 0xf000
364; GFX9-NEXT:    s_mov_b32 s2, -1
365; GFX9-NEXT:    s_mov_b32 s0, s4
366; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
367; GFX9-NEXT:    s_mul_i32 s4, s7, s6
368; GFX9-NEXT:    s_mov_b32 s1, s5
369; GFX9-NEXT:    v_mov_b32_e32 v0, s4
370; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
371; GFX9-NEXT:    s_endpgm
372;
373; GFX10-LABEL: s_trunc_i64_mul_to_i32:
374; GFX10:       ; %bb.0: ; %entry
375; GFX10-NEXT:    s_clause 0x1
376; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
377; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
378; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
379; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
380; GFX10-NEXT:    s_mul_i32 s0, s2, s6
381; GFX10-NEXT:    s_mov_b32 s6, -1
382; GFX10-NEXT:    v_mov_b32_e32 v0, s0
383; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
384; GFX10-NEXT:    s_endpgm
385;
386; GFX11-LABEL: s_trunc_i64_mul_to_i32:
387; GFX11:       ; %bb.0: ; %entry
388; GFX11-NEXT:    s_clause 0x1
389; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
390; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
391; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
393; GFX11-NEXT:    s_mul_i32 s0, s0, s6
394; GFX11-NEXT:    s_mov_b32 s6, -1
395; GFX11-NEXT:    v_mov_b32_e32 v0, s0
396; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
397; GFX11-NEXT:    s_nop 0
398; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
399; GFX11-NEXT:    s_endpgm
400;
401; GFX12-LABEL: s_trunc_i64_mul_to_i32:
402; GFX12:       ; %bb.0: ; %entry
403; GFX12-NEXT:    s_clause 0x1
404; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
405; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x34
406; GFX12-NEXT:    s_wait_kmcnt 0x0
407; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
408; GFX12-NEXT:    s_mul_i32 s0, s0, s6
409; GFX12-NEXT:    s_mov_b32 s6, -1
410; GFX12-NEXT:    v_mov_b32_e32 v0, s0
411; GFX12-NEXT:    buffer_store_b32 v0, off, s[4:7], null
412; GFX12-NEXT:    s_nop 0
413; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
414; GFX12-NEXT:    s_endpgm
415;
416; EG-LABEL: s_trunc_i64_mul_to_i32:
417; EG:       ; %bb.0: ; %entry
418; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
419; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
420; EG-NEXT:    CF_END
421; EG-NEXT:    PAD
422; EG-NEXT:    ALU clause starting at 4:
423; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
424; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
425; EG-NEXT:     MULLO_INT * T1.X, KC0[3].Y, KC0[2].W,
426entry:
427  %mul = mul i64 %b, %a
428  %trunc = trunc i64 %mul to i32
429  store i32 %trunc, ptr addrspace(1) %out, align 8
430  ret void
431}
432
433define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
434; SI-LABEL: v_trunc_i64_mul_to_i32:
435; SI:       ; %bb.0: ; %entry
436; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
437; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
438; SI-NEXT:    s_mov_b32 s3, 0xf000
439; SI-NEXT:    s_mov_b32 s2, -1
440; SI-NEXT:    s_mov_b32 s14, s2
441; SI-NEXT:    s_waitcnt lgkmcnt(0)
442; SI-NEXT:    s_mov_b32 s12, s6
443; SI-NEXT:    s_mov_b32 s13, s7
444; SI-NEXT:    s_mov_b32 s15, s3
445; SI-NEXT:    s_mov_b32 s10, s2
446; SI-NEXT:    s_mov_b32 s11, s3
447; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
448; SI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
449; SI-NEXT:    s_mov_b32 s0, s4
450; SI-NEXT:    s_mov_b32 s1, s5
451; SI-NEXT:    s_waitcnt vmcnt(0)
452; SI-NEXT:    v_mul_lo_u32 v0, v1, v0
453; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
454; SI-NEXT:    s_endpgm
455;
456; VI-LABEL: v_trunc_i64_mul_to_i32:
457; VI:       ; %bb.0: ; %entry
458; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
459; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
460; VI-NEXT:    s_mov_b32 s3, 0xf000
461; VI-NEXT:    s_mov_b32 s2, -1
462; VI-NEXT:    s_mov_b32 s14, s2
463; VI-NEXT:    s_waitcnt lgkmcnt(0)
464; VI-NEXT:    s_mov_b32 s12, s6
465; VI-NEXT:    s_mov_b32 s13, s7
466; VI-NEXT:    s_mov_b32 s15, s3
467; VI-NEXT:    s_mov_b32 s10, s2
468; VI-NEXT:    s_mov_b32 s11, s3
469; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
470; VI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
471; VI-NEXT:    s_mov_b32 s0, s4
472; VI-NEXT:    s_mov_b32 s1, s5
473; VI-NEXT:    s_waitcnt vmcnt(0)
474; VI-NEXT:    v_mul_lo_u32 v0, v1, v0
475; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
476; VI-NEXT:    s_endpgm
477;
478; GFX9-LABEL: v_trunc_i64_mul_to_i32:
479; GFX9:       ; %bb.0: ; %entry
480; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
481; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
482; GFX9-NEXT:    s_mov_b32 s3, 0xf000
483; GFX9-NEXT:    s_mov_b32 s2, -1
484; GFX9-NEXT:    s_mov_b32 s14, s2
485; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
486; GFX9-NEXT:    s_mov_b32 s12, s6
487; GFX9-NEXT:    s_mov_b32 s13, s7
488; GFX9-NEXT:    s_mov_b32 s15, s3
489; GFX9-NEXT:    s_mov_b32 s10, s2
490; GFX9-NEXT:    s_mov_b32 s11, s3
491; GFX9-NEXT:    buffer_load_dword v0, off, s[12:15], 0
492; GFX9-NEXT:    buffer_load_dword v1, off, s[8:11], 0
493; GFX9-NEXT:    s_mov_b32 s0, s4
494; GFX9-NEXT:    s_mov_b32 s1, s5
495; GFX9-NEXT:    s_waitcnt vmcnt(0)
496; GFX9-NEXT:    v_mul_lo_u32 v0, v1, v0
497; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
498; GFX9-NEXT:    s_endpgm
499;
500; GFX10-LABEL: v_trunc_i64_mul_to_i32:
501; GFX10:       ; %bb.0: ; %entry
502; GFX10-NEXT:    s_clause 0x1
503; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
504; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
505; GFX10-NEXT:    s_mov_b32 s2, -1
506; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
507; GFX10-NEXT:    s_mov_b32 s14, s2
508; GFX10-NEXT:    s_mov_b32 s15, s3
509; GFX10-NEXT:    s_mov_b32 s10, s2
510; GFX10-NEXT:    s_mov_b32 s11, s3
511; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
512; GFX10-NEXT:    s_mov_b32 s12, s6
513; GFX10-NEXT:    s_mov_b32 s13, s7
514; GFX10-NEXT:    buffer_load_dword v0, off, s[12:15], 0
515; GFX10-NEXT:    buffer_load_dword v1, off, s[8:11], 0
516; GFX10-NEXT:    s_mov_b32 s0, s4
517; GFX10-NEXT:    s_mov_b32 s1, s5
518; GFX10-NEXT:    s_waitcnt vmcnt(0)
519; GFX10-NEXT:    v_mul_lo_u32 v0, v1, v0
520; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
521; GFX10-NEXT:    s_endpgm
522;
523; GFX11-LABEL: v_trunc_i64_mul_to_i32:
524; GFX11:       ; %bb.0: ; %entry
525; GFX11-NEXT:    s_clause 0x1
526; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
527; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
528; GFX11-NEXT:    s_mov_b32 s10, -1
529; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
530; GFX11-NEXT:    s_mov_b32 s14, s10
531; GFX11-NEXT:    s_mov_b32 s15, s11
532; GFX11-NEXT:    s_mov_b32 s2, s10
533; GFX11-NEXT:    s_mov_b32 s3, s11
534; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
535; GFX11-NEXT:    s_mov_b32 s12, s6
536; GFX11-NEXT:    s_mov_b32 s13, s7
537; GFX11-NEXT:    buffer_load_b32 v0, off, s[12:15], 0
538; GFX11-NEXT:    buffer_load_b32 v1, off, s[0:3], 0
539; GFX11-NEXT:    s_mov_b32 s8, s4
540; GFX11-NEXT:    s_mov_b32 s9, s5
541; GFX11-NEXT:    s_waitcnt vmcnt(0)
542; GFX11-NEXT:    v_mul_lo_u32 v0, v1, v0
543; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
544; GFX11-NEXT:    s_nop 0
545; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
546; GFX11-NEXT:    s_endpgm
547;
548; GFX12-LABEL: v_trunc_i64_mul_to_i32:
549; GFX12:       ; %bb.0: ; %entry
550; GFX12-NEXT:    s_clause 0x1
551; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
552; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
553; GFX12-NEXT:    s_mov_b32 s10, -1
554; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
555; GFX12-NEXT:    s_mov_b32 s14, s10
556; GFX12-NEXT:    s_mov_b32 s15, s11
557; GFX12-NEXT:    s_mov_b32 s2, s10
558; GFX12-NEXT:    s_mov_b32 s3, s11
559; GFX12-NEXT:    s_wait_kmcnt 0x0
560; GFX12-NEXT:    s_mov_b32 s12, s6
561; GFX12-NEXT:    s_mov_b32 s13, s7
562; GFX12-NEXT:    buffer_load_b32 v0, off, s[12:15], null
563; GFX12-NEXT:    buffer_load_b32 v1, off, s[0:3], null
564; GFX12-NEXT:    s_mov_b32 s8, s4
565; GFX12-NEXT:    s_mov_b32 s9, s5
566; GFX12-NEXT:    s_wait_loadcnt 0x0
567; GFX12-NEXT:    v_mul_lo_u32 v0, v1, v0
568; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
569; GFX12-NEXT:    s_nop 0
570; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
571; GFX12-NEXT:    s_endpgm
572;
573; EG-LABEL: v_trunc_i64_mul_to_i32:
574; EG:       ; %bb.0: ; %entry
575; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
576; EG-NEXT:    TEX 1 @6
577; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
578; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
579; EG-NEXT:    CF_END
580; EG-NEXT:    PAD
581; EG-NEXT:    Fetch clause starting at 6:
582; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
583; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
584; EG-NEXT:    ALU clause starting at 10:
585; EG-NEXT:     MOV T0.X, KC0[2].Z,
586; EG-NEXT:     MOV * T1.X, KC0[2].W,
587; EG-NEXT:    ALU clause starting at 12:
588; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
589; EG-NEXT:     MULLO_INT * T0.X, T1.X, T0.X,
590; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
591entry:
592  %a = load i64, ptr addrspace(1) %aptr, align 8
593  %b = load i64, ptr addrspace(1) %bptr, align 8
594  %mul = mul i64 %b, %a
595  %trunc = trunc i64 %mul to i32
596  store i32 %trunc, ptr addrspace(1) %out, align 8
597  ret void
598}
599
600; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top
601; 32-bits of both arguments are sign bits.
602
603define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
604; SI-LABEL: mul64_sext_c:
605; SI:       ; %bb.0: ; %entry
606; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
607; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
608; SI-NEXT:    v_mov_b32_e32 v0, 0x50
609; SI-NEXT:    s_mov_b32 s3, 0xf000
610; SI-NEXT:    s_mov_b32 s2, -1
611; SI-NEXT:    s_waitcnt lgkmcnt(0)
612; SI-NEXT:    v_mul_hi_i32 v1, s4, v0
613; SI-NEXT:    s_mulk_i32 s4, 0x50
614; SI-NEXT:    v_mov_b32_e32 v0, s4
615; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
616; SI-NEXT:    s_endpgm
617;
618; VI-LABEL: mul64_sext_c:
619; VI:       ; %bb.0: ; %entry
620; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
621; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
622; VI-NEXT:    v_mov_b32_e32 v0, 0x50
623; VI-NEXT:    s_waitcnt lgkmcnt(0)
624; VI-NEXT:    v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0
625; VI-NEXT:    s_mov_b32 s3, 0xf000
626; VI-NEXT:    s_mov_b32 s2, -1
627; VI-NEXT:    s_nop 2
628; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
629; VI-NEXT:    s_endpgm
630;
631; GFX9-LABEL: mul64_sext_c:
632; GFX9:       ; %bb.0: ; %entry
633; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
634; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
635; GFX9-NEXT:    s_mov_b32 s7, 0xf000
636; GFX9-NEXT:    s_mov_b32 s6, -1
637; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
638; GFX9-NEXT:    s_mul_hi_i32 s0, s2, 0x50
639; GFX9-NEXT:    s_mulk_i32 s2, 0x50
640; GFX9-NEXT:    v_mov_b32_e32 v0, s2
641; GFX9-NEXT:    v_mov_b32_e32 v1, s0
642; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
643; GFX9-NEXT:    s_endpgm
644;
645; GFX10-LABEL: mul64_sext_c:
646; GFX10:       ; %bb.0: ; %entry
647; GFX10-NEXT:    s_clause 0x1
648; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
649; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
650; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
651; GFX10-NEXT:    s_mov_b32 s6, -1
652; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
653; GFX10-NEXT:    s_mul_i32 s0, s2, 0x50
654; GFX10-NEXT:    s_mul_hi_i32 s1, s2, 0x50
655; GFX10-NEXT:    v_mov_b32_e32 v0, s0
656; GFX10-NEXT:    v_mov_b32_e32 v1, s1
657; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
658; GFX10-NEXT:    s_endpgm
659;
660; GFX11-LABEL: mul64_sext_c:
661; GFX11:       ; %bb.0: ; %entry
662; GFX11-NEXT:    s_clause 0x1
663; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
664; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
665; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
666; GFX11-NEXT:    s_mul_i32 s3, s2, 0x50
667; GFX11-NEXT:    s_mul_hi_i32 s2, s2, 0x50
668; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
669; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
670; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
671; GFX11-NEXT:    s_mov_b32 s2, -1
672; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
673; GFX11-NEXT:    s_nop 0
674; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
675; GFX11-NEXT:    s_endpgm
676;
677; GFX12-LABEL: mul64_sext_c:
678; GFX12:       ; %bb.0: ; %entry
679; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
680; GFX12-NEXT:    s_wait_kmcnt 0x0
681; GFX12-NEXT:    s_ashr_i32 s3, s2, 31
682; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
683; GFX12-NEXT:    s_mul_u64 s[4:5], s[2:3], 0x50
684; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
685; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
686; GFX12-NEXT:    s_mov_b32 s2, -1
687; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
688; GFX12-NEXT:    s_nop 0
689; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
690; GFX12-NEXT:    s_endpgm
691;
692; EG-LABEL: mul64_sext_c:
693; EG:       ; %bb.0: ; %entry
694; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
695; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
696; EG-NEXT:    CF_END
697; EG-NEXT:    PAD
698; EG-NEXT:    ALU clause starting at 4:
699; EG-NEXT:     MULHI_INT * T0.Y, KC0[2].Z, literal.x,
700; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
701; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
702; EG-NEXT:     MULLO_INT * T0.X, KC0[2].Z, literal.y,
703; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
704entry:
705  %0 = sext i32 %in to i64
706  %1 = mul i64 %0, 80
707  store i64 %1, ptr addrspace(1) %out
708  ret void
709}
710
711define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
712; SI-LABEL: mul64_zext_c:
713; SI:       ; %bb.0: ; %entry
714; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
715; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
716; SI-NEXT:    v_mov_b32_e32 v0, 0x50
717; SI-NEXT:    s_mov_b32 s3, 0xf000
718; SI-NEXT:    s_mov_b32 s2, -1
719; SI-NEXT:    s_waitcnt lgkmcnt(0)
720; SI-NEXT:    v_mul_hi_u32 v1, s4, v0
721; SI-NEXT:    s_mulk_i32 s4, 0x50
722; SI-NEXT:    v_mov_b32_e32 v0, s4
723; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
724; SI-NEXT:    s_endpgm
725;
726; VI-LABEL: mul64_zext_c:
727; VI:       ; %bb.0: ; %entry
728; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
729; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
730; VI-NEXT:    v_mov_b32_e32 v0, 0x50
731; VI-NEXT:    s_waitcnt lgkmcnt(0)
732; VI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v0, 0
733; VI-NEXT:    s_mov_b32 s3, 0xf000
734; VI-NEXT:    s_mov_b32 s2, -1
735; VI-NEXT:    s_nop 2
736; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
737; VI-NEXT:    s_endpgm
738;
739; GFX9-LABEL: mul64_zext_c:
740; GFX9:       ; %bb.0: ; %entry
741; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
742; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
743; GFX9-NEXT:    s_mov_b32 s7, 0xf000
744; GFX9-NEXT:    s_mov_b32 s6, -1
745; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
746; GFX9-NEXT:    s_mul_hi_u32 s0, s2, 0x50
747; GFX9-NEXT:    s_mulk_i32 s2, 0x50
748; GFX9-NEXT:    v_mov_b32_e32 v0, s2
749; GFX9-NEXT:    v_mov_b32_e32 v1, s0
750; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
751; GFX9-NEXT:    s_endpgm
752;
753; GFX10-LABEL: mul64_zext_c:
754; GFX10:       ; %bb.0: ; %entry
755; GFX10-NEXT:    s_clause 0x1
756; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
757; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
758; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
759; GFX10-NEXT:    s_mov_b32 s6, -1
760; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
761; GFX10-NEXT:    s_mul_i32 s0, s2, 0x50
762; GFX10-NEXT:    s_mul_hi_u32 s1, s2, 0x50
763; GFX10-NEXT:    v_mov_b32_e32 v0, s0
764; GFX10-NEXT:    v_mov_b32_e32 v1, s1
765; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
766; GFX10-NEXT:    s_endpgm
767;
768; GFX11-LABEL: mul64_zext_c:
769; GFX11:       ; %bb.0: ; %entry
770; GFX11-NEXT:    s_clause 0x1
771; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
772; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
773; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
774; GFX11-NEXT:    s_mul_i32 s3, s2, 0x50
775; GFX11-NEXT:    s_mul_hi_u32 s2, s2, 0x50
776; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
777; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
778; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
779; GFX11-NEXT:    s_mov_b32 s2, -1
780; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
781; GFX11-NEXT:    s_nop 0
782; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
783; GFX11-NEXT:    s_endpgm
784;
785; GFX12-LABEL: mul64_zext_c:
786; GFX12:       ; %bb.0: ; %entry
787; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
788; GFX12-NEXT:    s_mov_b32 s3, 0
789; GFX12-NEXT:    s_wait_kmcnt 0x0
790; GFX12-NEXT:    s_mul_u64 s[4:5], s[2:3], 0x50
791; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
792; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
793; GFX12-NEXT:    s_mov_b32 s2, -1
794; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
795; GFX12-NEXT:    s_nop 0
796; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
797; GFX12-NEXT:    s_endpgm
798;
799; EG-LABEL: mul64_zext_c:
800; EG:       ; %bb.0: ; %entry
801; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
802; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
803; EG-NEXT:    CF_END
804; EG-NEXT:    PAD
805; EG-NEXT:    ALU clause starting at 4:
806; EG-NEXT:     MULHI * T0.Y, KC0[2].Z, literal.x,
807; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
808; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
809; EG-NEXT:     MULLO_INT * T0.X, KC0[2].Z, literal.y,
810; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
811entry:
812  %0 = zext i32 %in to i64
813  %1 = mul i64 %0, 80
814  store i64 %1, ptr addrspace(1) %out
815  ret void
816}
817
818define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) {
819; SI-LABEL: v_mul64_sext_c:
820; SI:       ; %bb.0: ; %entry
821; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
822; SI-NEXT:    s_mov_b32 s7, 0xf000
823; SI-NEXT:    s_mov_b32 s6, -1
824; SI-NEXT:    s_mov_b32 s10, s6
825; SI-NEXT:    s_mov_b32 s11, s7
826; SI-NEXT:    s_waitcnt lgkmcnt(0)
827; SI-NEXT:    s_mov_b32 s8, s2
828; SI-NEXT:    s_mov_b32 s9, s3
829; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
830; SI-NEXT:    s_movk_i32 s2, 0x50
831; SI-NEXT:    s_mov_b32 s4, s0
832; SI-NEXT:    s_mov_b32 s5, s1
833; SI-NEXT:    s_waitcnt vmcnt(0)
834; SI-NEXT:    v_mul_hi_i32 v1, v0, s2
835; SI-NEXT:    v_mul_lo_u32 v0, v0, s2
836; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
837; SI-NEXT:    s_endpgm
838;
839; VI-LABEL: v_mul64_sext_c:
840; VI:       ; %bb.0: ; %entry
841; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
842; VI-NEXT:    s_mov_b32 s7, 0xf000
843; VI-NEXT:    s_mov_b32 s6, -1
844; VI-NEXT:    s_mov_b32 s10, s6
845; VI-NEXT:    s_mov_b32 s11, s7
846; VI-NEXT:    s_waitcnt lgkmcnt(0)
847; VI-NEXT:    s_mov_b32 s8, s2
848; VI-NEXT:    s_mov_b32 s9, s3
849; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
850; VI-NEXT:    s_movk_i32 s2, 0x50
851; VI-NEXT:    s_mov_b32 s4, s0
852; VI-NEXT:    s_mov_b32 s5, s1
853; VI-NEXT:    s_waitcnt vmcnt(0)
854; VI-NEXT:    v_mad_i64_i32 v[0:1], s[2:3], v0, s2, 0
855; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
856; VI-NEXT:    s_endpgm
857;
858; GFX9-LABEL: v_mul64_sext_c:
859; GFX9:       ; %bb.0: ; %entry
860; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
861; GFX9-NEXT:    s_mov_b32 s7, 0xf000
862; GFX9-NEXT:    s_mov_b32 s6, -1
863; GFX9-NEXT:    s_mov_b32 s10, s6
864; GFX9-NEXT:    s_mov_b32 s11, s7
865; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
866; GFX9-NEXT:    s_mov_b32 s8, s2
867; GFX9-NEXT:    s_mov_b32 s9, s3
868; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
869; GFX9-NEXT:    s_movk_i32 s2, 0x50
870; GFX9-NEXT:    s_mov_b32 s4, s0
871; GFX9-NEXT:    s_mov_b32 s5, s1
872; GFX9-NEXT:    s_waitcnt vmcnt(0)
873; GFX9-NEXT:    v_mul_hi_i32 v1, v0, s2
874; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
875; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
876; GFX9-NEXT:    s_endpgm
877;
878; GFX10-LABEL: v_mul64_sext_c:
879; GFX10:       ; %bb.0: ; %entry
880; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
881; GFX10-NEXT:    s_mov_b32 s6, -1
882; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
883; GFX10-NEXT:    s_mov_b32 s10, s6
884; GFX10-NEXT:    s_mov_b32 s11, s7
885; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
886; GFX10-NEXT:    s_mov_b32 s8, s2
887; GFX10-NEXT:    s_mov_b32 s9, s3
888; GFX10-NEXT:    s_mov_b32 s4, s0
889; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
890; GFX10-NEXT:    s_mov_b32 s5, s1
891; GFX10-NEXT:    s_waitcnt vmcnt(0)
892; GFX10-NEXT:    v_mul_hi_i32 v1, 0x50, v0
893; GFX10-NEXT:    v_mul_lo_u32 v0, 0x50, v0
894; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
895; GFX10-NEXT:    s_endpgm
896;
897; GFX11-LABEL: v_mul64_sext_c:
898; GFX11:       ; %bb.0: ; %entry
899; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
900; GFX11-NEXT:    s_mov_b32 s6, -1
901; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
902; GFX11-NEXT:    s_mov_b32 s10, s6
903; GFX11-NEXT:    s_mov_b32 s11, s7
904; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
905; GFX11-NEXT:    s_mov_b32 s8, s2
906; GFX11-NEXT:    s_mov_b32 s9, s3
907; GFX11-NEXT:    s_mov_b32 s4, s0
908; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
909; GFX11-NEXT:    s_mov_b32 s5, s1
910; GFX11-NEXT:    s_waitcnt vmcnt(0)
911; GFX11-NEXT:    v_mul_hi_i32 v1, 0x50, v0
912; GFX11-NEXT:    v_mul_lo_u32 v0, 0x50, v0
913; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
914; GFX11-NEXT:    s_nop 0
915; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
916; GFX11-NEXT:    s_endpgm
917;
918; GFX12-LABEL: v_mul64_sext_c:
919; GFX12:       ; %bb.0: ; %entry
920; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
921; GFX12-NEXT:    s_mov_b32 s6, -1
922; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
923; GFX12-NEXT:    s_mov_b32 s10, s6
924; GFX12-NEXT:    s_mov_b32 s11, s7
925; GFX12-NEXT:    s_wait_kmcnt 0x0
926; GFX12-NEXT:    s_mov_b32 s8, s2
927; GFX12-NEXT:    s_mov_b32 s9, s3
928; GFX12-NEXT:    s_mov_b32 s4, s0
929; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
930; GFX12-NEXT:    s_mov_b32 s5, s1
931; GFX12-NEXT:    s_wait_loadcnt 0x0
932; GFX12-NEXT:    v_mul_hi_i32 v1, 0x50, v0
933; GFX12-NEXT:    v_mul_lo_u32 v0, 0x50, v0
934; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
935; GFX12-NEXT:    s_nop 0
936; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
937; GFX12-NEXT:    s_endpgm
938;
939; EG-LABEL: v_mul64_sext_c:
940; EG:       ; %bb.0: ; %entry
941; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
942; EG-NEXT:    TEX 0 @6
943; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
944; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
945; EG-NEXT:    CF_END
946; EG-NEXT:    PAD
947; EG-NEXT:    Fetch clause starting at 6:
948; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
949; EG-NEXT:    ALU clause starting at 8:
950; EG-NEXT:     MOV * T0.X, KC0[2].Z,
951; EG-NEXT:    ALU clause starting at 9:
952; EG-NEXT:     MULHI_INT * T0.Y, T0.X, literal.x,
953; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
954; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
955; EG-NEXT:     MULLO_INT * T0.X, T0.X, literal.y,
956; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
957entry:
958  %val = load i32, ptr addrspace(1) %in, align 4
959  %ext = sext i32 %val to i64
960  %mul = mul i64 %ext, 80
961  store i64 %mul, ptr addrspace(1) %out, align 8
962  ret void
963}
964
965define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) {
966; SI-LABEL: v_mul64_zext_c:
967; SI:       ; %bb.0: ; %entry
968; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
969; SI-NEXT:    s_mov_b32 s7, 0xf000
970; SI-NEXT:    s_mov_b32 s6, -1
971; SI-NEXT:    s_mov_b32 s10, s6
972; SI-NEXT:    s_mov_b32 s11, s7
973; SI-NEXT:    s_waitcnt lgkmcnt(0)
974; SI-NEXT:    s_mov_b32 s8, s2
975; SI-NEXT:    s_mov_b32 s9, s3
976; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
977; SI-NEXT:    s_movk_i32 s2, 0x50
978; SI-NEXT:    s_mov_b32 s4, s0
979; SI-NEXT:    s_mov_b32 s5, s1
980; SI-NEXT:    s_waitcnt vmcnt(0)
981; SI-NEXT:    v_mul_hi_u32 v1, v0, s2
982; SI-NEXT:    v_mul_lo_u32 v0, v0, s2
983; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
984; SI-NEXT:    s_endpgm
985;
986; VI-LABEL: v_mul64_zext_c:
987; VI:       ; %bb.0: ; %entry
988; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
989; VI-NEXT:    s_mov_b32 s7, 0xf000
990; VI-NEXT:    s_mov_b32 s6, -1
991; VI-NEXT:    s_mov_b32 s10, s6
992; VI-NEXT:    s_mov_b32 s11, s7
993; VI-NEXT:    s_waitcnt lgkmcnt(0)
994; VI-NEXT:    s_mov_b32 s8, s2
995; VI-NEXT:    s_mov_b32 s9, s3
996; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
997; VI-NEXT:    s_movk_i32 s2, 0x50
998; VI-NEXT:    s_mov_b32 s4, s0
999; VI-NEXT:    s_mov_b32 s5, s1
1000; VI-NEXT:    s_waitcnt vmcnt(0)
1001; VI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, s2, 0
1002; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1003; VI-NEXT:    s_endpgm
1004;
1005; GFX9-LABEL: v_mul64_zext_c:
1006; GFX9:       ; %bb.0: ; %entry
1007; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1008; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1009; GFX9-NEXT:    s_mov_b32 s6, -1
1010; GFX9-NEXT:    s_mov_b32 s10, s6
1011; GFX9-NEXT:    s_mov_b32 s11, s7
1012; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1013; GFX9-NEXT:    s_mov_b32 s8, s2
1014; GFX9-NEXT:    s_mov_b32 s9, s3
1015; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1016; GFX9-NEXT:    s_movk_i32 s2, 0x50
1017; GFX9-NEXT:    s_mov_b32 s4, s0
1018; GFX9-NEXT:    s_mov_b32 s5, s1
1019; GFX9-NEXT:    s_waitcnt vmcnt(0)
1020; GFX9-NEXT:    v_mul_hi_u32 v1, v0, s2
1021; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
1022; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1023; GFX9-NEXT:    s_endpgm
1024;
1025; GFX10-LABEL: v_mul64_zext_c:
1026; GFX10:       ; %bb.0: ; %entry
1027; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1028; GFX10-NEXT:    s_mov_b32 s6, -1
1029; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1030; GFX10-NEXT:    s_mov_b32 s10, s6
1031; GFX10-NEXT:    s_mov_b32 s11, s7
1032; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1033; GFX10-NEXT:    s_mov_b32 s8, s2
1034; GFX10-NEXT:    s_mov_b32 s9, s3
1035; GFX10-NEXT:    s_mov_b32 s4, s0
1036; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1037; GFX10-NEXT:    s_mov_b32 s5, s1
1038; GFX10-NEXT:    s_waitcnt vmcnt(0)
1039; GFX10-NEXT:    v_mul_hi_u32 v1, 0x50, v0
1040; GFX10-NEXT:    v_mul_lo_u32 v0, 0x50, v0
1041; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1042; GFX10-NEXT:    s_endpgm
1043;
1044; GFX11-LABEL: v_mul64_zext_c:
1045; GFX11:       ; %bb.0: ; %entry
1046; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1047; GFX11-NEXT:    s_mov_b32 s6, -1
1048; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1049; GFX11-NEXT:    s_mov_b32 s10, s6
1050; GFX11-NEXT:    s_mov_b32 s11, s7
1051; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1052; GFX11-NEXT:    s_mov_b32 s8, s2
1053; GFX11-NEXT:    s_mov_b32 s9, s3
1054; GFX11-NEXT:    s_mov_b32 s4, s0
1055; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
1056; GFX11-NEXT:    s_mov_b32 s5, s1
1057; GFX11-NEXT:    s_waitcnt vmcnt(0)
1058; GFX11-NEXT:    v_mul_hi_u32 v1, 0x50, v0
1059; GFX11-NEXT:    v_mul_lo_u32 v0, 0x50, v0
1060; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
1061; GFX11-NEXT:    s_nop 0
1062; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1063; GFX11-NEXT:    s_endpgm
1064;
1065; GFX12-LABEL: v_mul64_zext_c:
1066; GFX12:       ; %bb.0: ; %entry
1067; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1068; GFX12-NEXT:    s_mov_b32 s6, -1
1069; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
1070; GFX12-NEXT:    s_mov_b32 s10, s6
1071; GFX12-NEXT:    s_mov_b32 s11, s7
1072; GFX12-NEXT:    s_wait_kmcnt 0x0
1073; GFX12-NEXT:    s_mov_b32 s8, s2
1074; GFX12-NEXT:    s_mov_b32 s9, s3
1075; GFX12-NEXT:    s_mov_b32 s4, s0
1076; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
1077; GFX12-NEXT:    s_mov_b32 s5, s1
1078; GFX12-NEXT:    s_wait_loadcnt 0x0
1079; GFX12-NEXT:    v_mul_hi_u32 v1, 0x50, v0
1080; GFX12-NEXT:    v_mul_lo_u32 v0, 0x50, v0
1081; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
1082; GFX12-NEXT:    s_nop 0
1083; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1084; GFX12-NEXT:    s_endpgm
1085;
1086; EG-LABEL: v_mul64_zext_c:
1087; EG:       ; %bb.0: ; %entry
1088; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1089; EG-NEXT:    TEX 0 @6
1090; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
1091; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1092; EG-NEXT:    CF_END
1093; EG-NEXT:    PAD
1094; EG-NEXT:    Fetch clause starting at 6:
1095; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1096; EG-NEXT:    ALU clause starting at 8:
1097; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1098; EG-NEXT:    ALU clause starting at 9:
1099; EG-NEXT:     MULHI * T0.Y, T0.X, literal.x,
1100; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
1101; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
1102; EG-NEXT:     MULLO_INT * T0.X, T0.X, literal.y,
1103; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
1104entry:
1105  %val = load i32, ptr addrspace(1) %in, align 4
1106  %ext = zext i32 %val to i64
1107  %mul = mul i64 %ext, 80
1108  store i64 %mul, ptr addrspace(1) %out, align 8
1109  ret void
1110}
1111
1112define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1113; SI-LABEL: v_mul64_sext_inline_imm:
1114; SI:       ; %bb.0: ; %entry
1115; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1116; SI-NEXT:    s_mov_b32 s7, 0xf000
1117; SI-NEXT:    s_mov_b32 s6, -1
1118; SI-NEXT:    s_mov_b32 s10, s6
1119; SI-NEXT:    s_mov_b32 s11, s7
1120; SI-NEXT:    s_waitcnt lgkmcnt(0)
1121; SI-NEXT:    s_mov_b32 s8, s2
1122; SI-NEXT:    s_mov_b32 s9, s3
1123; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1124; SI-NEXT:    s_mov_b32 s4, s0
1125; SI-NEXT:    s_mov_b32 s5, s1
1126; SI-NEXT:    s_waitcnt vmcnt(0)
1127; SI-NEXT:    v_mul_hi_i32 v1, v0, 9
1128; SI-NEXT:    v_mul_lo_u32 v0, v0, 9
1129; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1130; SI-NEXT:    s_endpgm
1131;
1132; VI-LABEL: v_mul64_sext_inline_imm:
1133; VI:       ; %bb.0: ; %entry
1134; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1135; VI-NEXT:    s_mov_b32 s7, 0xf000
1136; VI-NEXT:    s_mov_b32 s6, -1
1137; VI-NEXT:    s_mov_b32 s10, s6
1138; VI-NEXT:    s_mov_b32 s11, s7
1139; VI-NEXT:    s_waitcnt lgkmcnt(0)
1140; VI-NEXT:    s_mov_b32 s8, s2
1141; VI-NEXT:    s_mov_b32 s9, s3
1142; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1143; VI-NEXT:    s_mov_b32 s4, s0
1144; VI-NEXT:    s_mov_b32 s5, s1
1145; VI-NEXT:    s_waitcnt vmcnt(0)
1146; VI-NEXT:    v_mad_i64_i32 v[0:1], s[2:3], v0, 9, 0
1147; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1148; VI-NEXT:    s_endpgm
1149;
1150; GFX9-LABEL: v_mul64_sext_inline_imm:
1151; GFX9:       ; %bb.0: ; %entry
1152; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1153; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1154; GFX9-NEXT:    s_mov_b32 s6, -1
1155; GFX9-NEXT:    s_mov_b32 s10, s6
1156; GFX9-NEXT:    s_mov_b32 s11, s7
1157; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1158; GFX9-NEXT:    s_mov_b32 s8, s2
1159; GFX9-NEXT:    s_mov_b32 s9, s3
1160; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1161; GFX9-NEXT:    s_mov_b32 s4, s0
1162; GFX9-NEXT:    s_mov_b32 s5, s1
1163; GFX9-NEXT:    s_waitcnt vmcnt(0)
1164; GFX9-NEXT:    v_mul_hi_i32 v1, v0, 9
1165; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 9
1166; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1167; GFX9-NEXT:    s_endpgm
1168;
1169; GFX10-LABEL: v_mul64_sext_inline_imm:
1170; GFX10:       ; %bb.0: ; %entry
1171; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1172; GFX10-NEXT:    s_mov_b32 s6, -1
1173; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1174; GFX10-NEXT:    s_mov_b32 s10, s6
1175; GFX10-NEXT:    s_mov_b32 s11, s7
1176; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1177; GFX10-NEXT:    s_mov_b32 s8, s2
1178; GFX10-NEXT:    s_mov_b32 s9, s3
1179; GFX10-NEXT:    s_mov_b32 s4, s0
1180; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1181; GFX10-NEXT:    s_mov_b32 s5, s1
1182; GFX10-NEXT:    s_waitcnt vmcnt(0)
1183; GFX10-NEXT:    v_mul_hi_i32 v1, v0, 9
1184; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 9
1185; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1186; GFX10-NEXT:    s_endpgm
1187;
1188; GFX11-LABEL: v_mul64_sext_inline_imm:
1189; GFX11:       ; %bb.0: ; %entry
1190; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1191; GFX11-NEXT:    s_mov_b32 s6, -1
1192; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1193; GFX11-NEXT:    s_mov_b32 s10, s6
1194; GFX11-NEXT:    s_mov_b32 s11, s7
1195; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1196; GFX11-NEXT:    s_mov_b32 s8, s2
1197; GFX11-NEXT:    s_mov_b32 s9, s3
1198; GFX11-NEXT:    s_mov_b32 s4, s0
1199; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
1200; GFX11-NEXT:    s_mov_b32 s5, s1
1201; GFX11-NEXT:    s_waitcnt vmcnt(0)
1202; GFX11-NEXT:    v_mul_hi_i32 v1, v0, 9
1203; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 9
1204; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
1205; GFX11-NEXT:    s_nop 0
1206; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1207; GFX11-NEXT:    s_endpgm
1208;
1209; GFX12-LABEL: v_mul64_sext_inline_imm:
1210; GFX12:       ; %bb.0: ; %entry
1211; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1212; GFX12-NEXT:    s_mov_b32 s6, -1
1213; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
1214; GFX12-NEXT:    s_mov_b32 s10, s6
1215; GFX12-NEXT:    s_mov_b32 s11, s7
1216; GFX12-NEXT:    s_wait_kmcnt 0x0
1217; GFX12-NEXT:    s_mov_b32 s8, s2
1218; GFX12-NEXT:    s_mov_b32 s9, s3
1219; GFX12-NEXT:    s_mov_b32 s4, s0
1220; GFX12-NEXT:    buffer_load_b32 v0, off, s[8:11], null
1221; GFX12-NEXT:    s_mov_b32 s5, s1
1222; GFX12-NEXT:    s_wait_loadcnt 0x0
1223; GFX12-NEXT:    v_mul_hi_i32 v1, 9, v0
1224; GFX12-NEXT:    v_mul_lo_u32 v0, 9, v0
1225; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
1226; GFX12-NEXT:    s_nop 0
1227; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1228; GFX12-NEXT:    s_endpgm
1229;
1230; EG-LABEL: v_mul64_sext_inline_imm:
1231; EG:       ; %bb.0: ; %entry
1232; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1233; EG-NEXT:    TEX 0 @6
1234; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
1235; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1236; EG-NEXT:    CF_END
1237; EG-NEXT:    PAD
1238; EG-NEXT:    Fetch clause starting at 6:
1239; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1240; EG-NEXT:    ALU clause starting at 8:
1241; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1242; EG-NEXT:    ALU clause starting at 9:
1243; EG-NEXT:     MULHI_INT * T0.Y, T0.X, literal.x,
1244; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1245; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
1246; EG-NEXT:     MULLO_INT * T0.X, T0.X, literal.y,
1247; EG-NEXT:    2(2.802597e-45), 9(1.261169e-44)
1248entry:
1249  %val = load i32, ptr addrspace(1) %in, align 4
1250  %ext = sext i32 %val to i64
1251  %mul = mul i64 %ext, 9
1252  store i64 %mul, ptr addrspace(1) %out, align 8
1253  ret void
1254}
1255
1256define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind {
1257; SI-LABEL: s_mul_i32:
1258; SI:       ; %bb.0: ; %entry
1259; SI-NEXT:    s_load_dword s4, s[0:1], 0x13
1260; SI-NEXT:    s_load_dword s5, s[0:1], 0x1c
1261; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1262; SI-NEXT:    s_mov_b32 s3, 0xf000
1263; SI-NEXT:    s_mov_b32 s2, -1
1264; SI-NEXT:    s_waitcnt lgkmcnt(0)
1265; SI-NEXT:    s_mul_i32 s4, s4, s5
1266; SI-NEXT:    v_mov_b32_e32 v0, s4
1267; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1268; SI-NEXT:    s_endpgm
1269;
1270; VI-LABEL: s_mul_i32:
1271; VI:       ; %bb.0: ; %entry
1272; VI-NEXT:    s_load_dword s4, s[0:1], 0x4c
1273; VI-NEXT:    s_load_dword s5, s[0:1], 0x70
1274; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1275; VI-NEXT:    s_mov_b32 s3, 0xf000
1276; VI-NEXT:    s_mov_b32 s2, -1
1277; VI-NEXT:    s_waitcnt lgkmcnt(0)
1278; VI-NEXT:    s_mul_i32 s4, s4, s5
1279; VI-NEXT:    v_mov_b32_e32 v0, s4
1280; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1281; VI-NEXT:    s_endpgm
1282;
1283; GFX9-LABEL: s_mul_i32:
1284; GFX9:       ; %bb.0: ; %entry
1285; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x4c
1286; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x70
1287; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1288; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1289; GFX9-NEXT:    s_mov_b32 s6, -1
1290; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1291; GFX9-NEXT:    s_mul_i32 s0, s2, s3
1292; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1293; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1294; GFX9-NEXT:    s_endpgm
1295;
1296; GFX10-LABEL: s_mul_i32:
1297; GFX10:       ; %bb.0: ; %entry
1298; GFX10-NEXT:    s_clause 0x2
1299; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x4c
1300; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x70
1301; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1302; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1303; GFX10-NEXT:    s_mov_b32 s6, -1
1304; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1305; GFX10-NEXT:    s_mul_i32 s0, s2, s3
1306; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1307; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1308; GFX10-NEXT:    s_endpgm
1309;
1310; GFX11-LABEL: s_mul_i32:
1311; GFX11:       ; %bb.0: ; %entry
1312; GFX11-NEXT:    s_clause 0x2
1313; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x4c
1314; GFX11-NEXT:    s_load_b32 s3, s[0:1], 0x70
1315; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1316; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1317; GFX11-NEXT:    s_mul_i32 s2, s2, s3
1318; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1319; GFX11-NEXT:    v_mov_b32_e32 v0, s2
1320; GFX11-NEXT:    s_mov_b32 s2, -1
1321; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1322; GFX11-NEXT:    s_nop 0
1323; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1324; GFX11-NEXT:    s_endpgm
1325;
1326; GFX12-LABEL: s_mul_i32:
1327; GFX12:       ; %bb.0: ; %entry
1328; GFX12-NEXT:    s_clause 0x2
1329; GFX12-NEXT:    s_load_b32 s2, s[0:1], 0x4c
1330; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x70
1331; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1332; GFX12-NEXT:    s_wait_kmcnt 0x0
1333; GFX12-NEXT:    s_mul_i32 s2, s2, s3
1334; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
1335; GFX12-NEXT:    v_mov_b32_e32 v0, s2
1336; GFX12-NEXT:    s_mov_b32 s2, -1
1337; GFX12-NEXT:    buffer_store_b32 v0, off, s[0:3], null
1338; GFX12-NEXT:    s_nop 0
1339; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1340; GFX12-NEXT:    s_endpgm
1341;
1342; EG-LABEL: s_mul_i32:
1343; EG:       ; %bb.0: ; %entry
1344; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
1345; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1346; EG-NEXT:    CF_END
1347; EG-NEXT:    PAD
1348; EG-NEXT:    ALU clause starting at 4:
1349; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1350; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1351; EG-NEXT:     MULLO_INT * T1.X, KC0[4].Z, KC0[6].W,
1352entry:
1353  %mul = mul i32 %a, %b
1354  store i32 %mul, ptr addrspace(1) %out, align 4
1355  ret void
1356}
1357
1358define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1359; SI-LABEL: v_mul_i32:
1360; SI:       ; %bb.0: ; %entry
1361; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1362; SI-NEXT:    s_mov_b32 s7, 0xf000
1363; SI-NEXT:    s_mov_b32 s6, -1
1364; SI-NEXT:    s_mov_b32 s10, s6
1365; SI-NEXT:    s_mov_b32 s11, s7
1366; SI-NEXT:    s_waitcnt lgkmcnt(0)
1367; SI-NEXT:    s_mov_b32 s8, s2
1368; SI-NEXT:    s_mov_b32 s9, s3
1369; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1370; SI-NEXT:    s_mov_b32 s4, s0
1371; SI-NEXT:    s_mov_b32 s5, s1
1372; SI-NEXT:    s_waitcnt vmcnt(0)
1373; SI-NEXT:    v_mul_lo_u32 v0, v0, v1
1374; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1375; SI-NEXT:    s_endpgm
1376;
1377; VI-LABEL: v_mul_i32:
1378; VI:       ; %bb.0: ; %entry
1379; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1380; VI-NEXT:    s_mov_b32 s7, 0xf000
1381; VI-NEXT:    s_mov_b32 s6, -1
1382; VI-NEXT:    s_mov_b32 s10, s6
1383; VI-NEXT:    s_mov_b32 s11, s7
1384; VI-NEXT:    s_waitcnt lgkmcnt(0)
1385; VI-NEXT:    s_mov_b32 s8, s2
1386; VI-NEXT:    s_mov_b32 s9, s3
1387; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1388; VI-NEXT:    s_mov_b32 s4, s0
1389; VI-NEXT:    s_mov_b32 s5, s1
1390; VI-NEXT:    s_waitcnt vmcnt(0)
1391; VI-NEXT:    v_mul_lo_u32 v0, v0, v1
1392; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1393; VI-NEXT:    s_endpgm
1394;
1395; GFX9-LABEL: v_mul_i32:
1396; GFX9:       ; %bb.0: ; %entry
1397; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1398; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1399; GFX9-NEXT:    s_mov_b32 s6, -1
1400; GFX9-NEXT:    s_mov_b32 s10, s6
1401; GFX9-NEXT:    s_mov_b32 s11, s7
1402; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1403; GFX9-NEXT:    s_mov_b32 s8, s2
1404; GFX9-NEXT:    s_mov_b32 s9, s3
1405; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1406; GFX9-NEXT:    s_mov_b32 s4, s0
1407; GFX9-NEXT:    s_mov_b32 s5, s1
1408; GFX9-NEXT:    s_waitcnt vmcnt(0)
1409; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
1410; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1411; GFX9-NEXT:    s_endpgm
1412;
1413; GFX10-LABEL: v_mul_i32:
1414; GFX10:       ; %bb.0: ; %entry
1415; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1416; GFX10-NEXT:    s_mov_b32 s6, -1
1417; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1418; GFX10-NEXT:    s_mov_b32 s10, s6
1419; GFX10-NEXT:    s_mov_b32 s11, s7
1420; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1421; GFX10-NEXT:    s_mov_b32 s8, s2
1422; GFX10-NEXT:    s_mov_b32 s9, s3
1423; GFX10-NEXT:    s_mov_b32 s4, s0
1424; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1425; GFX10-NEXT:    s_mov_b32 s5, s1
1426; GFX10-NEXT:    s_waitcnt vmcnt(0)
1427; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
1428; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1429; GFX10-NEXT:    s_endpgm
1430;
1431; GFX11-LABEL: v_mul_i32:
1432; GFX11:       ; %bb.0: ; %entry
1433; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1434; GFX11-NEXT:    s_mov_b32 s6, -1
1435; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1436; GFX11-NEXT:    s_mov_b32 s10, s6
1437; GFX11-NEXT:    s_mov_b32 s11, s7
1438; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1439; GFX11-NEXT:    s_mov_b32 s8, s2
1440; GFX11-NEXT:    s_mov_b32 s9, s3
1441; GFX11-NEXT:    s_mov_b32 s4, s0
1442; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
1443; GFX11-NEXT:    s_mov_b32 s5, s1
1444; GFX11-NEXT:    s_waitcnt vmcnt(0)
1445; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v1
1446; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
1447; GFX11-NEXT:    s_nop 0
1448; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1449; GFX11-NEXT:    s_endpgm
1450;
1451; GFX12-LABEL: v_mul_i32:
1452; GFX12:       ; %bb.0: ; %entry
1453; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1454; GFX12-NEXT:    s_mov_b32 s6, -1
1455; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
1456; GFX12-NEXT:    s_mov_b32 s10, s6
1457; GFX12-NEXT:    s_mov_b32 s11, s7
1458; GFX12-NEXT:    s_wait_kmcnt 0x0
1459; GFX12-NEXT:    s_mov_b32 s8, s2
1460; GFX12-NEXT:    s_mov_b32 s9, s3
1461; GFX12-NEXT:    s_mov_b32 s4, s0
1462; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], null
1463; GFX12-NEXT:    s_mov_b32 s5, s1
1464; GFX12-NEXT:    s_wait_loadcnt 0x0
1465; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v1
1466; GFX12-NEXT:    buffer_store_b32 v0, off, s[4:7], null
1467; GFX12-NEXT:    s_nop 0
1468; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1469; GFX12-NEXT:    s_endpgm
1470;
1471; EG-LABEL: v_mul_i32:
1472; EG:       ; %bb.0: ; %entry
1473; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1474; EG-NEXT:    TEX 0 @6
1475; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
1476; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1477; EG-NEXT:    CF_END
1478; EG-NEXT:    PAD
1479; EG-NEXT:    Fetch clause starting at 6:
1480; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
1481; EG-NEXT:    ALU clause starting at 8:
1482; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1483; EG-NEXT:    ALU clause starting at 9:
1484; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
1485; EG-NEXT:     MULLO_INT * T0.X, T0.X, T0.Y,
1486; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1487entry:
1488  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
1489  %a = load i32, ptr addrspace(1) %in
1490  %b = load i32, ptr addrspace(1) %b_ptr
1491  %result = mul i32 %a, %b
1492  store i32 %result, ptr addrspace(1) %out
1493  ret void
1494}
1495
1496define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind {
1497; SI-LABEL: s_mul_i1:
1498; SI:       ; %bb.0: ; %entry
1499; SI-NEXT:    s_load_dword s4, s[0:1], 0x13
1500; SI-NEXT:    s_load_dword s5, s[0:1], 0x1c
1501; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1502; SI-NEXT:    s_mov_b32 s3, 0xf000
1503; SI-NEXT:    s_mov_b32 s2, -1
1504; SI-NEXT:    s_waitcnt lgkmcnt(0)
1505; SI-NEXT:    s_mul_i32 s4, s4, s5
1506; SI-NEXT:    s_and_b32 s4, s4, 1
1507; SI-NEXT:    v_mov_b32_e32 v0, s4
1508; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1509; SI-NEXT:    s_endpgm
1510;
1511; VI-LABEL: s_mul_i1:
1512; VI:       ; %bb.0: ; %entry
1513; VI-NEXT:    s_load_dword s4, s[0:1], 0x70
1514; VI-NEXT:    s_load_dword s5, s[0:1], 0x4c
1515; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1516; VI-NEXT:    s_mov_b32 s3, 0xf000
1517; VI-NEXT:    s_mov_b32 s2, -1
1518; VI-NEXT:    s_waitcnt lgkmcnt(0)
1519; VI-NEXT:    v_mov_b32_e32 v0, s4
1520; VI-NEXT:    v_mul_lo_u16_e32 v0, s5, v0
1521; VI-NEXT:    v_and_b32_e32 v0, 1, v0
1522; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1523; VI-NEXT:    s_endpgm
1524;
1525; GFX9-LABEL: s_mul_i1:
1526; GFX9:       ; %bb.0: ; %entry
1527; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x70
1528; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x4c
1529; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1530; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1531; GFX9-NEXT:    s_mov_b32 s6, -1
1532; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1533; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1534; GFX9-NEXT:    v_mul_lo_u16_e32 v0, s3, v0
1535; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
1536; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1537; GFX9-NEXT:    s_endpgm
1538;
1539; GFX10-LABEL: s_mul_i1:
1540; GFX10:       ; %bb.0: ; %entry
1541; GFX10-NEXT:    s_clause 0x2
1542; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x4c
1543; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x70
1544; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1545; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1546; GFX10-NEXT:    s_mov_b32 s6, -1
1547; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1548; GFX10-NEXT:    v_mul_lo_u16 v0, s2, s3
1549; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
1550; GFX10-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1551; GFX10-NEXT:    s_endpgm
1552;
1553; GFX11-LABEL: s_mul_i1:
1554; GFX11:       ; %bb.0: ; %entry
1555; GFX11-NEXT:    s_clause 0x2
1556; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x4c
1557; GFX11-NEXT:    s_load_b32 s3, s[0:1], 0x70
1558; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1559; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1560; GFX11-NEXT:    v_mul_lo_u16 v0, s2, s3
1561; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1562; GFX11-NEXT:    s_mov_b32 s2, -1
1563; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1564; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
1565; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
1566; GFX11-NEXT:    s_nop 0
1567; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1568; GFX11-NEXT:    s_endpgm
1569;
1570; GFX12-LABEL: s_mul_i1:
1571; GFX12:       ; %bb.0: ; %entry
1572; GFX12-NEXT:    s_clause 0x2
1573; GFX12-NEXT:    s_load_b32 s2, s[0:1], 0x4c
1574; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x70
1575; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1576; GFX12-NEXT:    s_wait_kmcnt 0x0
1577; GFX12-NEXT:    v_mul_lo_u16 v0, s2, s3
1578; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
1579; GFX12-NEXT:    s_mov_b32 s2, -1
1580; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1581; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
1582; GFX12-NEXT:    buffer_store_b8 v0, off, s[0:3], null
1583; GFX12-NEXT:    s_nop 0
1584; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1585; GFX12-NEXT:    s_endpgm
1586;
1587; EG-LABEL: s_mul_i1:
1588; EG:       ; %bb.0: ; %entry
1589; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
1590; EG-NEXT:    TEX 1 @6
1591; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
1592; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1593; EG-NEXT:    CF_END
1594; EG-NEXT:    PAD
1595; EG-NEXT:    Fetch clause starting at 6:
1596; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 72, #3
1597; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 108, #3
1598; EG-NEXT:    ALU clause starting at 10:
1599; EG-NEXT:     MOV * T0.X, 0.0,
1600; EG-NEXT:    ALU clause starting at 11:
1601; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
1602; EG-NEXT:     MULLO_INT * T0.X, T1.X, T0.X,
1603; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1604; EG-NEXT:     AND_INT T1.W, PS, 1,
1605; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
1606; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1607; EG-NEXT:     LSHL T0.X, PV.W, PS,
1608; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1609; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1610; EG-NEXT:     MOV T0.Y, 0.0,
1611; EG-NEXT:     MOV * T0.Z, 0.0,
1612; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1613; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1614entry:
1615  %mul = mul i1 %a, %b
1616  store i1 %mul, ptr addrspace(1) %out, align 4
1617  ret void
1618}
1619
1620define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1621; SI-LABEL: v_mul_i1:
1622; SI:       ; %bb.0: ; %entry
1623; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1624; SI-NEXT:    s_mov_b32 s7, 0xf000
1625; SI-NEXT:    s_mov_b32 s6, -1
1626; SI-NEXT:    s_mov_b32 s10, s6
1627; SI-NEXT:    s_mov_b32 s11, s7
1628; SI-NEXT:    s_waitcnt lgkmcnt(0)
1629; SI-NEXT:    s_mov_b32 s8, s2
1630; SI-NEXT:    s_mov_b32 s9, s3
1631; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
1632; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1633; SI-NEXT:    s_mov_b32 s4, s0
1634; SI-NEXT:    s_mov_b32 s5, s1
1635; SI-NEXT:    s_waitcnt vmcnt(0)
1636; SI-NEXT:    v_mul_lo_u32 v0, v0, v1
1637; SI-NEXT:    v_and_b32_e32 v0, 1, v0
1638; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1639; SI-NEXT:    s_endpgm
1640;
1641; VI-LABEL: v_mul_i1:
1642; VI:       ; %bb.0: ; %entry
1643; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1644; VI-NEXT:    s_mov_b32 s7, 0xf000
1645; VI-NEXT:    s_mov_b32 s6, -1
1646; VI-NEXT:    s_mov_b32 s10, s6
1647; VI-NEXT:    s_mov_b32 s11, s7
1648; VI-NEXT:    s_waitcnt lgkmcnt(0)
1649; VI-NEXT:    s_mov_b32 s8, s2
1650; VI-NEXT:    s_mov_b32 s9, s3
1651; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
1652; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1653; VI-NEXT:    s_mov_b32 s4, s0
1654; VI-NEXT:    s_mov_b32 s5, s1
1655; VI-NEXT:    s_waitcnt vmcnt(0)
1656; VI-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
1657; VI-NEXT:    v_and_b32_e32 v0, 1, v0
1658; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1659; VI-NEXT:    s_endpgm
1660;
1661; GFX9-LABEL: v_mul_i1:
1662; GFX9:       ; %bb.0: ; %entry
1663; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1664; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1665; GFX9-NEXT:    s_mov_b32 s6, -1
1666; GFX9-NEXT:    s_mov_b32 s10, s6
1667; GFX9-NEXT:    s_mov_b32 s11, s7
1668; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1669; GFX9-NEXT:    s_mov_b32 s8, s2
1670; GFX9-NEXT:    s_mov_b32 s9, s3
1671; GFX9-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
1672; GFX9-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1673; GFX9-NEXT:    s_mov_b32 s4, s0
1674; GFX9-NEXT:    s_mov_b32 s5, s1
1675; GFX9-NEXT:    s_waitcnt vmcnt(0)
1676; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
1677; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
1678; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1679; GFX9-NEXT:    s_endpgm
1680;
1681; GFX10-LABEL: v_mul_i1:
1682; GFX10:       ; %bb.0: ; %entry
1683; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1684; GFX10-NEXT:    s_mov_b32 s6, -1
1685; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1686; GFX10-NEXT:    s_mov_b32 s10, s6
1687; GFX10-NEXT:    s_mov_b32 s11, s7
1688; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1689; GFX10-NEXT:    s_mov_b32 s8, s2
1690; GFX10-NEXT:    s_mov_b32 s9, s3
1691; GFX10-NEXT:    s_clause 0x1
1692; GFX10-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
1693; GFX10-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1694; GFX10-NEXT:    s_mov_b32 s4, s0
1695; GFX10-NEXT:    s_mov_b32 s5, s1
1696; GFX10-NEXT:    s_waitcnt vmcnt(0)
1697; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
1698; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
1699; GFX10-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1700; GFX10-NEXT:    s_endpgm
1701;
1702; GFX11-LABEL: v_mul_i1:
1703; GFX11:       ; %bb.0: ; %entry
1704; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1705; GFX11-NEXT:    s_mov_b32 s6, -1
1706; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1707; GFX11-NEXT:    s_mov_b32 s10, s6
1708; GFX11-NEXT:    s_mov_b32 s11, s7
1709; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1710; GFX11-NEXT:    s_mov_b32 s8, s2
1711; GFX11-NEXT:    s_mov_b32 s9, s3
1712; GFX11-NEXT:    s_clause 0x1
1713; GFX11-NEXT:    buffer_load_u8 v0, off, s[8:11], 0
1714; GFX11-NEXT:    buffer_load_u8 v1, off, s[8:11], 0 offset:4
1715; GFX11-NEXT:    s_mov_b32 s4, s0
1716; GFX11-NEXT:    s_mov_b32 s5, s1
1717; GFX11-NEXT:    s_waitcnt vmcnt(0)
1718; GFX11-NEXT:    v_mul_lo_u16 v0, v0, v1
1719; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1720; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
1721; GFX11-NEXT:    buffer_store_b8 v0, off, s[4:7], 0
1722; GFX11-NEXT:    s_nop 0
1723; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1724; GFX11-NEXT:    s_endpgm
1725;
1726; GFX12-LABEL: v_mul_i1:
1727; GFX12:       ; %bb.0: ; %entry
1728; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1729; GFX12-NEXT:    s_mov_b32 s6, -1
1730; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
1731; GFX12-NEXT:    s_mov_b32 s10, s6
1732; GFX12-NEXT:    s_mov_b32 s11, s7
1733; GFX12-NEXT:    s_wait_kmcnt 0x0
1734; GFX12-NEXT:    s_mov_b32 s8, s2
1735; GFX12-NEXT:    s_mov_b32 s9, s3
1736; GFX12-NEXT:    s_clause 0x1
1737; GFX12-NEXT:    buffer_load_u8 v0, off, s[8:11], null
1738; GFX12-NEXT:    buffer_load_u8 v1, off, s[8:11], null offset:4
1739; GFX12-NEXT:    s_mov_b32 s4, s0
1740; GFX12-NEXT:    s_mov_b32 s5, s1
1741; GFX12-NEXT:    s_wait_loadcnt 0x0
1742; GFX12-NEXT:    v_mul_lo_u16 v0, v0, v1
1743; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1744; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
1745; GFX12-NEXT:    buffer_store_b8 v0, off, s[4:7], null
1746; GFX12-NEXT:    s_nop 0
1747; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1748; GFX12-NEXT:    s_endpgm
1749;
1750; EG-LABEL: v_mul_i1:
1751; EG:       ; %bb.0: ; %entry
1752; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1753; EG-NEXT:    TEX 1 @6
1754; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
1755; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1756; EG-NEXT:    CF_END
1757; EG-NEXT:    PAD
1758; EG-NEXT:    Fetch clause starting at 6:
1759; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 4, #1
1760; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1761; EG-NEXT:    ALU clause starting at 10:
1762; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1763; EG-NEXT:    ALU clause starting at 11:
1764; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
1765; EG-NEXT:     MULLO_INT * T0.X, T0.X, T1.X,
1766; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1767; EG-NEXT:     AND_INT T1.W, PS, 1,
1768; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
1769; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1770; EG-NEXT:     LSHL T0.X, PV.W, PS,
1771; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1772; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1773; EG-NEXT:     MOV T0.Y, 0.0,
1774; EG-NEXT:     MOV * T0.Z, 0.0,
1775; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1776; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1777entry:
1778  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
1779  %a = load i1, ptr addrspace(1) %in
1780  %b = load i1, ptr addrspace(1) %b_ptr
1781  %result = mul i1 %a, %b
1782  store i1 %result, ptr addrspace(1) %out
1783  ret void
1784}
1785
1786; A standard 64-bit multiply.  The expansion should be around 6 instructions.
1787; It would be difficult to match the expansion correctly without writing
1788; a really complicated list of FileCheck expressions.  I don't want
1789; to confuse people who may 'break' this test with a correct optimization,
1790; so this test just uses FUNC-LABEL to make sure the compiler does not
1791; crash with a 'failed to select' error.
1792
1793define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
1794; SI-LABEL: s_mul_i64:
1795; SI:       ; %bb.0: ; %entry
1796; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1797; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1798; SI-NEXT:    s_mov_b32 s3, 0xf000
1799; SI-NEXT:    s_mov_b32 s2, -1
1800; SI-NEXT:    s_waitcnt lgkmcnt(0)
1801; SI-NEXT:    s_mov_b32 s0, s4
1802; SI-NEXT:    v_mov_b32_e32 v0, s8
1803; SI-NEXT:    v_mul_hi_u32 v0, s6, v0
1804; SI-NEXT:    s_mul_i32 s4, s6, s9
1805; SI-NEXT:    s_mov_b32 s1, s5
1806; SI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
1807; SI-NEXT:    s_mul_i32 s4, s7, s8
1808; SI-NEXT:    v_add_i32_e32 v1, vcc, s4, v0
1809; SI-NEXT:    s_mul_i32 s4, s6, s8
1810; SI-NEXT:    v_mov_b32_e32 v0, s4
1811; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1812; SI-NEXT:    s_endpgm
1813;
1814; VI-LABEL: s_mul_i64:
1815; VI:       ; %bb.0: ; %entry
1816; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1817; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
1818; VI-NEXT:    s_mov_b32 s3, 0xf000
1819; VI-NEXT:    s_mov_b32 s2, -1
1820; VI-NEXT:    s_waitcnt lgkmcnt(0)
1821; VI-NEXT:    s_mov_b32 s0, s4
1822; VI-NEXT:    v_mov_b32_e32 v0, s8
1823; VI-NEXT:    v_mad_u64_u32 v[0:1], s[10:11], s6, v0, 0
1824; VI-NEXT:    s_mul_i32 s4, s6, s9
1825; VI-NEXT:    s_mov_b32 s1, s5
1826; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
1827; VI-NEXT:    s_mul_i32 s4, s7, s8
1828; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
1829; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1830; VI-NEXT:    s_endpgm
1831;
1832; GFX9-LABEL: s_mul_i64:
1833; GFX9:       ; %bb.0: ; %entry
1834; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1835; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
1836; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1837; GFX9-NEXT:    s_mov_b32 s2, -1
1838; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1839; GFX9-NEXT:    s_mov_b32 s0, s4
1840; GFX9-NEXT:    s_mov_b32 s1, s5
1841; GFX9-NEXT:    s_mul_i32 s4, s6, s9
1842; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s8
1843; GFX9-NEXT:    s_add_i32 s4, s5, s4
1844; GFX9-NEXT:    s_mul_i32 s5, s7, s8
1845; GFX9-NEXT:    s_add_i32 s4, s4, s5
1846; GFX9-NEXT:    s_mul_i32 s5, s6, s8
1847; GFX9-NEXT:    v_mov_b32_e32 v0, s5
1848; GFX9-NEXT:    v_mov_b32_e32 v1, s4
1849; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1850; GFX9-NEXT:    s_endpgm
1851;
1852; GFX10-LABEL: s_mul_i64:
1853; GFX10:       ; %bb.0: ; %entry
1854; GFX10-NEXT:    s_clause 0x1
1855; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1856; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1857; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1858; GFX10-NEXT:    s_mul_i32 s0, s6, s3
1859; GFX10-NEXT:    s_mul_hi_u32 s1, s6, s2
1860; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1861; GFX10-NEXT:    s_add_i32 s0, s1, s0
1862; GFX10-NEXT:    s_mul_i32 s1, s7, s2
1863; GFX10-NEXT:    s_mul_i32 s2, s6, s2
1864; GFX10-NEXT:    s_add_i32 s0, s0, s1
1865; GFX10-NEXT:    v_mov_b32_e32 v0, s2
1866; GFX10-NEXT:    v_mov_b32_e32 v1, s0
1867; GFX10-NEXT:    s_mov_b32 s2, -1
1868; GFX10-NEXT:    s_mov_b32 s0, s4
1869; GFX10-NEXT:    s_mov_b32 s1, s5
1870; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1871; GFX10-NEXT:    s_endpgm
1872;
1873; GFX11-LABEL: s_mul_i64:
1874; GFX11:       ; %bb.0: ; %entry
1875; GFX11-NEXT:    s_clause 0x1
1876; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
1877; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
1878; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1879; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1880; GFX11-NEXT:    s_mul_i32 s1, s6, s1
1881; GFX11-NEXT:    s_mul_hi_u32 s2, s6, s0
1882; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
1883; GFX11-NEXT:    s_add_i32 s1, s2, s1
1884; GFX11-NEXT:    s_mul_i32 s2, s7, s0
1885; GFX11-NEXT:    s_mul_i32 s0, s6, s0
1886; GFX11-NEXT:    s_add_i32 s1, s1, s2
1887; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1888; GFX11-NEXT:    s_mov_b32 s2, -1
1889; GFX11-NEXT:    s_mov_b32 s0, s4
1890; GFX11-NEXT:    s_mov_b32 s1, s5
1891; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1892; GFX11-NEXT:    s_nop 0
1893; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1894; GFX11-NEXT:    s_endpgm
1895;
1896; GFX12-LABEL: s_mul_i64:
1897; GFX12:       ; %bb.0: ; %entry
1898; GFX12-NEXT:    s_clause 0x1
1899; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
1900; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
1901; GFX12-NEXT:    s_wait_kmcnt 0x0
1902; GFX12-NEXT:    s_mul_u64 s[0:1], s[6:7], s[0:1]
1903; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
1904; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1905; GFX12-NEXT:    s_mov_b32 s6, -1
1906; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
1907; GFX12-NEXT:    s_nop 0
1908; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1909; GFX12-NEXT:    s_endpgm
1910;
1911; EG-LABEL: s_mul_i64:
1912; EG:       ; %bb.0: ; %entry
1913; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1914; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1915; EG-NEXT:    CF_END
1916; EG-NEXT:    PAD
1917; EG-NEXT:    ALU clause starting at 4:
1918; EG-NEXT:     MULHI * T0.X, KC0[2].W, KC0[3].Y,
1919; EG-NEXT:     MULLO_INT * T0.Y, KC0[2].W, KC0[3].Z,
1920; EG-NEXT:     ADD_INT T0.W, T0.X, PS,
1921; EG-NEXT:     MULLO_INT * T0.X, KC0[3].X, KC0[3].Y,
1922; EG-NEXT:     ADD_INT * T0.Y, PV.W, PS,
1923; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1924; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1925; EG-NEXT:     MULLO_INT * T0.X, KC0[2].W, KC0[3].Y,
1926entry:
1927  %mul = mul i64 %a, %b
1928  store i64 %mul, ptr addrspace(1) %out, align 8
1929  ret void
1930}
1931
1932define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
1933; SI-LABEL: v_mul_i64:
1934; SI:       ; %bb.0: ; %entry
1935; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1936; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1937; SI-NEXT:    s_mov_b32 s3, 0xf000
1938; SI-NEXT:    s_mov_b32 s2, -1
1939; SI-NEXT:    s_mov_b32 s10, s2
1940; SI-NEXT:    s_mov_b32 s11, s3
1941; SI-NEXT:    s_waitcnt lgkmcnt(0)
1942; SI-NEXT:    s_mov_b32 s12, s6
1943; SI-NEXT:    s_mov_b32 s13, s7
1944; SI-NEXT:    s_mov_b32 s14, s2
1945; SI-NEXT:    s_mov_b32 s15, s3
1946; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1947; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1948; SI-NEXT:    s_mov_b32 s0, s4
1949; SI-NEXT:    s_mov_b32 s1, s5
1950; SI-NEXT:    s_waitcnt vmcnt(0)
1951; SI-NEXT:    v_mul_lo_u32 v1, v2, v1
1952; SI-NEXT:    v_mul_hi_u32 v4, v2, v0
1953; SI-NEXT:    v_mul_lo_u32 v3, v3, v0
1954; SI-NEXT:    v_mul_lo_u32 v0, v2, v0
1955; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
1956; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
1957; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1958; SI-NEXT:    s_endpgm
1959;
1960; VI-LABEL: v_mul_i64:
1961; VI:       ; %bb.0: ; %entry
1962; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1963; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
1964; VI-NEXT:    s_mov_b32 s3, 0xf000
1965; VI-NEXT:    s_mov_b32 s2, -1
1966; VI-NEXT:    s_mov_b32 s10, s2
1967; VI-NEXT:    s_mov_b32 s11, s3
1968; VI-NEXT:    s_waitcnt lgkmcnt(0)
1969; VI-NEXT:    s_mov_b32 s12, s6
1970; VI-NEXT:    s_mov_b32 s13, s7
1971; VI-NEXT:    s_mov_b32 s14, s2
1972; VI-NEXT:    s_mov_b32 s15, s3
1973; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1974; VI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1975; VI-NEXT:    s_mov_b32 s0, s4
1976; VI-NEXT:    s_mov_b32 s1, s5
1977; VI-NEXT:    s_waitcnt vmcnt(0)
1978; VI-NEXT:    v_mul_lo_u32 v4, v2, v1
1979; VI-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v2, v0, 0
1980; VI-NEXT:    v_mul_lo_u32 v0, v3, v0
1981; VI-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
1982; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
1983; VI-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
1984; VI-NEXT:    s_endpgm
1985;
1986; GFX9-LABEL: v_mul_i64:
1987; GFX9:       ; %bb.0: ; %entry
1988; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1989; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
1990; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1991; GFX9-NEXT:    s_mov_b32 s2, -1
1992; GFX9-NEXT:    s_mov_b32 s10, s2
1993; GFX9-NEXT:    s_mov_b32 s11, s3
1994; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1995; GFX9-NEXT:    s_mov_b32 s12, s6
1996; GFX9-NEXT:    s_mov_b32 s13, s7
1997; GFX9-NEXT:    s_mov_b32 s14, s2
1998; GFX9-NEXT:    s_mov_b32 s15, s3
1999; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
2000; GFX9-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
2001; GFX9-NEXT:    s_mov_b32 s0, s4
2002; GFX9-NEXT:    s_mov_b32 s1, s5
2003; GFX9-NEXT:    s_waitcnt vmcnt(0)
2004; GFX9-NEXT:    v_mul_lo_u32 v1, v2, v1
2005; GFX9-NEXT:    v_mul_hi_u32 v4, v2, v0
2006; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v0
2007; GFX9-NEXT:    v_mul_lo_u32 v0, v2, v0
2008; GFX9-NEXT:    v_add_u32_e32 v1, v4, v1
2009; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
2010; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2011; GFX9-NEXT:    s_endpgm
2012;
2013; GFX10-LABEL: v_mul_i64:
2014; GFX10:       ; %bb.0: ; %entry
2015; GFX10-NEXT:    s_clause 0x1
2016; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2017; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
2018; GFX10-NEXT:    s_mov_b32 s2, -1
2019; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
2020; GFX10-NEXT:    s_mov_b32 s10, s2
2021; GFX10-NEXT:    s_mov_b32 s11, s3
2022; GFX10-NEXT:    s_mov_b32 s14, s2
2023; GFX10-NEXT:    s_mov_b32 s15, s3
2024; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2025; GFX10-NEXT:    s_mov_b32 s12, s6
2026; GFX10-NEXT:    s_mov_b32 s13, s7
2027; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
2028; GFX10-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
2029; GFX10-NEXT:    s_mov_b32 s0, s4
2030; GFX10-NEXT:    s_mov_b32 s1, s5
2031; GFX10-NEXT:    s_waitcnt vmcnt(0)
2032; GFX10-NEXT:    v_mul_lo_u32 v1, v2, v1
2033; GFX10-NEXT:    v_mul_hi_u32 v4, v2, v0
2034; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v0
2035; GFX10-NEXT:    v_mul_lo_u32 v0, v2, v0
2036; GFX10-NEXT:    v_add_nc_u32_e32 v1, v4, v1
2037; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
2038; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2039; GFX10-NEXT:    s_endpgm
2040;
2041; GFX11-LABEL: v_mul_i64:
2042; GFX11:       ; %bb.0: ; %entry
2043; GFX11-NEXT:    s_clause 0x1
2044; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
2045; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
2046; GFX11-NEXT:    s_mov_b32 s10, -1
2047; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
2048; GFX11-NEXT:    s_mov_b32 s2, s10
2049; GFX11-NEXT:    s_mov_b32 s3, s11
2050; GFX11-NEXT:    s_mov_b32 s14, s10
2051; GFX11-NEXT:    s_mov_b32 s15, s11
2052; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2053; GFX11-NEXT:    s_mov_b32 s12, s6
2054; GFX11-NEXT:    s_mov_b32 s13, s7
2055; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
2056; GFX11-NEXT:    buffer_load_b64 v[2:3], off, s[12:15], 0
2057; GFX11-NEXT:    s_mov_b32 s8, s4
2058; GFX11-NEXT:    s_mov_b32 s9, s5
2059; GFX11-NEXT:    s_waitcnt vmcnt(0)
2060; GFX11-NEXT:    v_mul_lo_u32 v1, v2, v1
2061; GFX11-NEXT:    v_mul_hi_u32 v4, v2, v0
2062; GFX11-NEXT:    v_mul_lo_u32 v3, v3, v0
2063; GFX11-NEXT:    v_mul_lo_u32 v0, v2, v0
2064; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2065; GFX11-NEXT:    v_add_nc_u32_e32 v1, v4, v1
2066; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v3
2067; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
2068; GFX11-NEXT:    s_nop 0
2069; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2070; GFX11-NEXT:    s_endpgm
2071;
2072; GFX12-LABEL: v_mul_i64:
2073; GFX12:       ; %bb.0: ; %entry
2074; GFX12-NEXT:    s_clause 0x1
2075; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
2076; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
2077; GFX12-NEXT:    s_mov_b32 s10, -1
2078; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
2079; GFX12-NEXT:    s_mov_b32 s2, s10
2080; GFX12-NEXT:    s_mov_b32 s3, s11
2081; GFX12-NEXT:    s_mov_b32 s14, s10
2082; GFX12-NEXT:    s_mov_b32 s15, s11
2083; GFX12-NEXT:    s_wait_kmcnt 0x0
2084; GFX12-NEXT:    s_mov_b32 s12, s6
2085; GFX12-NEXT:    s_mov_b32 s13, s7
2086; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], null
2087; GFX12-NEXT:    buffer_load_b64 v[2:3], off, s[12:15], null
2088; GFX12-NEXT:    s_mov_b32 s8, s4
2089; GFX12-NEXT:    s_mov_b32 s9, s5
2090; GFX12-NEXT:    s_wait_loadcnt 0x0
2091; GFX12-NEXT:    v_mul_lo_u32 v3, v0, v3
2092; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v2
2093; GFX12-NEXT:    v_mul_hi_u32 v4, v0, v2
2094; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v2
2095; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2096; GFX12-NEXT:    v_add_nc_u32_e32 v1, v3, v1
2097; GFX12-NEXT:    v_add_nc_u32_e32 v1, v1, v4
2098; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
2099; GFX12-NEXT:    s_nop 0
2100; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2101; GFX12-NEXT:    s_endpgm
2102;
2103; EG-LABEL: v_mul_i64:
2104; EG:       ; %bb.0: ; %entry
2105; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
2106; EG-NEXT:    TEX 1 @6
2107; EG-NEXT:    ALU 7, @12, KC0[CB0:0-32], KC1[]
2108; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1
2109; EG-NEXT:    CF_END
2110; EG-NEXT:    PAD
2111; EG-NEXT:    Fetch clause starting at 6:
2112; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
2113; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
2114; EG-NEXT:    ALU clause starting at 10:
2115; EG-NEXT:     MOV T0.X, KC0[2].Z,
2116; EG-NEXT:     MOV * T1.X, KC0[2].W,
2117; EG-NEXT:    ALU clause starting at 12:
2118; EG-NEXT:     MULHI * T0.Z, T0.X, T1.X,
2119; EG-NEXT:     MULLO_INT * T0.W, T0.X, T1.Y,
2120; EG-NEXT:     ADD_INT T0.W, T0.Z, PS,
2121; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T1.X,
2122; EG-NEXT:     ADD_INT * T0.Y, PV.W, PS,
2123; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
2124; EG-NEXT:     MULLO_INT * T0.X, T0.X, T1.X,
2125; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2126entry:
2127  %a = load i64, ptr addrspace(1) %aptr, align 8
2128  %b = load i64, ptr addrspace(1) %bptr, align 8
2129  %mul = mul i64 %a, %b
2130  store i64 %mul, ptr addrspace(1) %out, align 8
2131  ret void
2132}
2133
2134define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) {
2135; SI-LABEL: mul32_in_branch:
2136; SI:       ; %bb.0: ; %entry
2137; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
2138; SI-NEXT:    s_waitcnt lgkmcnt(0)
2139; SI-NEXT:    s_cmp_lg_u32 s2, 0
2140; SI-NEXT:    s_cbranch_scc0 .LBB15_2
2141; SI-NEXT:  ; %bb.1: ; %else
2142; SI-NEXT:    s_mul_i32 s6, s2, s3
2143; SI-NEXT:    s_mov_b64 s[4:5], 0
2144; SI-NEXT:    s_branch .LBB15_3
2145; SI-NEXT:  .LBB15_2:
2146; SI-NEXT:    s_mov_b64 s[4:5], -1
2147; SI-NEXT:    ; implicit-def: $sgpr6
2148; SI-NEXT:  .LBB15_3: ; %Flow
2149; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2150; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
2151; SI-NEXT:    s_waitcnt lgkmcnt(0)
2152; SI-NEXT:    s_mov_b64 vcc, vcc
2153; SI-NEXT:    s_cbranch_vccnz .LBB15_5
2154; SI-NEXT:  ; %bb.4: ; %if
2155; SI-NEXT:    s_mov_b32 s7, 0xf000
2156; SI-NEXT:    s_mov_b32 s6, -1
2157; SI-NEXT:    s_mov_b32 s4, s2
2158; SI-NEXT:    s_mov_b32 s5, s3
2159; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
2160; SI-NEXT:    s_branch .LBB15_6
2161; SI-NEXT:  .LBB15_5:
2162; SI-NEXT:    v_mov_b32_e32 v0, s6
2163; SI-NEXT:  .LBB15_6: ; %endif
2164; SI-NEXT:    s_mov_b32 s3, 0xf000
2165; SI-NEXT:    s_mov_b32 s2, -1
2166; SI-NEXT:    s_waitcnt vmcnt(0)
2167; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2168; SI-NEXT:    s_endpgm
2169;
2170; VI-LABEL: mul32_in_branch:
2171; VI:       ; %bb.0: ; %entry
2172; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2173; VI-NEXT:    s_waitcnt lgkmcnt(0)
2174; VI-NEXT:    s_cmp_lg_u32 s2, 0
2175; VI-NEXT:    s_cbranch_scc0 .LBB15_2
2176; VI-NEXT:  ; %bb.1: ; %else
2177; VI-NEXT:    s_mul_i32 s6, s2, s3
2178; VI-NEXT:    s_mov_b64 s[4:5], 0
2179; VI-NEXT:    s_branch .LBB15_3
2180; VI-NEXT:  .LBB15_2:
2181; VI-NEXT:    s_mov_b64 s[4:5], -1
2182; VI-NEXT:    ; implicit-def: $sgpr6
2183; VI-NEXT:  .LBB15_3: ; %Flow
2184; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2185; VI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
2186; VI-NEXT:    s_cbranch_vccnz .LBB15_5
2187; VI-NEXT:  ; %bb.4: ; %if
2188; VI-NEXT:    s_mov_b32 s7, 0xf000
2189; VI-NEXT:    s_mov_b32 s6, -1
2190; VI-NEXT:    s_waitcnt lgkmcnt(0)
2191; VI-NEXT:    s_mov_b32 s4, s2
2192; VI-NEXT:    s_mov_b32 s5, s3
2193; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
2194; VI-NEXT:    s_branch .LBB15_6
2195; VI-NEXT:  .LBB15_5:
2196; VI-NEXT:    v_mov_b32_e32 v0, s6
2197; VI-NEXT:  .LBB15_6: ; %endif
2198; VI-NEXT:    s_waitcnt lgkmcnt(0)
2199; VI-NEXT:    s_mov_b32 s3, 0xf000
2200; VI-NEXT:    s_mov_b32 s2, -1
2201; VI-NEXT:    s_waitcnt vmcnt(0)
2202; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2203; VI-NEXT:    s_endpgm
2204;
2205; GFX9-LABEL: mul32_in_branch:
2206; GFX9:       ; %bb.0: ; %entry
2207; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2208; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2209; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
2210; GFX9-NEXT:    s_cbranch_scc0 .LBB15_2
2211; GFX9-NEXT:  ; %bb.1: ; %else
2212; GFX9-NEXT:    s_mul_i32 s6, s2, s3
2213; GFX9-NEXT:    s_mov_b64 s[4:5], 0
2214; GFX9-NEXT:    s_branch .LBB15_3
2215; GFX9-NEXT:  .LBB15_2:
2216; GFX9-NEXT:    s_mov_b64 s[4:5], -1
2217; GFX9-NEXT:    ; implicit-def: $sgpr6
2218; GFX9-NEXT:  .LBB15_3: ; %Flow
2219; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2220; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
2221; GFX9-NEXT:    s_cbranch_vccnz .LBB15_5
2222; GFX9-NEXT:  ; %bb.4: ; %if
2223; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2224; GFX9-NEXT:    s_mov_b32 s6, -1
2225; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2226; GFX9-NEXT:    s_mov_b32 s4, s2
2227; GFX9-NEXT:    s_mov_b32 s5, s3
2228; GFX9-NEXT:    buffer_load_dword v0, off, s[4:7], 0
2229; GFX9-NEXT:    s_branch .LBB15_6
2230; GFX9-NEXT:  .LBB15_5:
2231; GFX9-NEXT:    v_mov_b32_e32 v0, s6
2232; GFX9-NEXT:  .LBB15_6: ; %endif
2233; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2234; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2235; GFX9-NEXT:    s_mov_b32 s2, -1
2236; GFX9-NEXT:    s_waitcnt vmcnt(0)
2237; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2238; GFX9-NEXT:    s_endpgm
2239;
2240; GFX10-LABEL: mul32_in_branch:
2241; GFX10:       ; %bb.0: ; %entry
2242; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2243; GFX10-NEXT:    s_mov_b32 s4, 0
2244; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2245; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
2246; GFX10-NEXT:    s_cbranch_scc0 .LBB15_2
2247; GFX10-NEXT:  ; %bb.1: ; %else
2248; GFX10-NEXT:    s_mul_i32 s5, s2, s3
2249; GFX10-NEXT:    s_branch .LBB15_3
2250; GFX10-NEXT:  .LBB15_2:
2251; GFX10-NEXT:    s_mov_b32 s4, -1
2252; GFX10-NEXT:    ; implicit-def: $sgpr5
2253; GFX10-NEXT:  .LBB15_3: ; %Flow
2254; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2255; GFX10-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
2256; GFX10-NEXT:    s_cbranch_vccnz .LBB15_5
2257; GFX10-NEXT:  ; %bb.4: ; %if
2258; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
2259; GFX10-NEXT:    s_mov_b32 s6, -1
2260; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2261; GFX10-NEXT:    s_mov_b32 s4, s2
2262; GFX10-NEXT:    s_mov_b32 s5, s3
2263; GFX10-NEXT:    buffer_load_dword v0, off, s[4:7], 0
2264; GFX10-NEXT:    s_branch .LBB15_6
2265; GFX10-NEXT:  .LBB15_5:
2266; GFX10-NEXT:    v_mov_b32_e32 v0, s5
2267; GFX10-NEXT:  .LBB15_6: ; %endif
2268; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2269; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
2270; GFX10-NEXT:    s_mov_b32 s2, -1
2271; GFX10-NEXT:    s_waitcnt vmcnt(0)
2272; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2273; GFX10-NEXT:    s_endpgm
2274;
2275; GFX11-LABEL: mul32_in_branch:
2276; GFX11:       ; %bb.0: ; %entry
2277; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x34
2278; GFX11-NEXT:    s_mov_b32 s4, 0
2279; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2280; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
2281; GFX11-NEXT:    s_cbranch_scc0 .LBB15_2
2282; GFX11-NEXT:  ; %bb.1: ; %else
2283; GFX11-NEXT:    s_mul_i32 s5, s2, s3
2284; GFX11-NEXT:    s_branch .LBB15_3
2285; GFX11-NEXT:  .LBB15_2:
2286; GFX11-NEXT:    s_mov_b32 s4, -1
2287; GFX11-NEXT:    ; implicit-def: $sgpr5
2288; GFX11-NEXT:  .LBB15_3: ; %Flow
2289; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2290; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
2291; GFX11-NEXT:    s_cbranch_vccnz .LBB15_5
2292; GFX11-NEXT:  ; %bb.4: ; %if
2293; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
2294; GFX11-NEXT:    s_mov_b32 s6, -1
2295; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2296; GFX11-NEXT:    s_mov_b32 s4, s2
2297; GFX11-NEXT:    s_mov_b32 s5, s3
2298; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
2299; GFX11-NEXT:    s_branch .LBB15_6
2300; GFX11-NEXT:  .LBB15_5:
2301; GFX11-NEXT:    v_mov_b32_e32 v0, s5
2302; GFX11-NEXT:  .LBB15_6: ; %endif
2303; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2304; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
2305; GFX11-NEXT:    s_mov_b32 s2, -1
2306; GFX11-NEXT:    s_waitcnt vmcnt(0)
2307; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2308; GFX11-NEXT:    s_nop 0
2309; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2310; GFX11-NEXT:    s_endpgm
2311;
2312; GFX12-LABEL: mul32_in_branch:
2313; GFX12:       ; %bb.0: ; %entry
2314; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x34
2315; GFX12-NEXT:    s_mov_b32 s4, 0
2316; GFX12-NEXT:    s_wait_kmcnt 0x0
2317; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
2318; GFX12-NEXT:    s_cbranch_scc0 .LBB15_2
2319; GFX12-NEXT:  ; %bb.1: ; %else
2320; GFX12-NEXT:    s_mul_i32 s5, s2, s3
2321; GFX12-NEXT:    s_branch .LBB15_3
2322; GFX12-NEXT:  .LBB15_2:
2323; GFX12-NEXT:    s_mov_b32 s4, -1
2324; GFX12-NEXT:    ; implicit-def: $sgpr5
2325; GFX12-NEXT:  .LBB15_3: ; %Flow
2326; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2327; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
2328; GFX12-NEXT:    s_cbranch_vccnz .LBB15_5
2329; GFX12-NEXT:  ; %bb.4: ; %if
2330; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
2331; GFX12-NEXT:    s_mov_b32 s6, -1
2332; GFX12-NEXT:    s_wait_kmcnt 0x0
2333; GFX12-NEXT:    s_mov_b32 s4, s2
2334; GFX12-NEXT:    s_mov_b32 s5, s3
2335; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
2336; GFX12-NEXT:    s_branch .LBB15_6
2337; GFX12-NEXT:  .LBB15_5:
2338; GFX12-NEXT:    v_mov_b32_e32 v0, s5
2339; GFX12-NEXT:  .LBB15_6: ; %endif
2340; GFX12-NEXT:    s_wait_kmcnt 0x0
2341; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
2342; GFX12-NEXT:    s_mov_b32 s2, -1
2343; GFX12-NEXT:    s_wait_loadcnt 0x0
2344; GFX12-NEXT:    buffer_store_b32 v0, off, s[0:3], null
2345; GFX12-NEXT:    s_nop 0
2346; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2347; GFX12-NEXT:    s_endpgm
2348;
2349; EG-LABEL: mul32_in_branch:
2350; EG:       ; %bb.0: ; %entry
2351; EG-NEXT:    ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[]
2352; EG-NEXT:    JUMP @3 POP:1
2353; EG-NEXT:    ALU_POP_AFTER 4, @18, KC0[CB0:0-32], KC1[]
2354; EG-NEXT:    ALU_PUSH_BEFORE 2, @23, KC0[CB0:0-32], KC1[]
2355; EG-NEXT:    JUMP @8 POP:1
2356; EG-NEXT:    ALU 0, @26, KC0[CB0:0-32], KC1[]
2357; EG-NEXT:    TEX 0 @12
2358; EG-NEXT:    POP @8 POP:1
2359; EG-NEXT:    ALU 1, @27, KC0[], KC1[]
2360; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2361; EG-NEXT:    CF_END
2362; EG-NEXT:    PAD
2363; EG-NEXT:    Fetch clause starting at 12:
2364; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
2365; EG-NEXT:    ALU clause starting at 14:
2366; EG-NEXT:     MOV T0.W, literal.x,
2367; EG-NEXT:     SETNE_INT * T1.W, KC0[2].W, 0.0,
2368; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
2369; EG-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
2370; EG-NEXT:    ALU clause starting at 18:
2371; EG-NEXT:     MOV T1.W, KC0[2].W,
2372; EG-NEXT:     MOV * T2.W, KC0[3].X,
2373; EG-NEXT:     MOV T0.W, literal.x,
2374; EG-NEXT:     MULLO_INT * T0.X, PV.W, PS,
2375; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2376; EG-NEXT:    ALU clause starting at 23:
2377; EG-NEXT:     MOV T1.W, KC0[2].Y,
2378; EG-NEXT:     SETE_INT * T0.W, T0.W, 0.0,
2379; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
2380; EG-NEXT:    ALU clause starting at 26:
2381; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2382; EG-NEXT:    ALU clause starting at 27:
2383; EG-NEXT:     LSHR * T1.X, T1.W, literal.x,
2384; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2385entry:
2386  %0 = icmp eq i32 %a, 0
2387  br i1 %0, label %if, label %else
2388
2389if:
2390  %1 = load i32, ptr addrspace(1) %in
2391  br label %endif
2392
2393else:
2394  %2 = mul i32 %a, %b
2395  br label %endif
2396
2397endif:
2398  %3 = phi i32 [%1, %if], [%2, %else]
2399  store i32 %3, ptr addrspace(1) %out
2400  ret void
2401}
2402
2403define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) {
2404; SI-LABEL: mul64_in_branch:
2405; SI:       ; %bb.0: ; %entry
2406; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
2407; SI-NEXT:    s_mov_b64 s[8:9], 0
2408; SI-NEXT:    s_waitcnt lgkmcnt(0)
2409; SI-NEXT:    v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
2410; SI-NEXT:    s_and_b64 vcc, exec, s[10:11]
2411; SI-NEXT:    s_cbranch_vccz .LBB16_4
2412; SI-NEXT:  ; %bb.1: ; %else
2413; SI-NEXT:    v_mov_b32_e32 v0, s6
2414; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
2415; SI-NEXT:    s_mul_i32 s7, s4, s7
2416; SI-NEXT:    s_mul_i32 s5, s5, s6
2417; SI-NEXT:    s_mul_i32 s4, s4, s6
2418; SI-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
2419; SI-NEXT:    v_add_i32_e32 v1, vcc, s5, v0
2420; SI-NEXT:    v_mov_b32_e32 v0, s4
2421; SI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
2422; SI-NEXT:    s_cbranch_vccnz .LBB16_3
2423; SI-NEXT:  .LBB16_2: ; %if
2424; SI-NEXT:    s_mov_b32 s7, 0xf000
2425; SI-NEXT:    s_mov_b32 s6, -1
2426; SI-NEXT:    s_mov_b32 s4, s2
2427; SI-NEXT:    s_mov_b32 s5, s3
2428; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2429; SI-NEXT:  .LBB16_3: ; %endif
2430; SI-NEXT:    s_mov_b32 s3, 0xf000
2431; SI-NEXT:    s_mov_b32 s2, -1
2432; SI-NEXT:    s_waitcnt vmcnt(0)
2433; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2434; SI-NEXT:    s_endpgm
2435; SI-NEXT:  .LBB16_4:
2436; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1
2437; SI-NEXT:    s_branch .LBB16_2
2438;
2439; VI-LABEL: mul64_in_branch:
2440; VI:       ; %bb.0: ; %entry
2441; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
2442; VI-NEXT:    s_mov_b64 s[8:9], 0
2443; VI-NEXT:    s_waitcnt lgkmcnt(0)
2444; VI-NEXT:    s_cmp_lg_u64 s[4:5], 0
2445; VI-NEXT:    s_cbranch_scc0 .LBB16_4
2446; VI-NEXT:  ; %bb.1: ; %else
2447; VI-NEXT:    v_mov_b32_e32 v0, s6
2448; VI-NEXT:    v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0
2449; VI-NEXT:    s_mul_i32 s4, s4, s7
2450; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
2451; VI-NEXT:    s_mul_i32 s4, s5, s6
2452; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
2453; VI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
2454; VI-NEXT:    s_cbranch_vccnz .LBB16_3
2455; VI-NEXT:  .LBB16_2: ; %if
2456; VI-NEXT:    s_mov_b32 s7, 0xf000
2457; VI-NEXT:    s_mov_b32 s6, -1
2458; VI-NEXT:    s_mov_b32 s4, s2
2459; VI-NEXT:    s_mov_b32 s5, s3
2460; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2461; VI-NEXT:  .LBB16_3: ; %endif
2462; VI-NEXT:    s_mov_b32 s3, 0xf000
2463; VI-NEXT:    s_mov_b32 s2, -1
2464; VI-NEXT:    s_waitcnt vmcnt(0)
2465; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2466; VI-NEXT:    s_endpgm
2467; VI-NEXT:  .LBB16_4:
2468; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
2469; VI-NEXT:    s_branch .LBB16_2
2470;
2471; GFX9-LABEL: mul64_in_branch:
2472; GFX9:       ; %bb.0: ; %entry
2473; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
2474; GFX9-NEXT:    s_mov_b64 s[8:9], 0
2475; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2476; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
2477; GFX9-NEXT:    s_cbranch_scc0 .LBB16_3
2478; GFX9-NEXT:  ; %bb.1: ; %else
2479; GFX9-NEXT:    s_mul_i32 s7, s4, s7
2480; GFX9-NEXT:    s_mul_hi_u32 s10, s4, s6
2481; GFX9-NEXT:    s_add_i32 s7, s10, s7
2482; GFX9-NEXT:    s_mul_i32 s5, s5, s6
2483; GFX9-NEXT:    s_add_i32 s5, s7, s5
2484; GFX9-NEXT:    s_mul_i32 s4, s4, s6
2485; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
2486; GFX9-NEXT:    s_cbranch_vccnz .LBB16_4
2487; GFX9-NEXT:  .LBB16_2: ; %if
2488; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2489; GFX9-NEXT:    s_mov_b32 s6, -1
2490; GFX9-NEXT:    s_mov_b32 s4, s2
2491; GFX9-NEXT:    s_mov_b32 s5, s3
2492; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2493; GFX9-NEXT:    s_branch .LBB16_5
2494; GFX9-NEXT:  .LBB16_3:
2495; GFX9-NEXT:    ; implicit-def: $sgpr4_sgpr5
2496; GFX9-NEXT:    s_branch .LBB16_2
2497; GFX9-NEXT:  .LBB16_4:
2498; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2499; GFX9-NEXT:    v_mov_b32_e32 v1, s5
2500; GFX9-NEXT:  .LBB16_5: ; %endif
2501; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2502; GFX9-NEXT:    s_mov_b32 s2, -1
2503; GFX9-NEXT:    s_waitcnt vmcnt(0)
2504; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2505; GFX9-NEXT:    s_endpgm
2506;
2507; GFX10-LABEL: mul64_in_branch:
2508; GFX10:       ; %bb.0: ; %entry
2509; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
2510; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2511; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
2512; GFX10-NEXT:    s_cbranch_scc0 .LBB16_3
2513; GFX10-NEXT:  ; %bb.1: ; %else
2514; GFX10-NEXT:    s_mul_i32 s7, s4, s7
2515; GFX10-NEXT:    s_mul_hi_u32 s8, s4, s6
2516; GFX10-NEXT:    s_mul_i32 s5, s5, s6
2517; GFX10-NEXT:    s_add_i32 s7, s8, s7
2518; GFX10-NEXT:    s_mul_i32 s4, s4, s6
2519; GFX10-NEXT:    s_add_i32 s5, s7, s5
2520; GFX10-NEXT:    s_mov_b32 s6, 0
2521; GFX10-NEXT:    s_cbranch_execnz .LBB16_4
2522; GFX10-NEXT:  .LBB16_2: ; %if
2523; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
2524; GFX10-NEXT:    s_mov_b32 s6, -1
2525; GFX10-NEXT:    s_mov_b32 s4, s2
2526; GFX10-NEXT:    s_mov_b32 s5, s3
2527; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2528; GFX10-NEXT:    s_branch .LBB16_5
2529; GFX10-NEXT:  .LBB16_3:
2530; GFX10-NEXT:    s_mov_b32 s6, -1
2531; GFX10-NEXT:    ; implicit-def: $sgpr4_sgpr5
2532; GFX10-NEXT:    s_branch .LBB16_2
2533; GFX10-NEXT:  .LBB16_4:
2534; GFX10-NEXT:    v_mov_b32_e32 v0, s4
2535; GFX10-NEXT:    v_mov_b32_e32 v1, s5
2536; GFX10-NEXT:  .LBB16_5: ; %endif
2537; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
2538; GFX10-NEXT:    s_mov_b32 s2, -1
2539; GFX10-NEXT:    s_waitcnt vmcnt(0)
2540; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2541; GFX10-NEXT:    s_endpgm
2542;
2543; GFX11-LABEL: mul64_in_branch:
2544; GFX11:       ; %bb.0: ; %entry
2545; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
2546; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2547; GFX11-NEXT:    s_cmp_lg_u64 s[4:5], 0
2548; GFX11-NEXT:    s_cbranch_scc0 .LBB16_3
2549; GFX11-NEXT:  ; %bb.1: ; %else
2550; GFX11-NEXT:    s_mul_i32 s7, s4, s7
2551; GFX11-NEXT:    s_mul_hi_u32 s8, s4, s6
2552; GFX11-NEXT:    s_mul_i32 s5, s5, s6
2553; GFX11-NEXT:    s_add_i32 s7, s8, s7
2554; GFX11-NEXT:    s_mul_i32 s4, s4, s6
2555; GFX11-NEXT:    s_add_i32 s5, s7, s5
2556; GFX11-NEXT:    s_mov_b32 s6, 0
2557; GFX11-NEXT:    s_cbranch_execnz .LBB16_4
2558; GFX11-NEXT:  .LBB16_2: ; %if
2559; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
2560; GFX11-NEXT:    s_mov_b32 s6, -1
2561; GFX11-NEXT:    s_mov_b32 s4, s2
2562; GFX11-NEXT:    s_mov_b32 s5, s3
2563; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], 0
2564; GFX11-NEXT:    s_branch .LBB16_5
2565; GFX11-NEXT:  .LBB16_3:
2566; GFX11-NEXT:    s_mov_b32 s6, -1
2567; GFX11-NEXT:    ; implicit-def: $sgpr4_sgpr5
2568; GFX11-NEXT:    s_branch .LBB16_2
2569; GFX11-NEXT:  .LBB16_4:
2570; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
2571; GFX11-NEXT:  .LBB16_5: ; %endif
2572; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
2573; GFX11-NEXT:    s_mov_b32 s2, -1
2574; GFX11-NEXT:    s_waitcnt vmcnt(0)
2575; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2576; GFX11-NEXT:    s_nop 0
2577; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2578; GFX11-NEXT:    s_endpgm
2579;
2580; GFX12-LABEL: mul64_in_branch:
2581; GFX12:       ; %bb.0: ; %entry
2582; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
2583; GFX12-NEXT:    s_wait_kmcnt 0x0
2584; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
2585; GFX12-NEXT:    s_cbranch_scc0 .LBB16_3
2586; GFX12-NEXT:  ; %bb.1: ; %else
2587; GFX12-NEXT:    s_mul_u64 s[4:5], s[4:5], s[6:7]
2588; GFX12-NEXT:    s_mov_b32 s6, 0
2589; GFX12-NEXT:    s_cbranch_execnz .LBB16_4
2590; GFX12-NEXT:  .LBB16_2: ; %if
2591; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
2592; GFX12-NEXT:    s_mov_b32 s6, -1
2593; GFX12-NEXT:    s_mov_b32 s4, s2
2594; GFX12-NEXT:    s_mov_b32 s5, s3
2595; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], null
2596; GFX12-NEXT:    s_branch .LBB16_5
2597; GFX12-NEXT:  .LBB16_3:
2598; GFX12-NEXT:    s_mov_b32 s6, -1
2599; GFX12-NEXT:    ; implicit-def: $sgpr4_sgpr5
2600; GFX12-NEXT:    s_branch .LBB16_2
2601; GFX12-NEXT:  .LBB16_4:
2602; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
2603; GFX12-NEXT:  .LBB16_5: ; %endif
2604; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
2605; GFX12-NEXT:    s_mov_b32 s2, -1
2606; GFX12-NEXT:    s_wait_loadcnt 0x0
2607; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
2608; GFX12-NEXT:    s_nop 0
2609; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2610; GFX12-NEXT:    s_endpgm
2611;
2612; EG-LABEL: mul64_in_branch:
2613; EG:       ; %bb.0: ; %entry
2614; EG-NEXT:    ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[]
2615; EG-NEXT:    JUMP @3 POP:1
2616; EG-NEXT:    ALU_POP_AFTER 11, @19, KC0[CB0:0-32], KC1[]
2617; EG-NEXT:    ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[]
2618; EG-NEXT:    JUMP @8 POP:1
2619; EG-NEXT:    ALU 0, @34, KC0[CB0:0-32], KC1[]
2620; EG-NEXT:    TEX 0 @12
2621; EG-NEXT:    POP @8 POP:1
2622; EG-NEXT:    ALU 1, @35, KC0[], KC1[]
2623; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
2624; EG-NEXT:    CF_END
2625; EG-NEXT:    PAD
2626; EG-NEXT:    Fetch clause starting at 12:
2627; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
2628; EG-NEXT:    ALU clause starting at 14:
2629; EG-NEXT:     OR_INT T0.W, KC0[2].W, KC0[3].X,
2630; EG-NEXT:     MOV * T1.W, literal.x,
2631; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
2632; EG-NEXT:     SETNE_INT * T0.W, PV.W, 0.0,
2633; EG-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
2634; EG-NEXT:    ALU clause starting at 19:
2635; EG-NEXT:     MOV T0.W, KC0[2].W,
2636; EG-NEXT:     MOV * T1.W, KC0[3].Z,
2637; EG-NEXT:     MOV T2.W, KC0[3].Y,
2638; EG-NEXT:     MULLO_INT * T0.X, PV.W, PS,
2639; EG-NEXT:     MOV T1.W, KC0[3].X,
2640; EG-NEXT:     MULHI * T0.Y, T0.W, PV.W,
2641; EG-NEXT:     ADD_INT T3.W, PS, T0.X,
2642; EG-NEXT:     MULLO_INT * T0.X, PV.W, T2.W,
2643; EG-NEXT:     ADD_INT T0.Y, PV.W, PS,
2644; EG-NEXT:     MOV T1.W, literal.x,
2645; EG-NEXT:     MULLO_INT * T0.X, T0.W, T2.W,
2646; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2647; EG-NEXT:    ALU clause starting at 31:
2648; EG-NEXT:     MOV T0.W, KC0[2].Y,
2649; EG-NEXT:     SETE_INT * T1.W, T1.W, 0.0,
2650; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
2651; EG-NEXT:    ALU clause starting at 34:
2652; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2653; EG-NEXT:    ALU clause starting at 35:
2654; EG-NEXT:     LSHR * T1.X, T0.W, literal.x,
2655; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2656entry:
2657  %0 = icmp eq i64 %a, 0
2658  br i1 %0, label %if, label %else
2659
2660if:
2661  %1 = load i64, ptr addrspace(1) %in
2662  br label %endif
2663
2664else:
2665  %2 = mul i64 %a, %b
2666  br label %endif
2667
2668endif:
2669  %3 = phi i64 [%1, %if], [%2, %else]
2670  store i64 %3, ptr addrspace(1) %out
2671  ret void
2672}
2673
2674define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 {
2675; SI-LABEL: s_mul_i128:
2676; SI:       ; %bb.0: ; %entry
2677; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x13
2678; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x1f
2679; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2680; SI-NEXT:    s_mov_b32 s3, 0xf000
2681; SI-NEXT:    s_mov_b32 s2, -1
2682; SI-NEXT:    s_waitcnt lgkmcnt(0)
2683; SI-NEXT:    v_mov_b32_e32 v0, s6
2684; SI-NEXT:    v_mul_hi_u32 v0, s8, v0
2685; SI-NEXT:    v_mov_b32_e32 v1, s4
2686; SI-NEXT:    v_mul_hi_u32 v1, s10, v1
2687; SI-NEXT:    s_mul_i32 s7, s8, s7
2688; SI-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
2689; SI-NEXT:    s_mul_i32 s7, s10, s5
2690; SI-NEXT:    s_mul_i32 s12, s9, s6
2691; SI-NEXT:    s_mul_i32 s6, s8, s6
2692; SI-NEXT:    v_add_i32_e32 v1, vcc, s7, v1
2693; SI-NEXT:    s_mul_i32 s7, s11, s4
2694; SI-NEXT:    v_add_i32_e32 v0, vcc, s12, v0
2695; SI-NEXT:    v_add_i32_e32 v1, vcc, s7, v1
2696; SI-NEXT:    s_mul_i32 s7, s10, s4
2697; SI-NEXT:    v_mov_b32_e32 v2, s6
2698; SI-NEXT:    v_add_i32_e32 v2, vcc, s7, v2
2699; SI-NEXT:    v_addc_u32_e32 v0, vcc, v1, v0, vcc
2700; SI-NEXT:    v_mov_b32_e32 v1, s8
2701; SI-NEXT:    v_mul_hi_u32 v5, s4, v1
2702; SI-NEXT:    v_mul_hi_u32 v1, s5, v1
2703; SI-NEXT:    v_mov_b32_e32 v3, s9
2704; SI-NEXT:    v_mul_hi_u32 v4, s4, v3
2705; SI-NEXT:    s_mul_i32 s7, s5, s8
2706; SI-NEXT:    v_add_i32_e32 v5, vcc, s7, v5
2707; SI-NEXT:    s_mul_i32 s6, s4, s9
2708; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
2709; SI-NEXT:    v_add_i32_e32 v1, vcc, s6, v5
2710; SI-NEXT:    v_mul_hi_u32 v3, s5, v3
2711; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
2712; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
2713; SI-NEXT:    s_mul_i32 s5, s5, s9
2714; SI-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, 0, vcc
2715; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v4
2716; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
2717; SI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
2718; SI-NEXT:    s_mul_i32 s4, s4, s8
2719; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v0, vcc
2720; SI-NEXT:    v_mov_b32_e32 v0, s4
2721; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2722; SI-NEXT:    s_endpgm
2723;
2724; VI-LABEL: s_mul_i128:
2725; VI:       ; %bb.0: ; %entry
2726; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x4c
2727; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x7c
2728; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2729; VI-NEXT:    v_mov_b32_e32 v5, 0
2730; VI-NEXT:    s_mov_b32 s3, 0xf000
2731; VI-NEXT:    s_waitcnt lgkmcnt(0)
2732; VI-NEXT:    v_mov_b32_e32 v0, s6
2733; VI-NEXT:    v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0
2734; VI-NEXT:    s_mul_i32 s7, s8, s7
2735; VI-NEXT:    v_mov_b32_e32 v6, s8
2736; VI-NEXT:    v_add_u32_e32 v3, vcc, s7, v3
2737; VI-NEXT:    s_mul_i32 s12, s9, s6
2738; VI-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s4, v6, 0
2739; VI-NEXT:    v_add_u32_e32 v3, vcc, s12, v3
2740; VI-NEXT:    v_mov_b32_e32 v4, v1
2741; VI-NEXT:    v_mad_u64_u32 v[6:7], s[6:7], s5, v6, v[4:5]
2742; VI-NEXT:    v_mov_b32_e32 v8, s4
2743; VI-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], s10, v8, v[2:3]
2744; VI-NEXT:    v_mov_b32_e32 v3, v7
2745; VI-NEXT:    v_mov_b32_e32 v7, v5
2746; VI-NEXT:    v_mov_b32_e32 v8, s9
2747; VI-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], s4, v8, v[6:7]
2748; VI-NEXT:    s_mul_i32 s8, s11, s4
2749; VI-NEXT:    v_add_u32_e32 v6, vcc, s8, v2
2750; VI-NEXT:    v_mov_b32_e32 v2, v5
2751; VI-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
2752; VI-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, 0, vcc
2753; VI-NEXT:    s_mul_i32 s8, s10, s5
2754; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s5, v8, v[2:3]
2755; VI-NEXT:    v_add_u32_e32 v5, vcc, s8, v6
2756; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
2757; VI-NEXT:    s_mov_b32 s2, -1
2758; VI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
2759; VI-NEXT:    v_mov_b32_e32 v1, v4
2760; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2761; VI-NEXT:    s_endpgm
2762;
2763; GFX9-LABEL: s_mul_i128:
2764; GFX9:       ; %bb.0: ; %entry
2765; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x4c
2766; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x7c
2767; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2768; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2769; GFX9-NEXT:    s_mov_b32 s6, -1
2770; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2771; GFX9-NEXT:    s_mul_i32 s0, s12, s11
2772; GFX9-NEXT:    s_mul_hi_u32 s1, s12, s10
2773; GFX9-NEXT:    s_mul_i32 s2, s14, s9
2774; GFX9-NEXT:    s_mul_hi_u32 s3, s14, s8
2775; GFX9-NEXT:    s_add_i32 s0, s1, s0
2776; GFX9-NEXT:    s_mul_i32 s1, s13, s10
2777; GFX9-NEXT:    s_add_i32 s2, s3, s2
2778; GFX9-NEXT:    s_mul_i32 s3, s15, s8
2779; GFX9-NEXT:    s_add_i32 s0, s0, s1
2780; GFX9-NEXT:    s_mul_i32 s1, s12, s10
2781; GFX9-NEXT:    s_add_i32 s2, s2, s3
2782; GFX9-NEXT:    s_mul_i32 s3, s14, s8
2783; GFX9-NEXT:    s_add_u32 s3, s3, s1
2784; GFX9-NEXT:    s_addc_u32 s2, s2, s0
2785; GFX9-NEXT:    s_mul_i32 s14, s9, s12
2786; GFX9-NEXT:    s_mul_hi_u32 s15, s8, s12
2787; GFX9-NEXT:    s_mul_hi_u32 s11, s9, s12
2788; GFX9-NEXT:    s_add_u32 s14, s14, s15
2789; GFX9-NEXT:    s_mul_i32 s1, s8, s13
2790; GFX9-NEXT:    s_addc_u32 s11, s11, 0
2791; GFX9-NEXT:    s_mul_hi_u32 s10, s8, s13
2792; GFX9-NEXT:    s_add_u32 s1, s1, s14
2793; GFX9-NEXT:    s_addc_u32 s10, s10, 0
2794; GFX9-NEXT:    s_add_u32 s10, s11, s10
2795; GFX9-NEXT:    s_addc_u32 s11, 0, 0
2796; GFX9-NEXT:    s_mul_hi_u32 s14, s9, s13
2797; GFX9-NEXT:    s_mul_i32 s9, s9, s13
2798; GFX9-NEXT:    s_add_u32 s9, s9, s10
2799; GFX9-NEXT:    s_addc_u32 s10, s14, s11
2800; GFX9-NEXT:    s_mov_b32 s0, 0
2801; GFX9-NEXT:    s_add_u32 s9, s9, s3
2802; GFX9-NEXT:    s_addc_u32 s10, s10, s2
2803; GFX9-NEXT:    s_mul_i32 s2, s8, s12
2804; GFX9-NEXT:    s_mov_b32 s3, s0
2805; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
2806; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2807; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2808; GFX9-NEXT:    v_mov_b32_e32 v2, s9
2809; GFX9-NEXT:    v_mov_b32_e32 v3, s10
2810; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2811; GFX9-NEXT:    s_endpgm
2812;
2813; GFX10-LABEL: s_mul_i128:
2814; GFX10:       ; %bb.0: ; %entry
2815; GFX10-NEXT:    s_clause 0x1
2816; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x4c
2817; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x7c
2818; GFX10-NEXT:    s_mov_b32 s2, 0
2819; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2820; GFX10-NEXT:    s_mov_b32 s13, s2
2821; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2822; GFX10-NEXT:    s_mul_i32 s3, s8, s7
2823; GFX10-NEXT:    s_mul_hi_u32 s7, s8, s6
2824; GFX10-NEXT:    s_mul_i32 s14, s10, s5
2825; GFX10-NEXT:    s_mul_hi_u32 s15, s10, s4
2826; GFX10-NEXT:    s_mul_i32 s12, s9, s6
2827; GFX10-NEXT:    s_mul_i32 s11, s11, s4
2828; GFX10-NEXT:    s_add_i32 s3, s7, s3
2829; GFX10-NEXT:    s_add_i32 s7, s15, s14
2830; GFX10-NEXT:    s_mul_i32 s6, s8, s6
2831; GFX10-NEXT:    s_mul_i32 s10, s10, s4
2832; GFX10-NEXT:    s_add_i32 s3, s3, s12
2833; GFX10-NEXT:    s_add_i32 s7, s7, s11
2834; GFX10-NEXT:    s_mul_i32 s19, s5, s8
2835; GFX10-NEXT:    s_mul_hi_u32 s20, s4, s8
2836; GFX10-NEXT:    s_add_u32 s6, s10, s6
2837; GFX10-NEXT:    s_mul_hi_u32 s18, s5, s8
2838; GFX10-NEXT:    s_addc_u32 s7, s7, s3
2839; GFX10-NEXT:    s_mul_i32 s17, s4, s9
2840; GFX10-NEXT:    s_add_u32 s3, s19, s20
2841; GFX10-NEXT:    s_mul_hi_u32 s16, s4, s9
2842; GFX10-NEXT:    s_mul_hi_u32 s21, s5, s9
2843; GFX10-NEXT:    s_mul_i32 s5, s5, s9
2844; GFX10-NEXT:    s_addc_u32 s9, s18, 0
2845; GFX10-NEXT:    s_add_u32 s3, s17, s3
2846; GFX10-NEXT:    s_addc_u32 s10, s16, 0
2847; GFX10-NEXT:    s_mul_i32 s12, s4, s8
2848; GFX10-NEXT:    s_add_u32 s4, s9, s10
2849; GFX10-NEXT:    s_addc_u32 s8, 0, 0
2850; GFX10-NEXT:    s_add_u32 s4, s5, s4
2851; GFX10-NEXT:    s_addc_u32 s5, s21, s8
2852; GFX10-NEXT:    s_add_u32 s4, s4, s6
2853; GFX10-NEXT:    s_addc_u32 s5, s5, s7
2854; GFX10-NEXT:    s_or_b64 s[2:3], s[12:13], s[2:3]
2855; GFX10-NEXT:    v_mov_b32_e32 v2, s4
2856; GFX10-NEXT:    v_mov_b32_e32 v0, s2
2857; GFX10-NEXT:    v_mov_b32_e32 v1, s3
2858; GFX10-NEXT:    v_mov_b32_e32 v3, s5
2859; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
2860; GFX10-NEXT:    s_mov_b32 s2, -1
2861; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2862; GFX10-NEXT:    s_endpgm
2863;
2864; GFX11-LABEL: s_mul_i128:
2865; GFX11:       ; %bb.0: ; %entry
2866; GFX11-NEXT:    s_clause 0x2
2867; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x4c
2868; GFX11-NEXT:    s_load_b128 s[8:11], s[0:1], 0x7c
2869; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2870; GFX11-NEXT:    s_mov_b32 s2, 0
2871; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2872; GFX11-NEXT:    s_mov_b32 s13, s2
2873; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2874; GFX11-NEXT:    s_mul_i32 s3, s8, s7
2875; GFX11-NEXT:    s_mul_hi_u32 s7, s8, s6
2876; GFX11-NEXT:    s_mul_i32 s14, s10, s5
2877; GFX11-NEXT:    s_mul_hi_u32 s15, s10, s4
2878; GFX11-NEXT:    s_mul_i32 s12, s9, s6
2879; GFX11-NEXT:    s_mul_i32 s11, s11, s4
2880; GFX11-NEXT:    s_add_i32 s3, s7, s3
2881; GFX11-NEXT:    s_add_i32 s7, s15, s14
2882; GFX11-NEXT:    s_mul_i32 s6, s8, s6
2883; GFX11-NEXT:    s_mul_i32 s10, s10, s4
2884; GFX11-NEXT:    s_add_i32 s3, s3, s12
2885; GFX11-NEXT:    s_add_i32 s7, s7, s11
2886; GFX11-NEXT:    s_mul_i32 s19, s5, s8
2887; GFX11-NEXT:    s_mul_hi_u32 s20, s4, s8
2888; GFX11-NEXT:    s_add_u32 s6, s10, s6
2889; GFX11-NEXT:    s_mul_hi_u32 s18, s5, s8
2890; GFX11-NEXT:    s_addc_u32 s7, s7, s3
2891; GFX11-NEXT:    s_mul_i32 s17, s4, s9
2892; GFX11-NEXT:    s_add_u32 s3, s19, s20
2893; GFX11-NEXT:    s_mul_hi_u32 s16, s4, s9
2894; GFX11-NEXT:    s_mul_hi_u32 s21, s5, s9
2895; GFX11-NEXT:    s_mul_i32 s5, s5, s9
2896; GFX11-NEXT:    s_addc_u32 s9, s18, 0
2897; GFX11-NEXT:    s_add_u32 s3, s17, s3
2898; GFX11-NEXT:    s_addc_u32 s10, s16, 0
2899; GFX11-NEXT:    s_mul_i32 s12, s4, s8
2900; GFX11-NEXT:    s_add_u32 s4, s9, s10
2901; GFX11-NEXT:    s_addc_u32 s8, 0, 0
2902; GFX11-NEXT:    s_add_u32 s4, s5, s4
2903; GFX11-NEXT:    s_addc_u32 s5, s21, s8
2904; GFX11-NEXT:    s_add_u32 s4, s4, s6
2905; GFX11-NEXT:    s_addc_u32 s5, s5, s7
2906; GFX11-NEXT:    s_or_b64 s[2:3], s[12:13], s[2:3]
2907; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2908; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
2909; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
2910; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
2911; GFX11-NEXT:    s_mov_b32 s2, -1
2912; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
2913; GFX11-NEXT:    s_nop 0
2914; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2915; GFX11-NEXT:    s_endpgm
2916;
2917; GFX12-LABEL: s_mul_i128:
2918; GFX12:       ; %bb.0: ; %entry
2919; GFX12-NEXT:    s_clause 0x1
2920; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x7c
2921; GFX12-NEXT:    s_load_b128 s[8:11], s[0:1], 0x4c
2922; GFX12-NEXT:    s_mov_b32 s3, 0
2923; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2924; GFX12-NEXT:    s_mov_b32 s15, s3
2925; GFX12-NEXT:    s_mov_b32 s13, s3
2926; GFX12-NEXT:    s_mov_b32 s17, s3
2927; GFX12-NEXT:    s_mov_b32 s19, s3
2928; GFX12-NEXT:    s_mov_b32 s24, s3
2929; GFX12-NEXT:    s_wait_kmcnt 0x0
2930; GFX12-NEXT:    s_mov_b32 s2, s4
2931; GFX12-NEXT:    s_mov_b32 s14, s8
2932; GFX12-NEXT:    s_mov_b32 s12, s9
2933; GFX12-NEXT:    s_mul_u64 s[22:23], s[14:15], s[2:3]
2934; GFX12-NEXT:    s_mul_u64 s[20:21], s[12:13], s[2:3]
2935; GFX12-NEXT:    s_mov_b32 s2, s23
2936; GFX12-NEXT:    s_mov_b32 s16, s5
2937; GFX12-NEXT:    s_mul_u64 s[4:5], s[4:5], s[10:11]
2938; GFX12-NEXT:    s_add_nc_u64 s[10:11], s[20:21], s[2:3]
2939; GFX12-NEXT:    s_mul_u64 s[6:7], s[6:7], s[8:9]
2940; GFX12-NEXT:    s_mul_u64 s[8:9], s[14:15], s[16:17]
2941; GFX12-NEXT:    s_mov_b32 s2, s11
2942; GFX12-NEXT:    s_mov_b32 s11, s3
2943; GFX12-NEXT:    s_add_nc_u64 s[4:5], s[6:7], s[4:5]
2944; GFX12-NEXT:    s_add_nc_u64 s[6:7], s[8:9], s[10:11]
2945; GFX12-NEXT:    s_mul_u64 s[12:13], s[12:13], s[16:17]
2946; GFX12-NEXT:    s_mov_b32 s18, s7
2947; GFX12-NEXT:    s_mov_b32 s23, s3
2948; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[18:19]
2949; GFX12-NEXT:    s_mov_b32 s25, s6
2950; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[12:13], s[2:3]
2951; GFX12-NEXT:    s_or_b64 s[6:7], s[22:23], s[24:25]
2952; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
2953; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
2954; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2955; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
2956; GFX12-NEXT:    s_mov_b32 s2, -1
2957; GFX12-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null
2958; GFX12-NEXT:    s_nop 0
2959; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2960; GFX12-NEXT:    s_endpgm
2961;
2962; EG-LABEL: s_mul_i128:
2963; EG:       ; %bb.0: ; %entry
2964; EG-NEXT:    ALU 41, @4, KC0[CB0:0-32], KC1[]
2965; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2966; EG-NEXT:    CF_END
2967; EG-NEXT:    PAD
2968; EG-NEXT:    ALU clause starting at 4:
2969; EG-NEXT:     MULLO_INT * T0.X, KC0[5].X, KC0[8].X,
2970; EG-NEXT:     MULHI * T0.Y, KC0[5].X, KC0[8].X,
2971; EG-NEXT:     MULLO_INT * T0.Z, KC0[8].Y, KC0[4].W,
2972; EG-NEXT:     MULLO_INT * T0.W, KC0[8].X, KC0[5].Y,
2973; EG-NEXT:     MULHI * T1.X, KC0[5].X, KC0[7].W,
2974; EG-NEXT:     MULHI * T1.Y, KC0[4].W, KC0[8].X,
2975; EG-NEXT:     MULHI * T1.Z, KC0[8].Y, KC0[4].W,
2976; EG-NEXT:     MULLO_INT * T1.W, KC0[8].Y, KC0[5].X,
2977; EG-NEXT:     MULHI * T2.X, KC0[7].W, KC0[5].Y,
2978; EG-NEXT:     MULLO_INT * T2.Y, KC0[5].X, KC0[7].W,
2979; EG-NEXT:     MULHI * T2.Z, KC0[4].W, KC0[7].W,
2980; EG-NEXT:     ADD_INT T2.W, T2.Y, PS,
2981; EG-NEXT:     MULLO_INT * T3.X, KC0[4].W, KC0[8].X,
2982; EG-NEXT:     ADDC_UINT T2.Z, T2.Y, T2.Z,
2983; EG-NEXT:     ADDC_UINT T3.W, PS, PV.W,
2984; EG-NEXT:     MULLO_INT * T2.Y, KC0[7].W, KC0[5].Z,
2985; EG-NEXT:     ADD_INT T2.X, T2.X, PS,
2986; EG-NEXT:     ADD_INT T2.Y, T1.Z, T1.W,
2987; EG-NEXT:     ADD_INT T1.Z, T1.Y, PV.W,
2988; EG-NEXT:     ADD_INT T1.W, T1.X, PV.Z, BS:VEC_120/SCL_212
2989; EG-NEXT:     MULLO_INT * T1.X, KC0[8].Z, KC0[4].W,
2990; EG-NEXT:     ADD_INT T4.X, PV.W, PV.Z,
2991; EG-NEXT:     ADDC_UINT T1.Y, PV.W, PV.Z,
2992; EG-NEXT:     ADD_INT T1.Z, PV.Y, PS,
2993; EG-NEXT:     ADD_INT T0.W, PV.X, T0.W,
2994; EG-NEXT:     MULLO_INT * T1.X, KC0[7].W, KC0[5].Y,
2995; EG-NEXT:     ADD_INT T2.Y, PV.Z, PV.W,
2996; EG-NEXT:     ADDC_UINT T1.Z, T0.Z, PS,
2997; EG-NEXT:     ADD_INT T0.W, T0.Y, PV.Y,
2998; EG-NEXT:     ADDC_UINT * T1.W, T0.X, PV.X,
2999; EG-NEXT:     ADD_INT T0.Y, T0.X, T4.X,
3000; EG-NEXT:     ADD_INT T0.Z, T0.Z, T1.X, BS:VEC_021/SCL_122
3001; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
3002; EG-NEXT:     ADD_INT * T1.W, PV.Y, PV.Z,
3003; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
3004; EG-NEXT:     ADDC_UINT * T1.W, PV.Y, PV.Z,
3005; EG-NEXT:     ADD_INT * T0.W, PV.W, PS,
3006; EG-NEXT:     ADD_INT * T0.Z, T0.Y, T0.Z,
3007; EG-NEXT:     ADD_INT * T0.Y, T3.X, T2.W,
3008; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
3009; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3010; EG-NEXT:     MULLO_INT * T0.X, KC0[4].W, KC0[7].W,
3011entry:
3012  %mul = mul i128 %a, %b
3013  store i128 %mul, ptr addrspace(1) %out
3014  ret void
3015}
3016
3017define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
3018; SI-LABEL: v_mul_i128:
3019; SI:       ; %bb.0: ; %entry
3020; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
3021; SI-NEXT:    s_mov_b32 s7, 0xf000
3022; SI-NEXT:    s_mov_b32 s6, 0
3023; SI-NEXT:    v_lshlrev_b32_e32 v8, 4, v0
3024; SI-NEXT:    v_mov_b32_e32 v9, 0
3025; SI-NEXT:    s_waitcnt lgkmcnt(0)
3026; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
3027; SI-NEXT:    s_mov_b64 s[0:1], s[2:3]
3028; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
3029; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
3030; SI-NEXT:    buffer_load_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64
3031; SI-NEXT:    s_waitcnt vmcnt(0)
3032; SI-NEXT:    v_mul_lo_u32 v3, v4, v3
3033; SI-NEXT:    v_mul_hi_u32 v10, v4, v2
3034; SI-NEXT:    v_mul_lo_u32 v12, v6, v1
3035; SI-NEXT:    v_mul_hi_u32 v13, v6, v0
3036; SI-NEXT:    v_mul_lo_u32 v17, v1, v4
3037; SI-NEXT:    v_mul_hi_u32 v18, v0, v4
3038; SI-NEXT:    v_mul_lo_u32 v11, v5, v2
3039; SI-NEXT:    v_mul_lo_u32 v7, v7, v0
3040; SI-NEXT:    v_mul_hi_u32 v16, v1, v4
3041; SI-NEXT:    v_mul_lo_u32 v15, v0, v5
3042; SI-NEXT:    v_mul_hi_u32 v14, v0, v5
3043; SI-NEXT:    v_mul_hi_u32 v19, v1, v5
3044; SI-NEXT:    v_mul_lo_u32 v5, v1, v5
3045; SI-NEXT:    v_add_i32_e32 v1, vcc, v10, v3
3046; SI-NEXT:    v_add_i32_e32 v3, vcc, v13, v12
3047; SI-NEXT:    v_mul_lo_u32 v2, v4, v2
3048; SI-NEXT:    v_mul_lo_u32 v6, v6, v0
3049; SI-NEXT:    v_mul_lo_u32 v0, v0, v4
3050; SI-NEXT:    v_add_i32_e32 v4, vcc, v17, v18
3051; SI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v16, vcc
3052; SI-NEXT:    v_add_i32_e32 v11, vcc, v1, v11
3053; SI-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
3054; SI-NEXT:    v_add_i32_e32 v1, vcc, v15, v4
3055; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v14, vcc
3056; SI-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
3057; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
3058; SI-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
3059; SI-NEXT:    v_addc_u32_e64 v6, s[4:5], 0, 0, vcc
3060; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
3061; SI-NEXT:    v_addc_u32_e32 v5, vcc, v19, v6, vcc
3062; SI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
3063; SI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
3064; SI-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64
3065; SI-NEXT:    s_endpgm
3066;
3067; VI-LABEL: v_mul_i128:
3068; VI:       ; %bb.0: ; %entry
3069; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
3070; VI-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
3071; VI-NEXT:    v_mov_b32_e32 v11, 0
3072; VI-NEXT:    s_waitcnt lgkmcnt(0)
3073; VI-NEXT:    v_mov_b32_e32 v1, s1
3074; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3075; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3076; VI-NEXT:    v_mov_b32_e32 v3, s3
3077; VI-NEXT:    v_add_u32_e32 v8, vcc, s2, v2
3078; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
3079; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
3080; VI-NEXT:    flat_load_dwordx4 v[4:7], v[8:9]
3081; VI-NEXT:    s_waitcnt vmcnt(0)
3082; VI-NEXT:    v_mul_lo_u32 v10, v4, v3
3083; VI-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v4, v2, 0
3084; VI-NEXT:    v_mul_lo_u32 v14, v5, v2
3085; VI-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
3086; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v10
3087; VI-NEXT:    v_mov_b32_e32 v10, v3
3088; VI-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11]
3089; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v14
3090; VI-NEXT:    v_mov_b32_e32 v10, v4
3091; VI-NEXT:    v_mov_b32_e32 v4, v11
3092; VI-NEXT:    v_mul_lo_u32 v7, v7, v0
3093; VI-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v6, v0, v[12:13]
3094; VI-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4]
3095; VI-NEXT:    v_add_u32_e32 v13, vcc, v7, v13
3096; VI-NEXT:    v_mov_b32_e32 v0, v4
3097; VI-NEXT:    v_mul_lo_u32 v11, v6, v1
3098; VI-NEXT:    v_add_u32_e32 v6, vcc, v10, v0
3099; VI-NEXT:    v_addc_u32_e64 v7, s[0:1], 0, 0, vcc
3100; VI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7]
3101; VI-NEXT:    v_add_u32_e32 v5, vcc, v11, v13
3102; VI-NEXT:    v_add_u32_e32 v4, vcc, v0, v12
3103; VI-NEXT:    v_addc_u32_e32 v5, vcc, v1, v5, vcc
3104; VI-NEXT:    flat_store_dwordx4 v[8:9], v[2:5]
3105; VI-NEXT:    s_endpgm
3106;
3107; GFX9-LABEL: v_mul_i128:
3108; GFX9:       ; %bb.0: ; %entry
3109; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
3110; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 4, v0
3111; GFX9-NEXT:    v_mov_b32_e32 v10, 0
3112; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3113; GFX9-NEXT:    global_load_dwordx4 v[0:3], v13, s[0:1]
3114; GFX9-NEXT:    global_load_dwordx4 v[4:7], v13, s[2:3]
3115; GFX9-NEXT:    s_waitcnt vmcnt(0)
3116; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
3117; GFX9-NEXT:    v_mul_lo_u32 v14, v5, v2
3118; GFX9-NEXT:    v_mul_lo_u32 v15, v4, v3
3119; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10]
3120; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0
3121; GFX9-NEXT:    v_mul_lo_u32 v16, v7, v0
3122; GFX9-NEXT:    v_mov_b32_e32 v7, v12
3123; GFX9-NEXT:    v_mov_b32_e32 v12, v10
3124; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[11:12]
3125; GFX9-NEXT:    v_add3_u32 v3, v3, v15, v14
3126; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3]
3127; GFX9-NEXT:    v_mov_b32_e32 v0, v10
3128; GFX9-NEXT:    v_mul_lo_u32 v4, v6, v1
3129; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v0
3130; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc
3131; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7]
3132; GFX9-NEXT:    v_add3_u32 v3, v16, v3, v4
3133; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v0, v2
3134; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v1, v3, vcc
3135; GFX9-NEXT:    global_store_dwordx4 v13, v[8:11], s[2:3]
3136; GFX9-NEXT:    s_endpgm
3137;
3138; GFX10-LABEL: v_mul_i128:
3139; GFX10:       ; %bb.0: ; %entry
3140; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
3141; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 4, v0
3142; GFX10-NEXT:    v_mov_b32_e32 v10, 0
3143; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3144; GFX10-NEXT:    s_clause 0x1
3145; GFX10-NEXT:    global_load_dwordx4 v[0:3], v13, s[0:1]
3146; GFX10-NEXT:    global_load_dwordx4 v[4:7], v13, s[2:3]
3147; GFX10-NEXT:    s_waitcnt vmcnt(0)
3148; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s0, v0, v4, 0
3149; GFX10-NEXT:    v_mul_lo_u32 v15, v5, v2
3150; GFX10-NEXT:    v_mul_lo_u32 v7, v7, v0
3151; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s0, v1, v4, v[9:10]
3152; GFX10-NEXT:    v_mov_b32_e32 v14, v12
3153; GFX10-NEXT:    v_mov_b32_e32 v12, v10
3154; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s0, v0, v5, v[11:12]
3155; GFX10-NEXT:    v_mul_lo_u32 v11, v4, v3
3156; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, v4, v2, 0
3157; GFX10-NEXT:    v_mul_lo_u32 v12, v6, v1
3158; GFX10-NEXT:    v_mov_b32_e32 v4, v10
3159; GFX10-NEXT:    v_add3_u32 v3, v3, v11, v15
3160; GFX10-NEXT:    v_add_co_u32 v10, s0, v14, v4
3161; GFX10-NEXT:    v_add_co_ci_u32_e64 v11, s0, 0, 0, s0
3162; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, v6, v0, v[2:3]
3163; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v1, v5, v[10:11]
3164; GFX10-NEXT:    v_add3_u32 v3, v7, v3, v12
3165; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v0, v2
3166; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
3167; GFX10-NEXT:    global_store_dwordx4 v13, v[8:11], s[2:3]
3168; GFX10-NEXT:    s_endpgm
3169;
3170; GFX11-LABEL: v_mul_i128:
3171; GFX11:       ; %bb.0: ; %entry
3172; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x2c
3173; GFX11-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0
3174; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3175; GFX11-NEXT:    s_clause 0x1
3176; GFX11-NEXT:    global_load_b128 v[0:3], v15, s[0:1]
3177; GFX11-NEXT:    global_load_b128 v[4:7], v15, s[2:3]
3178; GFX11-NEXT:    s_waitcnt vmcnt(0)
3179; GFX11-NEXT:    v_mad_u64_u32 v[8:9], null, v0, v4, 0
3180; GFX11-NEXT:    v_mul_lo_u32 v14, v5, v2
3181; GFX11-NEXT:    v_mul_lo_u32 v3, v4, v3
3182; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3183; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10]
3184; GFX11-NEXT:    v_dual_mov_b32 v13, v12 :: v_dual_mov_b32 v12, v10
3185; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
3186; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v0, v5, v[11:12]
3187; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v4, v2, 0
3188; GFX11-NEXT:    v_mul_lo_u32 v4, v6, v1
3189; GFX11-NEXT:    v_mov_b32_e32 v2, v10
3190; GFX11-NEXT:    v_mul_lo_u32 v10, v7, v0
3191; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
3192; GFX11-NEXT:    v_add3_u32 v12, v12, v3, v14
3193; GFX11-NEXT:    v_add_co_u32 v2, s0, v13, v2
3194; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
3195; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s0
3196; GFX11-NEXT:    v_mad_u64_u32 v[13:14], null, v6, v0, v[11:12]
3197; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3198; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3]
3199; GFX11-NEXT:    v_add3_u32 v0, v10, v14, v4
3200; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3201; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v6, v13
3202; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo
3203; GFX11-NEXT:    global_store_b128 v15, v[8:11], s[2:3]
3204; GFX11-NEXT:    s_nop 0
3205; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3206; GFX11-NEXT:    s_endpgm
3207;
3208; GFX12-LABEL: v_mul_i128:
3209; GFX12:       ; %bb.0: ; %entry
3210; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x2c
3211; GFX12-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0
3212; GFX12-NEXT:    s_wait_kmcnt 0x0
3213; GFX12-NEXT:    s_clause 0x1
3214; GFX12-NEXT:    global_load_b128 v[0:3], v13, s[0:1]
3215; GFX12-NEXT:    global_load_b128 v[4:7], v13, s[2:3]
3216; GFX12-NEXT:    s_wait_loadcnt 0x0
3217; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v0, v4, 0
3218; GFX12-NEXT:    v_mul_lo_u32 v15, v5, v2
3219; GFX12-NEXT:    v_mul_lo_u32 v7, v7, v0
3220; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3221; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], null, v1, v4, v[9:10]
3222; GFX12-NEXT:    v_mov_b32_e32 v14, v12
3223; GFX12-NEXT:    v_mov_b32_e32 v12, v10
3224; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
3225; GFX12-NEXT:    v_mad_co_u64_u32 v[9:10], null, v0, v5, v[11:12]
3226; GFX12-NEXT:    v_mul_lo_u32 v11, v4, v3
3227; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, v4, v2, 0
3228; GFX12-NEXT:    v_mul_lo_u32 v12, v6, v1
3229; GFX12-NEXT:    v_mov_b32_e32 v4, v10
3230; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
3231; GFX12-NEXT:    v_add3_u32 v3, v3, v11, v15
3232; GFX12-NEXT:    v_add_co_u32 v10, s0, v14, v4
3233; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
3234; GFX12-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, 0, s0
3235; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, v6, v0, v[2:3]
3236; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3237; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v1, v5, v[10:11]
3238; GFX12-NEXT:    v_add3_u32 v3, v7, v3, v12
3239; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3240; GFX12-NEXT:    v_add_co_u32 v10, vcc_lo, v0, v2
3241; GFX12-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
3242; GFX12-NEXT:    global_store_b128 v13, v[8:11], s[2:3]
3243; GFX12-NEXT:    s_nop 0
3244; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3245; GFX12-NEXT:    s_endpgm
3246;
3247; EG-LABEL: v_mul_i128:
3248; EG:       ; %bb.0: ; %entry
3249; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
3250; EG-NEXT:    TEX 1 @6
3251; EG-NEXT:    ALU 41, @14, KC0[], KC1[]
3252; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
3253; EG-NEXT:    CF_END
3254; EG-NEXT:    PAD
3255; EG-NEXT:    Fetch clause starting at 6:
3256; EG-NEXT:     VTX_READ_128 T2.XYZW, T1.X, 0, #1
3257; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
3258; EG-NEXT:    ALU clause starting at 10:
3259; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
3260; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
3261; EG-NEXT:     ADD_INT T0.X, KC0[2].Z, PV.W,
3262; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, PV.W,
3263; EG-NEXT:    ALU clause starting at 14:
3264; EG-NEXT:     MULLO_INT * T1.Y, T0.Y, T2.Y,
3265; EG-NEXT:     MULHI * T1.Z, T0.Y, T2.Y,
3266; EG-NEXT:     MULLO_INT * T1.W, T2.Z, T0.X,
3267; EG-NEXT:     MULLO_INT * T3.X, T2.Y, T0.Z,
3268; EG-NEXT:     MULHI * T3.Y, T0.Y, T2.X,
3269; EG-NEXT:     MULHI * T3.Z, T0.X, T2.Y,
3270; EG-NEXT:     MULHI * T3.W, T2.Z, T0.X,
3271; EG-NEXT:     MULLO_INT * T2.Z, T2.Z, T0.Y,
3272; EG-NEXT:     MULHI * T4.X, T2.X, T0.Z,
3273; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T2.X,
3274; EG-NEXT:     MULHI * T4.Y, T0.X, T2.X,
3275; EG-NEXT:     ADD_INT T4.W, T0.Y, PS,
3276; EG-NEXT:     MULLO_INT * T2.Y, T0.X, T2.Y,
3277; EG-NEXT:     ADDC_UINT T4.Z, T0.Y, T4.Y,
3278; EG-NEXT:     ADDC_UINT T5.W, PS, PV.W,
3279; EG-NEXT:     MULLO_INT * T0.Y, T2.X, T0.W,
3280; EG-NEXT:     ADD_INT T4.X, T4.X, PS,
3281; EG-NEXT:     ADD_INT T0.Y, T3.W, T2.Z,
3282; EG-NEXT:     ADD_INT T2.Z, T3.Z, PV.W,
3283; EG-NEXT:     ADD_INT T0.W, T3.Y, PV.Z,
3284; EG-NEXT:     MULLO_INT * T2.W, T2.W, T0.X,
3285; EG-NEXT:     ADD_INT T5.X, PV.W, PV.Z,
3286; EG-NEXT:     ADDC_UINT T3.Y, PV.W, PV.Z,
3287; EG-NEXT:     ADD_INT T2.Z, PV.Y, PS,
3288; EG-NEXT:     ADD_INT T0.W, PV.X, T3.X,
3289; EG-NEXT:     MULLO_INT * T0.Y, T2.X, T0.Z,
3290; EG-NEXT:     ADD_INT T4.Y, PV.Z, PV.W,
3291; EG-NEXT:     ADDC_UINT T0.Z, T1.W, PS,
3292; EG-NEXT:     ADD_INT T0.W, T1.Z, PV.Y,
3293; EG-NEXT:     ADDC_UINT * T2.W, T1.Y, PV.X,
3294; EG-NEXT:     ADD_INT T1.Y, T1.Y, T5.X,
3295; EG-NEXT:     ADD_INT T1.Z, T1.W, T0.Y,
3296; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
3297; EG-NEXT:     ADD_INT * T1.W, PV.Y, PV.Z,
3298; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
3299; EG-NEXT:     ADDC_UINT * T1.W, PV.Y, PV.Z,
3300; EG-NEXT:     ADD_INT * T0.W, PV.W, PS,
3301; EG-NEXT:     ADD_INT * T0.Z, T1.Y, T1.Z,
3302; EG-NEXT:     ADD_INT * T0.Y, T2.Y, T4.W,
3303; EG-NEXT:     LSHR T1.X, T1.X, literal.x,
3304; EG-NEXT:     MULLO_INT * T0.X, T0.X, T2.X,
3305; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3306entry:
3307  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3308  %gep.a = getelementptr inbounds i128, ptr addrspace(1) %aptr, i32 %tid
3309  %gep.b = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid
3310  %gep.out = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid
3311  %a = load i128, ptr addrspace(1) %gep.a
3312  %b = load i128, ptr addrspace(1) %gep.b
3313  %mul = mul i128 %a, %b
3314  store i128 %mul, ptr addrspace(1) %gep.out
3315  ret void
3316}
3317
3318define i32 @mul_pow2_plus_1(i32 %val) {
3319; SI-LABEL: mul_pow2_plus_1:
3320; SI:       ; %bb.0:
3321; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3322; SI-NEXT:    v_mul_lo_u32 v0, v0, 9
3323; SI-NEXT:    s_setpc_b64 s[30:31]
3324;
3325; VI-LABEL: mul_pow2_plus_1:
3326; VI:       ; %bb.0:
3327; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3328; VI-NEXT:    v_mul_lo_u32 v0, v0, 9
3329; VI-NEXT:    s_setpc_b64 s[30:31]
3330;
3331; GFX9-LABEL: mul_pow2_plus_1:
3332; GFX9:       ; %bb.0:
3333; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3334; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
3335; GFX9-NEXT:    s_setpc_b64 s[30:31]
3336;
3337; GFX10-LABEL: mul_pow2_plus_1:
3338; GFX10:       ; %bb.0:
3339; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3340; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
3341; GFX10-NEXT:    s_setpc_b64 s[30:31]
3342;
3343; GFX11-LABEL: mul_pow2_plus_1:
3344; GFX11:       ; %bb.0:
3345; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3346; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
3347; GFX11-NEXT:    s_setpc_b64 s[30:31]
3348;
3349; GFX12-LABEL: mul_pow2_plus_1:
3350; GFX12:       ; %bb.0:
3351; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3352; GFX12-NEXT:    s_wait_expcnt 0x0
3353; GFX12-NEXT:    s_wait_samplecnt 0x0
3354; GFX12-NEXT:    s_wait_bvhcnt 0x0
3355; GFX12-NEXT:    s_wait_kmcnt 0x0
3356; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
3357; GFX12-NEXT:    s_setpc_b64 s[30:31]
3358;
3359; EG-LABEL: mul_pow2_plus_1:
3360; EG:       ; %bb.0:
3361; EG-NEXT:    CF_END
3362; EG-NEXT:    PAD
3363  %mul = mul i32 %val, 9
3364  ret i32 %mul
3365}
3366
3367declare i32 @llvm.amdgcn.workitem.id.x() #1
3368
3369attributes #0 = { nounwind }
3370attributes #1 = { nounwind readnone}
3371