xref: /llvm-project/llvm/test/CodeGen/AMDGPU/mul.ll (revision cc3d2533cc2e4ea06981b86ede5087fbf801e789)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
6; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s
8
9; mul24 and mad24 are affected
10
11define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
12; SI-LABEL: test_mul_v2i32:
13; SI:       ; %bb.0: ; %entry
14; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
15; SI-NEXT:    s_mov_b32 s7, 0xf000
16; SI-NEXT:    s_mov_b32 s6, -1
17; SI-NEXT:    s_mov_b32 s10, s6
18; SI-NEXT:    s_mov_b32 s11, s7
19; SI-NEXT:    s_waitcnt lgkmcnt(0)
20; SI-NEXT:    s_mov_b32 s8, s2
21; SI-NEXT:    s_mov_b32 s9, s3
22; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
23; SI-NEXT:    s_mov_b32 s4, s0
24; SI-NEXT:    s_mov_b32 s5, s1
25; SI-NEXT:    s_waitcnt vmcnt(0)
26; SI-NEXT:    v_mul_lo_u32 v1, v1, v3
27; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
28; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
29; SI-NEXT:    s_endpgm
30;
31; VI-LABEL: test_mul_v2i32:
32; VI:       ; %bb.0: ; %entry
33; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
34; VI-NEXT:    s_mov_b32 s7, 0xf000
35; VI-NEXT:    s_mov_b32 s6, -1
36; VI-NEXT:    s_mov_b32 s10, s6
37; VI-NEXT:    s_mov_b32 s11, s7
38; VI-NEXT:    s_waitcnt lgkmcnt(0)
39; VI-NEXT:    s_mov_b32 s8, s2
40; VI-NEXT:    s_mov_b32 s9, s3
41; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
42; VI-NEXT:    s_mov_b32 s4, s0
43; VI-NEXT:    s_mov_b32 s5, s1
44; VI-NEXT:    s_waitcnt vmcnt(0)
45; VI-NEXT:    v_mul_lo_u32 v1, v1, v3
46; VI-NEXT:    v_mul_lo_u32 v0, v0, v2
47; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
48; VI-NEXT:    s_endpgm
49;
50; GFX9-LABEL: test_mul_v2i32:
51; GFX9:       ; %bb.0: ; %entry
52; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
53; GFX9-NEXT:    s_mov_b32 s7, 0xf000
54; GFX9-NEXT:    s_mov_b32 s6, -1
55; GFX9-NEXT:    s_mov_b32 s10, s6
56; GFX9-NEXT:    s_mov_b32 s11, s7
57; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
58; GFX9-NEXT:    s_mov_b32 s8, s2
59; GFX9-NEXT:    s_mov_b32 s9, s3
60; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
61; GFX9-NEXT:    s_mov_b32 s4, s0
62; GFX9-NEXT:    s_mov_b32 s5, s1
63; GFX9-NEXT:    s_waitcnt vmcnt(0)
64; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
65; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
66; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
67; GFX9-NEXT:    s_endpgm
68;
69; GFX10-LABEL: test_mul_v2i32:
70; GFX10:       ; %bb.0: ; %entry
71; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
72; GFX10-NEXT:    s_mov_b32 s6, -1
73; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
74; GFX10-NEXT:    s_mov_b32 s10, s6
75; GFX10-NEXT:    s_mov_b32 s11, s7
76; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
77; GFX10-NEXT:    s_mov_b32 s8, s2
78; GFX10-NEXT:    s_mov_b32 s9, s3
79; GFX10-NEXT:    s_mov_b32 s4, s0
80; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
81; GFX10-NEXT:    s_mov_b32 s5, s1
82; GFX10-NEXT:    s_waitcnt vmcnt(0)
83; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v3
84; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
85; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
86; GFX10-NEXT:    s_endpgm
87;
88; GFX11-LABEL: test_mul_v2i32:
89; GFX11:       ; %bb.0: ; %entry
90; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
91; GFX11-NEXT:    s_mov_b32 s6, -1
92; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
93; GFX11-NEXT:    s_mov_b32 s10, s6
94; GFX11-NEXT:    s_mov_b32 s11, s7
95; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX11-NEXT:    s_mov_b32 s8, s2
97; GFX11-NEXT:    s_mov_b32 s9, s3
98; GFX11-NEXT:    s_mov_b32 s4, s0
99; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], 0
100; GFX11-NEXT:    s_mov_b32 s5, s1
101; GFX11-NEXT:    s_waitcnt vmcnt(0)
102; GFX11-NEXT:    v_mul_lo_u32 v1, v1, v3
103; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v2
104; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
105; GFX11-NEXT:    s_nop 0
106; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
107; GFX11-NEXT:    s_endpgm
108;
109; EG-LABEL: test_mul_v2i32:
110; EG:       ; %bb.0: ; %entry
111; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
112; EG-NEXT:    TEX 0 @6
113; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
114; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
115; EG-NEXT:    CF_END
116; EG-NEXT:    PAD
117; EG-NEXT:    Fetch clause starting at 6:
118; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
119; EG-NEXT:    ALU clause starting at 8:
120; EG-NEXT:     MOV * T0.X, KC0[2].Z,
121; EG-NEXT:    ALU clause starting at 9:
122; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T0.W,
123; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
124; EG-NEXT:     MULLO_INT * T0.X, T0.X, T0.Z,
125; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
126entry:
127  %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
128  %a = load <2 x i32>, ptr addrspace(1) %in
129  %b = load <2 x i32>, ptr addrspace(1) %b_ptr
130  %result = mul <2 x i32> %a, %b
131  store <2 x i32> %result, ptr addrspace(1) %out
132  ret void
133}
134
135define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
136; SI-LABEL: v_mul_v4i32:
137; SI:       ; %bb.0: ; %entry
138; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
139; SI-NEXT:    s_mov_b32 s7, 0xf000
140; SI-NEXT:    s_mov_b32 s6, -1
141; SI-NEXT:    s_mov_b32 s10, s6
142; SI-NEXT:    s_mov_b32 s11, s7
143; SI-NEXT:    s_waitcnt lgkmcnt(0)
144; SI-NEXT:    s_mov_b32 s8, s2
145; SI-NEXT:    s_mov_b32 s9, s3
146; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
147; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
148; SI-NEXT:    s_mov_b32 s4, s0
149; SI-NEXT:    s_mov_b32 s5, s1
150; SI-NEXT:    s_waitcnt vmcnt(0)
151; SI-NEXT:    v_mul_lo_u32 v3, v3, v7
152; SI-NEXT:    v_mul_lo_u32 v2, v2, v6
153; SI-NEXT:    v_mul_lo_u32 v1, v1, v5
154; SI-NEXT:    v_mul_lo_u32 v0, v0, v4
155; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
156; SI-NEXT:    s_endpgm
157;
158; VI-LABEL: v_mul_v4i32:
159; VI:       ; %bb.0: ; %entry
160; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
161; VI-NEXT:    s_mov_b32 s7, 0xf000
162; VI-NEXT:    s_mov_b32 s6, -1
163; VI-NEXT:    s_mov_b32 s10, s6
164; VI-NEXT:    s_mov_b32 s11, s7
165; VI-NEXT:    s_waitcnt lgkmcnt(0)
166; VI-NEXT:    s_mov_b32 s8, s2
167; VI-NEXT:    s_mov_b32 s9, s3
168; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
169; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
170; VI-NEXT:    s_mov_b32 s4, s0
171; VI-NEXT:    s_mov_b32 s5, s1
172; VI-NEXT:    s_waitcnt vmcnt(0)
173; VI-NEXT:    v_mul_lo_u32 v3, v3, v7
174; VI-NEXT:    v_mul_lo_u32 v2, v2, v6
175; VI-NEXT:    v_mul_lo_u32 v1, v1, v5
176; VI-NEXT:    v_mul_lo_u32 v0, v0, v4
177; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
178; VI-NEXT:    s_endpgm
179;
180; GFX9-LABEL: v_mul_v4i32:
181; GFX9:       ; %bb.0: ; %entry
182; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
183; GFX9-NEXT:    s_mov_b32 s7, 0xf000
184; GFX9-NEXT:    s_mov_b32 s6, -1
185; GFX9-NEXT:    s_mov_b32 s10, s6
186; GFX9-NEXT:    s_mov_b32 s11, s7
187; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
188; GFX9-NEXT:    s_mov_b32 s8, s2
189; GFX9-NEXT:    s_mov_b32 s9, s3
190; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
191; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
192; GFX9-NEXT:    s_mov_b32 s4, s0
193; GFX9-NEXT:    s_mov_b32 s5, s1
194; GFX9-NEXT:    s_waitcnt vmcnt(0)
195; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v7
196; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v6
197; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v5
198; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v4
199; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
200; GFX9-NEXT:    s_endpgm
201;
202; GFX10-LABEL: v_mul_v4i32:
203; GFX10:       ; %bb.0: ; %entry
204; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
205; GFX10-NEXT:    s_mov_b32 s6, -1
206; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
207; GFX10-NEXT:    s_mov_b32 s10, s6
208; GFX10-NEXT:    s_mov_b32 s11, s7
209; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX10-NEXT:    s_mov_b32 s8, s2
211; GFX10-NEXT:    s_mov_b32 s9, s3
212; GFX10-NEXT:    s_clause 0x1
213; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
214; GFX10-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
215; GFX10-NEXT:    s_mov_b32 s4, s0
216; GFX10-NEXT:    s_mov_b32 s5, s1
217; GFX10-NEXT:    s_waitcnt vmcnt(0)
218; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v7
219; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v6
220; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v5
221; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v4
222; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
223; GFX10-NEXT:    s_endpgm
224;
225; GFX11-LABEL: v_mul_v4i32:
226; GFX11:       ; %bb.0: ; %entry
227; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
228; GFX11-NEXT:    s_mov_b32 s6, -1
229; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
230; GFX11-NEXT:    s_mov_b32 s10, s6
231; GFX11-NEXT:    s_mov_b32 s11, s7
232; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
233; GFX11-NEXT:    s_mov_b32 s8, s2
234; GFX11-NEXT:    s_mov_b32 s9, s3
235; GFX11-NEXT:    s_clause 0x1
236; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], 0
237; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[8:11], 0 offset:16
238; GFX11-NEXT:    s_mov_b32 s4, s0
239; GFX11-NEXT:    s_mov_b32 s5, s1
240; GFX11-NEXT:    s_waitcnt vmcnt(0)
241; GFX11-NEXT:    v_mul_lo_u32 v3, v3, v7
242; GFX11-NEXT:    v_mul_lo_u32 v2, v2, v6
243; GFX11-NEXT:    v_mul_lo_u32 v1, v1, v5
244; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v4
245; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
246; GFX11-NEXT:    s_nop 0
247; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
248; GFX11-NEXT:    s_endpgm
249;
250; EG-LABEL: v_mul_v4i32:
251; EG:       ; %bb.0: ; %entry
252; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
253; EG-NEXT:    TEX 1 @6
254; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
255; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
256; EG-NEXT:    CF_END
257; EG-NEXT:    PAD
258; EG-NEXT:    Fetch clause starting at 6:
259; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
260; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
261; EG-NEXT:    ALU clause starting at 10:
262; EG-NEXT:     MOV * T0.X, KC0[2].Z,
263; EG-NEXT:    ALU clause starting at 11:
264; EG-NEXT:     MULLO_INT * T0.W, T0.W, T1.W,
265; EG-NEXT:     MULLO_INT * T0.Z, T0.Z, T1.Z,
266; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T1.Y,
267; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
268; EG-NEXT:     MULLO_INT * T0.X, T0.X, T1.X,
269; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
270entry:
271  %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
272  %a = load <4 x i32>, ptr addrspace(1) %in
273  %b = load <4 x i32>, ptr addrspace(1) %b_ptr
274  %result = mul <4 x i32> %a, %b
275  store <4 x i32> %result, ptr addrspace(1) %out
276  ret void
277}
278
279define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) {
280; SI-LABEL: s_trunc_i64_mul_to_i32:
281; SI:       ; %bb.0: ; %entry
282; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
283; SI-NEXT:    s_waitcnt lgkmcnt(0)
284; SI-NEXT:    s_load_dword s7, s[0:1], 0xd
285; SI-NEXT:    s_mov_b32 s3, 0xf000
286; SI-NEXT:    s_mov_b32 s2, -1
287; SI-NEXT:    s_mov_b32 s0, s4
288; SI-NEXT:    s_waitcnt lgkmcnt(0)
289; SI-NEXT:    s_mul_i32 s4, s7, s6
290; SI-NEXT:    s_mov_b32 s1, s5
291; SI-NEXT:    v_mov_b32_e32 v0, s4
292; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
293; SI-NEXT:    s_endpgm
294;
295; VI-LABEL: s_trunc_i64_mul_to_i32:
296; VI:       ; %bb.0: ; %entry
297; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
298; VI-NEXT:    s_waitcnt lgkmcnt(0)
299; VI-NEXT:    s_load_dword s7, s[0:1], 0x34
300; VI-NEXT:    s_mov_b32 s3, 0xf000
301; VI-NEXT:    s_mov_b32 s2, -1
302; VI-NEXT:    s_mov_b32 s0, s4
303; VI-NEXT:    s_waitcnt lgkmcnt(0)
304; VI-NEXT:    s_mul_i32 s4, s7, s6
305; VI-NEXT:    s_mov_b32 s1, s5
306; VI-NEXT:    v_mov_b32_e32 v0, s4
307; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
308; VI-NEXT:    s_endpgm
309;
310; GFX9-LABEL: s_trunc_i64_mul_to_i32:
311; GFX9:       ; %bb.0: ; %entry
312; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
313; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX9-NEXT:    s_load_dword s7, s[0:1], 0x34
315; GFX9-NEXT:    ; kill: killed $sgpr0_sgpr1
316; GFX9-NEXT:    s_mov_b32 s3, 0xf000
317; GFX9-NEXT:    s_mov_b32 s2, -1
318; GFX9-NEXT:    s_mov_b32 s0, s4
319; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX9-NEXT:    s_mul_i32 s4, s7, s6
321; GFX9-NEXT:    s_mov_b32 s1, s5
322; GFX9-NEXT:    v_mov_b32_e32 v0, s4
323; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
324; GFX9-NEXT:    s_endpgm
325;
326; GFX10-LABEL: s_trunc_i64_mul_to_i32:
327; GFX10:       ; %bb.0: ; %entry
328; GFX10-NEXT:    s_clause 0x1
329; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
330; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
331; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
332; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
333; GFX10-NEXT:    s_mul_i32 s0, s2, s6
334; GFX10-NEXT:    s_mov_b32 s6, -1
335; GFX10-NEXT:    v_mov_b32_e32 v0, s0
336; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
337; GFX10-NEXT:    s_endpgm
338;
339; GFX11-LABEL: s_trunc_i64_mul_to_i32:
340; GFX11:       ; %bb.0: ; %entry
341; GFX11-NEXT:    s_clause 0x1
342; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
343; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
344; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
345; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
346; GFX11-NEXT:    s_mul_i32 s0, s0, s6
347; GFX11-NEXT:    s_mov_b32 s6, -1
348; GFX11-NEXT:    v_mov_b32_e32 v0, s0
349; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
350; GFX11-NEXT:    s_nop 0
351; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
352; GFX11-NEXT:    s_endpgm
353;
354; EG-LABEL: s_trunc_i64_mul_to_i32:
355; EG:       ; %bb.0: ; %entry
356; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
357; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
358; EG-NEXT:    CF_END
359; EG-NEXT:    PAD
360; EG-NEXT:    ALU clause starting at 4:
361; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
362; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
363; EG-NEXT:     MULLO_INT * T1.X, KC0[3].Y, KC0[2].W,
364entry:
365  %mul = mul i64 %b, %a
366  %trunc = trunc i64 %mul to i32
367  store i32 %trunc, ptr addrspace(1) %out, align 8
368  ret void
369}
370
371define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
372; SI-LABEL: v_trunc_i64_mul_to_i32:
373; SI:       ; %bb.0: ; %entry
374; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
375; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
376; SI-NEXT:    s_mov_b32 s3, 0xf000
377; SI-NEXT:    s_mov_b32 s2, -1
378; SI-NEXT:    s_mov_b32 s14, s2
379; SI-NEXT:    s_waitcnt lgkmcnt(0)
380; SI-NEXT:    s_mov_b32 s12, s6
381; SI-NEXT:    s_mov_b32 s13, s7
382; SI-NEXT:    s_mov_b32 s15, s3
383; SI-NEXT:    s_mov_b32 s10, s2
384; SI-NEXT:    s_mov_b32 s11, s3
385; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
386; SI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
387; SI-NEXT:    s_mov_b32 s0, s4
388; SI-NEXT:    s_mov_b32 s1, s5
389; SI-NEXT:    s_waitcnt vmcnt(0)
390; SI-NEXT:    v_mul_lo_u32 v0, v1, v0
391; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
392; SI-NEXT:    s_endpgm
393;
394; VI-LABEL: v_trunc_i64_mul_to_i32:
395; VI:       ; %bb.0: ; %entry
396; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
397; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
398; VI-NEXT:    s_mov_b32 s3, 0xf000
399; VI-NEXT:    s_mov_b32 s2, -1
400; VI-NEXT:    s_mov_b32 s14, s2
401; VI-NEXT:    s_waitcnt lgkmcnt(0)
402; VI-NEXT:    s_mov_b32 s12, s6
403; VI-NEXT:    s_mov_b32 s13, s7
404; VI-NEXT:    s_mov_b32 s15, s3
405; VI-NEXT:    s_mov_b32 s10, s2
406; VI-NEXT:    s_mov_b32 s11, s3
407; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
408; VI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
409; VI-NEXT:    s_mov_b32 s0, s4
410; VI-NEXT:    s_mov_b32 s1, s5
411; VI-NEXT:    s_waitcnt vmcnt(0)
412; VI-NEXT:    v_mul_lo_u32 v0, v1, v0
413; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
414; VI-NEXT:    s_endpgm
415;
416; GFX9-LABEL: v_trunc_i64_mul_to_i32:
417; GFX9:       ; %bb.0: ; %entry
418; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
419; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
420; GFX9-NEXT:    s_mov_b32 s3, 0xf000
421; GFX9-NEXT:    s_mov_b32 s2, -1
422; GFX9-NEXT:    s_mov_b32 s14, s2
423; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
424; GFX9-NEXT:    s_mov_b32 s12, s6
425; GFX9-NEXT:    s_mov_b32 s13, s7
426; GFX9-NEXT:    s_mov_b32 s15, s3
427; GFX9-NEXT:    s_mov_b32 s10, s2
428; GFX9-NEXT:    s_mov_b32 s11, s3
429; GFX9-NEXT:    buffer_load_dword v0, off, s[12:15], 0
430; GFX9-NEXT:    buffer_load_dword v1, off, s[8:11], 0
431; GFX9-NEXT:    s_mov_b32 s0, s4
432; GFX9-NEXT:    s_mov_b32 s1, s5
433; GFX9-NEXT:    s_waitcnt vmcnt(0)
434; GFX9-NEXT:    v_mul_lo_u32 v0, v1, v0
435; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
436; GFX9-NEXT:    s_endpgm
437;
438; GFX10-LABEL: v_trunc_i64_mul_to_i32:
439; GFX10:       ; %bb.0: ; %entry
440; GFX10-NEXT:    s_clause 0x1
441; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
442; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
443; GFX10-NEXT:    s_mov_b32 s2, -1
444; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
445; GFX10-NEXT:    s_mov_b32 s14, s2
446; GFX10-NEXT:    s_mov_b32 s15, s3
447; GFX10-NEXT:    s_mov_b32 s10, s2
448; GFX10-NEXT:    s_mov_b32 s11, s3
449; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
450; GFX10-NEXT:    s_mov_b32 s12, s6
451; GFX10-NEXT:    s_mov_b32 s13, s7
452; GFX10-NEXT:    buffer_load_dword v0, off, s[12:15], 0
453; GFX10-NEXT:    buffer_load_dword v1, off, s[8:11], 0
454; GFX10-NEXT:    s_mov_b32 s0, s4
455; GFX10-NEXT:    s_mov_b32 s1, s5
456; GFX10-NEXT:    s_waitcnt vmcnt(0)
457; GFX10-NEXT:    v_mul_lo_u32 v0, v1, v0
458; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
459; GFX10-NEXT:    s_endpgm
460;
461; GFX11-LABEL: v_trunc_i64_mul_to_i32:
462; GFX11:       ; %bb.0: ; %entry
463; GFX11-NEXT:    s_clause 0x1
464; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
465; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
466; GFX11-NEXT:    s_mov_b32 s10, -1
467; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
468; GFX11-NEXT:    s_mov_b32 s14, s10
469; GFX11-NEXT:    s_mov_b32 s15, s11
470; GFX11-NEXT:    s_mov_b32 s2, s10
471; GFX11-NEXT:    s_mov_b32 s3, s11
472; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
473; GFX11-NEXT:    s_mov_b32 s12, s6
474; GFX11-NEXT:    s_mov_b32 s13, s7
475; GFX11-NEXT:    buffer_load_b32 v0, off, s[12:15], 0
476; GFX11-NEXT:    buffer_load_b32 v1, off, s[0:3], 0
477; GFX11-NEXT:    s_mov_b32 s8, s4
478; GFX11-NEXT:    s_mov_b32 s9, s5
479; GFX11-NEXT:    s_waitcnt vmcnt(0)
480; GFX11-NEXT:    v_mul_lo_u32 v0, v1, v0
481; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
482; GFX11-NEXT:    s_nop 0
483; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
484; GFX11-NEXT:    s_endpgm
485;
486; EG-LABEL: v_trunc_i64_mul_to_i32:
487; EG:       ; %bb.0: ; %entry
488; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
489; EG-NEXT:    TEX 1 @6
490; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
491; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
492; EG-NEXT:    CF_END
493; EG-NEXT:    PAD
494; EG-NEXT:    Fetch clause starting at 6:
495; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
496; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
497; EG-NEXT:    ALU clause starting at 10:
498; EG-NEXT:     MOV T0.X, KC0[2].Z,
499; EG-NEXT:     MOV * T1.X, KC0[2].W,
500; EG-NEXT:    ALU clause starting at 12:
501; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
502; EG-NEXT:     MULLO_INT * T0.X, T1.X, T0.X,
503; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
504entry:
505  %a = load i64, ptr addrspace(1) %aptr, align 8
506  %b = load i64, ptr addrspace(1) %bptr, align 8
507  %mul = mul i64 %b, %a
508  %trunc = trunc i64 %mul to i32
509  store i32 %trunc, ptr addrspace(1) %out, align 8
510  ret void
511}
512
513; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top
514; 32-bits of both arguments are sign bits.
515
516define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
517; SI-LABEL: mul64_sext_c:
518; SI:       ; %bb.0: ; %entry
519; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
520; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
521; SI-NEXT:    v_mov_b32_e32 v0, 0x50
522; SI-NEXT:    s_mov_b32 s3, 0xf000
523; SI-NEXT:    s_mov_b32 s2, -1
524; SI-NEXT:    s_waitcnt lgkmcnt(0)
525; SI-NEXT:    v_mul_hi_i32 v1, s4, v0
526; SI-NEXT:    s_mulk_i32 s4, 0x50
527; SI-NEXT:    v_mov_b32_e32 v0, s4
528; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
529; SI-NEXT:    s_endpgm
530;
531; VI-LABEL: mul64_sext_c:
532; VI:       ; %bb.0: ; %entry
533; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
534; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
535; VI-NEXT:    v_mov_b32_e32 v0, 0x50
536; VI-NEXT:    s_waitcnt lgkmcnt(0)
537; VI-NEXT:    v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0
538; VI-NEXT:    s_mov_b32 s3, 0xf000
539; VI-NEXT:    s_mov_b32 s2, -1
540; VI-NEXT:    s_nop 2
541; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
542; VI-NEXT:    s_endpgm
543;
544; GFX9-LABEL: mul64_sext_c:
545; GFX9:       ; %bb.0: ; %entry
546; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
547; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
548; GFX9-NEXT:    s_mov_b32 s7, 0xf000
549; GFX9-NEXT:    s_mov_b32 s6, -1
550; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
551; GFX9-NEXT:    s_mul_hi_i32 s0, s2, 0x50
552; GFX9-NEXT:    s_mulk_i32 s2, 0x50
553; GFX9-NEXT:    v_mov_b32_e32 v0, s2
554; GFX9-NEXT:    v_mov_b32_e32 v1, s0
555; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
556; GFX9-NEXT:    s_endpgm
557;
558; GFX10-LABEL: mul64_sext_c:
559; GFX10:       ; %bb.0: ; %entry
560; GFX10-NEXT:    s_clause 0x1
561; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
562; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
563; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
564; GFX10-NEXT:    s_mov_b32 s6, -1
565; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
566; GFX10-NEXT:    s_mul_i32 s0, s2, 0x50
567; GFX10-NEXT:    s_mul_hi_i32 s1, s2, 0x50
568; GFX10-NEXT:    v_mov_b32_e32 v0, s0
569; GFX10-NEXT:    v_mov_b32_e32 v1, s1
570; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
571; GFX10-NEXT:    s_endpgm
572;
573; GFX11-LABEL: mul64_sext_c:
574; GFX11:       ; %bb.0: ; %entry
575; GFX11-NEXT:    s_clause 0x1
576; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
577; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
578; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
579; GFX11-NEXT:    s_mul_i32 s3, s2, 0x50
580; GFX11-NEXT:    s_mul_hi_i32 s2, s2, 0x50
581; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
582; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
583; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
584; GFX11-NEXT:    s_mov_b32 s2, -1
585; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
586; GFX11-NEXT:    s_nop 0
587; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
588; GFX11-NEXT:    s_endpgm
589;
590; EG-LABEL: mul64_sext_c:
591; EG:       ; %bb.0: ; %entry
592; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
593; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
594; EG-NEXT:    CF_END
595; EG-NEXT:    PAD
596; EG-NEXT:    ALU clause starting at 4:
597; EG-NEXT:     MULHI_INT * T0.Y, KC0[2].Z, literal.x,
598; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
599; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
600; EG-NEXT:     MULLO_INT * T0.X, KC0[2].Z, literal.y,
601; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
602entry:
603  %0 = sext i32 %in to i64
604  %1 = mul i64 %0, 80
605  store i64 %1, ptr addrspace(1) %out
606  ret void
607}
608
609define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) {
610; SI-LABEL: v_mul64_sext_c:
611; SI:       ; %bb.0: ; %entry
612; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
613; SI-NEXT:    s_mov_b32 s7, 0xf000
614; SI-NEXT:    s_mov_b32 s6, -1
615; SI-NEXT:    s_mov_b32 s10, s6
616; SI-NEXT:    s_mov_b32 s11, s7
617; SI-NEXT:    s_waitcnt lgkmcnt(0)
618; SI-NEXT:    s_mov_b32 s8, s2
619; SI-NEXT:    s_mov_b32 s9, s3
620; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
621; SI-NEXT:    s_movk_i32 s2, 0x50
622; SI-NEXT:    s_mov_b32 s4, s0
623; SI-NEXT:    s_mov_b32 s5, s1
624; SI-NEXT:    s_waitcnt vmcnt(0)
625; SI-NEXT:    v_mul_hi_i32 v1, v0, s2
626; SI-NEXT:    v_mul_lo_u32 v0, v0, s2
627; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
628; SI-NEXT:    s_endpgm
629;
630; VI-LABEL: v_mul64_sext_c:
631; VI:       ; %bb.0: ; %entry
632; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
633; VI-NEXT:    s_mov_b32 s7, 0xf000
634; VI-NEXT:    s_mov_b32 s6, -1
635; VI-NEXT:    s_mov_b32 s10, s6
636; VI-NEXT:    s_mov_b32 s11, s7
637; VI-NEXT:    s_waitcnt lgkmcnt(0)
638; VI-NEXT:    s_mov_b32 s8, s2
639; VI-NEXT:    s_mov_b32 s9, s3
640; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
641; VI-NEXT:    s_movk_i32 s2, 0x50
642; VI-NEXT:    s_mov_b32 s4, s0
643; VI-NEXT:    s_mov_b32 s5, s1
644; VI-NEXT:    s_waitcnt vmcnt(0)
645; VI-NEXT:    v_mad_i64_i32 v[0:1], s[2:3], v0, s2, 0
646; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
647; VI-NEXT:    s_endpgm
648;
649; GFX9-LABEL: v_mul64_sext_c:
650; GFX9:       ; %bb.0: ; %entry
651; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
652; GFX9-NEXT:    s_mov_b32 s7, 0xf000
653; GFX9-NEXT:    s_mov_b32 s6, -1
654; GFX9-NEXT:    s_mov_b32 s10, s6
655; GFX9-NEXT:    s_mov_b32 s11, s7
656; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
657; GFX9-NEXT:    s_mov_b32 s8, s2
658; GFX9-NEXT:    s_mov_b32 s9, s3
659; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
660; GFX9-NEXT:    s_movk_i32 s2, 0x50
661; GFX9-NEXT:    s_mov_b32 s4, s0
662; GFX9-NEXT:    s_mov_b32 s5, s1
663; GFX9-NEXT:    s_waitcnt vmcnt(0)
664; GFX9-NEXT:    v_mul_hi_i32 v1, v0, s2
665; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
666; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
667; GFX9-NEXT:    s_endpgm
668;
669; GFX10-LABEL: v_mul64_sext_c:
670; GFX10:       ; %bb.0: ; %entry
671; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
672; GFX10-NEXT:    s_mov_b32 s6, -1
673; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
674; GFX10-NEXT:    s_mov_b32 s10, s6
675; GFX10-NEXT:    s_mov_b32 s11, s7
676; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX10-NEXT:    s_mov_b32 s8, s2
678; GFX10-NEXT:    s_mov_b32 s9, s3
679; GFX10-NEXT:    s_mov_b32 s4, s0
680; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
681; GFX10-NEXT:    s_mov_b32 s5, s1
682; GFX10-NEXT:    s_waitcnt vmcnt(0)
683; GFX10-NEXT:    v_mul_hi_i32 v1, 0x50, v0
684; GFX10-NEXT:    v_mul_lo_u32 v0, 0x50, v0
685; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
686; GFX10-NEXT:    s_endpgm
687;
688; GFX11-LABEL: v_mul64_sext_c:
689; GFX11:       ; %bb.0: ; %entry
690; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
691; GFX11-NEXT:    s_mov_b32 s6, -1
692; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
693; GFX11-NEXT:    s_mov_b32 s10, s6
694; GFX11-NEXT:    s_mov_b32 s11, s7
695; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
696; GFX11-NEXT:    s_mov_b32 s8, s2
697; GFX11-NEXT:    s_mov_b32 s9, s3
698; GFX11-NEXT:    s_mov_b32 s4, s0
699; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
700; GFX11-NEXT:    s_mov_b32 s5, s1
701; GFX11-NEXT:    s_waitcnt vmcnt(0)
702; GFX11-NEXT:    v_mul_hi_i32 v1, 0x50, v0
703; GFX11-NEXT:    v_mul_lo_u32 v0, 0x50, v0
704; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
705; GFX11-NEXT:    s_nop 0
706; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
707; GFX11-NEXT:    s_endpgm
708;
709; EG-LABEL: v_mul64_sext_c:
710; EG:       ; %bb.0: ; %entry
711; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
712; EG-NEXT:    TEX 0 @6
713; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
714; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
715; EG-NEXT:    CF_END
716; EG-NEXT:    PAD
717; EG-NEXT:    Fetch clause starting at 6:
718; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
719; EG-NEXT:    ALU clause starting at 8:
720; EG-NEXT:     MOV * T0.X, KC0[2].Z,
721; EG-NEXT:    ALU clause starting at 9:
722; EG-NEXT:     MULHI_INT * T0.Y, T0.X, literal.x,
723; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
724; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
725; EG-NEXT:     MULLO_INT * T0.X, T0.X, literal.y,
726; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
727entry:
728  %val = load i32, ptr addrspace(1) %in, align 4
729  %ext = sext i32 %val to i64
730  %mul = mul i64 %ext, 80
731  store i64 %mul, ptr addrspace(1) %out, align 8
732  ret void
733}
734
735define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) {
736; SI-LABEL: v_mul64_sext_inline_imm:
737; SI:       ; %bb.0: ; %entry
738; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
739; SI-NEXT:    s_mov_b32 s7, 0xf000
740; SI-NEXT:    s_mov_b32 s6, -1
741; SI-NEXT:    s_mov_b32 s10, s6
742; SI-NEXT:    s_mov_b32 s11, s7
743; SI-NEXT:    s_waitcnt lgkmcnt(0)
744; SI-NEXT:    s_mov_b32 s8, s2
745; SI-NEXT:    s_mov_b32 s9, s3
746; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
747; SI-NEXT:    s_mov_b32 s4, s0
748; SI-NEXT:    s_mov_b32 s5, s1
749; SI-NEXT:    s_waitcnt vmcnt(0)
750; SI-NEXT:    v_mul_hi_i32 v1, v0, 9
751; SI-NEXT:    v_mul_lo_u32 v0, v0, 9
752; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
753; SI-NEXT:    s_endpgm
754;
755; VI-LABEL: v_mul64_sext_inline_imm:
756; VI:       ; %bb.0: ; %entry
757; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
758; VI-NEXT:    s_mov_b32 s7, 0xf000
759; VI-NEXT:    s_mov_b32 s6, -1
760; VI-NEXT:    s_mov_b32 s10, s6
761; VI-NEXT:    s_mov_b32 s11, s7
762; VI-NEXT:    s_waitcnt lgkmcnt(0)
763; VI-NEXT:    s_mov_b32 s8, s2
764; VI-NEXT:    s_mov_b32 s9, s3
765; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
766; VI-NEXT:    s_mov_b32 s4, s0
767; VI-NEXT:    s_mov_b32 s5, s1
768; VI-NEXT:    s_waitcnt vmcnt(0)
769; VI-NEXT:    v_mad_i64_i32 v[0:1], s[2:3], v0, 9, 0
770; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
771; VI-NEXT:    s_endpgm
772;
773; GFX9-LABEL: v_mul64_sext_inline_imm:
774; GFX9:       ; %bb.0: ; %entry
775; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
776; GFX9-NEXT:    s_mov_b32 s7, 0xf000
777; GFX9-NEXT:    s_mov_b32 s6, -1
778; GFX9-NEXT:    s_mov_b32 s10, s6
779; GFX9-NEXT:    s_mov_b32 s11, s7
780; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
781; GFX9-NEXT:    s_mov_b32 s8, s2
782; GFX9-NEXT:    s_mov_b32 s9, s3
783; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
784; GFX9-NEXT:    s_mov_b32 s4, s0
785; GFX9-NEXT:    s_mov_b32 s5, s1
786; GFX9-NEXT:    s_waitcnt vmcnt(0)
787; GFX9-NEXT:    v_mul_hi_i32 v1, v0, 9
788; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 9
789; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
790; GFX9-NEXT:    s_endpgm
791;
792; GFX10-LABEL: v_mul64_sext_inline_imm:
793; GFX10:       ; %bb.0: ; %entry
794; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
795; GFX10-NEXT:    s_mov_b32 s6, -1
796; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
797; GFX10-NEXT:    s_mov_b32 s10, s6
798; GFX10-NEXT:    s_mov_b32 s11, s7
799; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
800; GFX10-NEXT:    s_mov_b32 s8, s2
801; GFX10-NEXT:    s_mov_b32 s9, s3
802; GFX10-NEXT:    s_mov_b32 s4, s0
803; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
804; GFX10-NEXT:    s_mov_b32 s5, s1
805; GFX10-NEXT:    s_waitcnt vmcnt(0)
806; GFX10-NEXT:    v_mul_hi_i32 v1, v0, 9
807; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 9
808; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
809; GFX10-NEXT:    s_endpgm
810;
811; GFX11-LABEL: v_mul64_sext_inline_imm:
812; GFX11:       ; %bb.0: ; %entry
813; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
814; GFX11-NEXT:    s_mov_b32 s6, -1
815; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
816; GFX11-NEXT:    s_mov_b32 s10, s6
817; GFX11-NEXT:    s_mov_b32 s11, s7
818; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
819; GFX11-NEXT:    s_mov_b32 s8, s2
820; GFX11-NEXT:    s_mov_b32 s9, s3
821; GFX11-NEXT:    s_mov_b32 s4, s0
822; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
823; GFX11-NEXT:    s_mov_b32 s5, s1
824; GFX11-NEXT:    s_waitcnt vmcnt(0)
825; GFX11-NEXT:    v_mul_hi_i32 v1, v0, 9
826; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 9
827; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
828; GFX11-NEXT:    s_nop 0
829; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
830; GFX11-NEXT:    s_endpgm
831;
832; EG-LABEL: v_mul64_sext_inline_imm:
833; EG:       ; %bb.0: ; %entry
834; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
835; EG-NEXT:    TEX 0 @6
836; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
837; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
838; EG-NEXT:    CF_END
839; EG-NEXT:    PAD
840; EG-NEXT:    Fetch clause starting at 6:
841; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
842; EG-NEXT:    ALU clause starting at 8:
843; EG-NEXT:     MOV * T0.X, KC0[2].Z,
844; EG-NEXT:    ALU clause starting at 9:
845; EG-NEXT:     MULHI_INT * T0.Y, T0.X, literal.x,
846; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
847; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
848; EG-NEXT:     MULLO_INT * T0.X, T0.X, literal.y,
849; EG-NEXT:    2(2.802597e-45), 9(1.261169e-44)
850entry:
851  %val = load i32, ptr addrspace(1) %in, align 4
852  %ext = sext i32 %val to i64
853  %mul = mul i64 %ext, 9
854  store i64 %mul, ptr addrspace(1) %out, align 8
855  ret void
856}
857
858define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind {
859; SI-LABEL: s_mul_i32:
860; SI:       ; %bb.0: ; %entry
861; SI-NEXT:    s_load_dword s4, s[0:1], 0x13
862; SI-NEXT:    s_load_dword s5, s[0:1], 0x1c
863; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
864; SI-NEXT:    s_mov_b32 s3, 0xf000
865; SI-NEXT:    s_mov_b32 s2, -1
866; SI-NEXT:    s_waitcnt lgkmcnt(0)
867; SI-NEXT:    s_mul_i32 s4, s4, s5
868; SI-NEXT:    v_mov_b32_e32 v0, s4
869; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
870; SI-NEXT:    s_endpgm
871;
872; VI-LABEL: s_mul_i32:
873; VI:       ; %bb.0: ; %entry
874; VI-NEXT:    s_load_dword s4, s[0:1], 0x4c
875; VI-NEXT:    s_load_dword s5, s[0:1], 0x70
876; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
877; VI-NEXT:    s_mov_b32 s3, 0xf000
878; VI-NEXT:    s_mov_b32 s2, -1
879; VI-NEXT:    s_waitcnt lgkmcnt(0)
880; VI-NEXT:    s_mul_i32 s4, s4, s5
881; VI-NEXT:    v_mov_b32_e32 v0, s4
882; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
883; VI-NEXT:    s_endpgm
884;
885; GFX9-LABEL: s_mul_i32:
886; GFX9:       ; %bb.0: ; %entry
887; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x4c
888; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x70
889; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
890; GFX9-NEXT:    s_mov_b32 s7, 0xf000
891; GFX9-NEXT:    s_mov_b32 s6, -1
892; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
893; GFX9-NEXT:    s_mul_i32 s0, s2, s3
894; GFX9-NEXT:    v_mov_b32_e32 v0, s0
895; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
896; GFX9-NEXT:    s_endpgm
897;
898; GFX10-LABEL: s_mul_i32:
899; GFX10:       ; %bb.0: ; %entry
900; GFX10-NEXT:    s_clause 0x2
901; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x4c
902; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x70
903; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
904; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
905; GFX10-NEXT:    s_mov_b32 s6, -1
906; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
907; GFX10-NEXT:    s_mul_i32 s0, s2, s3
908; GFX10-NEXT:    v_mov_b32_e32 v0, s0
909; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
910; GFX10-NEXT:    s_endpgm
911;
912; GFX11-LABEL: s_mul_i32:
913; GFX11:       ; %bb.0: ; %entry
914; GFX11-NEXT:    s_clause 0x2
915; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x4c
916; GFX11-NEXT:    s_load_b32 s3, s[0:1], 0x70
917; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
918; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
919; GFX11-NEXT:    s_mul_i32 s2, s2, s3
920; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
921; GFX11-NEXT:    v_mov_b32_e32 v0, s2
922; GFX11-NEXT:    s_mov_b32 s2, -1
923; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
924; GFX11-NEXT:    s_nop 0
925; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
926; GFX11-NEXT:    s_endpgm
927;
928; EG-LABEL: s_mul_i32:
929; EG:       ; %bb.0: ; %entry
930; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
931; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
932; EG-NEXT:    CF_END
933; EG-NEXT:    PAD
934; EG-NEXT:    ALU clause starting at 4:
935; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
936; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
937; EG-NEXT:     MULLO_INT * T1.X, KC0[4].Z, KC0[6].W,
938entry:
939  %mul = mul i32 %a, %b
940  store i32 %mul, ptr addrspace(1) %out, align 4
941  ret void
942}
943
944define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
945; SI-LABEL: v_mul_i32:
946; SI:       ; %bb.0: ; %entry
947; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
948; SI-NEXT:    s_mov_b32 s7, 0xf000
949; SI-NEXT:    s_mov_b32 s6, -1
950; SI-NEXT:    s_mov_b32 s10, s6
951; SI-NEXT:    s_mov_b32 s11, s7
952; SI-NEXT:    s_waitcnt lgkmcnt(0)
953; SI-NEXT:    s_mov_b32 s8, s2
954; SI-NEXT:    s_mov_b32 s9, s3
955; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
956; SI-NEXT:    s_mov_b32 s4, s0
957; SI-NEXT:    s_mov_b32 s5, s1
958; SI-NEXT:    s_waitcnt vmcnt(0)
959; SI-NEXT:    v_mul_lo_u32 v0, v0, v1
960; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
961; SI-NEXT:    s_endpgm
962;
963; VI-LABEL: v_mul_i32:
964; VI:       ; %bb.0: ; %entry
965; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
966; VI-NEXT:    s_mov_b32 s7, 0xf000
967; VI-NEXT:    s_mov_b32 s6, -1
968; VI-NEXT:    s_mov_b32 s10, s6
969; VI-NEXT:    s_mov_b32 s11, s7
970; VI-NEXT:    s_waitcnt lgkmcnt(0)
971; VI-NEXT:    s_mov_b32 s8, s2
972; VI-NEXT:    s_mov_b32 s9, s3
973; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
974; VI-NEXT:    s_mov_b32 s4, s0
975; VI-NEXT:    s_mov_b32 s5, s1
976; VI-NEXT:    s_waitcnt vmcnt(0)
977; VI-NEXT:    v_mul_lo_u32 v0, v0, v1
978; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
979; VI-NEXT:    s_endpgm
980;
981; GFX9-LABEL: v_mul_i32:
982; GFX9:       ; %bb.0: ; %entry
983; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
984; GFX9-NEXT:    s_mov_b32 s7, 0xf000
985; GFX9-NEXT:    s_mov_b32 s6, -1
986; GFX9-NEXT:    s_mov_b32 s10, s6
987; GFX9-NEXT:    s_mov_b32 s11, s7
988; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
989; GFX9-NEXT:    s_mov_b32 s8, s2
990; GFX9-NEXT:    s_mov_b32 s9, s3
991; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
992; GFX9-NEXT:    s_mov_b32 s4, s0
993; GFX9-NEXT:    s_mov_b32 s5, s1
994; GFX9-NEXT:    s_waitcnt vmcnt(0)
995; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
996; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
997; GFX9-NEXT:    s_endpgm
998;
999; GFX10-LABEL: v_mul_i32:
1000; GFX10:       ; %bb.0: ; %entry
1001; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1002; GFX10-NEXT:    s_mov_b32 s6, -1
1003; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1004; GFX10-NEXT:    s_mov_b32 s10, s6
1005; GFX10-NEXT:    s_mov_b32 s11, s7
1006; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1007; GFX10-NEXT:    s_mov_b32 s8, s2
1008; GFX10-NEXT:    s_mov_b32 s9, s3
1009; GFX10-NEXT:    s_mov_b32 s4, s0
1010; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1011; GFX10-NEXT:    s_mov_b32 s5, s1
1012; GFX10-NEXT:    s_waitcnt vmcnt(0)
1013; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
1014; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1015; GFX10-NEXT:    s_endpgm
1016;
1017; GFX11-LABEL: v_mul_i32:
1018; GFX11:       ; %bb.0: ; %entry
1019; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1020; GFX11-NEXT:    s_mov_b32 s6, -1
1021; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1022; GFX11-NEXT:    s_mov_b32 s10, s6
1023; GFX11-NEXT:    s_mov_b32 s11, s7
1024; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1025; GFX11-NEXT:    s_mov_b32 s8, s2
1026; GFX11-NEXT:    s_mov_b32 s9, s3
1027; GFX11-NEXT:    s_mov_b32 s4, s0
1028; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
1029; GFX11-NEXT:    s_mov_b32 s5, s1
1030; GFX11-NEXT:    s_waitcnt vmcnt(0)
1031; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v1
1032; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
1033; GFX11-NEXT:    s_nop 0
1034; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1035; GFX11-NEXT:    s_endpgm
1036;
1037; EG-LABEL: v_mul_i32:
1038; EG:       ; %bb.0: ; %entry
1039; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1040; EG-NEXT:    TEX 0 @6
1041; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
1042; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1043; EG-NEXT:    CF_END
1044; EG-NEXT:    PAD
1045; EG-NEXT:    Fetch clause starting at 6:
1046; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
1047; EG-NEXT:    ALU clause starting at 8:
1048; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1049; EG-NEXT:    ALU clause starting at 9:
1050; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
1051; EG-NEXT:     MULLO_INT * T0.X, T0.X, T0.Y,
1052; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1053entry:
1054  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
1055  %a = load i32, ptr addrspace(1) %in
1056  %b = load i32, ptr addrspace(1) %b_ptr
1057  %result = mul i32 %a, %b
1058  store i32 %result, ptr addrspace(1) %out
1059  ret void
1060}
1061
1062define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind {
1063; SI-LABEL: s_mul_i1:
1064; SI:       ; %bb.0: ; %entry
1065; SI-NEXT:    s_load_dword s4, s[0:1], 0x13
1066; SI-NEXT:    s_load_dword s5, s[0:1], 0x1c
1067; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1068; SI-NEXT:    s_mov_b32 s3, 0xf000
1069; SI-NEXT:    s_mov_b32 s2, -1
1070; SI-NEXT:    s_waitcnt lgkmcnt(0)
1071; SI-NEXT:    s_mul_i32 s4, s4, s5
1072; SI-NEXT:    s_and_b32 s4, s4, 1
1073; SI-NEXT:    v_mov_b32_e32 v0, s4
1074; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1075; SI-NEXT:    s_endpgm
1076;
1077; VI-LABEL: s_mul_i1:
1078; VI:       ; %bb.0: ; %entry
1079; VI-NEXT:    s_load_dword s4, s[0:1], 0x70
1080; VI-NEXT:    s_load_dword s5, s[0:1], 0x4c
1081; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1082; VI-NEXT:    s_mov_b32 s3, 0xf000
1083; VI-NEXT:    s_mov_b32 s2, -1
1084; VI-NEXT:    s_waitcnt lgkmcnt(0)
1085; VI-NEXT:    v_mov_b32_e32 v0, s4
1086; VI-NEXT:    v_mul_lo_u16_e32 v0, s5, v0
1087; VI-NEXT:    v_and_b32_e32 v0, 1, v0
1088; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1089; VI-NEXT:    s_endpgm
1090;
1091; GFX9-LABEL: s_mul_i1:
1092; GFX9:       ; %bb.0: ; %entry
1093; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x70
1094; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x4c
1095; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1096; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1097; GFX9-NEXT:    s_mov_b32 s6, -1
1098; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1099; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1100; GFX9-NEXT:    v_mul_lo_u16_e32 v0, s3, v0
1101; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
1102; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1103; GFX9-NEXT:    s_endpgm
1104;
1105; GFX10-LABEL: s_mul_i1:
1106; GFX10:       ; %bb.0: ; %entry
1107; GFX10-NEXT:    s_clause 0x2
1108; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x4c
1109; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x70
1110; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1111; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1112; GFX10-NEXT:    s_mov_b32 s6, -1
1113; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1114; GFX10-NEXT:    v_mul_lo_u16 v0, s2, s3
1115; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
1116; GFX10-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1117; GFX10-NEXT:    s_endpgm
1118;
1119; GFX11-LABEL: s_mul_i1:
1120; GFX11:       ; %bb.0: ; %entry
1121; GFX11-NEXT:    s_clause 0x2
1122; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x4c
1123; GFX11-NEXT:    s_load_b32 s3, s[0:1], 0x70
1124; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1125; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1126; GFX11-NEXT:    v_mul_lo_u16 v0, s2, s3
1127; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1128; GFX11-NEXT:    s_mov_b32 s2, -1
1129; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1130; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
1131; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
1132; GFX11-NEXT:    s_nop 0
1133; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1134; GFX11-NEXT:    s_endpgm
1135;
1136; EG-LABEL: s_mul_i1:
1137; EG:       ; %bb.0: ; %entry
1138; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
1139; EG-NEXT:    TEX 1 @6
1140; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
1141; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1142; EG-NEXT:    CF_END
1143; EG-NEXT:    PAD
1144; EG-NEXT:    Fetch clause starting at 6:
1145; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 72, #3
1146; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 108, #3
1147; EG-NEXT:    ALU clause starting at 10:
1148; EG-NEXT:     MOV * T0.X, 0.0,
1149; EG-NEXT:    ALU clause starting at 11:
1150; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
1151; EG-NEXT:     MULLO_INT * T0.X, T1.X, T0.X,
1152; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1153; EG-NEXT:     AND_INT T1.W, PS, 1,
1154; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
1155; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1156; EG-NEXT:     LSHL T0.X, PV.W, PS,
1157; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1158; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1159; EG-NEXT:     MOV T0.Y, 0.0,
1160; EG-NEXT:     MOV * T0.Z, 0.0,
1161; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1162; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1163entry:
1164  %mul = mul i1 %a, %b
1165  store i1 %mul, ptr addrspace(1) %out, align 4
1166  ret void
1167}
1168
1169define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1170; SI-LABEL: v_mul_i1:
1171; SI:       ; %bb.0: ; %entry
1172; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1173; SI-NEXT:    s_mov_b32 s7, 0xf000
1174; SI-NEXT:    s_mov_b32 s6, -1
1175; SI-NEXT:    s_mov_b32 s10, s6
1176; SI-NEXT:    s_mov_b32 s11, s7
1177; SI-NEXT:    s_waitcnt lgkmcnt(0)
1178; SI-NEXT:    s_mov_b32 s8, s2
1179; SI-NEXT:    s_mov_b32 s9, s3
1180; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
1181; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1182; SI-NEXT:    s_mov_b32 s4, s0
1183; SI-NEXT:    s_mov_b32 s5, s1
1184; SI-NEXT:    s_waitcnt vmcnt(0)
1185; SI-NEXT:    v_mul_lo_u32 v0, v0, v1
1186; SI-NEXT:    v_and_b32_e32 v0, 1, v0
1187; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1188; SI-NEXT:    s_endpgm
1189;
1190; VI-LABEL: v_mul_i1:
1191; VI:       ; %bb.0: ; %entry
1192; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1193; VI-NEXT:    s_mov_b32 s7, 0xf000
1194; VI-NEXT:    s_mov_b32 s6, -1
1195; VI-NEXT:    s_mov_b32 s10, s6
1196; VI-NEXT:    s_mov_b32 s11, s7
1197; VI-NEXT:    s_waitcnt lgkmcnt(0)
1198; VI-NEXT:    s_mov_b32 s8, s2
1199; VI-NEXT:    s_mov_b32 s9, s3
1200; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
1201; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1202; VI-NEXT:    s_mov_b32 s4, s0
1203; VI-NEXT:    s_mov_b32 s5, s1
1204; VI-NEXT:    s_waitcnt vmcnt(0)
1205; VI-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
1206; VI-NEXT:    v_and_b32_e32 v0, 1, v0
1207; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1208; VI-NEXT:    s_endpgm
1209;
1210; GFX9-LABEL: v_mul_i1:
1211; GFX9:       ; %bb.0: ; %entry
1212; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1213; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1214; GFX9-NEXT:    s_mov_b32 s6, -1
1215; GFX9-NEXT:    s_mov_b32 s10, s6
1216; GFX9-NEXT:    s_mov_b32 s11, s7
1217; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1218; GFX9-NEXT:    s_mov_b32 s8, s2
1219; GFX9-NEXT:    s_mov_b32 s9, s3
1220; GFX9-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
1221; GFX9-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1222; GFX9-NEXT:    s_mov_b32 s4, s0
1223; GFX9-NEXT:    s_mov_b32 s5, s1
1224; GFX9-NEXT:    s_waitcnt vmcnt(0)
1225; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
1226; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
1227; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1228; GFX9-NEXT:    s_endpgm
1229;
1230; GFX10-LABEL: v_mul_i1:
1231; GFX10:       ; %bb.0: ; %entry
1232; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1233; GFX10-NEXT:    s_mov_b32 s6, -1
1234; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1235; GFX10-NEXT:    s_mov_b32 s10, s6
1236; GFX10-NEXT:    s_mov_b32 s11, s7
1237; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1238; GFX10-NEXT:    s_mov_b32 s8, s2
1239; GFX10-NEXT:    s_mov_b32 s9, s3
1240; GFX10-NEXT:    s_clause 0x1
1241; GFX10-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
1242; GFX10-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:4
1243; GFX10-NEXT:    s_mov_b32 s4, s0
1244; GFX10-NEXT:    s_mov_b32 s5, s1
1245; GFX10-NEXT:    s_waitcnt vmcnt(0)
1246; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
1247; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
1248; GFX10-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1249; GFX10-NEXT:    s_endpgm
1250;
1251; GFX11-LABEL: v_mul_i1:
1252; GFX11:       ; %bb.0: ; %entry
1253; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1254; GFX11-NEXT:    s_mov_b32 s6, -1
1255; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1256; GFX11-NEXT:    s_mov_b32 s10, s6
1257; GFX11-NEXT:    s_mov_b32 s11, s7
1258; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1259; GFX11-NEXT:    s_mov_b32 s8, s2
1260; GFX11-NEXT:    s_mov_b32 s9, s3
1261; GFX11-NEXT:    s_clause 0x1
1262; GFX11-NEXT:    buffer_load_u8 v0, off, s[8:11], 0
1263; GFX11-NEXT:    buffer_load_u8 v1, off, s[8:11], 0 offset:4
1264; GFX11-NEXT:    s_mov_b32 s4, s0
1265; GFX11-NEXT:    s_mov_b32 s5, s1
1266; GFX11-NEXT:    s_waitcnt vmcnt(0)
1267; GFX11-NEXT:    v_mul_lo_u16 v0, v0, v1
1268; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1269; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
1270; GFX11-NEXT:    buffer_store_b8 v0, off, s[4:7], 0
1271; GFX11-NEXT:    s_nop 0
1272; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1273; GFX11-NEXT:    s_endpgm
1274;
1275; EG-LABEL: v_mul_i1:
1276; EG:       ; %bb.0: ; %entry
1277; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1278; EG-NEXT:    TEX 1 @6
1279; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
1280; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1281; EG-NEXT:    CF_END
1282; EG-NEXT:    PAD
1283; EG-NEXT:    Fetch clause starting at 6:
1284; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 4, #1
1285; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1286; EG-NEXT:    ALU clause starting at 10:
1287; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1288; EG-NEXT:    ALU clause starting at 11:
1289; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
1290; EG-NEXT:     MULLO_INT * T0.X, T0.X, T1.X,
1291; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1292; EG-NEXT:     AND_INT T1.W, PS, 1,
1293; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
1294; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1295; EG-NEXT:     LSHL T0.X, PV.W, PS,
1296; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1297; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1298; EG-NEXT:     MOV T0.Y, 0.0,
1299; EG-NEXT:     MOV * T0.Z, 0.0,
1300; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1301; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1302entry:
1303  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
1304  %a = load i1, ptr addrspace(1) %in
1305  %b = load i1, ptr addrspace(1) %b_ptr
1306  %result = mul i1 %a, %b
1307  store i1 %result, ptr addrspace(1) %out
1308  ret void
1309}
1310
1311; A standard 64-bit multiply.  The expansion should be around 6 instructions.
1312; It would be difficult to match the expansion correctly without writing
1313; a really complicated list of FileCheck expressions.  I don't want
1314; to confuse people who may 'break' this test with a correct optimization,
1315; so this test just uses FUNC-LABEL to make sure the compiler does not
1316; crash with a 'failed to select' error.
1317
1318define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
1319; SI-LABEL: s_mul_i64:
1320; SI:       ; %bb.0: ; %entry
1321; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1322; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1323; SI-NEXT:    s_mov_b32 s3, 0xf000
1324; SI-NEXT:    s_mov_b32 s2, -1
1325; SI-NEXT:    s_waitcnt lgkmcnt(0)
1326; SI-NEXT:    s_mov_b32 s0, s4
1327; SI-NEXT:    v_mov_b32_e32 v0, s8
1328; SI-NEXT:    v_mul_hi_u32 v0, s6, v0
1329; SI-NEXT:    s_mul_i32 s4, s6, s9
1330; SI-NEXT:    s_mov_b32 s1, s5
1331; SI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
1332; SI-NEXT:    s_mul_i32 s4, s7, s8
1333; SI-NEXT:    v_add_i32_e32 v1, vcc, s4, v0
1334; SI-NEXT:    s_mul_i32 s4, s6, s8
1335; SI-NEXT:    v_mov_b32_e32 v0, s4
1336; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1337; SI-NEXT:    s_endpgm
1338;
1339; VI-LABEL: s_mul_i64:
1340; VI:       ; %bb.0: ; %entry
1341; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1342; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
1343; VI-NEXT:    s_mov_b32 s3, 0xf000
1344; VI-NEXT:    s_mov_b32 s2, -1
1345; VI-NEXT:    s_waitcnt lgkmcnt(0)
1346; VI-NEXT:    s_mov_b32 s0, s4
1347; VI-NEXT:    v_mov_b32_e32 v0, s8
1348; VI-NEXT:    v_mad_u64_u32 v[0:1], s[10:11], s6, v0, 0
1349; VI-NEXT:    s_mul_i32 s4, s6, s9
1350; VI-NEXT:    s_mov_b32 s1, s5
1351; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
1352; VI-NEXT:    s_mul_i32 s4, s7, s8
1353; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
1354; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1355; VI-NEXT:    s_endpgm
1356;
1357; GFX9-LABEL: s_mul_i64:
1358; GFX9:       ; %bb.0: ; %entry
1359; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1360; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
1361; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1362; GFX9-NEXT:    s_mov_b32 s2, -1
1363; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1364; GFX9-NEXT:    s_mov_b32 s0, s4
1365; GFX9-NEXT:    s_mov_b32 s1, s5
1366; GFX9-NEXT:    s_mul_i32 s4, s6, s9
1367; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s8
1368; GFX9-NEXT:    s_add_i32 s4, s5, s4
1369; GFX9-NEXT:    s_mul_i32 s5, s7, s8
1370; GFX9-NEXT:    s_add_i32 s4, s4, s5
1371; GFX9-NEXT:    s_mul_i32 s5, s6, s8
1372; GFX9-NEXT:    v_mov_b32_e32 v0, s5
1373; GFX9-NEXT:    v_mov_b32_e32 v1, s4
1374; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1375; GFX9-NEXT:    s_endpgm
1376;
1377; GFX10-LABEL: s_mul_i64:
1378; GFX10:       ; %bb.0: ; %entry
1379; GFX10-NEXT:    s_clause 0x1
1380; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1381; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1382; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1383; GFX10-NEXT:    s_mul_i32 s0, s6, s3
1384; GFX10-NEXT:    s_mul_hi_u32 s1, s6, s2
1385; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1386; GFX10-NEXT:    s_add_i32 s0, s1, s0
1387; GFX10-NEXT:    s_mul_i32 s1, s7, s2
1388; GFX10-NEXT:    s_mul_i32 s2, s6, s2
1389; GFX10-NEXT:    s_add_i32 s0, s0, s1
1390; GFX10-NEXT:    v_mov_b32_e32 v0, s2
1391; GFX10-NEXT:    v_mov_b32_e32 v1, s0
1392; GFX10-NEXT:    s_mov_b32 s2, -1
1393; GFX10-NEXT:    s_mov_b32 s0, s4
1394; GFX10-NEXT:    s_mov_b32 s1, s5
1395; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1396; GFX10-NEXT:    s_endpgm
1397;
1398; GFX11-LABEL: s_mul_i64:
1399; GFX11:       ; %bb.0: ; %entry
1400; GFX11-NEXT:    s_clause 0x1
1401; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
1402; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
1403; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1404; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1405; GFX11-NEXT:    s_mul_i32 s1, s6, s1
1406; GFX11-NEXT:    s_mul_hi_u32 s2, s6, s0
1407; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
1408; GFX11-NEXT:    s_add_i32 s1, s2, s1
1409; GFX11-NEXT:    s_mul_i32 s2, s7, s0
1410; GFX11-NEXT:    s_mul_i32 s0, s6, s0
1411; GFX11-NEXT:    s_add_i32 s1, s1, s2
1412; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1413; GFX11-NEXT:    s_mov_b32 s2, -1
1414; GFX11-NEXT:    s_mov_b32 s0, s4
1415; GFX11-NEXT:    s_mov_b32 s1, s5
1416; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1417; GFX11-NEXT:    s_nop 0
1418; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1419; GFX11-NEXT:    s_endpgm
1420;
1421; EG-LABEL: s_mul_i64:
1422; EG:       ; %bb.0: ; %entry
1423; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1424; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1425; EG-NEXT:    CF_END
1426; EG-NEXT:    PAD
1427; EG-NEXT:    ALU clause starting at 4:
1428; EG-NEXT:     MULHI * T0.X, KC0[2].W, KC0[3].Y,
1429; EG-NEXT:     MULLO_INT * T0.Y, KC0[2].W, KC0[3].Z,
1430; EG-NEXT:     ADD_INT T0.W, T0.X, PS,
1431; EG-NEXT:     MULLO_INT * T0.X, KC0[3].X, KC0[3].Y,
1432; EG-NEXT:     ADD_INT * T0.Y, PV.W, PS,
1433; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1434; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1435; EG-NEXT:     MULLO_INT * T0.X, KC0[2].W, KC0[3].Y,
1436entry:
1437  %mul = mul i64 %a, %b
1438  store i64 %mul, ptr addrspace(1) %out, align 8
1439  ret void
1440}
1441
1442define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
1443; SI-LABEL: v_mul_i64:
1444; SI:       ; %bb.0: ; %entry
1445; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1446; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1447; SI-NEXT:    s_mov_b32 s3, 0xf000
1448; SI-NEXT:    s_mov_b32 s2, -1
1449; SI-NEXT:    s_mov_b32 s10, s2
1450; SI-NEXT:    s_mov_b32 s11, s3
1451; SI-NEXT:    s_waitcnt lgkmcnt(0)
1452; SI-NEXT:    s_mov_b32 s12, s6
1453; SI-NEXT:    s_mov_b32 s13, s7
1454; SI-NEXT:    s_mov_b32 s14, s2
1455; SI-NEXT:    s_mov_b32 s15, s3
1456; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1457; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1458; SI-NEXT:    s_mov_b32 s0, s4
1459; SI-NEXT:    s_mov_b32 s1, s5
1460; SI-NEXT:    s_waitcnt vmcnt(0)
1461; SI-NEXT:    v_mul_lo_u32 v1, v2, v1
1462; SI-NEXT:    v_mul_hi_u32 v4, v2, v0
1463; SI-NEXT:    v_mul_lo_u32 v3, v3, v0
1464; SI-NEXT:    v_mul_lo_u32 v0, v2, v0
1465; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
1466; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
1467; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1468; SI-NEXT:    s_endpgm
1469;
1470; VI-LABEL: v_mul_i64:
1471; VI:       ; %bb.0: ; %entry
1472; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1473; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
1474; VI-NEXT:    s_mov_b32 s3, 0xf000
1475; VI-NEXT:    s_mov_b32 s2, -1
1476; VI-NEXT:    s_mov_b32 s10, s2
1477; VI-NEXT:    s_mov_b32 s11, s3
1478; VI-NEXT:    s_waitcnt lgkmcnt(0)
1479; VI-NEXT:    s_mov_b32 s12, s6
1480; VI-NEXT:    s_mov_b32 s13, s7
1481; VI-NEXT:    s_mov_b32 s14, s2
1482; VI-NEXT:    s_mov_b32 s15, s3
1483; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1484; VI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1485; VI-NEXT:    s_mov_b32 s0, s4
1486; VI-NEXT:    s_mov_b32 s1, s5
1487; VI-NEXT:    s_waitcnt vmcnt(0)
1488; VI-NEXT:    v_mul_lo_u32 v4, v2, v1
1489; VI-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v2, v0, 0
1490; VI-NEXT:    v_mul_lo_u32 v0, v3, v0
1491; VI-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
1492; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
1493; VI-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
1494; VI-NEXT:    s_endpgm
1495;
1496; GFX9-LABEL: v_mul_i64:
1497; GFX9:       ; %bb.0: ; %entry
1498; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1499; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
1500; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1501; GFX9-NEXT:    s_mov_b32 s2, -1
1502; GFX9-NEXT:    s_mov_b32 s10, s2
1503; GFX9-NEXT:    s_mov_b32 s11, s3
1504; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1505; GFX9-NEXT:    s_mov_b32 s12, s6
1506; GFX9-NEXT:    s_mov_b32 s13, s7
1507; GFX9-NEXT:    s_mov_b32 s14, s2
1508; GFX9-NEXT:    s_mov_b32 s15, s3
1509; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1510; GFX9-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1511; GFX9-NEXT:    s_mov_b32 s0, s4
1512; GFX9-NEXT:    s_mov_b32 s1, s5
1513; GFX9-NEXT:    s_waitcnt vmcnt(0)
1514; GFX9-NEXT:    v_mul_lo_u32 v1, v2, v1
1515; GFX9-NEXT:    v_mul_hi_u32 v4, v2, v0
1516; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v0
1517; GFX9-NEXT:    v_mul_lo_u32 v0, v2, v0
1518; GFX9-NEXT:    v_add_u32_e32 v1, v4, v1
1519; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
1520; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1521; GFX9-NEXT:    s_endpgm
1522;
1523; GFX10-LABEL: v_mul_i64:
1524; GFX10:       ; %bb.0: ; %entry
1525; GFX10-NEXT:    s_clause 0x1
1526; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1527; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
1528; GFX10-NEXT:    s_mov_b32 s2, -1
1529; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1530; GFX10-NEXT:    s_mov_b32 s10, s2
1531; GFX10-NEXT:    s_mov_b32 s11, s3
1532; GFX10-NEXT:    s_mov_b32 s14, s2
1533; GFX10-NEXT:    s_mov_b32 s15, s3
1534; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1535; GFX10-NEXT:    s_mov_b32 s12, s6
1536; GFX10-NEXT:    s_mov_b32 s13, s7
1537; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1538; GFX10-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1539; GFX10-NEXT:    s_mov_b32 s0, s4
1540; GFX10-NEXT:    s_mov_b32 s1, s5
1541; GFX10-NEXT:    s_waitcnt vmcnt(0)
1542; GFX10-NEXT:    v_mul_lo_u32 v1, v2, v1
1543; GFX10-NEXT:    v_mul_hi_u32 v4, v2, v0
1544; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v0
1545; GFX10-NEXT:    v_mul_lo_u32 v0, v2, v0
1546; GFX10-NEXT:    v_add_nc_u32_e32 v1, v4, v1
1547; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
1548; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1549; GFX10-NEXT:    s_endpgm
1550;
1551; GFX11-LABEL: v_mul_i64:
1552; GFX11:       ; %bb.0: ; %entry
1553; GFX11-NEXT:    s_clause 0x1
1554; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
1555; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
1556; GFX11-NEXT:    s_mov_b32 s10, -1
1557; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
1558; GFX11-NEXT:    s_mov_b32 s2, s10
1559; GFX11-NEXT:    s_mov_b32 s3, s11
1560; GFX11-NEXT:    s_mov_b32 s14, s10
1561; GFX11-NEXT:    s_mov_b32 s15, s11
1562; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1563; GFX11-NEXT:    s_mov_b32 s12, s6
1564; GFX11-NEXT:    s_mov_b32 s13, s7
1565; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
1566; GFX11-NEXT:    buffer_load_b64 v[2:3], off, s[12:15], 0
1567; GFX11-NEXT:    s_mov_b32 s8, s4
1568; GFX11-NEXT:    s_mov_b32 s9, s5
1569; GFX11-NEXT:    s_waitcnt vmcnt(0)
1570; GFX11-NEXT:    v_mul_lo_u32 v1, v2, v1
1571; GFX11-NEXT:    v_mul_hi_u32 v4, v2, v0
1572; GFX11-NEXT:    v_mul_lo_u32 v3, v3, v0
1573; GFX11-NEXT:    v_mul_lo_u32 v0, v2, v0
1574; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
1575; GFX11-NEXT:    v_add_nc_u32_e32 v1, v4, v1
1576; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v3
1577; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
1578; GFX11-NEXT:    s_nop 0
1579; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1580; GFX11-NEXT:    s_endpgm
1581;
1582; EG-LABEL: v_mul_i64:
1583; EG:       ; %bb.0: ; %entry
1584; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
1585; EG-NEXT:    TEX 1 @6
1586; EG-NEXT:    ALU 7, @12, KC0[CB0:0-32], KC1[]
1587; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1
1588; EG-NEXT:    CF_END
1589; EG-NEXT:    PAD
1590; EG-NEXT:    Fetch clause starting at 6:
1591; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
1592; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
1593; EG-NEXT:    ALU clause starting at 10:
1594; EG-NEXT:     MOV T0.X, KC0[2].Z,
1595; EG-NEXT:     MOV * T1.X, KC0[2].W,
1596; EG-NEXT:    ALU clause starting at 12:
1597; EG-NEXT:     MULHI * T0.Z, T0.X, T1.X,
1598; EG-NEXT:     MULLO_INT * T0.W, T0.X, T1.Y,
1599; EG-NEXT:     ADD_INT T0.W, T0.Z, PS,
1600; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T1.X,
1601; EG-NEXT:     ADD_INT * T0.Y, PV.W, PS,
1602; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
1603; EG-NEXT:     MULLO_INT * T0.X, T0.X, T1.X,
1604; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1605entry:
1606  %a = load i64, ptr addrspace(1) %aptr, align 8
1607  %b = load i64, ptr addrspace(1) %bptr, align 8
1608  %mul = mul i64 %a, %b
1609  store i64 %mul, ptr addrspace(1) %out, align 8
1610  ret void
1611}
1612
1613define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) {
1614; SI-LABEL: mul32_in_branch:
1615; SI:       ; %bb.0: ; %entry
1616; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
1617; SI-NEXT:    s_waitcnt lgkmcnt(0)
1618; SI-NEXT:    s_cmp_lg_u32 s2, 0
1619; SI-NEXT:    s_cbranch_scc0 .LBB13_2
1620; SI-NEXT:  ; %bb.1: ; %else
1621; SI-NEXT:    s_mul_i32 s6, s2, s3
1622; SI-NEXT:    s_mov_b64 s[4:5], 0
1623; SI-NEXT:    s_branch .LBB13_3
1624; SI-NEXT:  .LBB13_2:
1625; SI-NEXT:    s_mov_b64 s[4:5], -1
1626; SI-NEXT:    ; implicit-def: $sgpr6
1627; SI-NEXT:  .LBB13_3: ; %Flow
1628; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1629; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
1630; SI-NEXT:    s_waitcnt lgkmcnt(0)
1631; SI-NEXT:    s_mov_b64 vcc, vcc
1632; SI-NEXT:    s_cbranch_vccnz .LBB13_5
1633; SI-NEXT:  ; %bb.4: ; %if
1634; SI-NEXT:    s_mov_b32 s7, 0xf000
1635; SI-NEXT:    s_mov_b32 s6, -1
1636; SI-NEXT:    s_mov_b32 s4, s2
1637; SI-NEXT:    s_mov_b32 s5, s3
1638; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1639; SI-NEXT:    s_branch .LBB13_6
1640; SI-NEXT:  .LBB13_5:
1641; SI-NEXT:    v_mov_b32_e32 v0, s6
1642; SI-NEXT:  .LBB13_6: ; %endif
1643; SI-NEXT:    s_mov_b32 s3, 0xf000
1644; SI-NEXT:    s_mov_b32 s2, -1
1645; SI-NEXT:    s_waitcnt vmcnt(0)
1646; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1647; SI-NEXT:    s_endpgm
1648;
1649; VI-LABEL: mul32_in_branch:
1650; VI:       ; %bb.0: ; %entry
1651; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1652; VI-NEXT:    s_waitcnt lgkmcnt(0)
1653; VI-NEXT:    s_cmp_lg_u32 s2, 0
1654; VI-NEXT:    s_cbranch_scc0 .LBB13_2
1655; VI-NEXT:  ; %bb.1: ; %else
1656; VI-NEXT:    s_mul_i32 s6, s2, s3
1657; VI-NEXT:    s_mov_b64 s[4:5], 0
1658; VI-NEXT:    s_branch .LBB13_3
1659; VI-NEXT:  .LBB13_2:
1660; VI-NEXT:    s_mov_b64 s[4:5], -1
1661; VI-NEXT:    ; implicit-def: $sgpr6
1662; VI-NEXT:  .LBB13_3: ; %Flow
1663; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1664; VI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
1665; VI-NEXT:    s_cbranch_vccnz .LBB13_5
1666; VI-NEXT:  ; %bb.4: ; %if
1667; VI-NEXT:    s_mov_b32 s7, 0xf000
1668; VI-NEXT:    s_mov_b32 s6, -1
1669; VI-NEXT:    s_waitcnt lgkmcnt(0)
1670; VI-NEXT:    s_mov_b32 s4, s2
1671; VI-NEXT:    s_mov_b32 s5, s3
1672; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1673; VI-NEXT:    s_branch .LBB13_6
1674; VI-NEXT:  .LBB13_5:
1675; VI-NEXT:    v_mov_b32_e32 v0, s6
1676; VI-NEXT:  .LBB13_6: ; %endif
1677; VI-NEXT:    s_waitcnt lgkmcnt(0)
1678; VI-NEXT:    s_mov_b32 s3, 0xf000
1679; VI-NEXT:    s_mov_b32 s2, -1
1680; VI-NEXT:    s_waitcnt vmcnt(0)
1681; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1682; VI-NEXT:    s_endpgm
1683;
1684; GFX9-LABEL: mul32_in_branch:
1685; GFX9:       ; %bb.0: ; %entry
1686; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1687; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1688; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
1689; GFX9-NEXT:    s_cbranch_scc0 .LBB13_2
1690; GFX9-NEXT:  ; %bb.1: ; %else
1691; GFX9-NEXT:    s_mul_i32 s6, s2, s3
1692; GFX9-NEXT:    s_mov_b64 s[4:5], 0
1693; GFX9-NEXT:    s_branch .LBB13_3
1694; GFX9-NEXT:  .LBB13_2:
1695; GFX9-NEXT:    s_mov_b64 s[4:5], -1
1696; GFX9-NEXT:    ; implicit-def: $sgpr6
1697; GFX9-NEXT:  .LBB13_3: ; %Flow
1698; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1699; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
1700; GFX9-NEXT:    s_cbranch_vccnz .LBB13_5
1701; GFX9-NEXT:  ; %bb.4: ; %if
1702; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1703; GFX9-NEXT:    s_mov_b32 s6, -1
1704; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1705; GFX9-NEXT:    s_mov_b32 s4, s2
1706; GFX9-NEXT:    s_mov_b32 s5, s3
1707; GFX9-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1708; GFX9-NEXT:    s_branch .LBB13_6
1709; GFX9-NEXT:  .LBB13_5:
1710; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1711; GFX9-NEXT:  .LBB13_6: ; %endif
1712; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1713; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1714; GFX9-NEXT:    s_mov_b32 s2, -1
1715; GFX9-NEXT:    s_waitcnt vmcnt(0)
1716; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1717; GFX9-NEXT:    s_endpgm
1718;
1719; GFX10-LABEL: mul32_in_branch:
1720; GFX10:       ; %bb.0: ; %entry
1721; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1722; GFX10-NEXT:    s_mov_b32 s4, 0
1723; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1724; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
1725; GFX10-NEXT:    s_cbranch_scc0 .LBB13_2
1726; GFX10-NEXT:  ; %bb.1: ; %else
1727; GFX10-NEXT:    s_mul_i32 s5, s2, s3
1728; GFX10-NEXT:    s_branch .LBB13_3
1729; GFX10-NEXT:  .LBB13_2:
1730; GFX10-NEXT:    s_mov_b32 s4, -1
1731; GFX10-NEXT:    ; implicit-def: $sgpr5
1732; GFX10-NEXT:  .LBB13_3: ; %Flow
1733; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1734; GFX10-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
1735; GFX10-NEXT:    s_cbranch_vccnz .LBB13_5
1736; GFX10-NEXT:  ; %bb.4: ; %if
1737; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1738; GFX10-NEXT:    s_mov_b32 s6, -1
1739; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1740; GFX10-NEXT:    s_mov_b32 s4, s2
1741; GFX10-NEXT:    s_mov_b32 s5, s3
1742; GFX10-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1743; GFX10-NEXT:    s_branch .LBB13_6
1744; GFX10-NEXT:  .LBB13_5:
1745; GFX10-NEXT:    v_mov_b32_e32 v0, s5
1746; GFX10-NEXT:  .LBB13_6: ; %endif
1747; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1748; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1749; GFX10-NEXT:    s_mov_b32 s2, -1
1750; GFX10-NEXT:    s_waitcnt vmcnt(0)
1751; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1752; GFX10-NEXT:    s_endpgm
1753;
1754; GFX11-LABEL: mul32_in_branch:
1755; GFX11:       ; %bb.0: ; %entry
1756; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x34
1757; GFX11-NEXT:    s_mov_b32 s4, 0
1758; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1759; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
1760; GFX11-NEXT:    s_cbranch_scc0 .LBB13_2
1761; GFX11-NEXT:  ; %bb.1: ; %else
1762; GFX11-NEXT:    s_mul_i32 s5, s2, s3
1763; GFX11-NEXT:    s_branch .LBB13_3
1764; GFX11-NEXT:  .LBB13_2:
1765; GFX11-NEXT:    s_mov_b32 s4, -1
1766; GFX11-NEXT:    ; implicit-def: $sgpr5
1767; GFX11-NEXT:  .LBB13_3: ; %Flow
1768; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1769; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
1770; GFX11-NEXT:    s_cbranch_vccnz .LBB13_5
1771; GFX11-NEXT:  ; %bb.4: ; %if
1772; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1773; GFX11-NEXT:    s_mov_b32 s6, -1
1774; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1775; GFX11-NEXT:    s_mov_b32 s4, s2
1776; GFX11-NEXT:    s_mov_b32 s5, s3
1777; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
1778; GFX11-NEXT:    s_branch .LBB13_6
1779; GFX11-NEXT:  .LBB13_5:
1780; GFX11-NEXT:    v_mov_b32_e32 v0, s5
1781; GFX11-NEXT:  .LBB13_6: ; %endif
1782; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1783; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1784; GFX11-NEXT:    s_mov_b32 s2, -1
1785; GFX11-NEXT:    s_waitcnt vmcnt(0)
1786; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1787; GFX11-NEXT:    s_nop 0
1788; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1789; GFX11-NEXT:    s_endpgm
1790;
1791; EG-LABEL: mul32_in_branch:
1792; EG:       ; %bb.0: ; %entry
1793; EG-NEXT:    ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[]
1794; EG-NEXT:    JUMP @3 POP:1
1795; EG-NEXT:    ALU_POP_AFTER 4, @18, KC0[CB0:0-32], KC1[]
1796; EG-NEXT:    ALU_PUSH_BEFORE 2, @23, KC0[CB0:0-32], KC1[]
1797; EG-NEXT:    JUMP @8 POP:1
1798; EG-NEXT:    ALU 0, @26, KC0[CB0:0-32], KC1[]
1799; EG-NEXT:    TEX 0 @12
1800; EG-NEXT:    POP @8 POP:1
1801; EG-NEXT:    ALU 1, @27, KC0[], KC1[]
1802; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1803; EG-NEXT:    CF_END
1804; EG-NEXT:    PAD
1805; EG-NEXT:    Fetch clause starting at 12:
1806; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1807; EG-NEXT:    ALU clause starting at 14:
1808; EG-NEXT:     MOV T0.W, literal.x,
1809; EG-NEXT:     SETNE_INT * T1.W, KC0[2].W, 0.0,
1810; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1811; EG-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
1812; EG-NEXT:    ALU clause starting at 18:
1813; EG-NEXT:     MOV T1.W, KC0[2].W,
1814; EG-NEXT:     MOV * T2.W, KC0[3].X,
1815; EG-NEXT:     MOV T0.W, literal.x,
1816; EG-NEXT:     MULLO_INT * T0.X, PV.W, PS,
1817; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
1818; EG-NEXT:    ALU clause starting at 23:
1819; EG-NEXT:     MOV T1.W, KC0[2].Y,
1820; EG-NEXT:     SETE_INT * T0.W, T0.W, 0.0,
1821; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
1822; EG-NEXT:    ALU clause starting at 26:
1823; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1824; EG-NEXT:    ALU clause starting at 27:
1825; EG-NEXT:     LSHR * T1.X, T1.W, literal.x,
1826; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1827entry:
1828  %0 = icmp eq i32 %a, 0
1829  br i1 %0, label %if, label %else
1830
1831if:
1832  %1 = load i32, ptr addrspace(1) %in
1833  br label %endif
1834
1835else:
1836  %2 = mul i32 %a, %b
1837  br label %endif
1838
1839endif:
1840  %3 = phi i32 [%1, %if], [%2, %else]
1841  store i32 %3, ptr addrspace(1) %out
1842  ret void
1843}
1844
1845define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) {
1846; SI-LABEL: mul64_in_branch:
1847; SI:       ; %bb.0: ; %entry
1848; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
1849; SI-NEXT:    s_mov_b64 s[8:9], 0
1850; SI-NEXT:    s_waitcnt lgkmcnt(0)
1851; SI-NEXT:    v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
1852; SI-NEXT:    s_and_b64 vcc, exec, s[10:11]
1853; SI-NEXT:    s_cbranch_vccz .LBB14_4
1854; SI-NEXT:  ; %bb.1: ; %else
1855; SI-NEXT:    v_mov_b32_e32 v0, s6
1856; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
1857; SI-NEXT:    s_mul_i32 s7, s4, s7
1858; SI-NEXT:    s_mul_i32 s5, s5, s6
1859; SI-NEXT:    s_mul_i32 s4, s4, s6
1860; SI-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
1861; SI-NEXT:    v_add_i32_e32 v1, vcc, s5, v0
1862; SI-NEXT:    v_mov_b32_e32 v0, s4
1863; SI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
1864; SI-NEXT:    s_cbranch_vccnz .LBB14_3
1865; SI-NEXT:  .LBB14_2: ; %if
1866; SI-NEXT:    s_mov_b32 s7, 0xf000
1867; SI-NEXT:    s_mov_b32 s6, -1
1868; SI-NEXT:    s_mov_b32 s4, s2
1869; SI-NEXT:    s_mov_b32 s5, s3
1870; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1871; SI-NEXT:  .LBB14_3: ; %endif
1872; SI-NEXT:    s_mov_b32 s3, 0xf000
1873; SI-NEXT:    s_mov_b32 s2, -1
1874; SI-NEXT:    s_waitcnt vmcnt(0)
1875; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1876; SI-NEXT:    s_endpgm
1877; SI-NEXT:  .LBB14_4:
1878; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1
1879; SI-NEXT:    s_branch .LBB14_2
1880;
1881; VI-LABEL: mul64_in_branch:
1882; VI:       ; %bb.0: ; %entry
1883; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
1884; VI-NEXT:    s_mov_b64 s[8:9], 0
1885; VI-NEXT:    s_waitcnt lgkmcnt(0)
1886; VI-NEXT:    s_cmp_lg_u64 s[4:5], 0
1887; VI-NEXT:    s_cbranch_scc0 .LBB14_4
1888; VI-NEXT:  ; %bb.1: ; %else
1889; VI-NEXT:    v_mov_b32_e32 v0, s6
1890; VI-NEXT:    v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0
1891; VI-NEXT:    s_mul_i32 s4, s4, s7
1892; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
1893; VI-NEXT:    s_mul_i32 s4, s5, s6
1894; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
1895; VI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
1896; VI-NEXT:    s_cbranch_vccnz .LBB14_3
1897; VI-NEXT:  .LBB14_2: ; %if
1898; VI-NEXT:    s_mov_b32 s7, 0xf000
1899; VI-NEXT:    s_mov_b32 s6, -1
1900; VI-NEXT:    s_mov_b32 s4, s2
1901; VI-NEXT:    s_mov_b32 s5, s3
1902; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1903; VI-NEXT:  .LBB14_3: ; %endif
1904; VI-NEXT:    s_mov_b32 s3, 0xf000
1905; VI-NEXT:    s_mov_b32 s2, -1
1906; VI-NEXT:    s_waitcnt vmcnt(0)
1907; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1908; VI-NEXT:    s_endpgm
1909; VI-NEXT:  .LBB14_4:
1910; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
1911; VI-NEXT:    s_branch .LBB14_2
1912;
1913; GFX9-LABEL: mul64_in_branch:
1914; GFX9:       ; %bb.0: ; %entry
1915; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
1916; GFX9-NEXT:    s_mov_b64 s[8:9], 0
1917; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1918; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
1919; GFX9-NEXT:    s_cbranch_scc0 .LBB14_3
1920; GFX9-NEXT:  ; %bb.1: ; %else
1921; GFX9-NEXT:    s_mul_i32 s7, s4, s7
1922; GFX9-NEXT:    s_mul_hi_u32 s10, s4, s6
1923; GFX9-NEXT:    s_add_i32 s7, s10, s7
1924; GFX9-NEXT:    s_mul_i32 s5, s5, s6
1925; GFX9-NEXT:    s_add_i32 s5, s7, s5
1926; GFX9-NEXT:    s_mul_i32 s4, s4, s6
1927; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
1928; GFX9-NEXT:    s_cbranch_vccnz .LBB14_4
1929; GFX9-NEXT:  .LBB14_2: ; %if
1930; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1931; GFX9-NEXT:    s_mov_b32 s6, -1
1932; GFX9-NEXT:    s_mov_b32 s4, s2
1933; GFX9-NEXT:    s_mov_b32 s5, s3
1934; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1935; GFX9-NEXT:    s_branch .LBB14_5
1936; GFX9-NEXT:  .LBB14_3:
1937; GFX9-NEXT:    ; implicit-def: $sgpr4_sgpr5
1938; GFX9-NEXT:    s_branch .LBB14_2
1939; GFX9-NEXT:  .LBB14_4:
1940; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1941; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1942; GFX9-NEXT:  .LBB14_5: ; %endif
1943; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1944; GFX9-NEXT:    s_mov_b32 s2, -1
1945; GFX9-NEXT:    s_waitcnt vmcnt(0)
1946; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1947; GFX9-NEXT:    s_endpgm
1948;
1949; GFX10-LABEL: mul64_in_branch:
1950; GFX10:       ; %bb.0: ; %entry
1951; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
1952; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1953; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
1954; GFX10-NEXT:    s_cbranch_scc0 .LBB14_3
1955; GFX10-NEXT:  ; %bb.1: ; %else
1956; GFX10-NEXT:    s_mul_i32 s7, s4, s7
1957; GFX10-NEXT:    s_mul_hi_u32 s8, s4, s6
1958; GFX10-NEXT:    s_mul_i32 s5, s5, s6
1959; GFX10-NEXT:    s_add_i32 s7, s8, s7
1960; GFX10-NEXT:    s_mul_i32 s4, s4, s6
1961; GFX10-NEXT:    s_add_i32 s5, s7, s5
1962; GFX10-NEXT:    s_mov_b32 s6, 0
1963; GFX10-NEXT:    s_cbranch_execnz .LBB14_4
1964; GFX10-NEXT:  .LBB14_2: ; %if
1965; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1966; GFX10-NEXT:    s_mov_b32 s6, -1
1967; GFX10-NEXT:    s_mov_b32 s4, s2
1968; GFX10-NEXT:    s_mov_b32 s5, s3
1969; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1970; GFX10-NEXT:    s_branch .LBB14_5
1971; GFX10-NEXT:  .LBB14_3:
1972; GFX10-NEXT:    s_mov_b32 s6, -1
1973; GFX10-NEXT:    ; implicit-def: $sgpr4_sgpr5
1974; GFX10-NEXT:    s_branch .LBB14_2
1975; GFX10-NEXT:  .LBB14_4:
1976; GFX10-NEXT:    v_mov_b32_e32 v0, s4
1977; GFX10-NEXT:    v_mov_b32_e32 v1, s5
1978; GFX10-NEXT:  .LBB14_5: ; %endif
1979; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1980; GFX10-NEXT:    s_mov_b32 s2, -1
1981; GFX10-NEXT:    s_waitcnt vmcnt(0)
1982; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1983; GFX10-NEXT:    s_endpgm
1984;
1985; GFX11-LABEL: mul64_in_branch:
1986; GFX11:       ; %bb.0: ; %entry
1987; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
1988; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1989; GFX11-NEXT:    s_cmp_lg_u64 s[4:5], 0
1990; GFX11-NEXT:    s_cbranch_scc0 .LBB14_3
1991; GFX11-NEXT:  ; %bb.1: ; %else
1992; GFX11-NEXT:    s_mul_i32 s7, s4, s7
1993; GFX11-NEXT:    s_mul_hi_u32 s8, s4, s6
1994; GFX11-NEXT:    s_mul_i32 s5, s5, s6
1995; GFX11-NEXT:    s_add_i32 s7, s8, s7
1996; GFX11-NEXT:    s_mul_i32 s4, s4, s6
1997; GFX11-NEXT:    s_add_i32 s5, s7, s5
1998; GFX11-NEXT:    s_mov_b32 s6, 0
1999; GFX11-NEXT:    s_cbranch_execnz .LBB14_4
2000; GFX11-NEXT:  .LBB14_2: ; %if
2001; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
2002; GFX11-NEXT:    s_mov_b32 s6, -1
2003; GFX11-NEXT:    s_mov_b32 s4, s2
2004; GFX11-NEXT:    s_mov_b32 s5, s3
2005; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], 0
2006; GFX11-NEXT:    s_branch .LBB14_5
2007; GFX11-NEXT:  .LBB14_3:
2008; GFX11-NEXT:    s_mov_b32 s6, -1
2009; GFX11-NEXT:    ; implicit-def: $sgpr4_sgpr5
2010; GFX11-NEXT:    s_branch .LBB14_2
2011; GFX11-NEXT:  .LBB14_4:
2012; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
2013; GFX11-NEXT:  .LBB14_5: ; %endif
2014; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
2015; GFX11-NEXT:    s_mov_b32 s2, -1
2016; GFX11-NEXT:    s_waitcnt vmcnt(0)
2017; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2018; GFX11-NEXT:    s_nop 0
2019; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2020; GFX11-NEXT:    s_endpgm
2021;
2022; EG-LABEL: mul64_in_branch:
2023; EG:       ; %bb.0: ; %entry
2024; EG-NEXT:    ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[]
2025; EG-NEXT:    JUMP @3 POP:1
2026; EG-NEXT:    ALU_POP_AFTER 11, @19, KC0[CB0:0-32], KC1[]
2027; EG-NEXT:    ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[]
2028; EG-NEXT:    JUMP @8 POP:1
2029; EG-NEXT:    ALU 0, @34, KC0[CB0:0-32], KC1[]
2030; EG-NEXT:    TEX 0 @12
2031; EG-NEXT:    POP @8 POP:1
2032; EG-NEXT:    ALU 1, @35, KC0[], KC1[]
2033; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
2034; EG-NEXT:    CF_END
2035; EG-NEXT:    PAD
2036; EG-NEXT:    Fetch clause starting at 12:
2037; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
2038; EG-NEXT:    ALU clause starting at 14:
2039; EG-NEXT:     OR_INT T0.W, KC0[2].W, KC0[3].X,
2040; EG-NEXT:     MOV * T1.W, literal.x,
2041; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
2042; EG-NEXT:     SETNE_INT * T0.W, PV.W, 0.0,
2043; EG-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
2044; EG-NEXT:    ALU clause starting at 19:
2045; EG-NEXT:     MOV T0.W, KC0[2].W,
2046; EG-NEXT:     MOV * T1.W, KC0[3].Z,
2047; EG-NEXT:     MOV T2.W, KC0[3].Y,
2048; EG-NEXT:     MULLO_INT * T0.X, PV.W, PS,
2049; EG-NEXT:     MOV T1.W, KC0[3].X,
2050; EG-NEXT:     MULHI * T0.Y, T0.W, PV.W,
2051; EG-NEXT:     ADD_INT T3.W, PS, T0.X,
2052; EG-NEXT:     MULLO_INT * T0.X, PV.W, T2.W,
2053; EG-NEXT:     ADD_INT T0.Y, PV.W, PS,
2054; EG-NEXT:     MOV T1.W, literal.x,
2055; EG-NEXT:     MULLO_INT * T0.X, T0.W, T2.W,
2056; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2057; EG-NEXT:    ALU clause starting at 31:
2058; EG-NEXT:     MOV T0.W, KC0[2].Y,
2059; EG-NEXT:     SETE_INT * T1.W, T1.W, 0.0,
2060; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
2061; EG-NEXT:    ALU clause starting at 34:
2062; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2063; EG-NEXT:    ALU clause starting at 35:
2064; EG-NEXT:     LSHR * T1.X, T0.W, literal.x,
2065; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2066entry:
2067  %0 = icmp eq i64 %a, 0
2068  br i1 %0, label %if, label %else
2069
2070if:
2071  %1 = load i64, ptr addrspace(1) %in
2072  br label %endif
2073
2074else:
2075  %2 = mul i64 %a, %b
2076  br label %endif
2077
2078endif:
2079  %3 = phi i64 [%1, %if], [%2, %else]
2080  store i64 %3, ptr addrspace(1) %out
2081  ret void
2082}
2083
2084define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 {
2085; SI-LABEL: s_mul_i128:
2086; SI:       ; %bb.0: ; %entry
2087; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x13
2088; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x1f
2089; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2090; SI-NEXT:    s_mov_b32 s3, 0xf000
2091; SI-NEXT:    s_mov_b32 s2, -1
2092; SI-NEXT:    s_waitcnt lgkmcnt(0)
2093; SI-NEXT:    v_mov_b32_e32 v0, s6
2094; SI-NEXT:    v_mul_hi_u32 v0, s8, v0
2095; SI-NEXT:    v_mov_b32_e32 v1, s4
2096; SI-NEXT:    v_mul_hi_u32 v1, s10, v1
2097; SI-NEXT:    s_mul_i32 s7, s8, s7
2098; SI-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
2099; SI-NEXT:    s_mul_i32 s7, s10, s5
2100; SI-NEXT:    s_mul_i32 s12, s9, s6
2101; SI-NEXT:    s_mul_i32 s6, s8, s6
2102; SI-NEXT:    v_add_i32_e32 v1, vcc, s7, v1
2103; SI-NEXT:    s_mul_i32 s7, s11, s4
2104; SI-NEXT:    v_add_i32_e32 v0, vcc, s12, v0
2105; SI-NEXT:    v_add_i32_e32 v1, vcc, s7, v1
2106; SI-NEXT:    s_mul_i32 s7, s10, s4
2107; SI-NEXT:    v_mov_b32_e32 v2, s6
2108; SI-NEXT:    v_add_i32_e32 v2, vcc, s7, v2
2109; SI-NEXT:    v_addc_u32_e32 v0, vcc, v1, v0, vcc
2110; SI-NEXT:    v_mov_b32_e32 v1, s8
2111; SI-NEXT:    v_mul_hi_u32 v5, s4, v1
2112; SI-NEXT:    v_mul_hi_u32 v1, s5, v1
2113; SI-NEXT:    v_mov_b32_e32 v3, s9
2114; SI-NEXT:    v_mul_hi_u32 v4, s4, v3
2115; SI-NEXT:    s_mul_i32 s7, s5, s8
2116; SI-NEXT:    v_add_i32_e32 v5, vcc, s7, v5
2117; SI-NEXT:    s_mul_i32 s6, s4, s9
2118; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
2119; SI-NEXT:    v_add_i32_e32 v1, vcc, s6, v5
2120; SI-NEXT:    v_mul_hi_u32 v3, s5, v3
2121; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
2122; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
2123; SI-NEXT:    s_mul_i32 s5, s5, s9
2124; SI-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, 0, vcc
2125; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v4
2126; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
2127; SI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
2128; SI-NEXT:    s_mul_i32 s4, s4, s8
2129; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v0, vcc
2130; SI-NEXT:    v_mov_b32_e32 v0, s4
2131; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2132; SI-NEXT:    s_endpgm
2133;
2134; VI-LABEL: s_mul_i128:
2135; VI:       ; %bb.0: ; %entry
2136; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x4c
2137; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x7c
2138; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2139; VI-NEXT:    v_mov_b32_e32 v5, 0
2140; VI-NEXT:    s_mov_b32 s3, 0xf000
2141; VI-NEXT:    s_waitcnt lgkmcnt(0)
2142; VI-NEXT:    v_mov_b32_e32 v0, s6
2143; VI-NEXT:    v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0
2144; VI-NEXT:    s_mul_i32 s7, s8, s7
2145; VI-NEXT:    v_mov_b32_e32 v6, s8
2146; VI-NEXT:    v_add_u32_e32 v3, vcc, s7, v3
2147; VI-NEXT:    s_mul_i32 s12, s9, s6
2148; VI-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s4, v6, 0
2149; VI-NEXT:    v_add_u32_e32 v3, vcc, s12, v3
2150; VI-NEXT:    v_mov_b32_e32 v4, v1
2151; VI-NEXT:    v_mad_u64_u32 v[6:7], s[6:7], s5, v6, v[4:5]
2152; VI-NEXT:    v_mov_b32_e32 v8, s4
2153; VI-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], s10, v8, v[2:3]
2154; VI-NEXT:    v_mov_b32_e32 v3, v7
2155; VI-NEXT:    v_mov_b32_e32 v7, v5
2156; VI-NEXT:    v_mov_b32_e32 v8, s9
2157; VI-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], s4, v8, v[6:7]
2158; VI-NEXT:    s_mul_i32 s8, s11, s4
2159; VI-NEXT:    v_add_u32_e32 v6, vcc, s8, v2
2160; VI-NEXT:    v_mov_b32_e32 v2, v5
2161; VI-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
2162; VI-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, 0, vcc
2163; VI-NEXT:    s_mul_i32 s8, s10, s5
2164; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s5, v8, v[2:3]
2165; VI-NEXT:    v_add_u32_e32 v5, vcc, s8, v6
2166; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
2167; VI-NEXT:    s_mov_b32 s2, -1
2168; VI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
2169; VI-NEXT:    v_mov_b32_e32 v1, v4
2170; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2171; VI-NEXT:    s_endpgm
2172;
2173; GFX9-LABEL: s_mul_i128:
2174; GFX9:       ; %bb.0: ; %entry
2175; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x4c
2176; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x7c
2177; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2178; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2179; GFX9-NEXT:    s_mov_b32 s6, -1
2180; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2181; GFX9-NEXT:    s_mul_i32 s0, s12, s11
2182; GFX9-NEXT:    s_mul_hi_u32 s1, s12, s10
2183; GFX9-NEXT:    s_mul_i32 s2, s14, s9
2184; GFX9-NEXT:    s_mul_hi_u32 s3, s14, s8
2185; GFX9-NEXT:    s_add_i32 s0, s1, s0
2186; GFX9-NEXT:    s_mul_i32 s1, s13, s10
2187; GFX9-NEXT:    s_add_i32 s2, s3, s2
2188; GFX9-NEXT:    s_mul_i32 s3, s15, s8
2189; GFX9-NEXT:    s_add_i32 s0, s0, s1
2190; GFX9-NEXT:    s_mul_i32 s1, s12, s10
2191; GFX9-NEXT:    s_add_i32 s2, s2, s3
2192; GFX9-NEXT:    s_mul_i32 s3, s14, s8
2193; GFX9-NEXT:    s_add_u32 s3, s3, s1
2194; GFX9-NEXT:    s_addc_u32 s2, s2, s0
2195; GFX9-NEXT:    s_mul_i32 s14, s9, s12
2196; GFX9-NEXT:    s_mul_hi_u32 s15, s8, s12
2197; GFX9-NEXT:    s_mul_hi_u32 s11, s9, s12
2198; GFX9-NEXT:    s_add_u32 s14, s14, s15
2199; GFX9-NEXT:    s_mul_i32 s1, s8, s13
2200; GFX9-NEXT:    s_addc_u32 s11, s11, 0
2201; GFX9-NEXT:    s_mul_hi_u32 s10, s8, s13
2202; GFX9-NEXT:    s_add_u32 s1, s1, s14
2203; GFX9-NEXT:    s_addc_u32 s10, s10, 0
2204; GFX9-NEXT:    s_add_u32 s10, s11, s10
2205; GFX9-NEXT:    s_addc_u32 s11, 0, 0
2206; GFX9-NEXT:    s_mul_hi_u32 s14, s9, s13
2207; GFX9-NEXT:    s_mul_i32 s9, s9, s13
2208; GFX9-NEXT:    s_add_u32 s9, s9, s10
2209; GFX9-NEXT:    s_addc_u32 s10, s14, s11
2210; GFX9-NEXT:    s_mov_b32 s0, 0
2211; GFX9-NEXT:    s_add_u32 s9, s9, s3
2212; GFX9-NEXT:    s_addc_u32 s10, s10, s2
2213; GFX9-NEXT:    s_mul_i32 s2, s8, s12
2214; GFX9-NEXT:    s_mov_b32 s3, s0
2215; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
2216; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2217; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2218; GFX9-NEXT:    v_mov_b32_e32 v2, s9
2219; GFX9-NEXT:    v_mov_b32_e32 v3, s10
2220; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2221; GFX9-NEXT:    s_endpgm
2222;
2223; GFX10-LABEL: s_mul_i128:
2224; GFX10:       ; %bb.0: ; %entry
2225; GFX10-NEXT:    s_clause 0x1
2226; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x4c
2227; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x7c
2228; GFX10-NEXT:    s_mov_b32 s2, 0
2229; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2230; GFX10-NEXT:    s_mov_b32 s13, s2
2231; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2232; GFX10-NEXT:    s_mul_i32 s3, s8, s7
2233; GFX10-NEXT:    s_mul_hi_u32 s7, s8, s6
2234; GFX10-NEXT:    s_mul_i32 s14, s10, s5
2235; GFX10-NEXT:    s_mul_hi_u32 s15, s10, s4
2236; GFX10-NEXT:    s_mul_i32 s12, s9, s6
2237; GFX10-NEXT:    s_mul_i32 s11, s11, s4
2238; GFX10-NEXT:    s_add_i32 s3, s7, s3
2239; GFX10-NEXT:    s_add_i32 s7, s15, s14
2240; GFX10-NEXT:    s_mul_i32 s6, s8, s6
2241; GFX10-NEXT:    s_mul_i32 s10, s10, s4
2242; GFX10-NEXT:    s_add_i32 s3, s3, s12
2243; GFX10-NEXT:    s_add_i32 s7, s7, s11
2244; GFX10-NEXT:    s_mul_i32 s19, s5, s8
2245; GFX10-NEXT:    s_mul_hi_u32 s20, s4, s8
2246; GFX10-NEXT:    s_add_u32 s6, s10, s6
2247; GFX10-NEXT:    s_mul_hi_u32 s18, s5, s8
2248; GFX10-NEXT:    s_addc_u32 s7, s7, s3
2249; GFX10-NEXT:    s_mul_i32 s17, s4, s9
2250; GFX10-NEXT:    s_add_u32 s3, s19, s20
2251; GFX10-NEXT:    s_mul_hi_u32 s16, s4, s9
2252; GFX10-NEXT:    s_mul_hi_u32 s21, s5, s9
2253; GFX10-NEXT:    s_mul_i32 s5, s5, s9
2254; GFX10-NEXT:    s_addc_u32 s9, s18, 0
2255; GFX10-NEXT:    s_add_u32 s3, s17, s3
2256; GFX10-NEXT:    s_addc_u32 s10, s16, 0
2257; GFX10-NEXT:    s_mul_i32 s12, s4, s8
2258; GFX10-NEXT:    s_add_u32 s4, s9, s10
2259; GFX10-NEXT:    s_addc_u32 s8, 0, 0
2260; GFX10-NEXT:    s_add_u32 s4, s5, s4
2261; GFX10-NEXT:    s_addc_u32 s5, s21, s8
2262; GFX10-NEXT:    s_add_u32 s4, s4, s6
2263; GFX10-NEXT:    s_addc_u32 s5, s5, s7
2264; GFX10-NEXT:    s_or_b64 s[2:3], s[12:13], s[2:3]
2265; GFX10-NEXT:    v_mov_b32_e32 v2, s4
2266; GFX10-NEXT:    v_mov_b32_e32 v0, s2
2267; GFX10-NEXT:    v_mov_b32_e32 v1, s3
2268; GFX10-NEXT:    v_mov_b32_e32 v3, s5
2269; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
2270; GFX10-NEXT:    s_mov_b32 s2, -1
2271; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2272; GFX10-NEXT:    s_endpgm
2273;
2274; GFX11-LABEL: s_mul_i128:
2275; GFX11:       ; %bb.0: ; %entry
2276; GFX11-NEXT:    s_clause 0x2
2277; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x4c
2278; GFX11-NEXT:    s_load_b128 s[8:11], s[0:1], 0x7c
2279; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2280; GFX11-NEXT:    s_mov_b32 s2, 0
2281; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2282; GFX11-NEXT:    s_mov_b32 s13, s2
2283; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2284; GFX11-NEXT:    s_mul_i32 s3, s8, s7
2285; GFX11-NEXT:    s_mul_hi_u32 s7, s8, s6
2286; GFX11-NEXT:    s_mul_i32 s14, s10, s5
2287; GFX11-NEXT:    s_mul_hi_u32 s15, s10, s4
2288; GFX11-NEXT:    s_mul_i32 s12, s9, s6
2289; GFX11-NEXT:    s_mul_i32 s11, s11, s4
2290; GFX11-NEXT:    s_add_i32 s3, s7, s3
2291; GFX11-NEXT:    s_add_i32 s7, s15, s14
2292; GFX11-NEXT:    s_mul_i32 s6, s8, s6
2293; GFX11-NEXT:    s_mul_i32 s10, s10, s4
2294; GFX11-NEXT:    s_add_i32 s3, s3, s12
2295; GFX11-NEXT:    s_add_i32 s7, s7, s11
2296; GFX11-NEXT:    s_mul_i32 s19, s5, s8
2297; GFX11-NEXT:    s_mul_hi_u32 s20, s4, s8
2298; GFX11-NEXT:    s_add_u32 s6, s10, s6
2299; GFX11-NEXT:    s_mul_hi_u32 s18, s5, s8
2300; GFX11-NEXT:    s_addc_u32 s7, s7, s3
2301; GFX11-NEXT:    s_mul_i32 s17, s4, s9
2302; GFX11-NEXT:    s_add_u32 s3, s19, s20
2303; GFX11-NEXT:    s_mul_hi_u32 s16, s4, s9
2304; GFX11-NEXT:    s_mul_hi_u32 s21, s5, s9
2305; GFX11-NEXT:    s_mul_i32 s5, s5, s9
2306; GFX11-NEXT:    s_addc_u32 s9, s18, 0
2307; GFX11-NEXT:    s_add_u32 s3, s17, s3
2308; GFX11-NEXT:    s_addc_u32 s10, s16, 0
2309; GFX11-NEXT:    s_mul_i32 s12, s4, s8
2310; GFX11-NEXT:    s_add_u32 s4, s9, s10
2311; GFX11-NEXT:    s_addc_u32 s8, 0, 0
2312; GFX11-NEXT:    s_add_u32 s4, s5, s4
2313; GFX11-NEXT:    s_addc_u32 s5, s21, s8
2314; GFX11-NEXT:    s_add_u32 s4, s4, s6
2315; GFX11-NEXT:    s_addc_u32 s5, s5, s7
2316; GFX11-NEXT:    s_or_b64 s[2:3], s[12:13], s[2:3]
2317; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2318; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
2319; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
2320; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
2321; GFX11-NEXT:    s_mov_b32 s2, -1
2322; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
2323; GFX11-NEXT:    s_nop 0
2324; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2325; GFX11-NEXT:    s_endpgm
2326;
2327; EG-LABEL: s_mul_i128:
2328; EG:       ; %bb.0: ; %entry
2329; EG-NEXT:    ALU 41, @4, KC0[CB0:0-32], KC1[]
2330; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2331; EG-NEXT:    CF_END
2332; EG-NEXT:    PAD
2333; EG-NEXT:    ALU clause starting at 4:
2334; EG-NEXT:     MULLO_INT * T0.X, KC0[5].X, KC0[8].X,
2335; EG-NEXT:     MULHI * T0.Y, KC0[5].X, KC0[8].X,
2336; EG-NEXT:     MULLO_INT * T0.Z, KC0[8].Y, KC0[4].W,
2337; EG-NEXT:     MULLO_INT * T0.W, KC0[8].X, KC0[5].Y,
2338; EG-NEXT:     MULHI * T1.X, KC0[5].X, KC0[7].W,
2339; EG-NEXT:     MULHI * T1.Y, KC0[4].W, KC0[8].X,
2340; EG-NEXT:     MULHI * T1.Z, KC0[8].Y, KC0[4].W,
2341; EG-NEXT:     MULLO_INT * T1.W, KC0[8].Y, KC0[5].X,
2342; EG-NEXT:     MULHI * T2.X, KC0[7].W, KC0[5].Y,
2343; EG-NEXT:     MULLO_INT * T2.Y, KC0[5].X, KC0[7].W,
2344; EG-NEXT:     MULHI * T2.Z, KC0[4].W, KC0[7].W,
2345; EG-NEXT:     ADD_INT T2.W, T2.Y, PS,
2346; EG-NEXT:     MULLO_INT * T3.X, KC0[4].W, KC0[8].X,
2347; EG-NEXT:     ADDC_UINT T2.Z, T2.Y, T2.Z,
2348; EG-NEXT:     ADDC_UINT T3.W, PS, PV.W,
2349; EG-NEXT:     MULLO_INT * T2.Y, KC0[7].W, KC0[5].Z,
2350; EG-NEXT:     ADD_INT T2.X, T2.X, PS,
2351; EG-NEXT:     ADD_INT T2.Y, T1.Z, T1.W,
2352; EG-NEXT:     ADD_INT T1.Z, T1.Y, PV.W,
2353; EG-NEXT:     ADD_INT T1.W, T1.X, PV.Z, BS:VEC_120/SCL_212
2354; EG-NEXT:     MULLO_INT * T1.X, KC0[8].Z, KC0[4].W,
2355; EG-NEXT:     ADD_INT T4.X, PV.W, PV.Z,
2356; EG-NEXT:     ADDC_UINT T1.Y, PV.W, PV.Z,
2357; EG-NEXT:     ADD_INT T1.Z, PV.Y, PS,
2358; EG-NEXT:     ADD_INT T0.W, PV.X, T0.W,
2359; EG-NEXT:     MULLO_INT * T1.X, KC0[7].W, KC0[5].Y,
2360; EG-NEXT:     ADD_INT T2.Y, PV.Z, PV.W,
2361; EG-NEXT:     ADDC_UINT T1.Z, T0.Z, PS,
2362; EG-NEXT:     ADD_INT T0.W, T0.Y, PV.Y,
2363; EG-NEXT:     ADDC_UINT * T1.W, T0.X, PV.X,
2364; EG-NEXT:     ADD_INT T0.Y, T0.X, T4.X,
2365; EG-NEXT:     ADD_INT T0.Z, T0.Z, T1.X, BS:VEC_021/SCL_122
2366; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
2367; EG-NEXT:     ADD_INT * T1.W, PV.Y, PV.Z,
2368; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
2369; EG-NEXT:     ADDC_UINT * T1.W, PV.Y, PV.Z,
2370; EG-NEXT:     ADD_INT * T0.W, PV.W, PS,
2371; EG-NEXT:     ADD_INT * T0.Z, T0.Y, T0.Z,
2372; EG-NEXT:     ADD_INT * T0.Y, T3.X, T2.W,
2373; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2374; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2375; EG-NEXT:     MULLO_INT * T0.X, KC0[4].W, KC0[7].W,
2376entry:
2377  %mul = mul i128 %a, %b
2378  store i128 %mul, ptr addrspace(1) %out
2379  ret void
2380}
2381
2382define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
2383; SI-LABEL: v_mul_i128:
2384; SI:       ; %bb.0: ; %entry
2385; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2386; SI-NEXT:    s_mov_b32 s7, 0xf000
2387; SI-NEXT:    s_mov_b32 s6, 0
2388; SI-NEXT:    v_lshlrev_b32_e32 v8, 4, v0
2389; SI-NEXT:    v_mov_b32_e32 v9, 0
2390; SI-NEXT:    s_waitcnt lgkmcnt(0)
2391; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
2392; SI-NEXT:    s_mov_b64 s[0:1], s[2:3]
2393; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
2394; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
2395; SI-NEXT:    buffer_load_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64
2396; SI-NEXT:    s_waitcnt vmcnt(0)
2397; SI-NEXT:    v_mul_lo_u32 v3, v4, v3
2398; SI-NEXT:    v_mul_hi_u32 v10, v4, v2
2399; SI-NEXT:    v_mul_lo_u32 v12, v6, v1
2400; SI-NEXT:    v_mul_hi_u32 v13, v6, v0
2401; SI-NEXT:    v_mul_lo_u32 v17, v1, v4
2402; SI-NEXT:    v_mul_hi_u32 v18, v0, v4
2403; SI-NEXT:    v_mul_lo_u32 v11, v5, v2
2404; SI-NEXT:    v_mul_lo_u32 v7, v7, v0
2405; SI-NEXT:    v_mul_hi_u32 v16, v1, v4
2406; SI-NEXT:    v_mul_lo_u32 v15, v0, v5
2407; SI-NEXT:    v_mul_hi_u32 v14, v0, v5
2408; SI-NEXT:    v_mul_hi_u32 v19, v1, v5
2409; SI-NEXT:    v_mul_lo_u32 v5, v1, v5
2410; SI-NEXT:    v_add_i32_e32 v1, vcc, v10, v3
2411; SI-NEXT:    v_add_i32_e32 v3, vcc, v13, v12
2412; SI-NEXT:    v_mul_lo_u32 v2, v4, v2
2413; SI-NEXT:    v_mul_lo_u32 v6, v6, v0
2414; SI-NEXT:    v_mul_lo_u32 v0, v0, v4
2415; SI-NEXT:    v_add_i32_e32 v4, vcc, v17, v18
2416; SI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v16, vcc
2417; SI-NEXT:    v_add_i32_e32 v11, vcc, v1, v11
2418; SI-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
2419; SI-NEXT:    v_add_i32_e32 v1, vcc, v15, v4
2420; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v14, vcc
2421; SI-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
2422; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
2423; SI-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
2424; SI-NEXT:    v_addc_u32_e64 v6, s[4:5], 0, 0, vcc
2425; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2426; SI-NEXT:    v_addc_u32_e32 v5, vcc, v19, v6, vcc
2427; SI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
2428; SI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
2429; SI-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64
2430; SI-NEXT:    s_endpgm
2431;
2432; VI-LABEL: v_mul_i128:
2433; VI:       ; %bb.0: ; %entry
2434; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
2435; VI-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
2436; VI-NEXT:    v_mov_b32_e32 v11, 0
2437; VI-NEXT:    s_waitcnt lgkmcnt(0)
2438; VI-NEXT:    v_mov_b32_e32 v1, s1
2439; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2440; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2441; VI-NEXT:    v_mov_b32_e32 v3, s3
2442; VI-NEXT:    v_add_u32_e32 v8, vcc, s2, v2
2443; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
2444; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2445; VI-NEXT:    flat_load_dwordx4 v[4:7], v[8:9]
2446; VI-NEXT:    s_waitcnt vmcnt(0)
2447; VI-NEXT:    v_mul_lo_u32 v10, v4, v3
2448; VI-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v4, v2, 0
2449; VI-NEXT:    v_mul_lo_u32 v14, v5, v2
2450; VI-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
2451; VI-NEXT:    v_mul_lo_u32 v15, v7, v0
2452; VI-NEXT:    v_add_u32_e32 v7, vcc, v13, v10
2453; VI-NEXT:    v_mov_b32_e32 v10, v3
2454; VI-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11]
2455; VI-NEXT:    v_add_u32_e32 v13, vcc, v7, v14
2456; VI-NEXT:    v_mov_b32_e32 v7, v4
2457; VI-NEXT:    v_mov_b32_e32 v4, v11
2458; VI-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v6, v0, v[12:13]
2459; VI-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4]
2460; VI-NEXT:    v_add_u32_e32 v11, vcc, v15, v13
2461; VI-NEXT:    v_mov_b32_e32 v0, v4
2462; VI-NEXT:    v_mul_lo_u32 v10, v6, v1
2463; VI-NEXT:    v_add_u32_e32 v6, vcc, v7, v0
2464; VI-NEXT:    v_addc_u32_e64 v7, s[0:1], 0, 0, vcc
2465; VI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7]
2466; VI-NEXT:    v_add_u32_e32 v5, vcc, v10, v11
2467; VI-NEXT:    v_add_u32_e32 v4, vcc, v0, v12
2468; VI-NEXT:    v_addc_u32_e32 v5, vcc, v1, v5, vcc
2469; VI-NEXT:    flat_store_dwordx4 v[8:9], v[2:5]
2470; VI-NEXT:    s_endpgm
2471;
2472; GFX9-LABEL: v_mul_i128:
2473; GFX9:       ; %bb.0: ; %entry
2474; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
2475; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 4, v0
2476; GFX9-NEXT:    v_mov_b32_e32 v10, 0
2477; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2478; GFX9-NEXT:    global_load_dwordx4 v[0:3], v13, s[0:1]
2479; GFX9-NEXT:    global_load_dwordx4 v[4:7], v13, s[2:3]
2480; GFX9-NEXT:    s_waitcnt vmcnt(0)
2481; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
2482; GFX9-NEXT:    v_mul_lo_u32 v14, v5, v2
2483; GFX9-NEXT:    v_mul_lo_u32 v15, v4, v3
2484; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10]
2485; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0
2486; GFX9-NEXT:    v_mov_b32_e32 v4, v12
2487; GFX9-NEXT:    v_mov_b32_e32 v12, v10
2488; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[11:12]
2489; GFX9-NEXT:    v_add3_u32 v3, v3, v15, v14
2490; GFX9-NEXT:    v_mul_lo_u32 v17, v7, v0
2491; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3]
2492; GFX9-NEXT:    v_mov_b32_e32 v0, v10
2493; GFX9-NEXT:    v_mul_lo_u32 v16, v6, v1
2494; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v4, v0
2495; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc
2496; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7]
2497; GFX9-NEXT:    v_add3_u32 v3, v17, v3, v16
2498; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v0, v2
2499; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v1, v3, vcc
2500; GFX9-NEXT:    global_store_dwordx4 v13, v[8:11], s[2:3]
2501; GFX9-NEXT:    s_endpgm
2502;
2503; GFX10-LABEL: v_mul_i128:
2504; GFX10:       ; %bb.0: ; %entry
2505; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
2506; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 4, v0
2507; GFX10-NEXT:    v_mov_b32_e32 v10, 0
2508; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2509; GFX10-NEXT:    s_clause 0x1
2510; GFX10-NEXT:    global_load_dwordx4 v[0:3], v14, s[0:1]
2511; GFX10-NEXT:    global_load_dwordx4 v[4:7], v14, s[2:3]
2512; GFX10-NEXT:    s_waitcnt vmcnt(0)
2513; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s0, v0, v4, 0
2514; GFX10-NEXT:    v_mul_lo_u32 v7, v7, v0
2515; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s0, v1, v4, v[9:10]
2516; GFX10-NEXT:    v_mov_b32_e32 v9, v12
2517; GFX10-NEXT:    v_mov_b32_e32 v12, v10
2518; GFX10-NEXT:    v_mul_lo_u32 v10, v5, v2
2519; GFX10-NEXT:    v_mad_u64_u32 v[12:13], s0, v0, v5, v[11:12]
2520; GFX10-NEXT:    v_mul_lo_u32 v11, v4, v3
2521; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, v4, v2, 0
2522; GFX10-NEXT:    v_mov_b32_e32 v4, v13
2523; GFX10-NEXT:    v_mul_lo_u32 v13, v6, v1
2524; GFX10-NEXT:    v_add3_u32 v3, v3, v11, v10
2525; GFX10-NEXT:    v_add_co_u32 v9, s0, v9, v4
2526; GFX10-NEXT:    v_add_co_ci_u32_e64 v10, s0, 0, 0, s0
2527; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, v6, v0, v[2:3]
2528; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v1, v5, v[9:10]
2529; GFX10-NEXT:    v_mov_b32_e32 v9, v12
2530; GFX10-NEXT:    v_add3_u32 v3, v7, v3, v13
2531; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v0, v2
2532; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
2533; GFX10-NEXT:    global_store_dwordx4 v14, v[8:11], s[2:3]
2534; GFX10-NEXT:    s_endpgm
2535;
2536; GFX11-LABEL: v_mul_i128:
2537; GFX11:       ; %bb.0: ; %entry
2538; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x2c
2539; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 4, v0
2540; GFX11-NEXT:    v_mov_b32_e32 v10, 0
2541; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2542; GFX11-NEXT:    s_clause 0x1
2543; GFX11-NEXT:    global_load_b128 v[0:3], v16, s[0:1]
2544; GFX11-NEXT:    global_load_b128 v[4:7], v16, s[2:3]
2545; GFX11-NEXT:    s_waitcnt vmcnt(0)
2546; GFX11-NEXT:    v_mad_u64_u32 v[8:9], null, v0, v4, 0
2547; GFX11-NEXT:    v_mul_lo_u32 v15, v5, v2
2548; GFX11-NEXT:    v_mul_lo_u32 v3, v4, v3
2549; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2550; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10]
2551; GFX11-NEXT:    v_dual_mov_b32 v9, v12 :: v_dual_mov_b32 v12, v10
2552; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
2553; GFX11-NEXT:    v_mad_u64_u32 v[13:14], null, v0, v5, v[11:12]
2554; GFX11-NEXT:    v_mad_u64_u32 v[10:11], null, v4, v2, 0
2555; GFX11-NEXT:    v_mul_lo_u32 v4, v6, v1
2556; GFX11-NEXT:    v_mul_lo_u32 v12, v7, v0
2557; GFX11-NEXT:    v_mov_b32_e32 v2, v14
2558; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
2559; GFX11-NEXT:    v_add3_u32 v11, v11, v3, v15
2560; GFX11-NEXT:    v_add_co_u32 v2, s0, v9, v2
2561; GFX11-NEXT:    v_mov_b32_e32 v9, v13
2562; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s0
2563; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
2564; GFX11-NEXT:    v_mad_u64_u32 v[14:15], null, v6, v0, v[10:11]
2565; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3]
2566; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2567; GFX11-NEXT:    v_add3_u32 v0, v12, v15, v4
2568; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v6, v14
2569; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2570; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo
2571; GFX11-NEXT:    global_store_b128 v16, v[8:11], s[2:3]
2572; GFX11-NEXT:    s_nop 0
2573; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2574; GFX11-NEXT:    s_endpgm
2575;
2576; EG-LABEL: v_mul_i128:
2577; EG:       ; %bb.0: ; %entry
2578; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
2579; EG-NEXT:    TEX 1 @6
2580; EG-NEXT:    ALU 41, @14, KC0[], KC1[]
2581; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2582; EG-NEXT:    CF_END
2583; EG-NEXT:    PAD
2584; EG-NEXT:    Fetch clause starting at 6:
2585; EG-NEXT:     VTX_READ_128 T2.XYZW, T1.X, 0, #1
2586; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
2587; EG-NEXT:    ALU clause starting at 10:
2588; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
2589; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
2590; EG-NEXT:     ADD_INT T0.X, KC0[2].Z, PV.W,
2591; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, PV.W,
2592; EG-NEXT:    ALU clause starting at 14:
2593; EG-NEXT:     MULLO_INT * T1.Y, T0.Y, T2.Y,
2594; EG-NEXT:     MULHI * T1.Z, T0.Y, T2.Y,
2595; EG-NEXT:     MULLO_INT * T1.W, T2.Z, T0.X,
2596; EG-NEXT:     MULLO_INT * T3.X, T2.Y, T0.Z,
2597; EG-NEXT:     MULHI * T3.Y, T0.Y, T2.X,
2598; EG-NEXT:     MULHI * T3.Z, T0.X, T2.Y,
2599; EG-NEXT:     MULHI * T3.W, T2.Z, T0.X,
2600; EG-NEXT:     MULLO_INT * T2.Z, T2.Z, T0.Y,
2601; EG-NEXT:     MULHI * T4.X, T2.X, T0.Z,
2602; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T2.X,
2603; EG-NEXT:     MULHI * T4.Y, T0.X, T2.X,
2604; EG-NEXT:     ADD_INT T4.W, T0.Y, PS,
2605; EG-NEXT:     MULLO_INT * T2.Y, T0.X, T2.Y,
2606; EG-NEXT:     ADDC_UINT T4.Z, T0.Y, T4.Y,
2607; EG-NEXT:     ADDC_UINT T5.W, PS, PV.W,
2608; EG-NEXT:     MULLO_INT * T0.Y, T2.X, T0.W,
2609; EG-NEXT:     ADD_INT T4.X, T4.X, PS,
2610; EG-NEXT:     ADD_INT T0.Y, T3.W, T2.Z,
2611; EG-NEXT:     ADD_INT T2.Z, T3.Z, PV.W,
2612; EG-NEXT:     ADD_INT T0.W, T3.Y, PV.Z,
2613; EG-NEXT:     MULLO_INT * T2.W, T2.W, T0.X,
2614; EG-NEXT:     ADD_INT T5.X, PV.W, PV.Z,
2615; EG-NEXT:     ADDC_UINT T3.Y, PV.W, PV.Z,
2616; EG-NEXT:     ADD_INT T2.Z, PV.Y, PS,
2617; EG-NEXT:     ADD_INT T0.W, PV.X, T3.X,
2618; EG-NEXT:     MULLO_INT * T0.Y, T2.X, T0.Z,
2619; EG-NEXT:     ADD_INT T4.Y, PV.Z, PV.W,
2620; EG-NEXT:     ADDC_UINT T0.Z, T1.W, PS,
2621; EG-NEXT:     ADD_INT T0.W, T1.Z, PV.Y,
2622; EG-NEXT:     ADDC_UINT * T2.W, T1.Y, PV.X,
2623; EG-NEXT:     ADD_INT T1.Y, T1.Y, T5.X,
2624; EG-NEXT:     ADD_INT T1.Z, T1.W, T0.Y,
2625; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
2626; EG-NEXT:     ADD_INT * T1.W, PV.Y, PV.Z,
2627; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
2628; EG-NEXT:     ADDC_UINT * T1.W, PV.Y, PV.Z,
2629; EG-NEXT:     ADD_INT * T0.W, PV.W, PS,
2630; EG-NEXT:     ADD_INT * T0.Z, T1.Y, T1.Z,
2631; EG-NEXT:     ADD_INT * T0.Y, T2.Y, T4.W,
2632; EG-NEXT:     LSHR T1.X, T1.X, literal.x,
2633; EG-NEXT:     MULLO_INT * T0.X, T0.X, T2.X,
2634; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2635entry:
2636  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2637  %gep.a = getelementptr inbounds i128, ptr addrspace(1) %aptr, i32 %tid
2638  %gep.b = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid
2639  %gep.out = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid
2640  %a = load i128, ptr addrspace(1) %gep.a
2641  %b = load i128, ptr addrspace(1) %gep.b
2642  %mul = mul i128 %a, %b
2643  store i128 %mul, ptr addrspace(1) %gep.out
2644  ret void
2645}
2646
2647declare i32 @llvm.amdgcn.workitem.id.x() #1
2648
2649attributes #0 = { nounwind }
2650attributes #1 = { nounwind readnone}
2651