xref: /llvm-project/llvm/test/CodeGen/AMDGPU/mul.ll (revision b434051dc83d77c8e8e349ab1992dcb0c795a7ea)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
6; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s
8
9; mul24 and mad24 are affected
10
11define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
12; SI-LABEL: test_mul_v2i32:
13; SI:       ; %bb.0: ; %entry
14; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
15; SI-NEXT:    s_mov_b32 s7, 0xf000
16; SI-NEXT:    s_mov_b32 s6, -1
17; SI-NEXT:    s_mov_b32 s10, s6
18; SI-NEXT:    s_mov_b32 s11, s7
19; SI-NEXT:    s_waitcnt lgkmcnt(0)
20; SI-NEXT:    s_mov_b32 s8, s2
21; SI-NEXT:    s_mov_b32 s9, s3
22; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
23; SI-NEXT:    s_mov_b32 s4, s0
24; SI-NEXT:    s_mov_b32 s5, s1
25; SI-NEXT:    s_waitcnt vmcnt(0)
26; SI-NEXT:    v_mul_lo_u32 v1, v1, v3
27; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
28; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
29; SI-NEXT:    s_endpgm
30;
31; VI-LABEL: test_mul_v2i32:
32; VI:       ; %bb.0: ; %entry
33; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
34; VI-NEXT:    s_mov_b32 s7, 0xf000
35; VI-NEXT:    s_mov_b32 s6, -1
36; VI-NEXT:    s_mov_b32 s10, s6
37; VI-NEXT:    s_mov_b32 s11, s7
38; VI-NEXT:    s_waitcnt lgkmcnt(0)
39; VI-NEXT:    s_mov_b32 s8, s2
40; VI-NEXT:    s_mov_b32 s9, s3
41; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
42; VI-NEXT:    s_mov_b32 s4, s0
43; VI-NEXT:    s_mov_b32 s5, s1
44; VI-NEXT:    s_waitcnt vmcnt(0)
45; VI-NEXT:    v_mul_lo_u32 v1, v1, v3
46; VI-NEXT:    v_mul_lo_u32 v0, v0, v2
47; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
48; VI-NEXT:    s_endpgm
49;
50; GFX9-LABEL: test_mul_v2i32:
51; GFX9:       ; %bb.0: ; %entry
52; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
53; GFX9-NEXT:    s_mov_b32 s7, 0xf000
54; GFX9-NEXT:    s_mov_b32 s6, -1
55; GFX9-NEXT:    s_mov_b32 s10, s6
56; GFX9-NEXT:    s_mov_b32 s11, s7
57; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
58; GFX9-NEXT:    s_mov_b32 s8, s2
59; GFX9-NEXT:    s_mov_b32 s9, s3
60; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
61; GFX9-NEXT:    s_mov_b32 s4, s0
62; GFX9-NEXT:    s_mov_b32 s5, s1
63; GFX9-NEXT:    s_waitcnt vmcnt(0)
64; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
65; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
66; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
67; GFX9-NEXT:    s_endpgm
68;
69; GFX10-LABEL: test_mul_v2i32:
70; GFX10:       ; %bb.0: ; %entry
71; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
72; GFX10-NEXT:    s_mov_b32 s6, -1
73; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
74; GFX10-NEXT:    s_mov_b32 s10, s6
75; GFX10-NEXT:    s_mov_b32 s11, s7
76; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
77; GFX10-NEXT:    s_mov_b32 s8, s2
78; GFX10-NEXT:    s_mov_b32 s9, s3
79; GFX10-NEXT:    s_mov_b32 s4, s0
80; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
81; GFX10-NEXT:    s_mov_b32 s5, s1
82; GFX10-NEXT:    s_waitcnt vmcnt(0)
83; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v3
84; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
85; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
86; GFX10-NEXT:    s_endpgm
87;
88; GFX11-LABEL: test_mul_v2i32:
89; GFX11:       ; %bb.0: ; %entry
90; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
91; GFX11-NEXT:    s_mov_b32 s6, -1
92; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
93; GFX11-NEXT:    s_mov_b32 s10, s6
94; GFX11-NEXT:    s_mov_b32 s11, s7
95; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX11-NEXT:    s_mov_b32 s8, s2
97; GFX11-NEXT:    s_mov_b32 s9, s3
98; GFX11-NEXT:    s_mov_b32 s4, s0
99; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], 0
100; GFX11-NEXT:    s_mov_b32 s5, s1
101; GFX11-NEXT:    s_waitcnt vmcnt(0)
102; GFX11-NEXT:    v_mul_lo_u32 v1, v1, v3
103; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v2
104; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
105; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
106; GFX11-NEXT:    s_endpgm
107;
108; EG-LABEL: test_mul_v2i32:
109; EG:       ; %bb.0: ; %entry
110; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
111; EG-NEXT:    TEX 0 @6
112; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
113; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
114; EG-NEXT:    CF_END
115; EG-NEXT:    PAD
116; EG-NEXT:    Fetch clause starting at 6:
117; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
118; EG-NEXT:    ALU clause starting at 8:
119; EG-NEXT:     MOV * T0.X, KC0[2].Z,
120; EG-NEXT:    ALU clause starting at 9:
121; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T0.W,
122; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
123; EG-NEXT:     MULLO_INT * T0.X, T0.X, T0.Z,
124; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
125entry:
126  %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
127  %a = load <2 x i32>, ptr addrspace(1) %in
128  %b = load <2 x i32>, ptr addrspace(1) %b_ptr
129  %result = mul <2 x i32> %a, %b
130  store <2 x i32> %result, ptr addrspace(1) %out
131  ret void
132}
133
134define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
135; SI-LABEL: v_mul_v4i32:
136; SI:       ; %bb.0: ; %entry
137; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
138; SI-NEXT:    s_mov_b32 s7, 0xf000
139; SI-NEXT:    s_mov_b32 s6, -1
140; SI-NEXT:    s_mov_b32 s10, s6
141; SI-NEXT:    s_mov_b32 s11, s7
142; SI-NEXT:    s_waitcnt lgkmcnt(0)
143; SI-NEXT:    s_mov_b32 s8, s2
144; SI-NEXT:    s_mov_b32 s9, s3
145; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
146; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
147; SI-NEXT:    s_mov_b32 s4, s0
148; SI-NEXT:    s_mov_b32 s5, s1
149; SI-NEXT:    s_waitcnt vmcnt(0)
150; SI-NEXT:    v_mul_lo_u32 v3, v3, v7
151; SI-NEXT:    v_mul_lo_u32 v2, v2, v6
152; SI-NEXT:    v_mul_lo_u32 v1, v1, v5
153; SI-NEXT:    v_mul_lo_u32 v0, v0, v4
154; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
155; SI-NEXT:    s_endpgm
156;
157; VI-LABEL: v_mul_v4i32:
158; VI:       ; %bb.0: ; %entry
159; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
160; VI-NEXT:    s_mov_b32 s7, 0xf000
161; VI-NEXT:    s_mov_b32 s6, -1
162; VI-NEXT:    s_mov_b32 s10, s6
163; VI-NEXT:    s_mov_b32 s11, s7
164; VI-NEXT:    s_waitcnt lgkmcnt(0)
165; VI-NEXT:    s_mov_b32 s8, s2
166; VI-NEXT:    s_mov_b32 s9, s3
167; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
168; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
169; VI-NEXT:    s_mov_b32 s4, s0
170; VI-NEXT:    s_mov_b32 s5, s1
171; VI-NEXT:    s_waitcnt vmcnt(0)
172; VI-NEXT:    v_mul_lo_u32 v3, v3, v7
173; VI-NEXT:    v_mul_lo_u32 v2, v2, v6
174; VI-NEXT:    v_mul_lo_u32 v1, v1, v5
175; VI-NEXT:    v_mul_lo_u32 v0, v0, v4
176; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
177; VI-NEXT:    s_endpgm
178;
179; GFX9-LABEL: v_mul_v4i32:
180; GFX9:       ; %bb.0: ; %entry
181; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
182; GFX9-NEXT:    s_mov_b32 s7, 0xf000
183; GFX9-NEXT:    s_mov_b32 s6, -1
184; GFX9-NEXT:    s_mov_b32 s10, s6
185; GFX9-NEXT:    s_mov_b32 s11, s7
186; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
187; GFX9-NEXT:    s_mov_b32 s8, s2
188; GFX9-NEXT:    s_mov_b32 s9, s3
189; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
190; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
191; GFX9-NEXT:    s_mov_b32 s4, s0
192; GFX9-NEXT:    s_mov_b32 s5, s1
193; GFX9-NEXT:    s_waitcnt vmcnt(0)
194; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v7
195; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v6
196; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v5
197; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v4
198; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
199; GFX9-NEXT:    s_endpgm
200;
201; GFX10-LABEL: v_mul_v4i32:
202; GFX10:       ; %bb.0: ; %entry
203; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
204; GFX10-NEXT:    s_mov_b32 s6, -1
205; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
206; GFX10-NEXT:    s_mov_b32 s10, s6
207; GFX10-NEXT:    s_mov_b32 s11, s7
208; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
209; GFX10-NEXT:    s_mov_b32 s8, s2
210; GFX10-NEXT:    s_mov_b32 s9, s3
211; GFX10-NEXT:    s_clause 0x1
212; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
213; GFX10-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
214; GFX10-NEXT:    s_mov_b32 s4, s0
215; GFX10-NEXT:    s_mov_b32 s5, s1
216; GFX10-NEXT:    s_waitcnt vmcnt(0)
217; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v7
218; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v6
219; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v5
220; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v4
221; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
222; GFX10-NEXT:    s_endpgm
223;
224; GFX11-LABEL: v_mul_v4i32:
225; GFX11:       ; %bb.0: ; %entry
226; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
227; GFX11-NEXT:    s_mov_b32 s6, -1
228; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
229; GFX11-NEXT:    s_mov_b32 s10, s6
230; GFX11-NEXT:    s_mov_b32 s11, s7
231; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
232; GFX11-NEXT:    s_mov_b32 s8, s2
233; GFX11-NEXT:    s_mov_b32 s9, s3
234; GFX11-NEXT:    s_clause 0x1
235; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[8:11], 0
236; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[8:11], 0 offset:16
237; GFX11-NEXT:    s_mov_b32 s4, s0
238; GFX11-NEXT:    s_mov_b32 s5, s1
239; GFX11-NEXT:    s_waitcnt vmcnt(0)
240; GFX11-NEXT:    v_mul_lo_u32 v3, v3, v7
241; GFX11-NEXT:    v_mul_lo_u32 v2, v2, v6
242; GFX11-NEXT:    v_mul_lo_u32 v1, v1, v5
243; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v4
244; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
245; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
246; GFX11-NEXT:    s_endpgm
247;
248; EG-LABEL: v_mul_v4i32:
249; EG:       ; %bb.0: ; %entry
250; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
251; EG-NEXT:    TEX 1 @6
252; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
253; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
254; EG-NEXT:    CF_END
255; EG-NEXT:    PAD
256; EG-NEXT:    Fetch clause starting at 6:
257; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
258; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
259; EG-NEXT:    ALU clause starting at 10:
260; EG-NEXT:     MOV * T0.X, KC0[2].Z,
261; EG-NEXT:    ALU clause starting at 11:
262; EG-NEXT:     MULLO_INT * T0.W, T0.W, T1.W,
263; EG-NEXT:     MULLO_INT * T0.Z, T0.Z, T1.Z,
264; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T1.Y,
265; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
266; EG-NEXT:     MULLO_INT * T0.X, T0.X, T1.X,
267; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
268entry:
269  %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
270  %a = load <4 x i32>, ptr addrspace(1) %in
271  %b = load <4 x i32>, ptr addrspace(1) %b_ptr
272  %result = mul <4 x i32> %a, %b
273  store <4 x i32> %result, ptr addrspace(1) %out
274  ret void
275}
276
277define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) {
278; SI-LABEL: s_trunc_i64_mul_to_i32:
279; SI:       ; %bb.0: ; %entry
280; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
281; SI-NEXT:    s_waitcnt lgkmcnt(0)
282; SI-NEXT:    s_load_dword s7, s[0:1], 0xd
283; SI-NEXT:    s_mov_b32 s3, 0xf000
284; SI-NEXT:    s_mov_b32 s2, -1
285; SI-NEXT:    s_mov_b32 s0, s4
286; SI-NEXT:    s_waitcnt lgkmcnt(0)
287; SI-NEXT:    s_mul_i32 s4, s7, s6
288; SI-NEXT:    s_mov_b32 s1, s5
289; SI-NEXT:    v_mov_b32_e32 v0, s4
290; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
291; SI-NEXT:    s_endpgm
292;
293; VI-LABEL: s_trunc_i64_mul_to_i32:
294; VI:       ; %bb.0: ; %entry
295; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
296; VI-NEXT:    s_waitcnt lgkmcnt(0)
297; VI-NEXT:    s_load_dword s7, s[0:1], 0x34
298; VI-NEXT:    s_mov_b32 s3, 0xf000
299; VI-NEXT:    s_mov_b32 s2, -1
300; VI-NEXT:    s_mov_b32 s0, s4
301; VI-NEXT:    s_waitcnt lgkmcnt(0)
302; VI-NEXT:    s_mul_i32 s4, s7, s6
303; VI-NEXT:    s_mov_b32 s1, s5
304; VI-NEXT:    v_mov_b32_e32 v0, s4
305; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
306; VI-NEXT:    s_endpgm
307;
308; GFX9-LABEL: s_trunc_i64_mul_to_i32:
309; GFX9:       ; %bb.0: ; %entry
310; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
311; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
312; GFX9-NEXT:    s_load_dword s7, s[0:1], 0x34
313; GFX9-NEXT:    ; kill: killed $sgpr0_sgpr1
314; GFX9-NEXT:    s_mov_b32 s3, 0xf000
315; GFX9-NEXT:    s_mov_b32 s2, -1
316; GFX9-NEXT:    s_mov_b32 s0, s4
317; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
318; GFX9-NEXT:    s_mul_i32 s4, s7, s6
319; GFX9-NEXT:    s_mov_b32 s1, s5
320; GFX9-NEXT:    v_mov_b32_e32 v0, s4
321; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
322; GFX9-NEXT:    s_endpgm
323;
324; GFX10-LABEL: s_trunc_i64_mul_to_i32:
325; GFX10:       ; %bb.0: ; %entry
326; GFX10-NEXT:    s_clause 0x1
327; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
328; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x34
329; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
330; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
331; GFX10-NEXT:    s_mul_i32 s0, s2, s6
332; GFX10-NEXT:    s_mov_b32 s6, -1
333; GFX10-NEXT:    v_mov_b32_e32 v0, s0
334; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
335; GFX10-NEXT:    s_endpgm
336;
337; GFX11-LABEL: s_trunc_i64_mul_to_i32:
338; GFX11:       ; %bb.0: ; %entry
339; GFX11-NEXT:    s_clause 0x1
340; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
341; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
342; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
343; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
344; GFX11-NEXT:    s_mul_i32 s0, s0, s6
345; GFX11-NEXT:    s_mov_b32 s6, -1
346; GFX11-NEXT:    v_mov_b32_e32 v0, s0
347; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
348; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
349; GFX11-NEXT:    s_endpgm
350;
351; EG-LABEL: s_trunc_i64_mul_to_i32:
352; EG:       ; %bb.0: ; %entry
353; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
354; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
355; EG-NEXT:    CF_END
356; EG-NEXT:    PAD
357; EG-NEXT:    ALU clause starting at 4:
358; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
359; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
360; EG-NEXT:     MULLO_INT * T1.X, KC0[3].Y, KC0[2].W,
361entry:
362  %mul = mul i64 %b, %a
363  %trunc = trunc i64 %mul to i32
364  store i32 %trunc, ptr addrspace(1) %out, align 8
365  ret void
366}
367
368define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
369; SI-LABEL: v_trunc_i64_mul_to_i32:
370; SI:       ; %bb.0: ; %entry
371; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
372; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
373; SI-NEXT:    s_mov_b32 s3, 0xf000
374; SI-NEXT:    s_mov_b32 s2, -1
375; SI-NEXT:    s_mov_b32 s14, s2
376; SI-NEXT:    s_waitcnt lgkmcnt(0)
377; SI-NEXT:    s_mov_b32 s12, s6
378; SI-NEXT:    s_mov_b32 s13, s7
379; SI-NEXT:    s_mov_b32 s15, s3
380; SI-NEXT:    s_mov_b32 s10, s2
381; SI-NEXT:    s_mov_b32 s11, s3
382; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
383; SI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
384; SI-NEXT:    s_mov_b32 s0, s4
385; SI-NEXT:    s_mov_b32 s1, s5
386; SI-NEXT:    s_waitcnt vmcnt(0)
387; SI-NEXT:    v_mul_lo_u32 v0, v1, v0
388; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
389; SI-NEXT:    s_endpgm
390;
391; VI-LABEL: v_trunc_i64_mul_to_i32:
392; VI:       ; %bb.0: ; %entry
393; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
394; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
395; VI-NEXT:    s_mov_b32 s3, 0xf000
396; VI-NEXT:    s_mov_b32 s2, -1
397; VI-NEXT:    s_mov_b32 s14, s2
398; VI-NEXT:    s_waitcnt lgkmcnt(0)
399; VI-NEXT:    s_mov_b32 s12, s6
400; VI-NEXT:    s_mov_b32 s13, s7
401; VI-NEXT:    s_mov_b32 s15, s3
402; VI-NEXT:    s_mov_b32 s10, s2
403; VI-NEXT:    s_mov_b32 s11, s3
404; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
405; VI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
406; VI-NEXT:    s_mov_b32 s0, s4
407; VI-NEXT:    s_mov_b32 s1, s5
408; VI-NEXT:    s_waitcnt vmcnt(0)
409; VI-NEXT:    v_mul_lo_u32 v0, v1, v0
410; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
411; VI-NEXT:    s_endpgm
412;
413; GFX9-LABEL: v_trunc_i64_mul_to_i32:
414; GFX9:       ; %bb.0: ; %entry
415; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
416; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
417; GFX9-NEXT:    s_mov_b32 s3, 0xf000
418; GFX9-NEXT:    s_mov_b32 s2, -1
419; GFX9-NEXT:    s_mov_b32 s14, s2
420; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
421; GFX9-NEXT:    s_mov_b32 s12, s6
422; GFX9-NEXT:    s_mov_b32 s13, s7
423; GFX9-NEXT:    s_mov_b32 s15, s3
424; GFX9-NEXT:    s_mov_b32 s10, s2
425; GFX9-NEXT:    s_mov_b32 s11, s3
426; GFX9-NEXT:    buffer_load_dword v0, off, s[12:15], 0
427; GFX9-NEXT:    buffer_load_dword v1, off, s[8:11], 0
428; GFX9-NEXT:    s_mov_b32 s0, s4
429; GFX9-NEXT:    s_mov_b32 s1, s5
430; GFX9-NEXT:    s_waitcnt vmcnt(0)
431; GFX9-NEXT:    v_mul_lo_u32 v0, v1, v0
432; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
433; GFX9-NEXT:    s_endpgm
434;
435; GFX10-LABEL: v_trunc_i64_mul_to_i32:
436; GFX10:       ; %bb.0: ; %entry
437; GFX10-NEXT:    s_clause 0x1
438; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
439; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
440; GFX10-NEXT:    s_mov_b32 s2, -1
441; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
442; GFX10-NEXT:    s_mov_b32 s14, s2
443; GFX10-NEXT:    s_mov_b32 s15, s3
444; GFX10-NEXT:    s_mov_b32 s10, s2
445; GFX10-NEXT:    s_mov_b32 s11, s3
446; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
447; GFX10-NEXT:    s_mov_b32 s12, s6
448; GFX10-NEXT:    s_mov_b32 s13, s7
449; GFX10-NEXT:    buffer_load_dword v0, off, s[12:15], 0
450; GFX10-NEXT:    buffer_load_dword v1, off, s[8:11], 0
451; GFX10-NEXT:    s_mov_b32 s0, s4
452; GFX10-NEXT:    s_mov_b32 s1, s5
453; GFX10-NEXT:    s_waitcnt vmcnt(0)
454; GFX10-NEXT:    v_mul_lo_u32 v0, v1, v0
455; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
456; GFX10-NEXT:    s_endpgm
457;
458; GFX11-LABEL: v_trunc_i64_mul_to_i32:
459; GFX11:       ; %bb.0: ; %entry
460; GFX11-NEXT:    s_clause 0x1
461; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
462; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
463; GFX11-NEXT:    s_mov_b32 s10, -1
464; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
465; GFX11-NEXT:    s_mov_b32 s14, s10
466; GFX11-NEXT:    s_mov_b32 s15, s11
467; GFX11-NEXT:    s_mov_b32 s2, s10
468; GFX11-NEXT:    s_mov_b32 s3, s11
469; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
470; GFX11-NEXT:    s_mov_b32 s12, s6
471; GFX11-NEXT:    s_mov_b32 s13, s7
472; GFX11-NEXT:    buffer_load_b32 v0, off, s[12:15], 0
473; GFX11-NEXT:    buffer_load_b32 v1, off, s[0:3], 0
474; GFX11-NEXT:    s_mov_b32 s8, s4
475; GFX11-NEXT:    s_mov_b32 s9, s5
476; GFX11-NEXT:    s_waitcnt vmcnt(0)
477; GFX11-NEXT:    v_mul_lo_u32 v0, v1, v0
478; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
479; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
480; GFX11-NEXT:    s_endpgm
481;
482; EG-LABEL: v_trunc_i64_mul_to_i32:
483; EG:       ; %bb.0: ; %entry
484; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
485; EG-NEXT:    TEX 1 @6
486; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
487; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
488; EG-NEXT:    CF_END
489; EG-NEXT:    PAD
490; EG-NEXT:    Fetch clause starting at 6:
491; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
492; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
493; EG-NEXT:    ALU clause starting at 10:
494; EG-NEXT:     MOV T0.X, KC0[2].Z,
495; EG-NEXT:     MOV * T1.X, KC0[2].W,
496; EG-NEXT:    ALU clause starting at 12:
497; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
498; EG-NEXT:     MULLO_INT * T0.X, T1.X, T0.X,
499; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
500entry:
501  %a = load i64, ptr addrspace(1) %aptr, align 8
502  %b = load i64, ptr addrspace(1) %bptr, align 8
503  %mul = mul i64 %b, %a
504  %trunc = trunc i64 %mul to i32
505  store i32 %trunc, ptr addrspace(1) %out, align 8
506  ret void
507}
508
509; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top
510; 32-bits of both arguments are sign bits.
511
512define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
513; SI-LABEL: mul64_sext_c:
514; SI:       ; %bb.0: ; %entry
515; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
516; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
517; SI-NEXT:    v_mov_b32_e32 v0, 0x50
518; SI-NEXT:    s_mov_b32 s3, 0xf000
519; SI-NEXT:    s_mov_b32 s2, -1
520; SI-NEXT:    s_waitcnt lgkmcnt(0)
521; SI-NEXT:    v_mul_hi_i32 v1, s4, v0
522; SI-NEXT:    s_mulk_i32 s4, 0x50
523; SI-NEXT:    v_mov_b32_e32 v0, s4
524; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
525; SI-NEXT:    s_endpgm
526;
527; VI-LABEL: mul64_sext_c:
528; VI:       ; %bb.0: ; %entry
529; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
530; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
531; VI-NEXT:    v_mov_b32_e32 v0, 0x50
532; VI-NEXT:    s_waitcnt lgkmcnt(0)
533; VI-NEXT:    v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0
534; VI-NEXT:    s_mov_b32 s3, 0xf000
535; VI-NEXT:    s_mov_b32 s2, -1
536; VI-NEXT:    s_nop 2
537; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
538; VI-NEXT:    s_endpgm
539;
540; GFX9-LABEL: mul64_sext_c:
541; GFX9:       ; %bb.0: ; %entry
542; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
543; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
544; GFX9-NEXT:    s_mov_b32 s7, 0xf000
545; GFX9-NEXT:    s_mov_b32 s6, -1
546; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
547; GFX9-NEXT:    s_mul_hi_i32 s0, s2, 0x50
548; GFX9-NEXT:    s_mulk_i32 s2, 0x50
549; GFX9-NEXT:    v_mov_b32_e32 v0, s2
550; GFX9-NEXT:    v_mov_b32_e32 v1, s0
551; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
552; GFX9-NEXT:    s_endpgm
553;
554; GFX10-LABEL: mul64_sext_c:
555; GFX10:       ; %bb.0: ; %entry
556; GFX10-NEXT:    s_clause 0x1
557; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
558; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
559; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
560; GFX10-NEXT:    s_mov_b32 s6, -1
561; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
562; GFX10-NEXT:    s_mul_i32 s0, s2, 0x50
563; GFX10-NEXT:    s_mul_hi_i32 s1, s2, 0x50
564; GFX10-NEXT:    v_mov_b32_e32 v0, s0
565; GFX10-NEXT:    v_mov_b32_e32 v1, s1
566; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
567; GFX10-NEXT:    s_endpgm
568;
569; GFX11-LABEL: mul64_sext_c:
570; GFX11:       ; %bb.0: ; %entry
571; GFX11-NEXT:    s_clause 0x1
572; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
573; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
574; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
575; GFX11-NEXT:    s_mul_i32 s3, s2, 0x50
576; GFX11-NEXT:    s_mul_hi_i32 s2, s2, 0x50
577; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
578; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
579; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
580; GFX11-NEXT:    s_mov_b32 s2, -1
581; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
582; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
583; GFX11-NEXT:    s_endpgm
584;
585; EG-LABEL: mul64_sext_c:
586; EG:       ; %bb.0: ; %entry
587; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
588; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
589; EG-NEXT:    CF_END
590; EG-NEXT:    PAD
591; EG-NEXT:    ALU clause starting at 4:
592; EG-NEXT:     MULHI_INT * T0.Y, KC0[2].Z, literal.x,
593; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
594; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
595; EG-NEXT:     MULLO_INT * T0.X, KC0[2].Z, literal.y,
596; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
597entry:
598  %0 = sext i32 %in to i64
599  %1 = mul i64 %0, 80
600  store i64 %1, ptr addrspace(1) %out
601  ret void
602}
603
604define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) {
605; SI-LABEL: v_mul64_sext_c:
606; SI:       ; %bb.0: ; %entry
607; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
608; SI-NEXT:    s_mov_b32 s7, 0xf000
609; SI-NEXT:    s_mov_b32 s6, -1
610; SI-NEXT:    s_mov_b32 s10, s6
611; SI-NEXT:    s_mov_b32 s11, s7
612; SI-NEXT:    s_waitcnt lgkmcnt(0)
613; SI-NEXT:    s_mov_b32 s8, s2
614; SI-NEXT:    s_mov_b32 s9, s3
615; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
616; SI-NEXT:    s_movk_i32 s2, 0x50
617; SI-NEXT:    s_mov_b32 s4, s0
618; SI-NEXT:    s_mov_b32 s5, s1
619; SI-NEXT:    s_waitcnt vmcnt(0)
620; SI-NEXT:    v_mul_hi_i32 v1, v0, s2
621; SI-NEXT:    v_mul_lo_u32 v0, v0, s2
622; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
623; SI-NEXT:    s_endpgm
624;
625; VI-LABEL: v_mul64_sext_c:
626; VI:       ; %bb.0: ; %entry
627; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
628; VI-NEXT:    s_mov_b32 s7, 0xf000
629; VI-NEXT:    s_mov_b32 s6, -1
630; VI-NEXT:    s_mov_b32 s10, s6
631; VI-NEXT:    s_mov_b32 s11, s7
632; VI-NEXT:    s_waitcnt lgkmcnt(0)
633; VI-NEXT:    s_mov_b32 s8, s2
634; VI-NEXT:    s_mov_b32 s9, s3
635; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
636; VI-NEXT:    s_movk_i32 s2, 0x50
637; VI-NEXT:    s_mov_b32 s4, s0
638; VI-NEXT:    s_mov_b32 s5, s1
639; VI-NEXT:    s_waitcnt vmcnt(0)
640; VI-NEXT:    v_mad_i64_i32 v[0:1], s[2:3], v0, s2, 0
641; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
642; VI-NEXT:    s_endpgm
643;
644; GFX9-LABEL: v_mul64_sext_c:
645; GFX9:       ; %bb.0: ; %entry
646; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
647; GFX9-NEXT:    s_mov_b32 s7, 0xf000
648; GFX9-NEXT:    s_mov_b32 s6, -1
649; GFX9-NEXT:    s_mov_b32 s10, s6
650; GFX9-NEXT:    s_mov_b32 s11, s7
651; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
652; GFX9-NEXT:    s_mov_b32 s8, s2
653; GFX9-NEXT:    s_mov_b32 s9, s3
654; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
655; GFX9-NEXT:    s_movk_i32 s2, 0x50
656; GFX9-NEXT:    s_mov_b32 s4, s0
657; GFX9-NEXT:    s_mov_b32 s5, s1
658; GFX9-NEXT:    s_waitcnt vmcnt(0)
659; GFX9-NEXT:    v_mul_hi_i32 v1, v0, s2
660; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
661; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
662; GFX9-NEXT:    s_endpgm
663;
664; GFX10-LABEL: v_mul64_sext_c:
665; GFX10:       ; %bb.0: ; %entry
666; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
667; GFX10-NEXT:    s_mov_b32 s6, -1
668; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
669; GFX10-NEXT:    s_mov_b32 s10, s6
670; GFX10-NEXT:    s_mov_b32 s11, s7
671; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
672; GFX10-NEXT:    s_mov_b32 s8, s2
673; GFX10-NEXT:    s_mov_b32 s9, s3
674; GFX10-NEXT:    s_mov_b32 s4, s0
675; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
676; GFX10-NEXT:    s_mov_b32 s5, s1
677; GFX10-NEXT:    s_waitcnt vmcnt(0)
678; GFX10-NEXT:    v_mul_hi_i32 v1, 0x50, v0
679; GFX10-NEXT:    v_mul_lo_u32 v0, 0x50, v0
680; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
681; GFX10-NEXT:    s_endpgm
682;
683; GFX11-LABEL: v_mul64_sext_c:
684; GFX11:       ; %bb.0: ; %entry
685; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
686; GFX11-NEXT:    s_mov_b32 s6, -1
687; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
688; GFX11-NEXT:    s_mov_b32 s10, s6
689; GFX11-NEXT:    s_mov_b32 s11, s7
690; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
691; GFX11-NEXT:    s_mov_b32 s8, s2
692; GFX11-NEXT:    s_mov_b32 s9, s3
693; GFX11-NEXT:    s_mov_b32 s4, s0
694; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
695; GFX11-NEXT:    s_mov_b32 s5, s1
696; GFX11-NEXT:    s_waitcnt vmcnt(0)
697; GFX11-NEXT:    v_mul_hi_i32 v1, 0x50, v0
698; GFX11-NEXT:    v_mul_lo_u32 v0, 0x50, v0
699; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
700; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
701; GFX11-NEXT:    s_endpgm
702;
703; EG-LABEL: v_mul64_sext_c:
704; EG:       ; %bb.0: ; %entry
705; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
706; EG-NEXT:    TEX 0 @6
707; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
708; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
709; EG-NEXT:    CF_END
710; EG-NEXT:    PAD
711; EG-NEXT:    Fetch clause starting at 6:
712; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
713; EG-NEXT:    ALU clause starting at 8:
714; EG-NEXT:     MOV * T0.X, KC0[2].Z,
715; EG-NEXT:    ALU clause starting at 9:
716; EG-NEXT:     MULHI_INT * T0.Y, T0.X, literal.x,
717; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
718; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
719; EG-NEXT:     MULLO_INT * T0.X, T0.X, literal.y,
720; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
721entry:
722  %val = load i32, ptr addrspace(1) %in, align 4
723  %ext = sext i32 %val to i64
724  %mul = mul i64 %ext, 80
725  store i64 %mul, ptr addrspace(1) %out, align 8
726  ret void
727}
728
729define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) {
730; SI-LABEL: v_mul64_sext_inline_imm:
731; SI:       ; %bb.0: ; %entry
732; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
733; SI-NEXT:    s_mov_b32 s7, 0xf000
734; SI-NEXT:    s_mov_b32 s6, -1
735; SI-NEXT:    s_mov_b32 s10, s6
736; SI-NEXT:    s_mov_b32 s11, s7
737; SI-NEXT:    s_waitcnt lgkmcnt(0)
738; SI-NEXT:    s_mov_b32 s8, s2
739; SI-NEXT:    s_mov_b32 s9, s3
740; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
741; SI-NEXT:    s_mov_b32 s4, s0
742; SI-NEXT:    s_mov_b32 s5, s1
743; SI-NEXT:    s_waitcnt vmcnt(0)
744; SI-NEXT:    v_mul_hi_i32 v1, v0, 9
745; SI-NEXT:    v_mul_lo_u32 v0, v0, 9
746; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
747; SI-NEXT:    s_endpgm
748;
749; VI-LABEL: v_mul64_sext_inline_imm:
750; VI:       ; %bb.0: ; %entry
751; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
752; VI-NEXT:    s_mov_b32 s7, 0xf000
753; VI-NEXT:    s_mov_b32 s6, -1
754; VI-NEXT:    s_mov_b32 s10, s6
755; VI-NEXT:    s_mov_b32 s11, s7
756; VI-NEXT:    s_waitcnt lgkmcnt(0)
757; VI-NEXT:    s_mov_b32 s8, s2
758; VI-NEXT:    s_mov_b32 s9, s3
759; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
760; VI-NEXT:    s_mov_b32 s4, s0
761; VI-NEXT:    s_mov_b32 s5, s1
762; VI-NEXT:    s_waitcnt vmcnt(0)
763; VI-NEXT:    v_mad_i64_i32 v[0:1], s[2:3], v0, 9, 0
764; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
765; VI-NEXT:    s_endpgm
766;
767; GFX9-LABEL: v_mul64_sext_inline_imm:
768; GFX9:       ; %bb.0: ; %entry
769; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
770; GFX9-NEXT:    s_mov_b32 s7, 0xf000
771; GFX9-NEXT:    s_mov_b32 s6, -1
772; GFX9-NEXT:    s_mov_b32 s10, s6
773; GFX9-NEXT:    s_mov_b32 s11, s7
774; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
775; GFX9-NEXT:    s_mov_b32 s8, s2
776; GFX9-NEXT:    s_mov_b32 s9, s3
777; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
778; GFX9-NEXT:    s_mov_b32 s4, s0
779; GFX9-NEXT:    s_mov_b32 s5, s1
780; GFX9-NEXT:    s_waitcnt vmcnt(0)
781; GFX9-NEXT:    v_mul_hi_i32 v1, v0, 9
782; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 9
783; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
784; GFX9-NEXT:    s_endpgm
785;
786; GFX10-LABEL: v_mul64_sext_inline_imm:
787; GFX10:       ; %bb.0: ; %entry
788; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
789; GFX10-NEXT:    s_mov_b32 s6, -1
790; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
791; GFX10-NEXT:    s_mov_b32 s10, s6
792; GFX10-NEXT:    s_mov_b32 s11, s7
793; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
794; GFX10-NEXT:    s_mov_b32 s8, s2
795; GFX10-NEXT:    s_mov_b32 s9, s3
796; GFX10-NEXT:    s_mov_b32 s4, s0
797; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
798; GFX10-NEXT:    s_mov_b32 s5, s1
799; GFX10-NEXT:    s_waitcnt vmcnt(0)
800; GFX10-NEXT:    v_mul_hi_i32 v1, v0, 9
801; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 9
802; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
803; GFX10-NEXT:    s_endpgm
804;
805; GFX11-LABEL: v_mul64_sext_inline_imm:
806; GFX11:       ; %bb.0: ; %entry
807; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
808; GFX11-NEXT:    s_mov_b32 s6, -1
809; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
810; GFX11-NEXT:    s_mov_b32 s10, s6
811; GFX11-NEXT:    s_mov_b32 s11, s7
812; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
813; GFX11-NEXT:    s_mov_b32 s8, s2
814; GFX11-NEXT:    s_mov_b32 s9, s3
815; GFX11-NEXT:    s_mov_b32 s4, s0
816; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
817; GFX11-NEXT:    s_mov_b32 s5, s1
818; GFX11-NEXT:    s_waitcnt vmcnt(0)
819; GFX11-NEXT:    v_mul_hi_i32 v1, v0, 9
820; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 9
821; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
822; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
823; GFX11-NEXT:    s_endpgm
824;
825; EG-LABEL: v_mul64_sext_inline_imm:
826; EG:       ; %bb.0: ; %entry
827; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
828; EG-NEXT:    TEX 0 @6
829; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
830; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
831; EG-NEXT:    CF_END
832; EG-NEXT:    PAD
833; EG-NEXT:    Fetch clause starting at 6:
834; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
835; EG-NEXT:    ALU clause starting at 8:
836; EG-NEXT:     MOV * T0.X, KC0[2].Z,
837; EG-NEXT:    ALU clause starting at 9:
838; EG-NEXT:     MULHI_INT * T0.Y, T0.X, literal.x,
839; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
840; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
841; EG-NEXT:     MULLO_INT * T0.X, T0.X, literal.y,
842; EG-NEXT:    2(2.802597e-45), 9(1.261169e-44)
843entry:
844  %val = load i32, ptr addrspace(1) %in, align 4
845  %ext = sext i32 %val to i64
846  %mul = mul i64 %ext, 9
847  store i64 %mul, ptr addrspace(1) %out, align 8
848  ret void
849}
850
851define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind {
852; SI-LABEL: s_mul_i32:
853; SI:       ; %bb.0: ; %entry
854; SI-NEXT:    s_load_dword s4, s[0:1], 0x13
855; SI-NEXT:    s_load_dword s5, s[0:1], 0x1c
856; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
857; SI-NEXT:    s_mov_b32 s3, 0xf000
858; SI-NEXT:    s_mov_b32 s2, -1
859; SI-NEXT:    s_waitcnt lgkmcnt(0)
860; SI-NEXT:    s_mul_i32 s4, s4, s5
861; SI-NEXT:    v_mov_b32_e32 v0, s4
862; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
863; SI-NEXT:    s_endpgm
864;
865; VI-LABEL: s_mul_i32:
866; VI:       ; %bb.0: ; %entry
867; VI-NEXT:    s_load_dword s4, s[0:1], 0x4c
868; VI-NEXT:    s_load_dword s5, s[0:1], 0x70
869; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
870; VI-NEXT:    s_mov_b32 s3, 0xf000
871; VI-NEXT:    s_mov_b32 s2, -1
872; VI-NEXT:    s_waitcnt lgkmcnt(0)
873; VI-NEXT:    s_mul_i32 s4, s4, s5
874; VI-NEXT:    v_mov_b32_e32 v0, s4
875; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
876; VI-NEXT:    s_endpgm
877;
878; GFX9-LABEL: s_mul_i32:
879; GFX9:       ; %bb.0: ; %entry
880; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x4c
881; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x70
882; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
883; GFX9-NEXT:    s_mov_b32 s7, 0xf000
884; GFX9-NEXT:    s_mov_b32 s6, -1
885; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
886; GFX9-NEXT:    s_mul_i32 s0, s2, s3
887; GFX9-NEXT:    v_mov_b32_e32 v0, s0
888; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
889; GFX9-NEXT:    s_endpgm
890;
891; GFX10-LABEL: s_mul_i32:
892; GFX10:       ; %bb.0: ; %entry
893; GFX10-NEXT:    s_clause 0x2
894; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x4c
895; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x70
896; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
897; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
898; GFX10-NEXT:    s_mov_b32 s6, -1
899; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
900; GFX10-NEXT:    s_mul_i32 s0, s2, s3
901; GFX10-NEXT:    v_mov_b32_e32 v0, s0
902; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
903; GFX10-NEXT:    s_endpgm
904;
905; GFX11-LABEL: s_mul_i32:
906; GFX11:       ; %bb.0: ; %entry
907; GFX11-NEXT:    s_clause 0x2
908; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x4c
909; GFX11-NEXT:    s_load_b32 s3, s[0:1], 0x70
910; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
911; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
912; GFX11-NEXT:    s_mul_i32 s2, s2, s3
913; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
914; GFX11-NEXT:    v_mov_b32_e32 v0, s2
915; GFX11-NEXT:    s_mov_b32 s2, -1
916; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
917; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
918; GFX11-NEXT:    s_endpgm
919;
920; EG-LABEL: s_mul_i32:
921; EG:       ; %bb.0: ; %entry
922; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
923; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
924; EG-NEXT:    CF_END
925; EG-NEXT:    PAD
926; EG-NEXT:    ALU clause starting at 4:
927; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
928; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
929; EG-NEXT:     MULLO_INT * T1.X, KC0[4].Z, KC0[6].W,
930entry:
931  %mul = mul i32 %a, %b
932  store i32 %mul, ptr addrspace(1) %out, align 4
933  ret void
934}
935
936define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
937; SI-LABEL: v_mul_i32:
938; SI:       ; %bb.0: ; %entry
939; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
940; SI-NEXT:    s_mov_b32 s7, 0xf000
941; SI-NEXT:    s_mov_b32 s6, -1
942; SI-NEXT:    s_mov_b32 s10, s6
943; SI-NEXT:    s_mov_b32 s11, s7
944; SI-NEXT:    s_waitcnt lgkmcnt(0)
945; SI-NEXT:    s_mov_b32 s8, s2
946; SI-NEXT:    s_mov_b32 s9, s3
947; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
948; SI-NEXT:    s_mov_b32 s4, s0
949; SI-NEXT:    s_mov_b32 s5, s1
950; SI-NEXT:    s_waitcnt vmcnt(0)
951; SI-NEXT:    v_mul_lo_u32 v0, v0, v1
952; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
953; SI-NEXT:    s_endpgm
954;
955; VI-LABEL: v_mul_i32:
956; VI:       ; %bb.0: ; %entry
957; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
958; VI-NEXT:    s_mov_b32 s7, 0xf000
959; VI-NEXT:    s_mov_b32 s6, -1
960; VI-NEXT:    s_mov_b32 s10, s6
961; VI-NEXT:    s_mov_b32 s11, s7
962; VI-NEXT:    s_waitcnt lgkmcnt(0)
963; VI-NEXT:    s_mov_b32 s8, s2
964; VI-NEXT:    s_mov_b32 s9, s3
965; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
966; VI-NEXT:    s_mov_b32 s4, s0
967; VI-NEXT:    s_mov_b32 s5, s1
968; VI-NEXT:    s_waitcnt vmcnt(0)
969; VI-NEXT:    v_mul_lo_u32 v0, v0, v1
970; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
971; VI-NEXT:    s_endpgm
972;
973; GFX9-LABEL: v_mul_i32:
974; GFX9:       ; %bb.0: ; %entry
975; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
976; GFX9-NEXT:    s_mov_b32 s7, 0xf000
977; GFX9-NEXT:    s_mov_b32 s6, -1
978; GFX9-NEXT:    s_mov_b32 s10, s6
979; GFX9-NEXT:    s_mov_b32 s11, s7
980; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
981; GFX9-NEXT:    s_mov_b32 s8, s2
982; GFX9-NEXT:    s_mov_b32 s9, s3
983; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
984; GFX9-NEXT:    s_mov_b32 s4, s0
985; GFX9-NEXT:    s_mov_b32 s5, s1
986; GFX9-NEXT:    s_waitcnt vmcnt(0)
987; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
988; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
989; GFX9-NEXT:    s_endpgm
990;
991; GFX10-LABEL: v_mul_i32:
992; GFX10:       ; %bb.0: ; %entry
993; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
994; GFX10-NEXT:    s_mov_b32 s6, -1
995; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
996; GFX10-NEXT:    s_mov_b32 s10, s6
997; GFX10-NEXT:    s_mov_b32 s11, s7
998; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
999; GFX10-NEXT:    s_mov_b32 s8, s2
1000; GFX10-NEXT:    s_mov_b32 s9, s3
1001; GFX10-NEXT:    s_mov_b32 s4, s0
1002; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1003; GFX10-NEXT:    s_mov_b32 s5, s1
1004; GFX10-NEXT:    s_waitcnt vmcnt(0)
1005; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
1006; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1007; GFX10-NEXT:    s_endpgm
1008;
1009; GFX11-LABEL: v_mul_i32:
1010; GFX11:       ; %bb.0: ; %entry
1011; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1012; GFX11-NEXT:    s_mov_b32 s6, -1
1013; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1014; GFX11-NEXT:    s_mov_b32 s10, s6
1015; GFX11-NEXT:    s_mov_b32 s11, s7
1016; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1017; GFX11-NEXT:    s_mov_b32 s8, s2
1018; GFX11-NEXT:    s_mov_b32 s9, s3
1019; GFX11-NEXT:    s_mov_b32 s4, s0
1020; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
1021; GFX11-NEXT:    s_mov_b32 s5, s1
1022; GFX11-NEXT:    s_waitcnt vmcnt(0)
1023; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v1
1024; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
1025; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1026; GFX11-NEXT:    s_endpgm
1027;
1028; EG-LABEL: v_mul_i32:
1029; EG:       ; %bb.0: ; %entry
1030; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1031; EG-NEXT:    TEX 0 @6
1032; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
1033; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1034; EG-NEXT:    CF_END
1035; EG-NEXT:    PAD
1036; EG-NEXT:    Fetch clause starting at 6:
1037; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
1038; EG-NEXT:    ALU clause starting at 8:
1039; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1040; EG-NEXT:    ALU clause starting at 9:
1041; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
1042; EG-NEXT:     MULLO_INT * T0.X, T0.X, T0.Y,
1043; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1044entry:
1045  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
1046  %a = load i32, ptr addrspace(1) %in
1047  %b = load i32, ptr addrspace(1) %b_ptr
1048  %result = mul i32 %a, %b
1049  store i32 %result, ptr addrspace(1) %out
1050  ret void
1051}
1052
1053; A standard 64-bit multiply.  The expansion should be around 6 instructions.
1054; It would be difficult to match the expansion correctly without writing
1055; a really complicated list of FileCheck expressions.  I don't want
1056; to confuse people who may 'break' this test with a correct optimization,
1057; so this test just uses FUNC-LABEL to make sure the compiler does not
1058; crash with a 'failed to select' error.
1059
1060define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
1061; SI-LABEL: s_mul_i64:
1062; SI:       ; %bb.0: ; %entry
1063; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1064; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1065; SI-NEXT:    s_mov_b32 s3, 0xf000
1066; SI-NEXT:    s_mov_b32 s2, -1
1067; SI-NEXT:    s_waitcnt lgkmcnt(0)
1068; SI-NEXT:    s_mov_b32 s0, s4
1069; SI-NEXT:    v_mov_b32_e32 v0, s8
1070; SI-NEXT:    v_mul_hi_u32 v0, s6, v0
1071; SI-NEXT:    s_mul_i32 s4, s6, s9
1072; SI-NEXT:    s_mov_b32 s1, s5
1073; SI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
1074; SI-NEXT:    s_mul_i32 s4, s7, s8
1075; SI-NEXT:    v_add_i32_e32 v1, vcc, s4, v0
1076; SI-NEXT:    s_mul_i32 s4, s6, s8
1077; SI-NEXT:    v_mov_b32_e32 v0, s4
1078; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1079; SI-NEXT:    s_endpgm
1080;
1081; VI-LABEL: s_mul_i64:
1082; VI:       ; %bb.0: ; %entry
1083; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1084; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
1085; VI-NEXT:    s_mov_b32 s3, 0xf000
1086; VI-NEXT:    s_mov_b32 s2, -1
1087; VI-NEXT:    s_waitcnt lgkmcnt(0)
1088; VI-NEXT:    s_mov_b32 s0, s4
1089; VI-NEXT:    v_mov_b32_e32 v0, s8
1090; VI-NEXT:    v_mad_u64_u32 v[0:1], s[10:11], s6, v0, 0
1091; VI-NEXT:    s_mul_i32 s4, s6, s9
1092; VI-NEXT:    s_mov_b32 s1, s5
1093; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
1094; VI-NEXT:    s_mul_i32 s4, s7, s8
1095; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
1096; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1097; VI-NEXT:    s_endpgm
1098;
1099; GFX9-LABEL: s_mul_i64:
1100; GFX9:       ; %bb.0: ; %entry
1101; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1102; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
1103; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1104; GFX9-NEXT:    s_mov_b32 s2, -1
1105; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1106; GFX9-NEXT:    s_mov_b32 s0, s4
1107; GFX9-NEXT:    s_mov_b32 s1, s5
1108; GFX9-NEXT:    s_mul_i32 s4, s6, s9
1109; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s8
1110; GFX9-NEXT:    s_add_i32 s4, s5, s4
1111; GFX9-NEXT:    s_mul_i32 s5, s7, s8
1112; GFX9-NEXT:    s_add_i32 s4, s4, s5
1113; GFX9-NEXT:    s_mul_i32 s5, s6, s8
1114; GFX9-NEXT:    v_mov_b32_e32 v0, s5
1115; GFX9-NEXT:    v_mov_b32_e32 v1, s4
1116; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1117; GFX9-NEXT:    s_endpgm
1118;
1119; GFX10-LABEL: s_mul_i64:
1120; GFX10:       ; %bb.0: ; %entry
1121; GFX10-NEXT:    s_clause 0x1
1122; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1123; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1124; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1125; GFX10-NEXT:    s_mul_i32 s0, s6, s3
1126; GFX10-NEXT:    s_mul_hi_u32 s1, s6, s2
1127; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1128; GFX10-NEXT:    s_add_i32 s0, s1, s0
1129; GFX10-NEXT:    s_mul_i32 s1, s7, s2
1130; GFX10-NEXT:    s_mul_i32 s2, s6, s2
1131; GFX10-NEXT:    s_add_i32 s0, s0, s1
1132; GFX10-NEXT:    v_mov_b32_e32 v0, s2
1133; GFX10-NEXT:    v_mov_b32_e32 v1, s0
1134; GFX10-NEXT:    s_mov_b32 s2, -1
1135; GFX10-NEXT:    s_mov_b32 s0, s4
1136; GFX10-NEXT:    s_mov_b32 s1, s5
1137; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1138; GFX10-NEXT:    s_endpgm
1139;
1140; GFX11-LABEL: s_mul_i64:
1141; GFX11:       ; %bb.0: ; %entry
1142; GFX11-NEXT:    s_clause 0x1
1143; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
1144; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
1145; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1146; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1147; GFX11-NEXT:    s_mul_i32 s1, s6, s1
1148; GFX11-NEXT:    s_mul_hi_u32 s2, s6, s0
1149; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
1150; GFX11-NEXT:    s_add_i32 s1, s2, s1
1151; GFX11-NEXT:    s_mul_i32 s2, s7, s0
1152; GFX11-NEXT:    s_mul_i32 s0, s6, s0
1153; GFX11-NEXT:    s_add_i32 s1, s1, s2
1154; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1155; GFX11-NEXT:    s_mov_b32 s2, -1
1156; GFX11-NEXT:    s_mov_b32 s0, s4
1157; GFX11-NEXT:    s_mov_b32 s1, s5
1158; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1159; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1160; GFX11-NEXT:    s_endpgm
1161;
1162; EG-LABEL: s_mul_i64:
1163; EG:       ; %bb.0: ; %entry
1164; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1165; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1166; EG-NEXT:    CF_END
1167; EG-NEXT:    PAD
1168; EG-NEXT:    ALU clause starting at 4:
1169; EG-NEXT:     MULHI * T0.X, KC0[2].W, KC0[3].Y,
1170; EG-NEXT:     MULLO_INT * T0.Y, KC0[2].W, KC0[3].Z,
1171; EG-NEXT:     ADD_INT T0.W, T0.X, PS,
1172; EG-NEXT:     MULLO_INT * T0.X, KC0[3].X, KC0[3].Y,
1173; EG-NEXT:     ADD_INT * T0.Y, PV.W, PS,
1174; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1175; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1176; EG-NEXT:     MULLO_INT * T0.X, KC0[2].W, KC0[3].Y,
1177entry:
1178  %mul = mul i64 %a, %b
1179  store i64 %mul, ptr addrspace(1) %out, align 8
1180  ret void
1181}
1182
1183define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
1184; SI-LABEL: v_mul_i64:
1185; SI:       ; %bb.0: ; %entry
1186; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1187; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1188; SI-NEXT:    s_mov_b32 s3, 0xf000
1189; SI-NEXT:    s_mov_b32 s2, -1
1190; SI-NEXT:    s_mov_b32 s10, s2
1191; SI-NEXT:    s_mov_b32 s11, s3
1192; SI-NEXT:    s_waitcnt lgkmcnt(0)
1193; SI-NEXT:    s_mov_b32 s12, s6
1194; SI-NEXT:    s_mov_b32 s13, s7
1195; SI-NEXT:    s_mov_b32 s14, s2
1196; SI-NEXT:    s_mov_b32 s15, s3
1197; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1198; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1199; SI-NEXT:    s_mov_b32 s0, s4
1200; SI-NEXT:    s_mov_b32 s1, s5
1201; SI-NEXT:    s_waitcnt vmcnt(0)
1202; SI-NEXT:    v_mul_lo_u32 v1, v2, v1
1203; SI-NEXT:    v_mul_hi_u32 v4, v2, v0
1204; SI-NEXT:    v_mul_lo_u32 v3, v3, v0
1205; SI-NEXT:    v_mul_lo_u32 v0, v2, v0
1206; SI-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
1207; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
1208; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1209; SI-NEXT:    s_endpgm
1210;
1211; VI-LABEL: v_mul_i64:
1212; VI:       ; %bb.0: ; %entry
1213; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1214; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
1215; VI-NEXT:    s_mov_b32 s3, 0xf000
1216; VI-NEXT:    s_mov_b32 s2, -1
1217; VI-NEXT:    s_mov_b32 s10, s2
1218; VI-NEXT:    s_mov_b32 s11, s3
1219; VI-NEXT:    s_waitcnt lgkmcnt(0)
1220; VI-NEXT:    s_mov_b32 s12, s6
1221; VI-NEXT:    s_mov_b32 s13, s7
1222; VI-NEXT:    s_mov_b32 s14, s2
1223; VI-NEXT:    s_mov_b32 s15, s3
1224; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1225; VI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1226; VI-NEXT:    s_mov_b32 s0, s4
1227; VI-NEXT:    s_mov_b32 s1, s5
1228; VI-NEXT:    s_waitcnt vmcnt(0)
1229; VI-NEXT:    v_mul_lo_u32 v4, v2, v1
1230; VI-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v2, v0, 0
1231; VI-NEXT:    v_mul_lo_u32 v0, v3, v0
1232; VI-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
1233; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
1234; VI-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
1235; VI-NEXT:    s_endpgm
1236;
1237; GFX9-LABEL: v_mul_i64:
1238; GFX9:       ; %bb.0: ; %entry
1239; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1240; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
1241; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1242; GFX9-NEXT:    s_mov_b32 s2, -1
1243; GFX9-NEXT:    s_mov_b32 s10, s2
1244; GFX9-NEXT:    s_mov_b32 s11, s3
1245; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1246; GFX9-NEXT:    s_mov_b32 s12, s6
1247; GFX9-NEXT:    s_mov_b32 s13, s7
1248; GFX9-NEXT:    s_mov_b32 s14, s2
1249; GFX9-NEXT:    s_mov_b32 s15, s3
1250; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1251; GFX9-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1252; GFX9-NEXT:    s_mov_b32 s0, s4
1253; GFX9-NEXT:    s_mov_b32 s1, s5
1254; GFX9-NEXT:    s_waitcnt vmcnt(0)
1255; GFX9-NEXT:    v_mul_lo_u32 v1, v2, v1
1256; GFX9-NEXT:    v_mul_hi_u32 v4, v2, v0
1257; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v0
1258; GFX9-NEXT:    v_mul_lo_u32 v0, v2, v0
1259; GFX9-NEXT:    v_add_u32_e32 v1, v4, v1
1260; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
1261; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1262; GFX9-NEXT:    s_endpgm
1263;
1264; GFX10-LABEL: v_mul_i64:
1265; GFX10:       ; %bb.0: ; %entry
1266; GFX10-NEXT:    s_clause 0x1
1267; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1268; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
1269; GFX10-NEXT:    s_mov_b32 s2, -1
1270; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1271; GFX10-NEXT:    s_mov_b32 s10, s2
1272; GFX10-NEXT:    s_mov_b32 s11, s3
1273; GFX10-NEXT:    s_mov_b32 s14, s2
1274; GFX10-NEXT:    s_mov_b32 s15, s3
1275; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1276; GFX10-NEXT:    s_mov_b32 s12, s6
1277; GFX10-NEXT:    s_mov_b32 s13, s7
1278; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1279; GFX10-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
1280; GFX10-NEXT:    s_mov_b32 s0, s4
1281; GFX10-NEXT:    s_mov_b32 s1, s5
1282; GFX10-NEXT:    s_waitcnt vmcnt(0)
1283; GFX10-NEXT:    v_mul_lo_u32 v1, v2, v1
1284; GFX10-NEXT:    v_mul_hi_u32 v4, v2, v0
1285; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v0
1286; GFX10-NEXT:    v_mul_lo_u32 v0, v2, v0
1287; GFX10-NEXT:    v_add_nc_u32_e32 v1, v4, v1
1288; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
1289; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1290; GFX10-NEXT:    s_endpgm
1291;
1292; GFX11-LABEL: v_mul_i64:
1293; GFX11:       ; %bb.0: ; %entry
1294; GFX11-NEXT:    s_clause 0x1
1295; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
1296; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
1297; GFX11-NEXT:    s_mov_b32 s10, -1
1298; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
1299; GFX11-NEXT:    s_mov_b32 s2, s10
1300; GFX11-NEXT:    s_mov_b32 s3, s11
1301; GFX11-NEXT:    s_mov_b32 s14, s10
1302; GFX11-NEXT:    s_mov_b32 s15, s11
1303; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1304; GFX11-NEXT:    s_mov_b32 s12, s6
1305; GFX11-NEXT:    s_mov_b32 s13, s7
1306; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
1307; GFX11-NEXT:    buffer_load_b64 v[2:3], off, s[12:15], 0
1308; GFX11-NEXT:    s_mov_b32 s8, s4
1309; GFX11-NEXT:    s_mov_b32 s9, s5
1310; GFX11-NEXT:    s_waitcnt vmcnt(0)
1311; GFX11-NEXT:    v_mul_lo_u32 v1, v2, v1
1312; GFX11-NEXT:    v_mul_hi_u32 v4, v2, v0
1313; GFX11-NEXT:    v_mul_lo_u32 v3, v3, v0
1314; GFX11-NEXT:    v_mul_lo_u32 v0, v2, v0
1315; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
1316; GFX11-NEXT:    v_add_nc_u32_e32 v1, v4, v1
1317; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v3
1318; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
1319; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1320; GFX11-NEXT:    s_endpgm
1321;
1322; EG-LABEL: v_mul_i64:
1323; EG:       ; %bb.0: ; %entry
1324; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
1325; EG-NEXT:    TEX 1 @6
1326; EG-NEXT:    ALU 7, @12, KC0[CB0:0-32], KC1[]
1327; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1
1328; EG-NEXT:    CF_END
1329; EG-NEXT:    PAD
1330; EG-NEXT:    Fetch clause starting at 6:
1331; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
1332; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
1333; EG-NEXT:    ALU clause starting at 10:
1334; EG-NEXT:     MOV T0.X, KC0[2].Z,
1335; EG-NEXT:     MOV * T1.X, KC0[2].W,
1336; EG-NEXT:    ALU clause starting at 12:
1337; EG-NEXT:     MULHI * T0.Z, T0.X, T1.X,
1338; EG-NEXT:     MULLO_INT * T0.W, T0.X, T1.Y,
1339; EG-NEXT:     ADD_INT T0.W, T0.Z, PS,
1340; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T1.X,
1341; EG-NEXT:     ADD_INT * T0.Y, PV.W, PS,
1342; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
1343; EG-NEXT:     MULLO_INT * T0.X, T0.X, T1.X,
1344; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1345entry:
1346  %a = load i64, ptr addrspace(1) %aptr, align 8
1347  %b = load i64, ptr addrspace(1) %bptr, align 8
1348  %mul = mul i64 %a, %b
1349  store i64 %mul, ptr addrspace(1) %out, align 8
1350  ret void
1351}
1352
1353define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) {
1354; SI-LABEL: mul32_in_branch:
1355; SI:       ; %bb.0: ; %entry
1356; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
1357; SI-NEXT:    s_waitcnt lgkmcnt(0)
1358; SI-NEXT:    s_cmp_lg_u32 s2, 0
1359; SI-NEXT:    s_cbranch_scc0 .LBB11_2
1360; SI-NEXT:  ; %bb.1: ; %else
1361; SI-NEXT:    s_mul_i32 s6, s2, s3
1362; SI-NEXT:    s_mov_b64 s[4:5], 0
1363; SI-NEXT:    s_branch .LBB11_3
1364; SI-NEXT:  .LBB11_2:
1365; SI-NEXT:    s_mov_b64 s[4:5], -1
1366; SI-NEXT:    ; implicit-def: $sgpr6
1367; SI-NEXT:  .LBB11_3: ; %Flow
1368; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1369; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
1370; SI-NEXT:    s_waitcnt lgkmcnt(0)
1371; SI-NEXT:    s_mov_b64 vcc, vcc
1372; SI-NEXT:    s_cbranch_vccnz .LBB11_5
1373; SI-NEXT:  ; %bb.4: ; %if
1374; SI-NEXT:    s_mov_b32 s7, 0xf000
1375; SI-NEXT:    s_mov_b32 s6, -1
1376; SI-NEXT:    s_mov_b32 s4, s2
1377; SI-NEXT:    s_mov_b32 s5, s3
1378; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1379; SI-NEXT:    s_branch .LBB11_6
1380; SI-NEXT:  .LBB11_5:
1381; SI-NEXT:    v_mov_b32_e32 v0, s6
1382; SI-NEXT:  .LBB11_6: ; %endif
1383; SI-NEXT:    s_mov_b32 s3, 0xf000
1384; SI-NEXT:    s_mov_b32 s2, -1
1385; SI-NEXT:    s_waitcnt vmcnt(0)
1386; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1387; SI-NEXT:    s_endpgm
1388;
1389; VI-LABEL: mul32_in_branch:
1390; VI:       ; %bb.0: ; %entry
1391; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1392; VI-NEXT:    s_waitcnt lgkmcnt(0)
1393; VI-NEXT:    s_cmp_lg_u32 s2, 0
1394; VI-NEXT:    s_cbranch_scc0 .LBB11_2
1395; VI-NEXT:  ; %bb.1: ; %else
1396; VI-NEXT:    s_mul_i32 s6, s2, s3
1397; VI-NEXT:    s_mov_b64 s[4:5], 0
1398; VI-NEXT:    s_branch .LBB11_3
1399; VI-NEXT:  .LBB11_2:
1400; VI-NEXT:    s_mov_b64 s[4:5], -1
1401; VI-NEXT:    ; implicit-def: $sgpr6
1402; VI-NEXT:  .LBB11_3: ; %Flow
1403; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1404; VI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
1405; VI-NEXT:    s_cbranch_vccnz .LBB11_5
1406; VI-NEXT:  ; %bb.4: ; %if
1407; VI-NEXT:    s_mov_b32 s7, 0xf000
1408; VI-NEXT:    s_mov_b32 s6, -1
1409; VI-NEXT:    s_waitcnt lgkmcnt(0)
1410; VI-NEXT:    s_mov_b32 s4, s2
1411; VI-NEXT:    s_mov_b32 s5, s3
1412; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1413; VI-NEXT:    s_branch .LBB11_6
1414; VI-NEXT:  .LBB11_5:
1415; VI-NEXT:    v_mov_b32_e32 v0, s6
1416; VI-NEXT:  .LBB11_6: ; %endif
1417; VI-NEXT:    s_waitcnt lgkmcnt(0)
1418; VI-NEXT:    s_mov_b32 s3, 0xf000
1419; VI-NEXT:    s_mov_b32 s2, -1
1420; VI-NEXT:    s_waitcnt vmcnt(0)
1421; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1422; VI-NEXT:    s_endpgm
1423;
1424; GFX9-LABEL: mul32_in_branch:
1425; GFX9:       ; %bb.0: ; %entry
1426; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1427; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1428; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
1429; GFX9-NEXT:    s_cbranch_scc0 .LBB11_2
1430; GFX9-NEXT:  ; %bb.1: ; %else
1431; GFX9-NEXT:    s_mul_i32 s6, s2, s3
1432; GFX9-NEXT:    s_mov_b64 s[4:5], 0
1433; GFX9-NEXT:    s_branch .LBB11_3
1434; GFX9-NEXT:  .LBB11_2:
1435; GFX9-NEXT:    s_mov_b64 s[4:5], -1
1436; GFX9-NEXT:    ; implicit-def: $sgpr6
1437; GFX9-NEXT:  .LBB11_3: ; %Flow
1438; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1439; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
1440; GFX9-NEXT:    s_cbranch_vccnz .LBB11_5
1441; GFX9-NEXT:  ; %bb.4: ; %if
1442; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1443; GFX9-NEXT:    s_mov_b32 s6, -1
1444; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1445; GFX9-NEXT:    s_mov_b32 s4, s2
1446; GFX9-NEXT:    s_mov_b32 s5, s3
1447; GFX9-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1448; GFX9-NEXT:    s_branch .LBB11_6
1449; GFX9-NEXT:  .LBB11_5:
1450; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1451; GFX9-NEXT:  .LBB11_6: ; %endif
1452; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1453; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1454; GFX9-NEXT:    s_mov_b32 s2, -1
1455; GFX9-NEXT:    s_waitcnt vmcnt(0)
1456; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1457; GFX9-NEXT:    s_endpgm
1458;
1459; GFX10-LABEL: mul32_in_branch:
1460; GFX10:       ; %bb.0: ; %entry
1461; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1462; GFX10-NEXT:    s_mov_b32 s4, 0
1463; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1464; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
1465; GFX10-NEXT:    s_cbranch_scc0 .LBB11_2
1466; GFX10-NEXT:  ; %bb.1: ; %else
1467; GFX10-NEXT:    s_mul_i32 s5, s2, s3
1468; GFX10-NEXT:    s_branch .LBB11_3
1469; GFX10-NEXT:  .LBB11_2:
1470; GFX10-NEXT:    s_mov_b32 s4, -1
1471; GFX10-NEXT:    ; implicit-def: $sgpr5
1472; GFX10-NEXT:  .LBB11_3: ; %Flow
1473; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1474; GFX10-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
1475; GFX10-NEXT:    s_cbranch_vccnz .LBB11_5
1476; GFX10-NEXT:  ; %bb.4: ; %if
1477; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1478; GFX10-NEXT:    s_mov_b32 s6, -1
1479; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1480; GFX10-NEXT:    s_mov_b32 s4, s2
1481; GFX10-NEXT:    s_mov_b32 s5, s3
1482; GFX10-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1483; GFX10-NEXT:    s_branch .LBB11_6
1484; GFX10-NEXT:  .LBB11_5:
1485; GFX10-NEXT:    v_mov_b32_e32 v0, s5
1486; GFX10-NEXT:  .LBB11_6: ; %endif
1487; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1488; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1489; GFX10-NEXT:    s_mov_b32 s2, -1
1490; GFX10-NEXT:    s_waitcnt vmcnt(0)
1491; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1492; GFX10-NEXT:    s_endpgm
1493;
1494; GFX11-LABEL: mul32_in_branch:
1495; GFX11:       ; %bb.0: ; %entry
1496; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x34
1497; GFX11-NEXT:    s_mov_b32 s4, 0
1498; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1499; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
1500; GFX11-NEXT:    s_cbranch_scc0 .LBB11_2
1501; GFX11-NEXT:  ; %bb.1: ; %else
1502; GFX11-NEXT:    s_mul_i32 s5, s2, s3
1503; GFX11-NEXT:    s_branch .LBB11_3
1504; GFX11-NEXT:  .LBB11_2:
1505; GFX11-NEXT:    s_mov_b32 s4, -1
1506; GFX11-NEXT:    ; implicit-def: $sgpr5
1507; GFX11-NEXT:  .LBB11_3: ; %Flow
1508; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1509; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
1510; GFX11-NEXT:    s_cbranch_vccnz .LBB11_5
1511; GFX11-NEXT:  ; %bb.4: ; %if
1512; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1513; GFX11-NEXT:    s_mov_b32 s6, -1
1514; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1515; GFX11-NEXT:    s_mov_b32 s4, s2
1516; GFX11-NEXT:    s_mov_b32 s5, s3
1517; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
1518; GFX11-NEXT:    s_branch .LBB11_6
1519; GFX11-NEXT:  .LBB11_5:
1520; GFX11-NEXT:    v_mov_b32_e32 v0, s5
1521; GFX11-NEXT:  .LBB11_6: ; %endif
1522; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1523; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1524; GFX11-NEXT:    s_mov_b32 s2, -1
1525; GFX11-NEXT:    s_waitcnt vmcnt(0)
1526; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1527; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1528; GFX11-NEXT:    s_endpgm
1529;
1530; EG-LABEL: mul32_in_branch:
1531; EG:       ; %bb.0: ; %entry
1532; EG-NEXT:    ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[]
1533; EG-NEXT:    JUMP @3 POP:1
1534; EG-NEXT:    ALU_POP_AFTER 4, @18, KC0[CB0:0-32], KC1[]
1535; EG-NEXT:    ALU_PUSH_BEFORE 2, @23, KC0[CB0:0-32], KC1[]
1536; EG-NEXT:    JUMP @8 POP:1
1537; EG-NEXT:    ALU 0, @26, KC0[CB0:0-32], KC1[]
1538; EG-NEXT:    TEX 0 @12
1539; EG-NEXT:    POP @8 POP:1
1540; EG-NEXT:    ALU 1, @27, KC0[], KC1[]
1541; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1542; EG-NEXT:    CF_END
1543; EG-NEXT:    PAD
1544; EG-NEXT:    Fetch clause starting at 12:
1545; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1546; EG-NEXT:    ALU clause starting at 14:
1547; EG-NEXT:     MOV T1.W, literal.x,
1548; EG-NEXT:     SETNE_INT * T0.W, KC0[2].W, 0.0,
1549; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1550; EG-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
1551; EG-NEXT:    ALU clause starting at 18:
1552; EG-NEXT:     MOV T0.W, KC0[2].W,
1553; EG-NEXT:     MOV * T2.W, KC0[3].X,
1554; EG-NEXT:     MOV T1.W, literal.x,
1555; EG-NEXT:     MULLO_INT * T0.X, PV.W, PS,
1556; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
1557; EG-NEXT:    ALU clause starting at 23:
1558; EG-NEXT:     MOV T0.W, KC0[2].Y,
1559; EG-NEXT:     SETE_INT * T1.W, T1.W, 0.0,
1560; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
1561; EG-NEXT:    ALU clause starting at 26:
1562; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1563; EG-NEXT:    ALU clause starting at 27:
1564; EG-NEXT:     LSHR * T1.X, T0.W, literal.x,
1565; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1566entry:
1567  %0 = icmp eq i32 %a, 0
1568  br i1 %0, label %if, label %else
1569
1570if:
1571  %1 = load i32, ptr addrspace(1) %in
1572  br label %endif
1573
1574else:
1575  %2 = mul i32 %a, %b
1576  br label %endif
1577
1578endif:
1579  %3 = phi i32 [%1, %if], [%2, %else]
1580  store i32 %3, ptr addrspace(1) %out
1581  ret void
1582}
1583
1584define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) {
1585; SI-LABEL: mul64_in_branch:
1586; SI:       ; %bb.0: ; %entry
1587; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
1588; SI-NEXT:    s_mov_b64 s[8:9], 0
1589; SI-NEXT:    s_waitcnt lgkmcnt(0)
1590; SI-NEXT:    v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
1591; SI-NEXT:    s_and_b64 vcc, exec, s[10:11]
1592; SI-NEXT:    s_cbranch_vccz .LBB12_4
1593; SI-NEXT:  ; %bb.1: ; %else
1594; SI-NEXT:    v_mov_b32_e32 v0, s6
1595; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
1596; SI-NEXT:    s_mul_i32 s7, s4, s7
1597; SI-NEXT:    s_mul_i32 s5, s5, s6
1598; SI-NEXT:    s_mul_i32 s4, s4, s6
1599; SI-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
1600; SI-NEXT:    v_add_i32_e32 v1, vcc, s5, v0
1601; SI-NEXT:    v_mov_b32_e32 v0, s4
1602; SI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
1603; SI-NEXT:    s_cbranch_vccnz .LBB12_3
1604; SI-NEXT:  .LBB12_2: ; %if
1605; SI-NEXT:    s_mov_b32 s7, 0xf000
1606; SI-NEXT:    s_mov_b32 s6, -1
1607; SI-NEXT:    s_mov_b32 s4, s2
1608; SI-NEXT:    s_mov_b32 s5, s3
1609; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1610; SI-NEXT:  .LBB12_3: ; %endif
1611; SI-NEXT:    s_mov_b32 s3, 0xf000
1612; SI-NEXT:    s_mov_b32 s2, -1
1613; SI-NEXT:    s_waitcnt vmcnt(0)
1614; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1615; SI-NEXT:    s_endpgm
1616; SI-NEXT:  .LBB12_4:
1617; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1
1618; SI-NEXT:    s_branch .LBB12_2
1619;
1620; VI-LABEL: mul64_in_branch:
1621; VI:       ; %bb.0: ; %entry
1622; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
1623; VI-NEXT:    s_mov_b64 s[8:9], 0
1624; VI-NEXT:    s_waitcnt lgkmcnt(0)
1625; VI-NEXT:    s_cmp_lg_u64 s[4:5], 0
1626; VI-NEXT:    s_cbranch_scc0 .LBB12_4
1627; VI-NEXT:  ; %bb.1: ; %else
1628; VI-NEXT:    v_mov_b32_e32 v0, s6
1629; VI-NEXT:    v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0
1630; VI-NEXT:    s_mul_i32 s4, s4, s7
1631; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
1632; VI-NEXT:    s_mul_i32 s4, s5, s6
1633; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
1634; VI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
1635; VI-NEXT:    s_cbranch_vccnz .LBB12_3
1636; VI-NEXT:  .LBB12_2: ; %if
1637; VI-NEXT:    s_mov_b32 s7, 0xf000
1638; VI-NEXT:    s_mov_b32 s6, -1
1639; VI-NEXT:    s_mov_b32 s4, s2
1640; VI-NEXT:    s_mov_b32 s5, s3
1641; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1642; VI-NEXT:  .LBB12_3: ; %endif
1643; VI-NEXT:    s_mov_b32 s3, 0xf000
1644; VI-NEXT:    s_mov_b32 s2, -1
1645; VI-NEXT:    s_waitcnt vmcnt(0)
1646; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1647; VI-NEXT:    s_endpgm
1648; VI-NEXT:  .LBB12_4:
1649; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
1650; VI-NEXT:    s_branch .LBB12_2
1651;
1652; GFX9-LABEL: mul64_in_branch:
1653; GFX9:       ; %bb.0: ; %entry
1654; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
1655; GFX9-NEXT:    s_mov_b64 s[8:9], 0
1656; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1657; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
1658; GFX9-NEXT:    s_cbranch_scc0 .LBB12_3
1659; GFX9-NEXT:  ; %bb.1: ; %else
1660; GFX9-NEXT:    s_mul_i32 s7, s4, s7
1661; GFX9-NEXT:    s_mul_hi_u32 s10, s4, s6
1662; GFX9-NEXT:    s_add_i32 s7, s10, s7
1663; GFX9-NEXT:    s_mul_i32 s5, s5, s6
1664; GFX9-NEXT:    s_add_i32 s5, s7, s5
1665; GFX9-NEXT:    s_mul_i32 s4, s4, s6
1666; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
1667; GFX9-NEXT:    s_cbranch_vccnz .LBB12_4
1668; GFX9-NEXT:  .LBB12_2: ; %if
1669; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1670; GFX9-NEXT:    s_mov_b32 s6, -1
1671; GFX9-NEXT:    s_mov_b32 s4, s2
1672; GFX9-NEXT:    s_mov_b32 s5, s3
1673; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1674; GFX9-NEXT:    s_branch .LBB12_5
1675; GFX9-NEXT:  .LBB12_3:
1676; GFX9-NEXT:    ; implicit-def: $sgpr4_sgpr5
1677; GFX9-NEXT:    s_branch .LBB12_2
1678; GFX9-NEXT:  .LBB12_4:
1679; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1680; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1681; GFX9-NEXT:  .LBB12_5: ; %endif
1682; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1683; GFX9-NEXT:    s_mov_b32 s2, -1
1684; GFX9-NEXT:    s_waitcnt vmcnt(0)
1685; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1686; GFX9-NEXT:    s_endpgm
1687;
1688; GFX10-LABEL: mul64_in_branch:
1689; GFX10:       ; %bb.0: ; %entry
1690; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
1691; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1692; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
1693; GFX10-NEXT:    s_cbranch_scc0 .LBB12_3
1694; GFX10-NEXT:  ; %bb.1: ; %else
1695; GFX10-NEXT:    s_mul_i32 s7, s4, s7
1696; GFX10-NEXT:    s_mul_hi_u32 s8, s4, s6
1697; GFX10-NEXT:    s_mul_i32 s5, s5, s6
1698; GFX10-NEXT:    s_add_i32 s7, s8, s7
1699; GFX10-NEXT:    s_mul_i32 s4, s4, s6
1700; GFX10-NEXT:    s_add_i32 s5, s7, s5
1701; GFX10-NEXT:    s_mov_b32 s6, 0
1702; GFX10-NEXT:    s_cbranch_execnz .LBB12_4
1703; GFX10-NEXT:  .LBB12_2: ; %if
1704; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1705; GFX10-NEXT:    s_mov_b32 s6, -1
1706; GFX10-NEXT:    s_mov_b32 s4, s2
1707; GFX10-NEXT:    s_mov_b32 s5, s3
1708; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1709; GFX10-NEXT:    s_branch .LBB12_5
1710; GFX10-NEXT:  .LBB12_3:
1711; GFX10-NEXT:    s_mov_b32 s6, -1
1712; GFX10-NEXT:    ; implicit-def: $sgpr4_sgpr5
1713; GFX10-NEXT:    s_branch .LBB12_2
1714; GFX10-NEXT:  .LBB12_4:
1715; GFX10-NEXT:    v_mov_b32_e32 v0, s4
1716; GFX10-NEXT:    v_mov_b32_e32 v1, s5
1717; GFX10-NEXT:  .LBB12_5: ; %endif
1718; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
1719; GFX10-NEXT:    s_mov_b32 s2, -1
1720; GFX10-NEXT:    s_waitcnt vmcnt(0)
1721; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1722; GFX10-NEXT:    s_endpgm
1723;
1724; GFX11-LABEL: mul64_in_branch:
1725; GFX11:       ; %bb.0: ; %entry
1726; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
1727; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1728; GFX11-NEXT:    s_cmp_lg_u64 s[4:5], 0
1729; GFX11-NEXT:    s_cbranch_scc0 .LBB12_3
1730; GFX11-NEXT:  ; %bb.1: ; %else
1731; GFX11-NEXT:    s_mul_i32 s7, s4, s7
1732; GFX11-NEXT:    s_mul_hi_u32 s8, s4, s6
1733; GFX11-NEXT:    s_mul_i32 s5, s5, s6
1734; GFX11-NEXT:    s_add_i32 s7, s8, s7
1735; GFX11-NEXT:    s_mul_i32 s4, s4, s6
1736; GFX11-NEXT:    s_add_i32 s5, s7, s5
1737; GFX11-NEXT:    s_mov_b32 s6, 0
1738; GFX11-NEXT:    s_cbranch_execnz .LBB12_4
1739; GFX11-NEXT:  .LBB12_2: ; %if
1740; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1741; GFX11-NEXT:    s_mov_b32 s6, -1
1742; GFX11-NEXT:    s_mov_b32 s4, s2
1743; GFX11-NEXT:    s_mov_b32 s5, s3
1744; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], 0
1745; GFX11-NEXT:    s_branch .LBB12_5
1746; GFX11-NEXT:  .LBB12_3:
1747; GFX11-NEXT:    s_mov_b32 s6, -1
1748; GFX11-NEXT:    ; implicit-def: $sgpr4_sgpr5
1749; GFX11-NEXT:    s_branch .LBB12_2
1750; GFX11-NEXT:  .LBB12_4:
1751; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
1752; GFX11-NEXT:  .LBB12_5: ; %endif
1753; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
1754; GFX11-NEXT:    s_mov_b32 s2, -1
1755; GFX11-NEXT:    s_waitcnt vmcnt(0)
1756; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1757; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1758; GFX11-NEXT:    s_endpgm
1759;
1760; EG-LABEL: mul64_in_branch:
1761; EG:       ; %bb.0: ; %entry
1762; EG-NEXT:    ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[]
1763; EG-NEXT:    JUMP @3 POP:1
1764; EG-NEXT:    ALU_POP_AFTER 11, @19, KC0[CB0:0-32], KC1[]
1765; EG-NEXT:    ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[]
1766; EG-NEXT:    JUMP @8 POP:1
1767; EG-NEXT:    ALU 0, @34, KC0[CB0:0-32], KC1[]
1768; EG-NEXT:    TEX 0 @12
1769; EG-NEXT:    POP @8 POP:1
1770; EG-NEXT:    ALU 1, @35, KC0[], KC1[]
1771; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1772; EG-NEXT:    CF_END
1773; EG-NEXT:    PAD
1774; EG-NEXT:    Fetch clause starting at 12:
1775; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
1776; EG-NEXT:    ALU clause starting at 14:
1777; EG-NEXT:     OR_INT T0.W, KC0[2].W, KC0[3].X,
1778; EG-NEXT:     MOV * T1.W, literal.x,
1779; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
1780; EG-NEXT:     SETNE_INT * T0.W, PV.W, 0.0,
1781; EG-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
1782; EG-NEXT:    ALU clause starting at 19:
1783; EG-NEXT:     MOV T0.W, KC0[2].W,
1784; EG-NEXT:     MOV * T1.W, KC0[3].Z,
1785; EG-NEXT:     MOV T2.W, KC0[3].Y,
1786; EG-NEXT:     MULLO_INT * T0.X, PV.W, PS,
1787; EG-NEXT:     MOV T1.W, KC0[3].X,
1788; EG-NEXT:     MULHI * T0.Y, T0.W, PV.W,
1789; EG-NEXT:     ADD_INT T3.W, PS, T0.X,
1790; EG-NEXT:     MULLO_INT * T0.X, PV.W, T2.W,
1791; EG-NEXT:     ADD_INT T0.Y, PV.W, PS,
1792; EG-NEXT:     MOV T1.W, literal.x,
1793; EG-NEXT:     MULLO_INT * T0.X, T0.W, T2.W,
1794; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
1795; EG-NEXT:    ALU clause starting at 31:
1796; EG-NEXT:     MOV T0.W, KC0[2].Y,
1797; EG-NEXT:     SETE_INT * T1.W, T1.W, 0.0,
1798; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
1799; EG-NEXT:    ALU clause starting at 34:
1800; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1801; EG-NEXT:    ALU clause starting at 35:
1802; EG-NEXT:     LSHR * T1.X, T0.W, literal.x,
1803; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1804entry:
1805  %0 = icmp eq i64 %a, 0
1806  br i1 %0, label %if, label %else
1807
1808if:
1809  %1 = load i64, ptr addrspace(1) %in
1810  br label %endif
1811
1812else:
1813  %2 = mul i64 %a, %b
1814  br label %endif
1815
1816endif:
1817  %3 = phi i64 [%1, %if], [%2, %else]
1818  store i64 %3, ptr addrspace(1) %out
1819  ret void
1820}
1821
1822define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 {
1823; SI-LABEL: s_mul_i128:
1824; SI:       ; %bb.0: ; %entry
1825; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x13
1826; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x1f
1827; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1828; SI-NEXT:    s_mov_b32 s3, 0xf000
1829; SI-NEXT:    s_mov_b32 s2, -1
1830; SI-NEXT:    s_waitcnt lgkmcnt(0)
1831; SI-NEXT:    v_mov_b32_e32 v0, s6
1832; SI-NEXT:    v_mul_hi_u32 v0, s8, v0
1833; SI-NEXT:    v_mov_b32_e32 v1, s4
1834; SI-NEXT:    v_mul_hi_u32 v1, s10, v1
1835; SI-NEXT:    s_mul_i32 s7, s8, s7
1836; SI-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
1837; SI-NEXT:    s_mul_i32 s7, s10, s5
1838; SI-NEXT:    s_mul_i32 s12, s9, s6
1839; SI-NEXT:    s_mul_i32 s6, s8, s6
1840; SI-NEXT:    v_add_i32_e32 v1, vcc, s7, v1
1841; SI-NEXT:    s_mul_i32 s7, s11, s4
1842; SI-NEXT:    v_add_i32_e32 v0, vcc, s12, v0
1843; SI-NEXT:    v_add_i32_e32 v1, vcc, s7, v1
1844; SI-NEXT:    s_mul_i32 s7, s10, s4
1845; SI-NEXT:    v_mov_b32_e32 v2, s6
1846; SI-NEXT:    v_add_i32_e32 v2, vcc, s7, v2
1847; SI-NEXT:    v_addc_u32_e32 v0, vcc, v1, v0, vcc
1848; SI-NEXT:    v_mov_b32_e32 v1, s8
1849; SI-NEXT:    v_mul_hi_u32 v5, s4, v1
1850; SI-NEXT:    v_mul_hi_u32 v1, s5, v1
1851; SI-NEXT:    v_mov_b32_e32 v3, s9
1852; SI-NEXT:    v_mul_hi_u32 v4, s4, v3
1853; SI-NEXT:    s_mul_i32 s7, s5, s8
1854; SI-NEXT:    v_add_i32_e32 v5, vcc, s7, v5
1855; SI-NEXT:    s_mul_i32 s6, s4, s9
1856; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
1857; SI-NEXT:    v_add_i32_e32 v1, vcc, s6, v5
1858; SI-NEXT:    v_mul_hi_u32 v3, s5, v3
1859; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
1860; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
1861; SI-NEXT:    s_mul_i32 s5, s5, s9
1862; SI-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, 0, vcc
1863; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v4
1864; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
1865; SI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
1866; SI-NEXT:    s_mul_i32 s4, s4, s8
1867; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v0, vcc
1868; SI-NEXT:    v_mov_b32_e32 v0, s4
1869; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1870; SI-NEXT:    s_endpgm
1871;
1872; VI-LABEL: s_mul_i128:
1873; VI:       ; %bb.0: ; %entry
1874; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x4c
1875; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x7c
1876; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1877; VI-NEXT:    v_mov_b32_e32 v5, 0
1878; VI-NEXT:    s_mov_b32 s3, 0xf000
1879; VI-NEXT:    s_waitcnt lgkmcnt(0)
1880; VI-NEXT:    v_mov_b32_e32 v0, s6
1881; VI-NEXT:    v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0
1882; VI-NEXT:    s_mul_i32 s7, s8, s7
1883; VI-NEXT:    v_mov_b32_e32 v6, s8
1884; VI-NEXT:    v_add_u32_e32 v3, vcc, s7, v3
1885; VI-NEXT:    s_mul_i32 s12, s9, s6
1886; VI-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s4, v6, 0
1887; VI-NEXT:    v_add_u32_e32 v3, vcc, s12, v3
1888; VI-NEXT:    v_mov_b32_e32 v4, v1
1889; VI-NEXT:    v_mad_u64_u32 v[6:7], s[6:7], s5, v6, v[4:5]
1890; VI-NEXT:    v_mov_b32_e32 v8, s4
1891; VI-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], s10, v8, v[2:3]
1892; VI-NEXT:    v_mov_b32_e32 v3, v7
1893; VI-NEXT:    v_mov_b32_e32 v7, v5
1894; VI-NEXT:    v_mov_b32_e32 v8, s9
1895; VI-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], s4, v8, v[6:7]
1896; VI-NEXT:    s_mul_i32 s8, s11, s4
1897; VI-NEXT:    v_add_u32_e32 v6, vcc, s8, v2
1898; VI-NEXT:    v_mov_b32_e32 v2, v5
1899; VI-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
1900; VI-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, 0, vcc
1901; VI-NEXT:    s_mul_i32 s8, s10, s5
1902; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s5, v8, v[2:3]
1903; VI-NEXT:    v_add_u32_e32 v5, vcc, s8, v6
1904; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
1905; VI-NEXT:    s_mov_b32 s2, -1
1906; VI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
1907; VI-NEXT:    v_mov_b32_e32 v1, v4
1908; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1909; VI-NEXT:    s_endpgm
1910;
1911; GFX9-LABEL: s_mul_i128:
1912; GFX9:       ; %bb.0: ; %entry
1913; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x4c
1914; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x7c
1915; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
1916; GFX9-NEXT:    s_mov_b32 s15, 0xf000
1917; GFX9-NEXT:    s_mov_b32 s14, -1
1918; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1919; GFX9-NEXT:    s_mul_i32 s0, s8, s7
1920; GFX9-NEXT:    s_mul_hi_u32 s1, s8, s6
1921; GFX9-NEXT:    s_mul_i32 s2, s10, s5
1922; GFX9-NEXT:    s_mul_hi_u32 s3, s10, s4
1923; GFX9-NEXT:    s_add_i32 s0, s1, s0
1924; GFX9-NEXT:    s_mul_i32 s1, s9, s6
1925; GFX9-NEXT:    s_add_i32 s2, s3, s2
1926; GFX9-NEXT:    s_mul_i32 s3, s11, s4
1927; GFX9-NEXT:    s_add_i32 s0, s0, s1
1928; GFX9-NEXT:    s_mul_i32 s1, s8, s6
1929; GFX9-NEXT:    s_add_i32 s2, s2, s3
1930; GFX9-NEXT:    s_mul_i32 s3, s10, s4
1931; GFX9-NEXT:    s_add_u32 s3, s3, s1
1932; GFX9-NEXT:    s_addc_u32 s2, s2, s0
1933; GFX9-NEXT:    s_mul_i32 s10, s5, s8
1934; GFX9-NEXT:    s_mul_hi_u32 s11, s4, s8
1935; GFX9-NEXT:    s_mul_hi_u32 s7, s5, s8
1936; GFX9-NEXT:    s_add_u32 s10, s10, s11
1937; GFX9-NEXT:    s_mul_i32 s1, s4, s9
1938; GFX9-NEXT:    s_addc_u32 s7, s7, 0
1939; GFX9-NEXT:    s_mul_hi_u32 s6, s4, s9
1940; GFX9-NEXT:    s_add_u32 s1, s1, s10
1941; GFX9-NEXT:    s_addc_u32 s6, s6, 0
1942; GFX9-NEXT:    s_add_u32 s6, s7, s6
1943; GFX9-NEXT:    s_addc_u32 s7, 0, 0
1944; GFX9-NEXT:    s_mul_hi_u32 s10, s5, s9
1945; GFX9-NEXT:    s_mul_i32 s5, s5, s9
1946; GFX9-NEXT:    s_add_u32 s5, s5, s6
1947; GFX9-NEXT:    s_addc_u32 s6, s10, s7
1948; GFX9-NEXT:    s_mov_b32 s0, 0
1949; GFX9-NEXT:    s_add_u32 s5, s5, s3
1950; GFX9-NEXT:    s_addc_u32 s6, s6, s2
1951; GFX9-NEXT:    s_mul_i32 s2, s4, s8
1952; GFX9-NEXT:    s_mov_b32 s3, s0
1953; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1954; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1955; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1956; GFX9-NEXT:    v_mov_b32_e32 v2, s5
1957; GFX9-NEXT:    v_mov_b32_e32 v3, s6
1958; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
1959; GFX9-NEXT:    s_endpgm
1960;
1961; GFX10-LABEL: s_mul_i128:
1962; GFX10:       ; %bb.0: ; %entry
1963; GFX10-NEXT:    s_clause 0x1
1964; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x4c
1965; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x7c
1966; GFX10-NEXT:    s_mov_b32 s2, 0
1967; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1968; GFX10-NEXT:    s_mov_b32 s13, s2
1969; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1970; GFX10-NEXT:    s_mul_i32 s3, s8, s7
1971; GFX10-NEXT:    s_mul_hi_u32 s7, s8, s6
1972; GFX10-NEXT:    s_mul_i32 s14, s10, s5
1973; GFX10-NEXT:    s_mul_hi_u32 s15, s10, s4
1974; GFX10-NEXT:    s_mul_i32 s12, s9, s6
1975; GFX10-NEXT:    s_mul_i32 s11, s11, s4
1976; GFX10-NEXT:    s_add_i32 s3, s7, s3
1977; GFX10-NEXT:    s_add_i32 s7, s15, s14
1978; GFX10-NEXT:    s_mul_i32 s6, s8, s6
1979; GFX10-NEXT:    s_mul_i32 s10, s10, s4
1980; GFX10-NEXT:    s_add_i32 s3, s3, s12
1981; GFX10-NEXT:    s_add_i32 s7, s7, s11
1982; GFX10-NEXT:    s_mul_i32 s19, s5, s8
1983; GFX10-NEXT:    s_mul_hi_u32 s20, s4, s8
1984; GFX10-NEXT:    s_add_u32 s6, s10, s6
1985; GFX10-NEXT:    s_mul_hi_u32 s18, s5, s8
1986; GFX10-NEXT:    s_addc_u32 s7, s7, s3
1987; GFX10-NEXT:    s_mul_i32 s17, s4, s9
1988; GFX10-NEXT:    s_add_u32 s3, s19, s20
1989; GFX10-NEXT:    s_mul_hi_u32 s16, s4, s9
1990; GFX10-NEXT:    s_mul_hi_u32 s21, s5, s9
1991; GFX10-NEXT:    s_mul_i32 s5, s5, s9
1992; GFX10-NEXT:    s_addc_u32 s9, s18, 0
1993; GFX10-NEXT:    s_add_u32 s3, s17, s3
1994; GFX10-NEXT:    s_addc_u32 s10, s16, 0
1995; GFX10-NEXT:    s_mul_i32 s12, s4, s8
1996; GFX10-NEXT:    s_add_u32 s4, s9, s10
1997; GFX10-NEXT:    s_addc_u32 s8, 0, 0
1998; GFX10-NEXT:    s_add_u32 s4, s5, s4
1999; GFX10-NEXT:    s_addc_u32 s5, s21, s8
2000; GFX10-NEXT:    s_add_u32 s4, s4, s6
2001; GFX10-NEXT:    s_addc_u32 s5, s5, s7
2002; GFX10-NEXT:    s_or_b64 s[2:3], s[12:13], s[2:3]
2003; GFX10-NEXT:    v_mov_b32_e32 v2, s4
2004; GFX10-NEXT:    v_mov_b32_e32 v0, s2
2005; GFX10-NEXT:    v_mov_b32_e32 v1, s3
2006; GFX10-NEXT:    v_mov_b32_e32 v3, s5
2007; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
2008; GFX10-NEXT:    s_mov_b32 s2, -1
2009; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2010; GFX10-NEXT:    s_endpgm
2011;
2012; GFX11-LABEL: s_mul_i128:
2013; GFX11:       ; %bb.0: ; %entry
2014; GFX11-NEXT:    s_clause 0x2
2015; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x4c
2016; GFX11-NEXT:    s_load_b128 s[8:11], s[0:1], 0x7c
2017; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2018; GFX11-NEXT:    s_mov_b32 s2, 0
2019; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2020; GFX11-NEXT:    s_mov_b32 s13, s2
2021; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2022; GFX11-NEXT:    s_mul_i32 s3, s8, s7
2023; GFX11-NEXT:    s_mul_hi_u32 s7, s8, s6
2024; GFX11-NEXT:    s_mul_i32 s14, s10, s5
2025; GFX11-NEXT:    s_mul_hi_u32 s15, s10, s4
2026; GFX11-NEXT:    s_mul_i32 s12, s9, s6
2027; GFX11-NEXT:    s_mul_i32 s11, s11, s4
2028; GFX11-NEXT:    s_add_i32 s3, s7, s3
2029; GFX11-NEXT:    s_add_i32 s7, s15, s14
2030; GFX11-NEXT:    s_mul_i32 s6, s8, s6
2031; GFX11-NEXT:    s_mul_i32 s10, s10, s4
2032; GFX11-NEXT:    s_add_i32 s3, s3, s12
2033; GFX11-NEXT:    s_add_i32 s7, s7, s11
2034; GFX11-NEXT:    s_mul_i32 s19, s5, s8
2035; GFX11-NEXT:    s_mul_hi_u32 s20, s4, s8
2036; GFX11-NEXT:    s_add_u32 s6, s10, s6
2037; GFX11-NEXT:    s_mul_hi_u32 s18, s5, s8
2038; GFX11-NEXT:    s_addc_u32 s7, s7, s3
2039; GFX11-NEXT:    s_mul_i32 s17, s4, s9
2040; GFX11-NEXT:    s_add_u32 s3, s19, s20
2041; GFX11-NEXT:    s_mul_hi_u32 s16, s4, s9
2042; GFX11-NEXT:    s_mul_hi_u32 s21, s5, s9
2043; GFX11-NEXT:    s_mul_i32 s5, s5, s9
2044; GFX11-NEXT:    s_addc_u32 s9, s18, 0
2045; GFX11-NEXT:    s_add_u32 s3, s17, s3
2046; GFX11-NEXT:    s_addc_u32 s10, s16, 0
2047; GFX11-NEXT:    s_mul_i32 s12, s4, s8
2048; GFX11-NEXT:    s_add_u32 s4, s9, s10
2049; GFX11-NEXT:    s_addc_u32 s8, 0, 0
2050; GFX11-NEXT:    s_add_u32 s4, s5, s4
2051; GFX11-NEXT:    s_addc_u32 s5, s21, s8
2052; GFX11-NEXT:    s_add_u32 s4, s4, s6
2053; GFX11-NEXT:    s_addc_u32 s5, s5, s7
2054; GFX11-NEXT:    s_or_b64 s[2:3], s[12:13], s[2:3]
2055; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2056; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
2057; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
2058; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
2059; GFX11-NEXT:    s_mov_b32 s2, -1
2060; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
2061; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2062; GFX11-NEXT:    s_endpgm
2063;
2064; EG-LABEL: s_mul_i128:
2065; EG:       ; %bb.0: ; %entry
2066; EG-NEXT:    ALU 41, @4, KC0[CB0:0-32], KC1[]
2067; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2068; EG-NEXT:    CF_END
2069; EG-NEXT:    PAD
2070; EG-NEXT:    ALU clause starting at 4:
2071; EG-NEXT:     MULLO_INT * T0.X, KC0[5].X, KC0[8].X,
2072; EG-NEXT:     MULHI * T0.Y, KC0[5].X, KC0[8].X,
2073; EG-NEXT:     MULLO_INT * T0.Z, KC0[8].Y, KC0[4].W,
2074; EG-NEXT:     MULLO_INT * T0.W, KC0[8].X, KC0[5].Y,
2075; EG-NEXT:     MULHI * T1.X, KC0[5].X, KC0[7].W,
2076; EG-NEXT:     MULHI * T1.Y, KC0[4].W, KC0[8].X,
2077; EG-NEXT:     MULHI * T1.Z, KC0[8].Y, KC0[4].W,
2078; EG-NEXT:     MULLO_INT * T1.W, KC0[8].Y, KC0[5].X,
2079; EG-NEXT:     MULHI * T2.X, KC0[7].W, KC0[5].Y,
2080; EG-NEXT:     MULLO_INT * T2.Y, KC0[5].X, KC0[7].W,
2081; EG-NEXT:     MULHI * T2.Z, KC0[4].W, KC0[7].W,
2082; EG-NEXT:     ADD_INT T2.W, T2.Y, PS,
2083; EG-NEXT:     MULLO_INT * T3.X, KC0[4].W, KC0[8].X,
2084; EG-NEXT:     ADDC_UINT T2.Z, T2.Y, T2.Z,
2085; EG-NEXT:     ADDC_UINT T3.W, PS, PV.W,
2086; EG-NEXT:     MULLO_INT * T2.Y, KC0[7].W, KC0[5].Z,
2087; EG-NEXT:     ADD_INT T2.X, T2.X, PS,
2088; EG-NEXT:     ADD_INT T2.Y, T1.Z, T1.W,
2089; EG-NEXT:     ADD_INT T1.Z, T1.Y, PV.W,
2090; EG-NEXT:     ADD_INT T1.W, T1.X, PV.Z, BS:VEC_120/SCL_212
2091; EG-NEXT:     MULLO_INT * T1.X, KC0[8].Z, KC0[4].W,
2092; EG-NEXT:     ADD_INT T4.X, PV.W, PV.Z,
2093; EG-NEXT:     ADDC_UINT T1.Y, PV.W, PV.Z,
2094; EG-NEXT:     ADD_INT T1.Z, PV.Y, PS,
2095; EG-NEXT:     ADD_INT T0.W, PV.X, T0.W,
2096; EG-NEXT:     MULLO_INT * T1.X, KC0[7].W, KC0[5].Y,
2097; EG-NEXT:     ADD_INT T2.Y, PV.Z, PV.W,
2098; EG-NEXT:     ADDC_UINT T1.Z, T0.Z, PS,
2099; EG-NEXT:     ADD_INT T0.W, T0.Y, PV.Y,
2100; EG-NEXT:     ADDC_UINT * T1.W, T0.X, PV.X,
2101; EG-NEXT:     ADD_INT T0.Y, T0.X, T4.X,
2102; EG-NEXT:     ADD_INT T0.Z, T0.Z, T1.X, BS:VEC_021/SCL_122
2103; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
2104; EG-NEXT:     ADD_INT * T1.W, PV.Y, PV.Z,
2105; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
2106; EG-NEXT:     ADDC_UINT * T1.W, PV.Y, PV.Z,
2107; EG-NEXT:     ADD_INT * T0.W, PV.W, PS,
2108; EG-NEXT:     ADD_INT * T0.Z, T0.Y, T0.Z,
2109; EG-NEXT:     ADD_INT * T0.Y, T3.X, T2.W,
2110; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2111; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2112; EG-NEXT:     MULLO_INT * T0.X, KC0[4].W, KC0[7].W,
2113entry:
2114  %mul = mul i128 %a, %b
2115  store i128 %mul, ptr addrspace(1) %out
2116  ret void
2117}
2118
2119define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
2120; SI-LABEL: v_mul_i128:
2121; SI:       ; %bb.0: ; %entry
2122; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2123; SI-NEXT:    s_mov_b32 s7, 0xf000
2124; SI-NEXT:    s_mov_b32 s6, 0
2125; SI-NEXT:    v_lshlrev_b32_e32 v8, 4, v0
2126; SI-NEXT:    v_mov_b32_e32 v9, 0
2127; SI-NEXT:    s_waitcnt lgkmcnt(0)
2128; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
2129; SI-NEXT:    s_mov_b64 s[0:1], s[2:3]
2130; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
2131; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
2132; SI-NEXT:    buffer_load_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64
2133; SI-NEXT:    s_waitcnt vmcnt(0)
2134; SI-NEXT:    v_mul_lo_u32 v3, v4, v3
2135; SI-NEXT:    v_mul_hi_u32 v10, v4, v2
2136; SI-NEXT:    v_mul_lo_u32 v12, v6, v1
2137; SI-NEXT:    v_mul_hi_u32 v13, v6, v0
2138; SI-NEXT:    v_mul_lo_u32 v17, v1, v4
2139; SI-NEXT:    v_mul_hi_u32 v18, v0, v4
2140; SI-NEXT:    v_mul_lo_u32 v11, v5, v2
2141; SI-NEXT:    v_mul_lo_u32 v7, v7, v0
2142; SI-NEXT:    v_mul_hi_u32 v16, v1, v4
2143; SI-NEXT:    v_mul_lo_u32 v15, v0, v5
2144; SI-NEXT:    v_mul_hi_u32 v14, v0, v5
2145; SI-NEXT:    v_mul_hi_u32 v19, v1, v5
2146; SI-NEXT:    v_mul_lo_u32 v5, v1, v5
2147; SI-NEXT:    v_add_i32_e32 v1, vcc, v10, v3
2148; SI-NEXT:    v_add_i32_e32 v3, vcc, v13, v12
2149; SI-NEXT:    v_mul_lo_u32 v2, v4, v2
2150; SI-NEXT:    v_mul_lo_u32 v6, v6, v0
2151; SI-NEXT:    v_mul_lo_u32 v0, v0, v4
2152; SI-NEXT:    v_add_i32_e32 v4, vcc, v17, v18
2153; SI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v16, vcc
2154; SI-NEXT:    v_add_i32_e32 v11, vcc, v1, v11
2155; SI-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
2156; SI-NEXT:    v_add_i32_e32 v1, vcc, v15, v4
2157; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v14, vcc
2158; SI-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
2159; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
2160; SI-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
2161; SI-NEXT:    v_addc_u32_e64 v6, s[4:5], 0, 0, vcc
2162; SI-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2163; SI-NEXT:    v_addc_u32_e32 v5, vcc, v19, v6, vcc
2164; SI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
2165; SI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
2166; SI-NEXT:    buffer_store_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64
2167; SI-NEXT:    s_endpgm
2168;
2169; VI-LABEL: v_mul_i128:
2170; VI:       ; %bb.0: ; %entry
2171; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
2172; VI-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
2173; VI-NEXT:    v_mov_b32_e32 v11, 0
2174; VI-NEXT:    s_waitcnt lgkmcnt(0)
2175; VI-NEXT:    v_mov_b32_e32 v1, s1
2176; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2177; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2178; VI-NEXT:    v_mov_b32_e32 v3, s3
2179; VI-NEXT:    v_add_u32_e32 v8, vcc, s2, v2
2180; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
2181; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2182; VI-NEXT:    flat_load_dwordx4 v[4:7], v[8:9]
2183; VI-NEXT:    s_waitcnt vmcnt(0)
2184; VI-NEXT:    v_mul_lo_u32 v10, v4, v3
2185; VI-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v4, v2, 0
2186; VI-NEXT:    v_mul_lo_u32 v14, v5, v2
2187; VI-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
2188; VI-NEXT:    v_mul_lo_u32 v15, v7, v0
2189; VI-NEXT:    v_add_u32_e32 v7, vcc, v13, v10
2190; VI-NEXT:    v_mov_b32_e32 v10, v3
2191; VI-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11]
2192; VI-NEXT:    v_add_u32_e32 v13, vcc, v7, v14
2193; VI-NEXT:    v_mov_b32_e32 v7, v4
2194; VI-NEXT:    v_mov_b32_e32 v4, v11
2195; VI-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v6, v0, v[12:13]
2196; VI-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4]
2197; VI-NEXT:    v_add_u32_e32 v11, vcc, v15, v13
2198; VI-NEXT:    v_mov_b32_e32 v0, v4
2199; VI-NEXT:    v_mul_lo_u32 v10, v6, v1
2200; VI-NEXT:    v_add_u32_e32 v6, vcc, v7, v0
2201; VI-NEXT:    v_addc_u32_e64 v7, s[0:1], 0, 0, vcc
2202; VI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7]
2203; VI-NEXT:    v_add_u32_e32 v5, vcc, v10, v11
2204; VI-NEXT:    v_add_u32_e32 v4, vcc, v0, v12
2205; VI-NEXT:    v_addc_u32_e32 v5, vcc, v1, v5, vcc
2206; VI-NEXT:    flat_store_dwordx4 v[8:9], v[2:5]
2207; VI-NEXT:    s_endpgm
2208;
2209; GFX9-LABEL: v_mul_i128:
2210; GFX9:       ; %bb.0: ; %entry
2211; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
2212; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 4, v0
2213; GFX9-NEXT:    v_mov_b32_e32 v10, 0
2214; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2215; GFX9-NEXT:    global_load_dwordx4 v[0:3], v13, s[0:1]
2216; GFX9-NEXT:    global_load_dwordx4 v[4:7], v13, s[2:3]
2217; GFX9-NEXT:    s_waitcnt vmcnt(0)
2218; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
2219; GFX9-NEXT:    v_mul_lo_u32 v14, v5, v2
2220; GFX9-NEXT:    v_mul_lo_u32 v15, v4, v3
2221; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10]
2222; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0
2223; GFX9-NEXT:    v_mov_b32_e32 v4, v12
2224; GFX9-NEXT:    v_mov_b32_e32 v12, v10
2225; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[11:12]
2226; GFX9-NEXT:    v_add3_u32 v3, v3, v15, v14
2227; GFX9-NEXT:    v_mul_lo_u32 v17, v7, v0
2228; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3]
2229; GFX9-NEXT:    v_mov_b32_e32 v0, v10
2230; GFX9-NEXT:    v_mul_lo_u32 v16, v6, v1
2231; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v4, v0
2232; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc
2233; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7]
2234; GFX9-NEXT:    v_add3_u32 v3, v17, v3, v16
2235; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v0, v2
2236; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v1, v3, vcc
2237; GFX9-NEXT:    global_store_dwordx4 v13, v[8:11], s[2:3]
2238; GFX9-NEXT:    s_endpgm
2239;
2240; GFX10-LABEL: v_mul_i128:
2241; GFX10:       ; %bb.0: ; %entry
2242; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
2243; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 4, v0
2244; GFX10-NEXT:    v_mov_b32_e32 v10, 0
2245; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2246; GFX10-NEXT:    s_clause 0x1
2247; GFX10-NEXT:    global_load_dwordx4 v[0:3], v14, s[0:1]
2248; GFX10-NEXT:    global_load_dwordx4 v[4:7], v14, s[2:3]
2249; GFX10-NEXT:    s_waitcnt vmcnt(0)
2250; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s0, v0, v4, 0
2251; GFX10-NEXT:    v_mul_lo_u32 v7, v7, v0
2252; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s0, v1, v4, v[9:10]
2253; GFX10-NEXT:    v_mov_b32_e32 v9, v12
2254; GFX10-NEXT:    v_mov_b32_e32 v12, v10
2255; GFX10-NEXT:    v_mul_lo_u32 v10, v5, v2
2256; GFX10-NEXT:    v_mad_u64_u32 v[12:13], s0, v0, v5, v[11:12]
2257; GFX10-NEXT:    v_mul_lo_u32 v11, v4, v3
2258; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, v4, v2, 0
2259; GFX10-NEXT:    v_mov_b32_e32 v4, v13
2260; GFX10-NEXT:    v_mul_lo_u32 v13, v6, v1
2261; GFX10-NEXT:    v_add3_u32 v3, v3, v11, v10
2262; GFX10-NEXT:    v_add_co_u32 v9, s0, v9, v4
2263; GFX10-NEXT:    v_add_co_ci_u32_e64 v10, s0, 0, 0, s0
2264; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, v6, v0, v[2:3]
2265; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v1, v5, v[9:10]
2266; GFX10-NEXT:    v_mov_b32_e32 v9, v12
2267; GFX10-NEXT:    v_add3_u32 v3, v7, v3, v13
2268; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v0, v2
2269; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
2270; GFX10-NEXT:    global_store_dwordx4 v14, v[8:11], s[2:3]
2271; GFX10-NEXT:    s_endpgm
2272;
2273; GFX11-LABEL: v_mul_i128:
2274; GFX11:       ; %bb.0: ; %entry
2275; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x2c
2276; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 4, v0
2277; GFX11-NEXT:    v_mov_b32_e32 v10, 0
2278; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2279; GFX11-NEXT:    s_clause 0x1
2280; GFX11-NEXT:    global_load_b128 v[0:3], v16, s[0:1]
2281; GFX11-NEXT:    global_load_b128 v[4:7], v16, s[2:3]
2282; GFX11-NEXT:    s_waitcnt vmcnt(0)
2283; GFX11-NEXT:    v_mad_u64_u32 v[8:9], null, v0, v4, 0
2284; GFX11-NEXT:    v_mul_lo_u32 v15, v5, v2
2285; GFX11-NEXT:    v_mul_lo_u32 v3, v4, v3
2286; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2287; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10]
2288; GFX11-NEXT:    v_dual_mov_b32 v9, v12 :: v_dual_mov_b32 v12, v10
2289; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
2290; GFX11-NEXT:    v_mad_u64_u32 v[13:14], null, v0, v5, v[11:12]
2291; GFX11-NEXT:    v_mad_u64_u32 v[10:11], null, v4, v2, 0
2292; GFX11-NEXT:    v_mul_lo_u32 v4, v6, v1
2293; GFX11-NEXT:    v_mul_lo_u32 v12, v7, v0
2294; GFX11-NEXT:    v_mov_b32_e32 v2, v14
2295; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
2296; GFX11-NEXT:    v_add3_u32 v11, v11, v3, v15
2297; GFX11-NEXT:    v_add_co_u32 v2, s0, v9, v2
2298; GFX11-NEXT:    v_mov_b32_e32 v9, v13
2299; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s0
2300; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
2301; GFX11-NEXT:    v_mad_u64_u32 v[14:15], null, v6, v0, v[10:11]
2302; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3]
2303; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2304; GFX11-NEXT:    v_add3_u32 v0, v12, v15, v4
2305; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v6, v14
2306; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2307; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo
2308; GFX11-NEXT:    global_store_b128 v16, v[8:11], s[2:3]
2309; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2310; GFX11-NEXT:    s_endpgm
2311;
2312; EG-LABEL: v_mul_i128:
2313; EG:       ; %bb.0: ; %entry
2314; EG-NEXT:    ALU 3, @10, KC0[CB0:0-32], KC1[]
2315; EG-NEXT:    TEX 1 @6
2316; EG-NEXT:    ALU 41, @14, KC0[], KC1[]
2317; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2318; EG-NEXT:    CF_END
2319; EG-NEXT:    PAD
2320; EG-NEXT:    Fetch clause starting at 6:
2321; EG-NEXT:     VTX_READ_128 T2.XYZW, T1.X, 0, #1
2322; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
2323; EG-NEXT:    ALU clause starting at 10:
2324; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
2325; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
2326; EG-NEXT:     ADD_INT T0.X, KC0[2].Z, PV.W,
2327; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, PV.W,
2328; EG-NEXT:    ALU clause starting at 14:
2329; EG-NEXT:     MULLO_INT * T1.Y, T0.Y, T2.Y,
2330; EG-NEXT:     MULHI * T1.Z, T0.Y, T2.Y,
2331; EG-NEXT:     MULLO_INT * T1.W, T2.Z, T0.X,
2332; EG-NEXT:     MULLO_INT * T3.X, T2.Y, T0.Z,
2333; EG-NEXT:     MULHI * T3.Y, T0.Y, T2.X,
2334; EG-NEXT:     MULHI * T3.Z, T0.X, T2.Y,
2335; EG-NEXT:     MULHI * T3.W, T2.Z, T0.X,
2336; EG-NEXT:     MULLO_INT * T2.Z, T2.Z, T0.Y,
2337; EG-NEXT:     MULHI * T4.X, T2.X, T0.Z,
2338; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T2.X,
2339; EG-NEXT:     MULHI * T4.Y, T0.X, T2.X,
2340; EG-NEXT:     ADD_INT T4.W, T0.Y, PS,
2341; EG-NEXT:     MULLO_INT * T2.Y, T0.X, T2.Y,
2342; EG-NEXT:     ADDC_UINT T4.Z, T0.Y, T4.Y,
2343; EG-NEXT:     ADDC_UINT T5.W, PS, PV.W,
2344; EG-NEXT:     MULLO_INT * T0.Y, T2.X, T0.W,
2345; EG-NEXT:     ADD_INT T4.X, T4.X, PS,
2346; EG-NEXT:     ADD_INT T0.Y, T3.W, T2.Z,
2347; EG-NEXT:     ADD_INT T2.Z, T3.Z, PV.W,
2348; EG-NEXT:     ADD_INT T0.W, T3.Y, PV.Z,
2349; EG-NEXT:     MULLO_INT * T2.W, T2.W, T0.X,
2350; EG-NEXT:     ADD_INT T5.X, PV.W, PV.Z,
2351; EG-NEXT:     ADDC_UINT T3.Y, PV.W, PV.Z,
2352; EG-NEXT:     ADD_INT T2.Z, PV.Y, PS,
2353; EG-NEXT:     ADD_INT T0.W, PV.X, T3.X,
2354; EG-NEXT:     MULLO_INT * T0.Y, T2.X, T0.Z,
2355; EG-NEXT:     ADD_INT T4.Y, PV.Z, PV.W,
2356; EG-NEXT:     ADDC_UINT T0.Z, T1.W, PS,
2357; EG-NEXT:     ADD_INT T0.W, T1.Z, PV.Y,
2358; EG-NEXT:     ADDC_UINT * T2.W, T1.Y, PV.X,
2359; EG-NEXT:     ADD_INT T1.Y, T1.Y, T5.X,
2360; EG-NEXT:     ADD_INT T1.Z, T1.W, T0.Y,
2361; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
2362; EG-NEXT:     ADD_INT * T1.W, PV.Y, PV.Z,
2363; EG-NEXT:     ADD_INT T0.W, PV.W, PS,
2364; EG-NEXT:     ADDC_UINT * T1.W, PV.Y, PV.Z,
2365; EG-NEXT:     ADD_INT * T0.W, PV.W, PS,
2366; EG-NEXT:     ADD_INT * T0.Z, T1.Y, T1.Z,
2367; EG-NEXT:     ADD_INT * T0.Y, T2.Y, T4.W,
2368; EG-NEXT:     LSHR T1.X, T1.X, literal.x,
2369; EG-NEXT:     MULLO_INT * T0.X, T0.X, T2.X,
2370; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2371entry:
2372  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2373  %gep.a = getelementptr inbounds i128, ptr addrspace(1) %aptr, i32 %tid
2374  %gep.b = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid
2375  %gep.out = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid
2376  %a = load i128, ptr addrspace(1) %gep.a
2377  %b = load i128, ptr addrspace(1) %gep.b
2378  %mul = mul i128 %a, %b
2379  store i128 %mul, ptr addrspace(1) %gep.out
2380  ret void
2381}
2382
2383declare i32 @llvm.amdgcn.workitem.id.x() #1
2384
2385attributes #0 = { nounwind }
2386attributes #1 = { nounwind readnone}
2387