xref: /llvm-project/llvm/test/CodeGen/AMDGPU/mul_int24.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
6; RUN: llc -mtriple=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s
7
8; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
9define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
10; SI-LABEL: test_smul24_i32:
11; SI:       ; %bb.0: ; %entry
12; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
13; SI-NEXT:    s_mov_b32 s7, 0xf000
14; SI-NEXT:    s_waitcnt lgkmcnt(0)
15; SI-NEXT:    s_bfe_i32 s2, s2, 0x180000
16; SI-NEXT:    s_bfe_i32 s3, s3, 0x180000
17; SI-NEXT:    s_mul_i32 s2, s2, s3
18; SI-NEXT:    s_mov_b32 s6, -1
19; SI-NEXT:    s_mov_b32 s4, s0
20; SI-NEXT:    s_mov_b32 s5, s1
21; SI-NEXT:    v_mov_b32_e32 v0, s2
22; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
23; SI-NEXT:    s_endpgm
24;
25; VI-LABEL: test_smul24_i32:
26; VI:       ; %bb.0: ; %entry
27; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
28; VI-NEXT:    s_mov_b32 s7, 0xf000
29; VI-NEXT:    s_mov_b32 s6, -1
30; VI-NEXT:    s_waitcnt lgkmcnt(0)
31; VI-NEXT:    s_mov_b32 s4, s0
32; VI-NEXT:    s_mov_b32 s5, s1
33; VI-NEXT:    s_bfe_i32 s0, s2, 0x180000
34; VI-NEXT:    s_bfe_i32 s1, s3, 0x180000
35; VI-NEXT:    s_mul_i32 s0, s0, s1
36; VI-NEXT:    v_mov_b32_e32 v0, s0
37; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
38; VI-NEXT:    s_endpgm
39;
40; GFX9-LABEL: test_smul24_i32:
41; GFX9:       ; %bb.0: ; %entry
42; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
43; GFX9-NEXT:    s_mov_b32 s7, 0xf000
44; GFX9-NEXT:    s_mov_b32 s6, -1
45; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX9-NEXT:    s_mov_b32 s4, s0
47; GFX9-NEXT:    s_mov_b32 s5, s1
48; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
49; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
50; GFX9-NEXT:    s_mul_i32 s0, s0, s1
51; GFX9-NEXT:    v_mov_b32_e32 v0, s0
52; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
53; GFX9-NEXT:    s_endpgm
54;
55; EG-LABEL: test_smul24_i32:
56; EG:       ; %bb.0: ; %entry
57; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
58; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
59; EG-NEXT:    CF_END
60; EG-NEXT:    PAD
61; EG-NEXT:    ALU clause starting at 4:
62; EG-NEXT:     LSHL T0.W, KC0[2].Z, literal.x,
63; EG-NEXT:     LSHL * T1.W, KC0[2].W, literal.x,
64; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
65; EG-NEXT:     ASHR T1.W, PS, literal.x,
66; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
67; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
68; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
69; EG-NEXT:     MULLO_INT * T1.X, PS, PV.W,
70; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
71;
72; CM-LABEL: test_smul24_i32:
73; CM:       ; %bb.0: ; %entry
74; CM-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
75; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
76; CM-NEXT:    CF_END
77; CM-NEXT:    PAD
78; CM-NEXT:    ALU clause starting at 4:
79; CM-NEXT:     LSHL T0.Z, KC0[2].Z, literal.x,
80; CM-NEXT:     LSHL * T0.W, KC0[2].W, literal.x,
81; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
82; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
83; CM-NEXT:     ASHR T1.Z, PV.W, literal.y,
84; CM-NEXT:     ASHR * T0.W, PV.Z, literal.y,
85; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
86; CM-NEXT:     MULLO_INT T1.X, T0.W, T1.Z,
87; CM-NEXT:     MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
88; CM-NEXT:     MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
89; CM-NEXT:     MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
90entry:
91  %a.shl = shl i32 %a, 8
92  %a.24 = ashr i32 %a.shl, 8
93  %b.shl = shl i32 %b, 8
94  %b.24 = ashr i32 %b.shl, 8
95  %mul24 = mul i32 %a.24, %b.24
96  store i32 %mul24, ptr addrspace(1) %out
97  ret void
98}
99
100define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
101; SI-LABEL: test_smulhi24_i64:
102; SI:       ; %bb.0: ; %entry
103; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
104; SI-NEXT:    s_mov_b32 s7, 0xf000
105; SI-NEXT:    s_mov_b32 s6, -1
106; SI-NEXT:    s_waitcnt lgkmcnt(0)
107; SI-NEXT:    s_mov_b32 s4, s0
108; SI-NEXT:    s_mov_b32 s5, s1
109; SI-NEXT:    v_mov_b32_e32 v0, s3
110; SI-NEXT:    v_mul_hi_i32_i24_e32 v0, s2, v0
111; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
112; SI-NEXT:    s_endpgm
113;
114; VI-LABEL: test_smulhi24_i64:
115; VI:       ; %bb.0: ; %entry
116; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
117; VI-NEXT:    s_mov_b32 s7, 0xf000
118; VI-NEXT:    s_mov_b32 s6, -1
119; VI-NEXT:    s_waitcnt lgkmcnt(0)
120; VI-NEXT:    v_mov_b32_e32 v0, s3
121; VI-NEXT:    s_mov_b32 s4, s0
122; VI-NEXT:    s_mov_b32 s5, s1
123; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s2, v0
124; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
125; VI-NEXT:    s_endpgm
126;
127; GFX9-LABEL: test_smulhi24_i64:
128; GFX9:       ; %bb.0: ; %entry
129; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
130; GFX9-NEXT:    s_mov_b32 s7, 0xf000
131; GFX9-NEXT:    s_mov_b32 s6, -1
132; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
133; GFX9-NEXT:    s_mov_b32 s4, s0
134; GFX9-NEXT:    s_mov_b32 s5, s1
135; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
136; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
137; GFX9-NEXT:    s_mul_hi_i32 s0, s0, s1
138; GFX9-NEXT:    v_mov_b32_e32 v0, s0
139; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
140; GFX9-NEXT:    s_endpgm
141;
142; EG-LABEL: test_smulhi24_i64:
143; EG:       ; %bb.0: ; %entry
144; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
145; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
146; EG-NEXT:    CF_END
147; EG-NEXT:    PAD
148; EG-NEXT:    ALU clause starting at 4:
149; EG-NEXT:     LSHL T0.W, KC0[2].Z, literal.x,
150; EG-NEXT:     LSHL * T1.W, KC0[2].W, literal.x,
151; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
152; EG-NEXT:     ASHR T1.W, PS, literal.x,
153; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
154; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
155; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
156; EG-NEXT:     MULHI_INT * T1.X, PS, PV.W,
157; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
158;
159; CM-LABEL: test_smulhi24_i64:
160; CM:       ; %bb.0: ; %entry
161; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
162; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
163; CM-NEXT:    CF_END
164; CM-NEXT:    PAD
165; CM-NEXT:    ALU clause starting at 4:
166; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
167; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
168; CM-NEXT:     MULHI_INT24 T1.X, KC0[2].Z, KC0[2].W,
169; CM-NEXT:     MULHI_INT24 T1.Y (MASKED), KC0[2].Z, KC0[2].W,
170; CM-NEXT:     MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].W,
171; CM-NEXT:     MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].W,
172entry:
173  %a.shl = shl i32 %a, 8
174  %a.24 = ashr i32 %a.shl, 8
175  %b.shl = shl i32 %b, 8
176  %b.24 = ashr i32 %b.shl, 8
177  %a.24.i64 = sext i32 %a.24 to i64
178  %b.24.i64 = sext i32 %b.24 to i64
179  %mul48 = mul i64 %a.24.i64, %b.24.i64
180  %mul48.hi = lshr i64 %mul48, 32
181  %mul24hi = trunc i64 %mul48.hi to i32
182  store i32 %mul24hi, ptr addrspace(1) %out
183  ret void
184}
185
186define i64 @test_smul48_i64(i64 %lhs, i64 %rhs) {
187; SI-LABEL: test_smul48_i64:
188; SI:       ; %bb.0:
189; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v2
191; SI-NEXT:    v_mul_i32_i24_e32 v0, v0, v2
192; SI-NEXT:    s_setpc_b64 s[30:31]
193;
194; VI-LABEL: test_smul48_i64:
195; VI:       ; %bb.0:
196; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
197; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v2
198; VI-NEXT:    v_mul_i32_i24_e32 v0, v0, v2
199; VI-NEXT:    s_setpc_b64 s[30:31]
200;
201; GFX9-LABEL: test_smul48_i64:
202; GFX9:       ; %bb.0:
203; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204; GFX9-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v2
205; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v0, v2
206; GFX9-NEXT:    s_setpc_b64 s[30:31]
207;
208; EG-LABEL: test_smul48_i64:
209; EG:       ; %bb.0:
210; EG-NEXT:    CF_END
211; EG-NEXT:    PAD
212;
213; CM-LABEL: test_smul48_i64:
214; CM:       ; %bb.0:
215; CM-NEXT:    CF_END
216; CM-NEXT:    PAD
217  %shl.lhs = shl i64 %lhs, 40
218  %lhs24 = ashr i64 %shl.lhs, 40
219  %shl.rhs = shl i64 %rhs, 40
220  %rhs24 = ashr i64 %shl.rhs, 40
221  %mul = mul i64 %lhs24, %rhs24
222  ret i64 %mul
223}
224
225define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
226; SI-LABEL: test_smul48_v2i64:
227; SI:       ; %bb.0:
228; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v4
230; SI-NEXT:    v_mul_i32_i24_e32 v0, v0, v4
231; SI-NEXT:    v_mul_hi_i32_i24_e32 v3, v2, v6
232; SI-NEXT:    v_mul_i32_i24_e32 v2, v2, v6
233; SI-NEXT:    s_setpc_b64 s[30:31]
234;
235; VI-LABEL: test_smul48_v2i64:
236; VI:       ; %bb.0:
237; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
238; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v4
239; VI-NEXT:    v_mul_i32_i24_e32 v0, v0, v4
240; VI-NEXT:    v_mul_hi_i32_i24_e32 v3, v2, v6
241; VI-NEXT:    v_mul_i32_i24_e32 v2, v2, v6
242; VI-NEXT:    s_setpc_b64 s[30:31]
243;
244; GFX9-LABEL: test_smul48_v2i64:
245; GFX9:       ; %bb.0:
246; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
247; GFX9-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v4
248; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v0, v4
249; GFX9-NEXT:    v_mul_hi_i32_i24_e32 v3, v2, v6
250; GFX9-NEXT:    v_mul_i32_i24_e32 v2, v2, v6
251; GFX9-NEXT:    s_setpc_b64 s[30:31]
252;
253; EG-LABEL: test_smul48_v2i64:
254; EG:       ; %bb.0:
255; EG-NEXT:    CF_END
256; EG-NEXT:    PAD
257;
258; CM-LABEL: test_smul48_v2i64:
259; CM:       ; %bb.0:
260; CM-NEXT:    CF_END
261; CM-NEXT:    PAD
262  %shl.lhs = shl <2 x i64> %lhs, <i64 40, i64 40>
263  %lhs24 = ashr <2 x i64> %shl.lhs, <i64 40, i64 40>
264  %shl.rhs = shl <2 x i64> %rhs, <i64 40, i64 40>
265  %rhs24 = ashr <2 x i64> %shl.rhs, <i64 40, i64 40>
266  %mul = mul <2 x i64> %lhs24, %rhs24
267  ret <2 x i64> %mul
268}
269
270; This requires handling of the original 64-bit mul node to eliminate
271; unnecessary extension instructions because after legalization they
272; will not be removed by SimplifyDemandedBits because there are
273; multiple uses by the separate mul and mulhi.
274define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
275; SI-LABEL: test_smul24_i64:
276; SI:       ; %bb.0:
277; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
278; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
279; SI-NEXT:    s_load_dword s4, s[4:5], 0x1c
280; SI-NEXT:    s_mov_b32 s3, 0xf000
281; SI-NEXT:    s_mov_b32 s2, -1
282; SI-NEXT:    s_waitcnt lgkmcnt(0)
283; SI-NEXT:    s_bfe_i32 s5, s6, 0x180000
284; SI-NEXT:    s_bfe_i32 s4, s4, 0x180000
285; SI-NEXT:    v_mov_b32_e32 v0, s5
286; SI-NEXT:    s_mul_i32 s5, s4, s5
287; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s4, v0
288; SI-NEXT:    v_mov_b32_e32 v0, s5
289; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
290; SI-NEXT:    s_endpgm
291;
292; VI-LABEL: test_smul24_i64:
293; VI:       ; %bb.0:
294; VI-NEXT:    s_load_dword s6, s[4:5], 0x4c
295; VI-NEXT:    s_load_dword s7, s[4:5], 0x70
296; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
297; VI-NEXT:    s_mov_b32 s3, 0xf000
298; VI-NEXT:    s_mov_b32 s2, -1
299; VI-NEXT:    s_waitcnt lgkmcnt(0)
300; VI-NEXT:    s_bfe_i32 s4, s6, 0x180000
301; VI-NEXT:    s_bfe_i32 s5, s7, 0x180000
302; VI-NEXT:    v_mov_b32_e32 v0, s4
303; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, s5, v0
304; VI-NEXT:    v_mul_i32_i24_e32 v0, s5, v0
305; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
306; VI-NEXT:    s_endpgm
307;
308; GFX9-LABEL: test_smul24_i64:
309; GFX9:       ; %bb.0:
310; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x4c
311; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x70
312; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
313; GFX9-NEXT:    s_mov_b32 s3, 0xf000
314; GFX9-NEXT:    s_mov_b32 s2, -1
315; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
316; GFX9-NEXT:    s_bfe_i32 s4, s6, 0x180000
317; GFX9-NEXT:    s_bfe_i32 s5, s7, 0x180000
318; GFX9-NEXT:    s_mul_hi_i32 s6, s5, s4
319; GFX9-NEXT:    s_mul_i32 s5, s5, s4
320; GFX9-NEXT:    v_mov_b32_e32 v0, s5
321; GFX9-NEXT:    v_mov_b32_e32 v1, s6
322; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
323; GFX9-NEXT:    s_endpgm
324;
325; EG-LABEL: test_smul24_i64:
326; EG:       ; %bb.0:
327; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
328; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
329; EG-NEXT:    CF_END
330; EG-NEXT:    PAD
331; EG-NEXT:    ALU clause starting at 4:
332; EG-NEXT:     LSHL T0.W, KC0[4].Z, literal.x,
333; EG-NEXT:     LSHL * T1.W, KC0[6].W, literal.x,
334; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
335; EG-NEXT:     ASHR T1.W, PS, literal.x,
336; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
337; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
338; EG-NEXT:     MULHI_INT * T0.Y, PV.W, PS,
339; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
340; EG-NEXT:     MULLO_INT * T0.X, T1.W, T0.W,
341; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
342;
343; CM-LABEL: test_smul24_i64:
344; CM:       ; %bb.0:
345; CM-NEXT:    ALU 14, @4, KC0[CB0:0-32], KC1[]
346; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
347; CM-NEXT:    CF_END
348; CM-NEXT:    PAD
349; CM-NEXT:    ALU clause starting at 4:
350; CM-NEXT:     LSHL T0.Z, KC0[4].Z, literal.x,
351; CM-NEXT:     LSHL * T0.W, KC0[6].W, literal.x,
352; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
353; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
354; CM-NEXT:     ASHR T1.Z, PV.W, literal.y,
355; CM-NEXT:     ASHR * T0.W, PV.Z, literal.y,
356; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
357; CM-NEXT:     MULLO_INT T1.X, T1.Z, T0.W,
358; CM-NEXT:     MULLO_INT T1.Y (MASKED), T1.Z, T0.W,
359; CM-NEXT:     MULLO_INT T1.Z (MASKED), T1.Z, T0.W,
360; CM-NEXT:     MULLO_INT * T1.W (MASKED), T1.Z, T0.W,
361; CM-NEXT:     MULHI_INT24 T1.X (MASKED), KC0[6].W, KC0[4].Z,
362; CM-NEXT:     MULHI_INT24 T1.Y, KC0[6].W, KC0[4].Z,
363; CM-NEXT:     MULHI_INT24 T1.Z (MASKED), KC0[6].W, KC0[4].Z,
364; CM-NEXT:     MULHI_INT24 * T1.W (MASKED), KC0[6].W, KC0[4].Z,
365  %shl.i = shl i32 %a, 8
366  %shr.i = ashr i32 %shl.i, 8
367  %conv.i = sext i32 %shr.i to i64
368  %shl1.i = shl i32 %b, 8
369  %shr2.i = ashr i32 %shl1.i, 8
370  %conv3.i = sext i32 %shr2.i to i64
371  %mul.i = mul i64 %conv3.i, %conv.i
372  store i64 %mul.i, ptr addrspace(1) %out
373  ret void
374}
375
376define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
377; SI-LABEL: test_smul24_i64_square:
378; SI:       ; %bb.0:
379; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
380; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
381; SI-NEXT:    s_mov_b32 s3, 0xf000
382; SI-NEXT:    s_mov_b32 s2, -1
383; SI-NEXT:    s_waitcnt lgkmcnt(0)
384; SI-NEXT:    s_bfe_i32 s4, s6, 0x180000
385; SI-NEXT:    s_mul_i32 s5, s4, s4
386; SI-NEXT:    v_mul_hi_i32_i24_e64 v1, s4, s4
387; SI-NEXT:    v_mov_b32_e32 v0, s5
388; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
389; SI-NEXT:    s_endpgm
390;
391; VI-LABEL: test_smul24_i64_square:
392; VI:       ; %bb.0:
393; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
394; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
395; VI-NEXT:    s_mov_b32 s3, 0xf000
396; VI-NEXT:    s_mov_b32 s2, -1
397; VI-NEXT:    s_waitcnt lgkmcnt(0)
398; VI-NEXT:    s_bfe_i32 s4, s6, 0x180000
399; VI-NEXT:    v_mul_hi_i32_i24_e64 v1, s4, s4
400; VI-NEXT:    v_mul_i32_i24_e64 v0, s4, s4
401; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
402; VI-NEXT:    s_endpgm
403;
404; GFX9-LABEL: test_smul24_i64_square:
405; GFX9:       ; %bb.0:
406; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
407; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
408; GFX9-NEXT:    s_mov_b32 s3, 0xf000
409; GFX9-NEXT:    s_mov_b32 s2, -1
410; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
411; GFX9-NEXT:    s_bfe_i32 s4, s6, 0x180000
412; GFX9-NEXT:    s_mul_hi_i32 s5, s4, s4
413; GFX9-NEXT:    s_mul_i32 s4, s4, s4
414; GFX9-NEXT:    v_mov_b32_e32 v0, s4
415; GFX9-NEXT:    v_mov_b32_e32 v1, s5
416; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
417; GFX9-NEXT:    s_endpgm
418;
419; EG-LABEL: test_smul24_i64_square:
420; EG:       ; %bb.0:
421; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
422; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
423; EG-NEXT:    CF_END
424; EG-NEXT:    PAD
425; EG-NEXT:    ALU clause starting at 4:
426; EG-NEXT:     LSHL * T0.W, KC0[2].Z, literal.x,
427; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
428; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
429; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
430; EG-NEXT:     MULHI_INT * T0.Y, PV.W, PV.W,
431; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
432; EG-NEXT:     MULLO_INT * T0.X, T0.W, T0.W,
433; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
434;
435; CM-LABEL: test_smul24_i64_square:
436; CM:       ; %bb.0:
437; CM-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
438; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
439; CM-NEXT:    CF_END
440; CM-NEXT:    PAD
441; CM-NEXT:    ALU clause starting at 4:
442; CM-NEXT:     LSHL * T0.W, KC0[2].Z, literal.x,
443; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
444; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
445; CM-NEXT:     ASHR * T0.W, PV.W, literal.y,
446; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
447; CM-NEXT:     MULLO_INT T1.X, T0.W, T0.W,
448; CM-NEXT:     MULLO_INT T1.Y (MASKED), T0.W, T0.W,
449; CM-NEXT:     MULLO_INT T1.Z (MASKED), T0.W, T0.W,
450; CM-NEXT:     MULLO_INT * T1.W (MASKED), T0.W, T0.W,
451; CM-NEXT:     MULHI_INT24 T1.X (MASKED), KC0[2].Z, KC0[2].Z,
452; CM-NEXT:     MULHI_INT24 T1.Y, KC0[2].Z, KC0[2].Z,
453; CM-NEXT:     MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].Z,
454; CM-NEXT:     MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].Z,
455  %shl.i = shl i32 %a, 8
456  %shr.i = ashr i32 %shl.i, 8
457  %conv.i = sext i32 %shr.i to i64
458  %mul.i = mul i64 %conv.i, %conv.i
459  store i64 %mul.i, ptr addrspace(1) %out
460  ret void
461}
462
463define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) #0 {
464; SI-LABEL: test_smul24_i33:
465; SI:       ; %bb.0: ; %entry
466; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
467; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
468; SI-NEXT:    s_load_dword s4, s[4:5], 0xd
469; SI-NEXT:    s_mov_b32 s3, 0xf000
470; SI-NEXT:    s_mov_b32 s2, -1
471; SI-NEXT:    s_waitcnt lgkmcnt(0)
472; SI-NEXT:    s_lshl_b32 s5, s6, 8
473; SI-NEXT:    s_lshl_b32 s7, s4, 8
474; SI-NEXT:    s_ashr_i64 s[6:7], s[6:7], 40
475; SI-NEXT:    s_ashr_i64 s[4:5], s[4:5], 40
476; SI-NEXT:    v_mov_b32_e32 v0, s6
477; SI-NEXT:    s_mul_i32 s5, s4, s6
478; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s4, v0
479; SI-NEXT:    v_mov_b32_e32 v0, s5
480; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 31
481; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], 31
482; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
483; SI-NEXT:    s_endpgm
484;
485; VI-LABEL: test_smul24_i33:
486; VI:       ; %bb.0: ; %entry
487; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
488; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
489; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
490; VI-NEXT:    s_waitcnt lgkmcnt(0)
491; VI-NEXT:    s_lshl_b32 s3, s2, 8
492; VI-NEXT:    s_lshl_b32 s5, s6, 8
493; VI-NEXT:    s_ashr_i64 s[4:5], s[4:5], 40
494; VI-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
495; VI-NEXT:    v_mov_b32_e32 v0, s4
496; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, s2, v0
497; VI-NEXT:    v_mul_i32_i24_e32 v0, s2, v0
498; VI-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
499; VI-NEXT:    s_mov_b32 s3, 0xf000
500; VI-NEXT:    v_ashrrev_i64 v[0:1], 31, v[0:1]
501; VI-NEXT:    s_mov_b32 s2, -1
502; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
503; VI-NEXT:    s_endpgm
504;
505; GFX9-LABEL: test_smul24_i33:
506; GFX9:       ; %bb.0: ; %entry
507; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
508; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
509; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x34
510; GFX9-NEXT:    s_mov_b32 s3, 0xf000
511; GFX9-NEXT:    s_mov_b32 s2, -1
512; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
513; GFX9-NEXT:    s_lshl_b32 s5, s6, 8
514; GFX9-NEXT:    s_ashr_i64 s[4:5], s[4:5], 40
515; GFX9-NEXT:    s_lshl_b32 s5, s7, 8
516; GFX9-NEXT:    s_ashr_i64 s[6:7], s[4:5], 40
517; GFX9-NEXT:    s_mul_hi_i32 s5, s4, s6
518; GFX9-NEXT:    s_mul_i32 s4, s4, s6
519; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], 31
520; GFX9-NEXT:    s_ashr_i64 s[4:5], s[4:5], 31
521; GFX9-NEXT:    v_mov_b32_e32 v0, s4
522; GFX9-NEXT:    v_mov_b32_e32 v1, s5
523; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
524; GFX9-NEXT:    s_endpgm
525;
526; EG-LABEL: test_smul24_i33:
527; EG:       ; %bb.0: ; %entry
528; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
529; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
530; EG-NEXT:    CF_END
531; EG-NEXT:    PAD
532; EG-NEXT:    ALU clause starting at 4:
533; EG-NEXT:     LSHL T0.W, KC0[2].W, literal.x,
534; EG-NEXT:     LSHL * T1.W, KC0[3].Y, literal.x,
535; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
536; EG-NEXT:     ASHR T1.W, PS, literal.x,
537; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
538; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
539; EG-NEXT:     MULHI_INT * T0.X, PS, PV.W,
540; EG-NEXT:     MULLO_INT * T1.X, T0.W, T1.W,
541; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
542; EG-NEXT:     BFE_INT * T1.Y, T0.X, 0.0, 1,
543; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
544;
545; CM-LABEL: test_smul24_i33:
546; CM:       ; %bb.0: ; %entry
547; CM-NEXT:    ALU 16, @4, KC0[CB0:0-32], KC1[]
548; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
549; CM-NEXT:    CF_END
550; CM-NEXT:    PAD
551; CM-NEXT:    ALU clause starting at 4:
552; CM-NEXT:     LSHL T0.Z, KC0[2].W, literal.x,
553; CM-NEXT:     LSHL * T0.W, KC0[3].Y, literal.x,
554; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
555; CM-NEXT:     ASHR T1.Z, PV.W, literal.x,
556; CM-NEXT:     ASHR * T0.W, PV.Z, literal.x,
557; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
558; CM-NEXT:     MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
559; CM-NEXT:     MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
560; CM-NEXT:     MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
561; CM-NEXT:     MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
562; CM-NEXT:     MULLO_INT T1.X, T0.W, T1.Z,
563; CM-NEXT:     MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
564; CM-NEXT:     MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
565; CM-NEXT:     MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
566; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
567; CM-NEXT:     BFE_INT * T1.Y, T0.X, 0.0, 1,
568; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
569entry:
570  %a.shl = shl i33 %a, 9
571  %a.24 = ashr i33 %a.shl, 9
572  %b.shl = shl i33 %b, 9
573  %b.24 = ashr i33 %b.shl, 9
574  %mul24 = mul i33 %a.24, %b.24
575  %ext = sext i33 %mul24 to i64
576  store i64 %ext, ptr addrspace(1) %out
577  ret void
578}
579
580define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33 %b) {
581; SI-LABEL: test_smulhi24_i33:
582; SI:       ; %bb.0: ; %entry
583; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
584; SI-NEXT:    s_load_dword s7, s[4:5], 0xb
585; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
586; SI-NEXT:    s_mov_b32 s3, 0xf000
587; SI-NEXT:    s_mov_b32 s2, -1
588; SI-NEXT:    s_waitcnt lgkmcnt(0)
589; SI-NEXT:    v_mov_b32_e32 v0, s6
590; SI-NEXT:    v_mul_hi_i32_i24_e32 v0, s7, v0
591; SI-NEXT:    v_and_b32_e32 v0, 1, v0
592; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
593; SI-NEXT:    s_endpgm
594;
595; VI-LABEL: test_smulhi24_i33:
596; VI:       ; %bb.0: ; %entry
597; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
598; VI-NEXT:    s_load_dword s7, s[4:5], 0x2c
599; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
600; VI-NEXT:    s_mov_b32 s3, 0xf000
601; VI-NEXT:    s_mov_b32 s2, -1
602; VI-NEXT:    s_waitcnt lgkmcnt(0)
603; VI-NEXT:    v_mov_b32_e32 v0, s6
604; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s7, v0
605; VI-NEXT:    v_and_b32_e32 v0, 1, v0
606; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
607; VI-NEXT:    s_endpgm
608;
609; GFX9-LABEL: test_smulhi24_i33:
610; GFX9:       ; %bb.0: ; %entry
611; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x2c
612; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
613; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x34
614; GFX9-NEXT:    s_mov_b32 s3, 0xf000
615; GFX9-NEXT:    s_mov_b32 s2, -1
616; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
617; GFX9-NEXT:    s_lshl_b32 s5, s6, 8
618; GFX9-NEXT:    s_ashr_i64 s[4:5], s[4:5], 40
619; GFX9-NEXT:    s_lshl_b32 s5, s7, 8
620; GFX9-NEXT:    s_ashr_i64 s[6:7], s[4:5], 40
621; GFX9-NEXT:    s_mul_hi_i32 s4, s4, s6
622; GFX9-NEXT:    s_and_b32 s4, s4, 1
623; GFX9-NEXT:    v_mov_b32_e32 v0, s4
624; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
625; GFX9-NEXT:    s_endpgm
626;
627; EG-LABEL: test_smulhi24_i33:
628; EG:       ; %bb.0: ; %entry
629; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
630; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
631; EG-NEXT:    CF_END
632; EG-NEXT:    PAD
633; EG-NEXT:    ALU clause starting at 4:
634; EG-NEXT:     LSHL T0.W, KC0[2].W, literal.x,
635; EG-NEXT:     LSHL * T1.W, KC0[3].Y, literal.x,
636; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
637; EG-NEXT:     ASHR T1.W, PS, literal.x,
638; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
639; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
640; EG-NEXT:     MULHI_INT * T0.X, PS, PV.W,
641; EG-NEXT:     AND_INT T0.X, PS, 1,
642; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
643; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
644;
645; CM-LABEL: test_smulhi24_i33:
646; CM:       ; %bb.0: ; %entry
647; CM-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
648; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
649; CM-NEXT:    CF_END
650; CM-NEXT:    PAD
651; CM-NEXT:    ALU clause starting at 4:
652; CM-NEXT:     MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
653; CM-NEXT:     MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
654; CM-NEXT:     MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
655; CM-NEXT:     MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
656; CM-NEXT:     AND_INT * T0.X, PV.X, 1,
657; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
658; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
659entry:
660  %tmp0 = shl i33 %a, 9
661  %a_24 = ashr i33 %tmp0, 9
662  %tmp1 = shl i33 %b, 9
663  %b_24 = ashr i33 %tmp1, 9
664  %tmp2 = mul i33 %a_24, %b_24
665  %hi = lshr i33 %tmp2, 32
666  %trunc = trunc i33 %hi to i32
667
668  store i32 %trunc, ptr addrspace(1) %out
669  ret void
670}
671
672define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
673; SI-LABEL: simplify_i24_crash:
674; SI:       ; %bb.0: ; %bb
675; SI-NEXT:    s_load_dword s0, s[4:5], 0xb
676; SI-NEXT:    s_waitcnt lgkmcnt(0)
677; SI-NEXT:    s_cmp_lg_u32 s0, 0
678; SI-NEXT:    s_cbranch_scc0 .LBB8_2
679; SI-NEXT:  ; %bb.1: ; %bb7
680; SI-NEXT:    s_endpgm
681; SI-NEXT:  .LBB8_2: ; %bb11
682; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
683; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
684; SI-NEXT:    s_mov_b32 s7, 0xf000
685; SI-NEXT:    s_waitcnt lgkmcnt(0)
686; SI-NEXT:    s_bfe_i32 s0, s0, 0x180000
687; SI-NEXT:    s_bfe_i32 s1, s2, 0x180000
688; SI-NEXT:    s_mul_i32 s0, s0, s1
689; SI-NEXT:    s_mov_b32 s6, -1
690; SI-NEXT:    v_mov_b32_e32 v0, s0
691; SI-NEXT:    v_mov_b32_e32 v1, s0
692; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
693; SI-NEXT:    s_endpgm
694;
695; VI-LABEL: simplify_i24_crash:
696; VI:       ; %bb.0: ; %bb
697; VI-NEXT:    s_load_dword s0, s[4:5], 0x2c
698; VI-NEXT:    s_waitcnt lgkmcnt(0)
699; VI-NEXT:    s_cmp_lg_u32 s0, 0
700; VI-NEXT:    s_cbranch_scc0 .LBB8_2
701; VI-NEXT:  ; %bb.1: ; %bb7
702; VI-NEXT:    s_endpgm
703; VI-NEXT:  .LBB8_2: ; %bb11
704; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
705; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
706; VI-NEXT:    s_mov_b32 s7, 0xf000
707; VI-NEXT:    s_mov_b32 s6, -1
708; VI-NEXT:    s_waitcnt lgkmcnt(0)
709; VI-NEXT:    s_bfe_i32 s0, s0, 0x180000
710; VI-NEXT:    s_bfe_i32 s1, s2, 0x180000
711; VI-NEXT:    s_mul_i32 s0, s0, s1
712; VI-NEXT:    v_mov_b32_e32 v0, s0
713; VI-NEXT:    v_mov_b32_e32 v1, s0
714; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
715; VI-NEXT:    s_endpgm
716;
717; GFX9-LABEL: simplify_i24_crash:
718; GFX9:       ; %bb.0: ; %bb
719; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x2c
720; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
721; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
722; GFX9-NEXT:    s_cbranch_scc0 .LBB8_2
723; GFX9-NEXT:  ; %bb.1: ; %bb7
724; GFX9-NEXT:    s_endpgm
725; GFX9-NEXT:  .LBB8_2: ; %bb11
726; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
727; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
728; GFX9-NEXT:    s_mov_b32 s11, 0xf000
729; GFX9-NEXT:    s_mov_b32 s10, -1
730; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
731; GFX9-NEXT:    s_bfe_i32 s0, s0, 0x180000
732; GFX9-NEXT:    s_bfe_i32 s1, s2, 0x180000
733; GFX9-NEXT:    s_mul_i32 s0, s0, s1
734; GFX9-NEXT:    v_mov_b32_e32 v0, s0
735; GFX9-NEXT:    v_mov_b32_e32 v1, s0
736; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
737; GFX9-NEXT:    s_endpgm
738;
739; EG-LABEL: simplify_i24_crash:
740; EG:       ; %bb.0: ; %bb
741; EG-NEXT:    ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
742; EG-NEXT:    JUMP @5 POP:1
743; EG-NEXT:    ALU 14, @8, KC0[CB0:0-32], KC1[]
744; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 0
745; EG-NEXT:    POP @5 POP:1
746; EG-NEXT:    CF_END
747; EG-NEXT:    ALU clause starting at 6:
748; EG-NEXT:     SETNE_INT * T0.W, KC0[2].Z, 0.0,
749; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
750; EG-NEXT:    ALU clause starting at 8:
751; EG-NEXT:     MOV T0.X, KC0[3].Y,
752; EG-NEXT:     MOV * T1.X, KC0[2].W,
753; EG-NEXT:     LSHL T0.W, PS, literal.x,
754; EG-NEXT:     LSHL * T1.W, PV.X, literal.x,
755; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
756; EG-NEXT:     ASHR T1.W, PS, literal.x,
757; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
758; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
759; EG-NEXT:     MOV T2.W, KC0[2].Y,
760; EG-NEXT:     MULLO_INT * T0.X, PS, PV.W,
761; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
762; EG-NEXT:     MOV T0.Y, PS,
763; EG-NEXT:     MOV T0.W, KC0[3].X,
764; EG-NEXT:     MOV * T0.W, KC0[3].Z,
765; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
766;
767; CM-LABEL: simplify_i24_crash:
768; CM:       ; %bb.0: ; %bb
769; CM-NEXT:    ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
770; CM-NEXT:    JUMP @5 POP:1
771; CM-NEXT:    ALU 17, @8, KC0[CB0:0-32], KC1[]
772; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
773; CM-NEXT:    POP @5 POP:1
774; CM-NEXT:    CF_END
775; CM-NEXT:    ALU clause starting at 6:
776; CM-NEXT:     SETNE_INT * T0.W, KC0[2].Z, 0.0,
777; CM-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
778; CM-NEXT:    ALU clause starting at 8:
779; CM-NEXT:     MOV * T0.X, KC0[3].Y,
780; CM-NEXT:     MOV * T1.X, KC0[2].W,
781; CM-NEXT:     LSHL T0.Z, PV.X, literal.x,
782; CM-NEXT:     LSHL * T0.W, T0.X, literal.x,
783; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
784; CM-NEXT:     MOV T0.Y, KC0[2].Y,
785; CM-NEXT:     ASHR T1.Z, PV.W, literal.x,
786; CM-NEXT:     ASHR * T0.W, PV.Z, literal.x,
787; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
788; CM-NEXT:     MULLO_INT T0.X, T0.W, T1.Z,
789; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.W, T1.Z,
790; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.W, T1.Z,
791; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.W, T1.Z,
792; CM-NEXT:     LSHR T1.X, T0.Y, literal.x,
793; CM-NEXT:     MOV T0.Y, PV.X,
794; CM-NEXT:     MOV T0.Z, KC0[3].X,
795; CM-NEXT:     MOV * T0.W, KC0[3].Z,
796; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
797bb:
798  %cmp = icmp eq i32 %arg0, 0
799  br i1 %cmp, label %bb11, label %bb7
800
801bb11:
802  %tmp14 = shufflevector <2 x i32> %arg1, <2 x i32> undef, <2 x i32> zeroinitializer
803  %tmp16 = shufflevector <2 x i32> %arg2, <2 x i32> undef, <2 x i32> zeroinitializer
804  %tmp17 = shl <2 x i32> %tmp14, <i32 8, i32 8>
805  %tmp18 = ashr <2 x i32> %tmp17, <i32 8, i32 8>
806  %tmp19 = shl <2 x i32> %tmp16, <i32 8, i32 8>
807  %tmp20 = ashr <2 x i32> %tmp19, <i32 8, i32 8>
808  %tmp21 = mul <2 x i32> %tmp18, %tmp20
809  store <2 x i32> %tmp21, ptr addrspace(1) %out
810  br label %bb7
811
812bb7:
813  ret void
814
815}
816
817define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) {
818; SI-LABEL: test_umul_i24:
819; SI:       ; %bb.0:
820; SI-NEXT:    s_load_dword s1, s[4:5], 0xb
821; SI-NEXT:    v_mov_b32_e32 v0, 0xff803fe1
822; SI-NEXT:    s_mov_b32 s0, 0
823; SI-NEXT:    s_mov_b32 s3, 0xf000
824; SI-NEXT:    s_waitcnt lgkmcnt(0)
825; SI-NEXT:    s_lshr_b32 s1, s1, 9
826; SI-NEXT:    v_mul_hi_u32 v0, s1, v0
827; SI-NEXT:    s_mul_i32 s1, s1, 0xff803fe1
828; SI-NEXT:    v_alignbit_b32 v0, v0, s1, 1
829; SI-NEXT:    s_mov_b32 s2, -1
830; SI-NEXT:    s_mov_b32 s1, s0
831; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
832; SI-NEXT:    s_endpgm
833;
834; VI-LABEL: test_umul_i24:
835; VI:       ; %bb.0:
836; VI-NEXT:    s_load_dword s0, s[4:5], 0x2c
837; VI-NEXT:    v_mov_b32_e32 v0, 0xff803fe1
838; VI-NEXT:    s_mov_b32 s3, 0xf000
839; VI-NEXT:    s_mov_b32 s2, -1
840; VI-NEXT:    s_waitcnt lgkmcnt(0)
841; VI-NEXT:    s_lshr_b32 s0, s0, 9
842; VI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s0, v0, 0
843; VI-NEXT:    s_mov_b32 s0, 0
844; VI-NEXT:    s_mov_b32 s1, s0
845; VI-NEXT:    v_alignbit_b32 v0, v1, v0, 1
846; VI-NEXT:    s_nop 1
847; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
848; VI-NEXT:    s_endpgm
849;
850; GFX9-LABEL: test_umul_i24:
851; GFX9:       ; %bb.0:
852; GFX9-NEXT:    s_load_dword s1, s[4:5], 0x2c
853; GFX9-NEXT:    s_mov_b32 s0, 0
854; GFX9-NEXT:    s_mov_b32 s3, 0xf000
855; GFX9-NEXT:    s_mov_b32 s2, -1
856; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
857; GFX9-NEXT:    s_lshr_b32 s1, s1, 9
858; GFX9-NEXT:    s_mul_hi_u32 s4, s1, 0xff803fe1
859; GFX9-NEXT:    s_mul_i32 s1, s1, 0xff803fe1
860; GFX9-NEXT:    v_mov_b32_e32 v0, s1
861; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
862; GFX9-NEXT:    s_mov_b32 s1, s0
863; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
864; GFX9-NEXT:    s_endpgm
865;
866; EG-LABEL: test_umul_i24:
867; EG:       ; %bb.0:
868; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
869; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
870; EG-NEXT:    CF_END
871; EG-NEXT:    PAD
872; EG-NEXT:    ALU clause starting at 4:
873; EG-NEXT:     LSHR * T0.W, KC0[2].Z, literal.x,
874; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
875; EG-NEXT:     MULHI * T0.X, PV.W, literal.x,
876; EG-NEXT:    -8372255(nan), 0(0.000000e+00)
877; EG-NEXT:     MULLO_INT * T0.Y, T0.W, literal.x,
878; EG-NEXT:    -8372255(nan), 0(0.000000e+00)
879; EG-NEXT:     BIT_ALIGN_INT T0.X, T0.X, PS, 1,
880; EG-NEXT:     MOV * T1.X, literal.x,
881; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
882;
883; CM-LABEL: test_umul_i24:
884; CM:       ; %bb.0:
885; CM-NEXT:    ALU 14, @4, KC0[CB0:0-32], KC1[]
886; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
887; CM-NEXT:    CF_END
888; CM-NEXT:    PAD
889; CM-NEXT:    ALU clause starting at 4:
890; CM-NEXT:     LSHR * T0.W, KC0[2].Z, literal.x,
891; CM-NEXT:    9(1.261169e-44), 0(0.000000e+00)
892; CM-NEXT:     MULHI T0.X, T0.W, literal.x,
893; CM-NEXT:     MULHI T0.Y (MASKED), T0.W, literal.x,
894; CM-NEXT:     MULHI T0.Z (MASKED), T0.W, literal.x,
895; CM-NEXT:     MULHI * T0.W (MASKED), T0.W, literal.x,
896; CM-NEXT:    -8372255(nan), 0(0.000000e+00)
897; CM-NEXT:     MULLO_INT T0.X (MASKED), T0.W, literal.x,
898; CM-NEXT:     MULLO_INT T0.Y, T0.W, literal.x,
899; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.W, literal.x,
900; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.W, literal.x,
901; CM-NEXT:    -8372255(nan), 0(0.000000e+00)
902; CM-NEXT:     BIT_ALIGN_INT * T0.X, T0.X, PV.Y, 1,
903; CM-NEXT:     MOV * T1.X, literal.x,
904; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
905  %i = lshr i32 %arg, 9
906  %i1 = zext i32 %i to i64
907  %i2 = mul i64 %i1, 4286595041
908  %i3 = lshr i64 %i2, 1
909  %i4 = trunc i64 %i3 to i32
910  store i32 %i4, ptr addrspace(1) null, align 4
911  ret void
912}
913
914attributes #0 = { nounwind }
915