xref: /llvm-project/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll (revision 2d6d723a85c2d007b0359c206d66cd2e5a9f00e1)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
4; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s
5
6define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perShaderTable, i32 inreg %descTable0, i32 inreg %descTable1, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 {
7; GFX11-LABEL: mixed_vmem_types:
8; GFX11:       ; %bb.0: ; %.entry
9; GFX11-NEXT:    s_getpc_b64 s[4:5]
10; GFX11-NEXT:    s_mov_b32 s0, s3
11; GFX11-NEXT:    s_mov_b32 s3, s5
12; GFX11-NEXT:    s_mov_b32 s1, s5
13; GFX11-NEXT:    s_load_b256 s[20:27], s[2:3], 0x40
14; GFX11-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
15; GFX11-NEXT:    s_load_b512 s[36:51], s[2:3], 0x0
16; GFX11-NEXT:    v_mov_b32_e32 v0, 0xbc00bc00
17; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
18; GFX11-NEXT:    buffer_load_b32 v1, off, s[20:23], 0
19; GFX11-NEXT:    buffer_load_b32 v2, off, s[16:19], 0
20; GFX11-NEXT:    image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
21; GFX11-NEXT:    buffer_load_b32 v4, off, s[40:43], 0
22; GFX11-NEXT:    image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
23; GFX11-NEXT:    s_waitcnt vmcnt(4)
24; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1
25; GFX11-NEXT:    s_waitcnt vmcnt(3)
26; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0xac0, v2
27; GFX11-NEXT:    s_waitcnt vmcnt(2)
28; GFX11-NEXT:    v_cmp_eq_f32_e64 s1, 1.0, v3
29; GFX11-NEXT:    s_waitcnt vmcnt(1)
30; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0xac0, v4
31; GFX11-NEXT:    s_and_b32 s0, s0, vcc_lo
32; GFX11-NEXT:    s_waitcnt vmcnt(0)
33; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
34; GFX11-NEXT:    s_and_b32 s0, s0, s1
35; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
36; GFX11-NEXT:    s_and_b32 s0, s0, s2
37; GFX11-NEXT:    s_and_b32 s0, s0, vcc_lo
38; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
39; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
40; GFX11-NEXT:    buffer_store_b32 v0, off, s[24:27], 0
41; GFX11-NEXT:    s_endpgm
42;
43; GFX12-LABEL: mixed_vmem_types:
44; GFX12:       ; %bb.0: ; %.entry
45; GFX12-NEXT:    s_getpc_b64 s[4:5]
46; GFX12-NEXT:    s_mov_b32 s0, s3
47; GFX12-NEXT:    s_sext_i32_i16 s5, s5
48; GFX12-NEXT:    v_mov_b32_e32 v0, 0xbc00bc00
49; GFX12-NEXT:    s_mov_b32 s3, s5
50; GFX12-NEXT:    s_mov_b32 s1, s5
51; GFX12-NEXT:    s_load_b256 s[20:27], s[2:3], 0x40
52; GFX12-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
53; GFX12-NEXT:    s_load_b512 s[36:51], s[2:3], 0x0
54; GFX12-NEXT:    s_wait_kmcnt 0x0
55; GFX12-NEXT:    buffer_load_b32 v1, off, s[20:23], null
56; GFX12-NEXT:    buffer_load_b32 v2, off, s[16:19], null
57; GFX12-NEXT:    image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
58; GFX12-NEXT:    buffer_load_b32 v4, off, s[40:43], null
59; GFX12-NEXT:    image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
60; GFX12-NEXT:    s_wait_loadcnt 0x2
61; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1
62; GFX12-NEXT:    s_wait_loadcnt 0x1
63; GFX12-NEXT:    v_cmp_eq_u32_e64 s0, 0xac0, v2
64; GFX12-NEXT:    s_wait_samplecnt 0x1
65; GFX12-NEXT:    v_cmp_eq_f32_e64 s1, 1.0, v3
66; GFX12-NEXT:    s_wait_loadcnt 0x0
67; GFX12-NEXT:    v_cmp_eq_u32_e64 s2, 0xac0, v4
68; GFX12-NEXT:    s_and_b32 s0, s0, vcc_lo
69; GFX12-NEXT:    s_wait_samplecnt 0x0
70; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
71; GFX12-NEXT:    s_and_b32 s0, s0, s1
72; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
73; GFX12-NEXT:    s_and_b32 s0, s0, s2
74; GFX12-NEXT:    s_and_b32 s0, s0, vcc_lo
75; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
76; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
77; GFX12-NEXT:    buffer_store_b32 v0, off, s[24:27], null
78; GFX12-NEXT:    s_endpgm
79;
80; GFX12-GISEL-LABEL: mixed_vmem_types:
81; GFX12-GISEL:       ; %bb.0: ; %.entry
82; GFX12-GISEL-NEXT:    s_getpc_b64 s[20:21]
83; GFX12-GISEL-NEXT:    s_mov_b32 s0, s3
84; GFX12-GISEL-NEXT:    s_sext_i32_i16 s21, s21
85; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0xbc00bc00
86; GFX12-GISEL-NEXT:    s_mov_b32 s1, s21
87; GFX12-GISEL-NEXT:    s_mov_b32 s3, s21
88; GFX12-GISEL-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
89; GFX12-GISEL-NEXT:    s_clause 0x1
90; GFX12-GISEL-NEXT:    s_load_b256 s[20:27], s[2:3], 0x40
91; GFX12-GISEL-NEXT:    s_load_b512 s[36:51], s[2:3], 0x0
92; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
93; GFX12-GISEL-NEXT:    image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
94; GFX12-GISEL-NEXT:    buffer_load_b32 v2, off, s[16:19], null
95; GFX12-GISEL-NEXT:    buffer_load_b32 v3, off, s[20:23], null
96; GFX12-GISEL-NEXT:    buffer_load_b32 v4, off, s[40:43], null
97; GFX12-GISEL-NEXT:    image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
98; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x2
99; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0xac0, v2
100; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x1
101; GFX12-GISEL-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
102; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x1
103; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s1, 0xac0, v3
104; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
105; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s2, 0xac0, v4
106; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, vcc_lo
107; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
108; GFX12-GISEL-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
109; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, s1
110; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
111; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, s2
112; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, vcc_lo
113; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
114; GFX12-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
115; GFX12-GISEL-NEXT:    buffer_store_b32 v0, off, s[24:27], null
116; GFX12-GISEL-NEXT:    s_endpgm
117.entry:
118  %i = call i64 @llvm.amdgcn.s.getpc()
119  %extelt.offset = lshr i64 %i, 32
120  %.i1 = trunc i64 %extelt.offset to i32
121  %.upto0 = insertelement <2 x i32> poison, i32 %descTable1, i64 0
122  %i1 = insertelement <2 x i32> %.upto0, i32 %.i1, i64 1
123  %i2 = bitcast <2 x i32> %i1 to i64
124  %i3 = inttoptr i64 %i2 to ptr addrspace(4)
125  %.upto03 = insertelement <2 x i32> poison, i32 %descTable0, i64 0
126  %i4 = insertelement <2 x i32> %.upto03, i32 %.i1, i64 1
127  %i5 = bitcast <2 x i32> %i4 to i64
128  %i6 = inttoptr i64 %i5 to ptr addrspace(4)
129  %i7 = getelementptr i8, ptr addrspace(4) %i6, i64 80
130  %i8 = load <4 x i32>, ptr addrspace(4) %i7, align 16
131  %i9 = getelementptr i8, ptr addrspace(4) %i3, i64 48
132  %i10 = load <4 x i32>, ptr addrspace(4) %i9, align 16
133  %i11 = getelementptr i8, ptr addrspace(4) %i6, i64 64
134  %i12 = load <4 x i32>, ptr addrspace(4) %i11, align 16
135  %i13 = getelementptr i8, ptr addrspace(4) %i6, i64 16
136  %i14 = load <4 x i32>, ptr addrspace(4) %i13, align 16
137  %i15 = getelementptr i8, ptr addrspace(4) %i6, i64 32
138  %i16 = load <8 x i32>, ptr addrspace(4) %i15, align 32
139  %i17 = load <4 x i32>, ptr addrspace(4) %i6, align 16
140  %i18 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16.v8i32.v4i32(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %i16, <4 x i32> %i17, i1 false, i32 0, i32 0)
141  %i19 = fcmp oeq float %i18, 0.000000e+00
142  %i20 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i14, i32 0, i32 0, i32 0)
143  %.not = icmp eq i32 %i20, 2752
144  %i21 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i12, i32 0, i32 0, i32 0)
145  %.not1 = icmp eq i32 %i21, 2752
146  %i22 = getelementptr i8, ptr addrspace(4) %i3, i64 16
147  %i23 = load <8 x i32>, ptr addrspace(4) %i22, align 32
148  %i24 = load <4 x i32>, ptr addrspace(4) %i3, align 16
149  %i25 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16.v8i32.v4i32(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %i23, <4 x i32> %i24, i1 false, i32 0, i32 0)
150  %i26 = fcmp oeq float %i25, 1.000000e+00
151  %i27 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i10, i32 0, i32 0, i32 0)
152  %.not2 = icmp eq i32 %i27, 2752
153  %i28 = select i1 %.not2, i1 %i26, i1 false
154  %i29 = select i1 %i28, i1 %.not1, i1 false
155  %i30 = select i1 %i29, i1 %.not, i1 false
156  %narrow2 = select i1 %i30, i1 %i19, i1 false
157  %.4 = zext i1 %narrow2 to i32
158  call void @llvm.amdgcn.raw.buffer.store.i32(i32 %.4, <4 x i32> %i8, i32 0, i32 0, i32 0)
159  ret void
160}
161