xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll (revision 35f7b60aa6105753859bcccaf4a793aaf16b4acd)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-SDAG %s
3; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG %s
4; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s
5
6define void @v_permlane16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %src2) {
7; GFX10-SDAG-LABEL: v_permlane16_p0:
8; GFX10-SDAG:       ; %bb.0:
9; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
11; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
12; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
13; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
14; GFX10-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
15; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
16;
17; GFX11-SDAG-LABEL: v_permlane16_p0:
18; GFX11-SDAG:       ; %bb.0:
19; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
21; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
22; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
23; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
24; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
25; GFX11-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
26; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
27;
28; GFX12-SDAG-LABEL: v_permlane16_p0:
29; GFX12-SDAG:       ; %bb.0:
30; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
31; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
32; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
33; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
34; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
35; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
36; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
37; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
38; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
39; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
40; GFX12-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
41; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
42  %v = call ptr @llvm.amdgcn.permlane16.p0(ptr %src0, ptr %src0, i32 %src1, i32 %src2, i1 false, i1 false)
43  store ptr %v, ptr addrspace(1) %out
44  ret void
45}
46
47define void @v_permlanex16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %src2) {
48; GFX10-SDAG-LABEL: v_permlanex16_p0:
49; GFX10-SDAG:       ; %bb.0:
50; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
52; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
53; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
54; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
55; GFX10-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
56; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
57;
58; GFX11-SDAG-LABEL: v_permlanex16_p0:
59; GFX11-SDAG:       ; %bb.0:
60; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
62; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
63; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
64; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
65; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
66; GFX11-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
67; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
68;
69; GFX12-SDAG-LABEL: v_permlanex16_p0:
70; GFX12-SDAG:       ; %bb.0:
71; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
72; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
73; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
74; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
75; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
76; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
77; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
78; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
79; GFX12-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
80; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
81; GFX12-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
82; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
83  %v = call ptr @llvm.amdgcn.permlanex16.p0(ptr %src0, ptr %src0, i32 %src1, i32 %src2, i1 false, i1 false)
84  store ptr %v, ptr addrspace(1) %out
85  ret void
86}
87
88define void @v_permlane16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1, i32 %src2) {
89; GFX10-SDAG-LABEL: v_permlane16_v3p0:
90; GFX10-SDAG:       ; %bb.0:
91; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v8
93; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v9
94; GFX10-SDAG-NEXT:    v_permlane16_b32 v7, v7, s4, s5
95; GFX10-SDAG-NEXT:    v_permlane16_b32 v6, v6, s4, s5
96; GFX10-SDAG-NEXT:    v_permlane16_b32 v5, v5, s4, s5
97; GFX10-SDAG-NEXT:    v_permlane16_b32 v4, v4, s4, s5
98; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
99; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
100; GFX10-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
101; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
102; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
103;
104; GFX11-SDAG-LABEL: v_permlane16_v3p0:
105; GFX11-SDAG:       ; %bb.0:
106; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
107; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
108; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
109; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
110; GFX11-SDAG-NEXT:    v_permlane16_b32 v7, v7, s0, s1
111; GFX11-SDAG-NEXT:    v_permlane16_b32 v6, v6, s0, s1
112; GFX11-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
113; GFX11-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
114; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
115; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
116; GFX11-SDAG-NEXT:    s_clause 0x1
117; GFX11-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
118; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
119; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
120;
121; GFX12-SDAG-LABEL: v_permlane16_v3p0:
122; GFX12-SDAG:       ; %bb.0:
123; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
124; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
125; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
126; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
127; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
128; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
129; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
130; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
131; GFX12-SDAG-NEXT:    v_permlane16_b32 v7, v7, s0, s1
132; GFX12-SDAG-NEXT:    v_permlane16_b32 v6, v6, s0, s1
133; GFX12-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
134; GFX12-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
135; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
136; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
137; GFX12-SDAG-NEXT:    s_clause 0x1
138; GFX12-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
139; GFX12-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
140; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
141  %v = call <3 x ptr> @llvm.amdgcn.permlane16.v3p0(<3 x ptr> %src0, <3 x ptr> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
142  store <3 x ptr> %v, ptr addrspace(1) %out
143  ret void
144}
145
146define void @v_permlanex16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1, i32 %src2) {
147; GFX10-SDAG-LABEL: v_permlanex16_v3p0:
148; GFX10-SDAG:       ; %bb.0:
149; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v8
151; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v9
152; GFX10-SDAG-NEXT:    v_permlanex16_b32 v7, v7, s4, s5
153; GFX10-SDAG-NEXT:    v_permlanex16_b32 v6, v6, s4, s5
154; GFX10-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s4, s5
155; GFX10-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s4, s5
156; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
157; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
158; GFX10-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
159; GFX10-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
160; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
161;
162; GFX11-SDAG-LABEL: v_permlanex16_v3p0:
163; GFX11-SDAG:       ; %bb.0:
164; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
166; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
167; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
168; GFX11-SDAG-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
169; GFX11-SDAG-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
170; GFX11-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
171; GFX11-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
172; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
173; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
174; GFX11-SDAG-NEXT:    s_clause 0x1
175; GFX11-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
176; GFX11-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
177; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
178;
179; GFX12-SDAG-LABEL: v_permlanex16_v3p0:
180; GFX12-SDAG:       ; %bb.0:
181; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
182; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
183; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
184; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
185; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
186; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
187; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
188; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
189; GFX12-SDAG-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
190; GFX12-SDAG-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
191; GFX12-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
192; GFX12-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
193; GFX12-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
194; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
195; GFX12-SDAG-NEXT:    s_clause 0x1
196; GFX12-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
197; GFX12-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
198; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
199  %v = call <3 x ptr> @llvm.amdgcn.permlanex16.v3p0(<3 x ptr> %src0, <3 x ptr> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
200  store <3 x ptr> %v, ptr addrspace(1) %out
201  ret void
202}
203
204define void @v_permlane16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 %src1, i32 %src2) {
205; GFX10-SDAG-LABEL: v_permlane16_p3:
206; GFX10-SDAG:       ; %bb.0:
207; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
208; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
209; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
210; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
211; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v2, off
212; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
213;
214; GFX11-SDAG-LABEL: v_permlane16_p3:
215; GFX11-SDAG:       ; %bb.0:
216; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
218; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
219; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
220; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
221; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
222; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
223;
224; GFX12-SDAG-LABEL: v_permlane16_p3:
225; GFX12-SDAG:       ; %bb.0:
226; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
227; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
228; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
229; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
230; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
231; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
232; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
233; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
234; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
235; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
236; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
237  %v = call ptr addrspace(3) @llvm.amdgcn.permlane16.p3(ptr addrspace(3) %src0, ptr addrspace(3) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
238  store ptr addrspace(3) %v, ptr addrspace(1) %out
239  ret void
240}
241
242define void @v_permlanex16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 %src1, i32 %src2) {
243; GFX10-SDAG-LABEL: v_permlanex16_p3:
244; GFX10-SDAG:       ; %bb.0:
245; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
246; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
247; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
248; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
249; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v2, off
250; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
251;
252; GFX11-SDAG-LABEL: v_permlanex16_p3:
253; GFX11-SDAG:       ; %bb.0:
254; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
255; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
256; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
257; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
258; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
259; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
260; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
261;
262; GFX12-SDAG-LABEL: v_permlanex16_p3:
263; GFX12-SDAG:       ; %bb.0:
264; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
265; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
266; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
267; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
268; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
269; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
270; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
271; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
272; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
273; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
274; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
275  %v = call ptr addrspace(3) @llvm.amdgcn.permlanex16.p3(ptr addrspace(3) %src0, ptr addrspace(3) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
276  store ptr addrspace(3) %v, ptr addrspace(1) %out
277  ret void
278}
279
280define void @v_permlane16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2) {
281; GFX10-SDAG-LABEL: v_permlane16_v3p3:
282; GFX10-SDAG:       ; %bb.0:
283; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
284; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
285; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
286; GFX10-SDAG-NEXT:    v_permlane16_b32 v4, v4, s4, s5
287; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
288; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
289; GFX10-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
290; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
291;
292; GFX11-SDAG-LABEL: v_permlane16_v3p3:
293; GFX11-SDAG:       ; %bb.0:
294; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
295; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
296; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
297; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
298; GFX11-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
299; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
300; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
301; GFX11-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
302; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
303;
304; GFX12-SDAG-LABEL: v_permlane16_v3p3:
305; GFX12-SDAG:       ; %bb.0:
306; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
307; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
308; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
309; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
310; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
311; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
312; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
313; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
314; GFX12-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
315; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
316; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
317; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
318; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
319  %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane16.v3p3(<3 x ptr addrspace(3)> %src0, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
320  store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
321  ret void
322}
323
324define void @v_permlanex16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2) {
325; GFX10-SDAG-LABEL: v_permlanex16_v3p3:
326; GFX10-SDAG:       ; %bb.0:
327; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
328; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
329; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
330; GFX10-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s4, s5
331; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
332; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
333; GFX10-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
334; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
335;
336; GFX11-SDAG-LABEL: v_permlanex16_v3p3:
337; GFX11-SDAG:       ; %bb.0:
338; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
339; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
340; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
341; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
342; GFX11-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
343; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
344; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
345; GFX11-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
346; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
347;
348; GFX12-SDAG-LABEL: v_permlanex16_v3p3:
349; GFX12-SDAG:       ; %bb.0:
350; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
351; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
352; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
353; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
354; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
355; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
356; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
357; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
358; GFX12-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
359; GFX12-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
360; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
361; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
362; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
363  %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlanex16.v3p3(<3 x ptr addrspace(3)> %src0, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
364  store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
365  ret void
366}
367
368define void @v_permlane16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 %src1, i32 %src2) {
369; GFX10-SDAG-LABEL: v_permlane16_p5:
370; GFX10-SDAG:       ; %bb.0:
371; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
372; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
373; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
374; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
375; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v2, off
376; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
377;
378; GFX11-SDAG-LABEL: v_permlane16_p5:
379; GFX11-SDAG:       ; %bb.0:
380; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
381; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
382; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
383; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
384; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
385; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
386; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
387;
388; GFX12-SDAG-LABEL: v_permlane16_p5:
389; GFX12-SDAG:       ; %bb.0:
390; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
391; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
392; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
393; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
394; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
395; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
396; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
397; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
398; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
399; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
400; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
401  %v = call ptr addrspace(5) @llvm.amdgcn.permlane16.p5(ptr addrspace(5) %src0, ptr addrspace(5) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
402  store ptr addrspace(5) %v, ptr addrspace(1) %out
403  ret void
404}
405
406define void @v_permlanex16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 %src1, i32 %src2) {
407; GFX10-SDAG-LABEL: v_permlanex16_p5:
408; GFX10-SDAG:       ; %bb.0:
409; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
410; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
411; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
412; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
413; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v2, off
414; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
415;
416; GFX11-SDAG-LABEL: v_permlanex16_p5:
417; GFX11-SDAG:       ; %bb.0:
418; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
419; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
420; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
421; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
422; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
423; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
424; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
425;
426; GFX12-SDAG-LABEL: v_permlanex16_p5:
427; GFX12-SDAG:       ; %bb.0:
428; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
429; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
430; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
431; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
432; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
433; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
434; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
435; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
436; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
437; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
438; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
439  %v = call ptr addrspace(5) @llvm.amdgcn.permlanex16.p5(ptr addrspace(5) %src0, ptr addrspace(5) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
440  store ptr addrspace(5) %v, ptr addrspace(1) %out
441  ret void
442}
443
444define void @v_permlane16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2) {
445; GFX10-SDAG-LABEL: v_permlane16_v3p5:
446; GFX10-SDAG:       ; %bb.0:
447; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
448; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
449; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
450; GFX10-SDAG-NEXT:    v_permlane16_b32 v4, v4, s4, s5
451; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
452; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
453; GFX10-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
454; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
455;
456; GFX11-SDAG-LABEL: v_permlane16_v3p5:
457; GFX11-SDAG:       ; %bb.0:
458; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
459; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
460; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
461; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
462; GFX11-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
463; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
464; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
465; GFX11-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
466; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
467;
468; GFX12-SDAG-LABEL: v_permlane16_v3p5:
469; GFX12-SDAG:       ; %bb.0:
470; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
471; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
472; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
473; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
474; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
475; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
476; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
477; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
478; GFX12-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
479; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
480; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
481; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
482; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
483  %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane16.v3p5(<3 x ptr addrspace(5)> %src0, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
484  store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
485  ret void
486}
487
488define void @v_permlanex16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2) {
489; GFX10-SDAG-LABEL: v_permlanex16_v3p5:
490; GFX10-SDAG:       ; %bb.0:
491; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
492; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
493; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
494; GFX10-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s4, s5
495; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
496; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
497; GFX10-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
498; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
499;
500; GFX11-SDAG-LABEL: v_permlanex16_v3p5:
501; GFX11-SDAG:       ; %bb.0:
502; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
503; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
504; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
505; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
506; GFX11-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
507; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
508; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
509; GFX11-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
510; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
511;
512; GFX12-SDAG-LABEL: v_permlanex16_v3p5:
513; GFX12-SDAG:       ; %bb.0:
514; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
515; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
516; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
517; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
518; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
519; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
520; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
521; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
522; GFX12-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
523; GFX12-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
524; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
525; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
526; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
527  %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlanex16.v3p5(<3 x ptr addrspace(5)> %src0, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
528  store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
529  ret void
530}
531
532define void @v_permlane16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 %src1, i32 %src2) {
533; GFX10-SDAG-LABEL: v_permlane16_p6:
534; GFX10-SDAG:       ; %bb.0:
535; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
536; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
537; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
538; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
539; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v2, off
540; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
541;
542; GFX11-SDAG-LABEL: v_permlane16_p6:
543; GFX11-SDAG:       ; %bb.0:
544; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
545; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
546; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
547; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
548; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
549; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
550; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
551;
552; GFX12-SDAG-LABEL: v_permlane16_p6:
553; GFX12-SDAG:       ; %bb.0:
554; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
555; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
556; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
557; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
558; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
559; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
560; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
561; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
562; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
563; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
564; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
565  %v = call ptr addrspace(6) @llvm.amdgcn.permlane16.p6(ptr addrspace(6) %src0, ptr addrspace(6) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
566  store ptr addrspace(6) %v, ptr addrspace(1) %out
567  ret void
568}
569
570define void @v_permlanex16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 %src1, i32 %src2) {
571; GFX10-SDAG-LABEL: v_permlanex16_p6:
572; GFX10-SDAG:       ; %bb.0:
573; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
574; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
575; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
576; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
577; GFX10-SDAG-NEXT:    global_store_dword v[0:1], v2, off
578; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
579;
580; GFX11-SDAG-LABEL: v_permlanex16_p6:
581; GFX11-SDAG:       ; %bb.0:
582; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
583; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
584; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
585; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
586; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
587; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
588; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
589;
590; GFX12-SDAG-LABEL: v_permlanex16_p6:
591; GFX12-SDAG:       ; %bb.0:
592; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
593; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
594; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
595; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
596; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
597; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
598; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
599; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
600; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
601; GFX12-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
602; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
603  %v = call ptr addrspace(6) @llvm.amdgcn.permlanex16.p6(ptr addrspace(6) %src0, ptr addrspace(6) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
604  store ptr addrspace(6) %v, ptr addrspace(1) %out
605  ret void
606}
607
608define void @v_permlane16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2) {
609; GFX10-SDAG-LABEL: v_permlane16_v3p6:
610; GFX10-SDAG:       ; %bb.0:
611; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
612; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
613; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
614; GFX10-SDAG-NEXT:    v_permlane16_b32 v4, v4, s4, s5
615; GFX10-SDAG-NEXT:    v_permlane16_b32 v3, v3, s4, s5
616; GFX10-SDAG-NEXT:    v_permlane16_b32 v2, v2, s4, s5
617; GFX10-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
618; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
619;
620; GFX11-SDAG-LABEL: v_permlane16_v3p6:
621; GFX11-SDAG:       ; %bb.0:
622; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
623; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
624; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
625; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
626; GFX11-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
627; GFX11-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
628; GFX11-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
629; GFX11-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
630; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
631;
632; GFX12-SDAG-LABEL: v_permlane16_v3p6:
633; GFX12-SDAG:       ; %bb.0:
634; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
635; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
636; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
637; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
638; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
639; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
640; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
641; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
642; GFX12-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
643; GFX12-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
644; GFX12-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
645; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
646; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
647  %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane16.v3p6(<3 x ptr addrspace(6)> %src0, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
648  store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
649  ret void
650}
651
652define void @v_permlanex16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2) {
653; GFX10-SDAG-LABEL: v_permlanex16_v3p6:
654; GFX10-SDAG:       ; %bb.0:
655; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
656; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
657; GFX10-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
658; GFX10-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s4, s5
659; GFX10-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s4, s5
660; GFX10-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
661; GFX10-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
662; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
663;
664; GFX11-SDAG-LABEL: v_permlanex16_v3p6:
665; GFX11-SDAG:       ; %bb.0:
666; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
667; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
668; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
669; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
670; GFX11-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
671; GFX11-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
672; GFX11-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
673; GFX11-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
674; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
675;
676; GFX12-SDAG-LABEL: v_permlanex16_v3p6:
677; GFX12-SDAG:       ; %bb.0:
678; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
679; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
680; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
681; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
682; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
683; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v5
684; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s1, v6
685; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
686; GFX12-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
687; GFX12-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
688; GFX12-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
689; GFX12-SDAG-NEXT:    global_store_b96 v[0:1], v[2:4], off
690; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
691  %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlanex16.v3p6(<3 x ptr addrspace(6)> %src0, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
692  store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
693  ret void
694}
695