xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll (revision 5feb32ba929f9e517c530217cabb09d1d734a763)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX802-SDAG %s
3; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
4; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
5
6define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
7; GFX802-SDAG-LABEL: test_writelane_p0:
8; GFX802-SDAG:       ; %bb.0:
9; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10; GFX802-SDAG-NEXT:    flat_load_dwordx2 v[5:6], v[0:1]
11; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v4
12; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
13; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v2
14; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
15; GFX802-SDAG-NEXT:    s_nop 0
16; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s4, m0
17; GFX802-SDAG-NEXT:    v_writelane_b32 v5, s5, m0
18; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[5:6]
19; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
20; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
21;
22; GFX1010-SDAG-LABEL: test_writelane_p0:
23; GFX1010-SDAG:       ; %bb.0:
24; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25; GFX1010-SDAG-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
26; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v3
27; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
28; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
29; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
30; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s4, s5
31; GFX1010-SDAG-NEXT:    v_writelane_b32 v5, s6, s5
32; GFX1010-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[5:6], off
33; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
34;
35; GFX1100-SDAG-LABEL: test_writelane_p0:
36; GFX1100-SDAG:       ; %bb.0:
37; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38; GFX1100-SDAG-NEXT:    global_load_b64 v[5:6], v[0:1], off
39; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
40; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v4
41; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
42; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
43; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
44; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s0, s1
45; GFX1100-SDAG-NEXT:    v_writelane_b32 v5, s2, s1
46; GFX1100-SDAG-NEXT:    global_store_b64 v[0:1], v[5:6], off
47; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
48  %oldval = load ptr, ptr addrspace(1) %out
49  %writelane = call ptr @llvm.amdgcn.writelane.p0(ptr %src, i32 %src1, ptr %oldval)
50  store ptr %writelane, ptr addrspace(1) %out, align 4
51  ret void
52}
53
54define void @test_writelane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1) {
55; GFX802-SDAG-LABEL: test_writelane_v3p0:
56; GFX802-SDAG:       ; %bb.0:
57; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
58; GFX802-SDAG-NEXT:    v_add_u32_e32 v13, vcc, 16, v0
59; GFX802-SDAG-NEXT:    flat_load_dwordx4 v[9:12], v[0:1]
60; GFX802-SDAG-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
61; GFX802-SDAG-NEXT:    flat_load_dwordx2 v[15:16], v[13:14]
62; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v8
63; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v5
64; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s7, v4
65; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s8, v3
66; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s9, v2
67; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v7
68; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v6
69; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(1)
70; GFX802-SDAG-NEXT:    v_writelane_b32 v12, s6, m0
71; GFX802-SDAG-NEXT:    v_writelane_b32 v11, s7, m0
72; GFX802-SDAG-NEXT:    v_writelane_b32 v10, s8, m0
73; GFX802-SDAG-NEXT:    v_writelane_b32 v9, s9, m0
74; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
75; GFX802-SDAG-NEXT:    v_writelane_b32 v16, s4, m0
76; GFX802-SDAG-NEXT:    v_writelane_b32 v15, s5, m0
77; GFX802-SDAG-NEXT:    flat_store_dwordx4 v[0:1], v[9:12]
78; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[13:14], v[15:16]
79; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
80; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
81;
82; GFX1010-SDAG-LABEL: test_writelane_v3p0:
83; GFX1010-SDAG:       ; %bb.0:
84; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
85; GFX1010-SDAG-NEXT:    s_clause 0x1
86; GFX1010-SDAG-NEXT:    global_load_dwordx2 v[13:14], v[0:1], off offset:16
87; GFX1010-SDAG-NEXT:    global_load_dwordx4 v[9:12], v[0:1], off
88; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v8
89; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v5
90; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s8, v4
91; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s9, v3
92; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s10, v2
93; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v7
94; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
95; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(1)
96; GFX1010-SDAG-NEXT:    v_writelane_b32 v14, s4, s5
97; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
98; GFX1010-SDAG-NEXT:    v_writelane_b32 v12, s7, s5
99; GFX1010-SDAG-NEXT:    v_writelane_b32 v11, s8, s5
100; GFX1010-SDAG-NEXT:    v_writelane_b32 v10, s9, s5
101; GFX1010-SDAG-NEXT:    v_writelane_b32 v9, s10, s5
102; GFX1010-SDAG-NEXT:    v_writelane_b32 v13, s6, s5
103; GFX1010-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[9:12], off
104; GFX1010-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[13:14], off offset:16
105; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
106;
107; GFX1100-SDAG-LABEL: test_writelane_v3p0:
108; GFX1100-SDAG:       ; %bb.0:
109; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
110; GFX1100-SDAG-NEXT:    s_clause 0x1
111; GFX1100-SDAG-NEXT:    global_load_b64 v[13:14], v[0:1], off offset:16
112; GFX1100-SDAG-NEXT:    global_load_b128 v[9:12], v[0:1], off
113; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v8
114; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v5
115; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
116; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
117; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
118; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v7
119; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v6
120; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(1)
121; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
122; GFX1100-SDAG-NEXT:    v_writelane_b32 v14, s0, s1
123; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
124; GFX1100-SDAG-NEXT:    v_writelane_b32 v12, s3, s1
125; GFX1100-SDAG-NEXT:    v_writelane_b32 v11, s4, s1
126; GFX1100-SDAG-NEXT:    v_writelane_b32 v10, s5, s1
127; GFX1100-SDAG-NEXT:    v_writelane_b32 v9, s6, s1
128; GFX1100-SDAG-NEXT:    v_writelane_b32 v13, s2, s1
129; GFX1100-SDAG-NEXT:    s_clause 0x1
130; GFX1100-SDAG-NEXT:    global_store_b128 v[0:1], v[9:12], off
131; GFX1100-SDAG-NEXT:    global_store_b64 v[0:1], v[13:14], off offset:16
132; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
133  %oldval = load <3 x ptr>, ptr addrspace(1) %out
134  %writelane = call <3 x ptr> @llvm.amdgcn.writelane.v3p0(<3 x ptr> %src, i32 %src1, <3 x ptr> %oldval)
135  store <3 x ptr> %writelane, ptr addrspace(1) %out, align 4
136  ret void
137}
138
139define void @test_writelane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 %src1) {
140; GFX802-SDAG-LABEL: test_writelane_p3:
141; GFX802-SDAG:       ; %bb.0:
142; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143; GFX802-SDAG-NEXT:    flat_load_dword v4, v[0:1]
144; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
145; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
146; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
147; GFX802-SDAG-NEXT:    s_nop 1
148; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
149; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v4
150; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
151; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
152;
153; GFX1010-SDAG-LABEL: test_writelane_p3:
154; GFX1010-SDAG:       ; %bb.0:
155; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
156; GFX1010-SDAG-NEXT:    global_load_dword v4, v[0:1], off
157; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
158; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
159; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
160; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
161; GFX1010-SDAG-NEXT:    global_store_dword v[0:1], v4, off
162; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
163;
164; GFX1100-SDAG-LABEL: test_writelane_p3:
165; GFX1100-SDAG:       ; %bb.0:
166; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
167; GFX1100-SDAG-NEXT:    global_load_b32 v4, v[0:1], off
168; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
169; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
170; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
171; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
172; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
173; GFX1100-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
174; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
175  %oldval = load ptr addrspace(3), ptr addrspace(1) %out
176  %writelane = call ptr addrspace(3) @llvm.amdgcn.writelane.p3(ptr addrspace(3) %src, i32 %src1, ptr addrspace(3) %oldval)
177  store ptr addrspace(3) %writelane, ptr addrspace(1) %out, align 4
178  ret void
179}
180
181define void @test_writelane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src, i32 %src1) {
182; GFX802-SDAG-LABEL: test_writelane_v3p3:
183; GFX802-SDAG:       ; %bb.0:
184; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
185; GFX802-SDAG-NEXT:    flat_load_dwordx3 v[6:8], v[0:1]
186; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v5
187; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
188; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
189; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
190; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
191; GFX802-SDAG-NEXT:    v_writelane_b32 v8, s4, m0
192; GFX802-SDAG-NEXT:    v_writelane_b32 v7, s5, m0
193; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s6, m0
194; GFX802-SDAG-NEXT:    flat_store_dwordx3 v[0:1], v[6:8]
195; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
196; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
197;
198; GFX1010-SDAG-LABEL: test_writelane_v3p3:
199; GFX1010-SDAG:       ; %bb.0:
200; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
201; GFX1010-SDAG-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
202; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
203; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
204; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
205; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
206; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
207; GFX1010-SDAG-NEXT:    v_writelane_b32 v8, s4, s5
208; GFX1010-SDAG-NEXT:    v_writelane_b32 v7, s6, s5
209; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s7, s5
210; GFX1010-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off
211; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
212;
213; GFX1100-SDAG-LABEL: test_writelane_v3p3:
214; GFX1100-SDAG:       ; %bb.0:
215; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
216; GFX1100-SDAG-NEXT:    global_load_b96 v[6:8], v[0:1], off
217; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
218; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
219; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v3
220; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v2
221; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
222; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
223; GFX1100-SDAG-NEXT:    v_writelane_b32 v8, s0, s1
224; GFX1100-SDAG-NEXT:    v_writelane_b32 v7, s2, s1
225; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
226; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s3, s1
227; GFX1100-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off
228; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
229  %oldval = load <3 x ptr addrspace(3)>, ptr addrspace(1) %out
230  %writelane = call <3 x ptr addrspace(3)> @llvm.amdgcn.writelane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1, <3 x ptr addrspace(3)> %oldval)
231  store <3 x ptr addrspace(3)> %writelane, ptr addrspace(1) %out, align 4
232  ret void
233}
234
235define void @test_writelane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32 %src1) {
236; GFX802-SDAG-LABEL: test_writelane_p5:
237; GFX802-SDAG:       ; %bb.0:
238; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
239; GFX802-SDAG-NEXT:    flat_load_dword v4, v[0:1]
240; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
241; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
242; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
243; GFX802-SDAG-NEXT:    s_nop 1
244; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
245; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v4
246; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
247; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
248;
249; GFX1010-SDAG-LABEL: test_writelane_p5:
250; GFX1010-SDAG:       ; %bb.0:
251; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252; GFX1010-SDAG-NEXT:    global_load_dword v4, v[0:1], off
253; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
254; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
255; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
256; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
257; GFX1010-SDAG-NEXT:    global_store_dword v[0:1], v4, off
258; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
259;
260; GFX1100-SDAG-LABEL: test_writelane_p5:
261; GFX1100-SDAG:       ; %bb.0:
262; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
263; GFX1100-SDAG-NEXT:    global_load_b32 v4, v[0:1], off
264; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
265; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
266; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
267; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
268; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
269; GFX1100-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
270; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
271  %oldval = load ptr addrspace(5), ptr addrspace(1) %out
272  %writelane = call ptr addrspace(5) @llvm.amdgcn.writelane.p5(ptr addrspace(5) %src, i32 %src1, ptr addrspace(5) %oldval)
273  store ptr addrspace(5) %writelane, ptr addrspace(1) %out, align 4
274  ret void
275}
276
277define void @test_writelane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src, i32 %src1) {
278; GFX802-SDAG-LABEL: test_writelane_v3p5:
279; GFX802-SDAG:       ; %bb.0:
280; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
281; GFX802-SDAG-NEXT:    flat_load_dwordx3 v[6:8], v[0:1]
282; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v5
283; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
284; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
285; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
286; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
287; GFX802-SDAG-NEXT:    v_writelane_b32 v8, s4, m0
288; GFX802-SDAG-NEXT:    v_writelane_b32 v7, s5, m0
289; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s6, m0
290; GFX802-SDAG-NEXT:    flat_store_dwordx3 v[0:1], v[6:8]
291; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
292; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
293;
294; GFX1010-SDAG-LABEL: test_writelane_v3p5:
295; GFX1010-SDAG:       ; %bb.0:
296; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
297; GFX1010-SDAG-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
298; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
299; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
300; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
301; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
302; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
303; GFX1010-SDAG-NEXT:    v_writelane_b32 v8, s4, s5
304; GFX1010-SDAG-NEXT:    v_writelane_b32 v7, s6, s5
305; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s7, s5
306; GFX1010-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off
307; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
308;
309; GFX1100-SDAG-LABEL: test_writelane_v3p5:
310; GFX1100-SDAG:       ; %bb.0:
311; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312; GFX1100-SDAG-NEXT:    global_load_b96 v[6:8], v[0:1], off
313; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
314; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
315; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v3
316; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v2
317; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
318; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
319; GFX1100-SDAG-NEXT:    v_writelane_b32 v8, s0, s1
320; GFX1100-SDAG-NEXT:    v_writelane_b32 v7, s2, s1
321; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
322; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s3, s1
323; GFX1100-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off
324; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
325  %oldval = load <3 x ptr addrspace(5)>, ptr addrspace(1) %out
326  %writelane = call <3 x ptr addrspace(5)> @llvm.amdgcn.writelane.v3p5(<3 x ptr addrspace(5)> %src, i32 %src1, <3 x ptr addrspace(5)> %oldval)
327  store <3 x ptr addrspace(5)> %writelane, ptr addrspace(1) %out, align 4
328  ret void
329}
330
331define void @test_writelane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32 %src1) {
332; GFX802-SDAG-LABEL: test_writelane_p6:
333; GFX802-SDAG:       ; %bb.0:
334; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
335; GFX802-SDAG-NEXT:    flat_load_dword v4, v[0:1]
336; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v3
337; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
338; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
339; GFX802-SDAG-NEXT:    s_nop 1
340; GFX802-SDAG-NEXT:    v_writelane_b32 v4, s4, m0
341; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v4
342; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
343; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
344;
345; GFX1010-SDAG-LABEL: test_writelane_p6:
346; GFX1010-SDAG:       ; %bb.0:
347; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
348; GFX1010-SDAG-NEXT:    global_load_dword v4, v[0:1], off
349; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
350; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
351; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
352; GFX1010-SDAG-NEXT:    v_writelane_b32 v4, s4, s5
353; GFX1010-SDAG-NEXT:    global_store_dword v[0:1], v4, off
354; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
355;
356; GFX1100-SDAG-LABEL: test_writelane_p6:
357; GFX1100-SDAG:       ; %bb.0:
358; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
359; GFX1100-SDAG-NEXT:    global_load_b32 v4, v[0:1], off
360; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v2
361; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
362; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
363; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
364; GFX1100-SDAG-NEXT:    v_writelane_b32 v4, s0, s1
365; GFX1100-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
366; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
367  %oldval = load ptr addrspace(6), ptr addrspace(1) %out
368  %writelane = call ptr addrspace(6) @llvm.amdgcn.writelane.p6(ptr addrspace(6) %src, i32 %src1, ptr addrspace(6) %oldval)
369  store ptr addrspace(6) %writelane, ptr addrspace(1) %out, align 4
370  ret void
371}
372
373define void @test_writelane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src, i32 %src1) {
374; GFX802-SDAG-LABEL: test_writelane_v3p6:
375; GFX802-SDAG:       ; %bb.0:
376; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377; GFX802-SDAG-NEXT:    flat_load_dwordx3 v[6:8], v[0:1]
378; GFX802-SDAG-NEXT:    v_readfirstlane_b32 m0, v5
379; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
380; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
381; GFX802-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
382; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
383; GFX802-SDAG-NEXT:    v_writelane_b32 v8, s4, m0
384; GFX802-SDAG-NEXT:    v_writelane_b32 v7, s5, m0
385; GFX802-SDAG-NEXT:    v_writelane_b32 v6, s6, m0
386; GFX802-SDAG-NEXT:    flat_store_dwordx3 v[0:1], v[6:8]
387; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
388; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
389;
390; GFX1010-SDAG-LABEL: test_writelane_v3p6:
391; GFX1010-SDAG:       ; %bb.0:
392; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
393; GFX1010-SDAG-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
394; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
395; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
396; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
397; GFX1010-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
398; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0)
399; GFX1010-SDAG-NEXT:    v_writelane_b32 v8, s4, s5
400; GFX1010-SDAG-NEXT:    v_writelane_b32 v7, s6, s5
401; GFX1010-SDAG-NEXT:    v_writelane_b32 v6, s7, s5
402; GFX1010-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off
403; GFX1010-SDAG-NEXT:    s_setpc_b64 s[30:31]
404;
405; GFX1100-SDAG-LABEL: test_writelane_v3p6:
406; GFX1100-SDAG:       ; %bb.0:
407; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
408; GFX1100-SDAG-NEXT:    global_load_b96 v[6:8], v[0:1], off
409; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
410; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
411; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s2, v3
412; GFX1100-SDAG-NEXT:    v_readfirstlane_b32 s3, v2
413; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0)
414; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
415; GFX1100-SDAG-NEXT:    v_writelane_b32 v8, s0, s1
416; GFX1100-SDAG-NEXT:    v_writelane_b32 v7, s2, s1
417; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
418; GFX1100-SDAG-NEXT:    v_writelane_b32 v6, s3, s1
419; GFX1100-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off
420; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
421  %oldval = load <3 x ptr addrspace(6)>, ptr addrspace(1) %out
422  %writelane = call <3 x ptr addrspace(6)> @llvm.amdgcn.writelane.v3p6(<3 x ptr addrspace(6)> %src, i32 %src1, <3 x ptr addrspace(6)> %oldval)
423  store <3 x ptr addrspace(6)> %writelane, ptr addrspace(1) %out, align 4
424  ret void
425}
426