xref: /llvm-project/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
3; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
5; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
7
8define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
9; GFX6-LABEL: constant_load_i64:
10; GFX6:       ; %bb.0:
11; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
12; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
13; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
14; GFX6-NEXT:    s_mov_b32 s3, 0xf000
15; GFX6-NEXT:    s_mov_b32 s2, -1
16; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
17; GFX6-NEXT:    v_mov_b32_e32 v0, s4
18; GFX6-NEXT:    v_mov_b32_e32 v1, s5
19; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
20; GFX6-NEXT:    s_endpgm
21;
22; GFX7-LABEL: constant_load_i64:
23; GFX7:       ; %bb.0:
24; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
25; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
26; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
27; GFX7-NEXT:    v_mov_b32_e32 v0, s0
28; GFX7-NEXT:    v_mov_b32_e32 v1, s1
29; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
30; GFX7-NEXT:    v_mov_b32_e32 v2, s2
31; GFX7-NEXT:    v_mov_b32_e32 v3, s3
32; GFX7-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
33; GFX7-NEXT:    s_endpgm
34;
35; GFX8-LABEL: constant_load_i64:
36; GFX8:       ; %bb.0:
37; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
38; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
40; GFX8-NEXT:    v_mov_b32_e32 v0, s0
41; GFX8-NEXT:    v_mov_b32_e32 v1, s1
42; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
43; GFX8-NEXT:    v_mov_b32_e32 v2, s2
44; GFX8-NEXT:    v_mov_b32_e32 v3, s3
45; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
46; GFX8-NEXT:    s_endpgm
47;
48; EG-LABEL: constant_load_i64:
49; EG:       ; %bb.0:
50; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
51; EG-NEXT:    TEX 0 @6
52; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
53; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
54; EG-NEXT:    CF_END
55; EG-NEXT:    PAD
56; EG-NEXT:    Fetch clause starting at 6:
57; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
58; EG-NEXT:    ALU clause starting at 8:
59; EG-NEXT:     MOV * T0.X, KC0[2].Z,
60; EG-NEXT:    ALU clause starting at 9:
61; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
62; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
63;
64; GFX12-LABEL: constant_load_i64:
65; GFX12:       ; %bb.0:
66; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
67; GFX12-NEXT:    s_wait_kmcnt 0x0
68; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
69; GFX12-NEXT:    v_mov_b32_e32 v2, 0
70; GFX12-NEXT:    s_wait_kmcnt 0x0
71; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
72; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
73; GFX12-NEXT:    s_endpgm
74  %ld = load i64, ptr addrspace(4) %in
75  store i64 %ld, ptr addrspace(1) %out
76  ret void
77}
78
79define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
80; GFX6-LABEL: constant_load_v2i64:
81; GFX6:       ; %bb.0: ; %entry
82; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
83; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
84; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
85; GFX6-NEXT:    s_mov_b32 s3, 0xf000
86; GFX6-NEXT:    s_mov_b32 s2, -1
87; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
88; GFX6-NEXT:    v_mov_b32_e32 v0, s4
89; GFX6-NEXT:    v_mov_b32_e32 v1, s5
90; GFX6-NEXT:    v_mov_b32_e32 v2, s6
91; GFX6-NEXT:    v_mov_b32_e32 v3, s7
92; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
93; GFX6-NEXT:    s_endpgm
94;
95; GFX7-LABEL: constant_load_v2i64:
96; GFX7:       ; %bb.0: ; %entry
97; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
98; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
99; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
100; GFX7-NEXT:    v_mov_b32_e32 v4, s0
101; GFX7-NEXT:    v_mov_b32_e32 v5, s1
102; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
103; GFX7-NEXT:    v_mov_b32_e32 v0, s4
104; GFX7-NEXT:    v_mov_b32_e32 v1, s5
105; GFX7-NEXT:    v_mov_b32_e32 v2, s6
106; GFX7-NEXT:    v_mov_b32_e32 v3, s7
107; GFX7-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
108; GFX7-NEXT:    s_endpgm
109;
110; GFX8-LABEL: constant_load_v2i64:
111; GFX8:       ; %bb.0: ; %entry
112; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
113; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
114; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
115; GFX8-NEXT:    v_mov_b32_e32 v4, s0
116; GFX8-NEXT:    v_mov_b32_e32 v5, s1
117; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX8-NEXT:    v_mov_b32_e32 v0, s4
119; GFX8-NEXT:    v_mov_b32_e32 v1, s5
120; GFX8-NEXT:    v_mov_b32_e32 v2, s6
121; GFX8-NEXT:    v_mov_b32_e32 v3, s7
122; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
123; GFX8-NEXT:    s_endpgm
124;
125; EG-LABEL: constant_load_v2i64:
126; EG:       ; %bb.0: ; %entry
127; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
128; EG-NEXT:    TEX 0 @6
129; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
130; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
131; EG-NEXT:    CF_END
132; EG-NEXT:    PAD
133; EG-NEXT:    Fetch clause starting at 6:
134; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
135; EG-NEXT:    ALU clause starting at 8:
136; EG-NEXT:     MOV * T0.X, KC0[2].Z,
137; EG-NEXT:    ALU clause starting at 9:
138; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
139; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
140;
141; GFX12-LABEL: constant_load_v2i64:
142; GFX12:       ; %bb.0: ; %entry
143; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
144; GFX12-NEXT:    s_wait_kmcnt 0x0
145; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
146; GFX12-NEXT:    v_mov_b32_e32 v4, 0
147; GFX12-NEXT:    s_wait_kmcnt 0x0
148; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
149; GFX12-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
150; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
151; GFX12-NEXT:    s_endpgm
152entry:
153  %ld = load <2 x i64>, ptr addrspace(4) %in
154  store <2 x i64> %ld, ptr addrspace(1) %out
155  ret void
156}
157
158define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
159; GFX6-LABEL: constant_load_v3i64:
160; GFX6:       ; %bb.0: ; %entry
161; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
162; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
163; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x4
164; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
165; GFX6-NEXT:    s_mov_b32 s3, 0xf000
166; GFX6-NEXT:    s_mov_b32 s2, -1
167; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
168; GFX6-NEXT:    v_mov_b32_e32 v0, s8
169; GFX6-NEXT:    v_mov_b32_e32 v1, s9
170; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
171; GFX6-NEXT:    s_waitcnt expcnt(0)
172; GFX6-NEXT:    v_mov_b32_e32 v0, s4
173; GFX6-NEXT:    v_mov_b32_e32 v1, s5
174; GFX6-NEXT:    v_mov_b32_e32 v2, s6
175; GFX6-NEXT:    v_mov_b32_e32 v3, s7
176; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
177; GFX6-NEXT:    s_endpgm
178;
179; GFX7-LABEL: constant_load_v3i64:
180; GFX7:       ; %bb.0: ; %entry
181; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
182; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x4
184; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
185; GFX7-NEXT:    s_add_u32 s2, s0, 16
186; GFX7-NEXT:    s_addc_u32 s3, s1, 0
187; GFX7-NEXT:    v_mov_b32_e32 v4, s3
188; GFX7-NEXT:    v_mov_b32_e32 v3, s2
189; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
190; GFX7-NEXT:    v_mov_b32_e32 v5, s8
191; GFX7-NEXT:    v_mov_b32_e32 v6, s9
192; GFX7-NEXT:    flat_store_dwordx2 v[3:4], v[5:6]
193; GFX7-NEXT:    v_mov_b32_e32 v5, s1
194; GFX7-NEXT:    v_mov_b32_e32 v0, s4
195; GFX7-NEXT:    v_mov_b32_e32 v1, s5
196; GFX7-NEXT:    v_mov_b32_e32 v2, s6
197; GFX7-NEXT:    v_mov_b32_e32 v3, s7
198; GFX7-NEXT:    v_mov_b32_e32 v4, s0
199; GFX7-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
200; GFX7-NEXT:    s_endpgm
201;
202; GFX8-LABEL: constant_load_v3i64:
203; GFX8:       ; %bb.0: ; %entry
204; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
205; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
206; GFX8-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x10
207; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
208; GFX8-NEXT:    s_add_u32 s2, s0, 16
209; GFX8-NEXT:    s_addc_u32 s3, s1, 0
210; GFX8-NEXT:    v_mov_b32_e32 v4, s3
211; GFX8-NEXT:    v_mov_b32_e32 v3, s2
212; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
213; GFX8-NEXT:    v_mov_b32_e32 v5, s8
214; GFX8-NEXT:    v_mov_b32_e32 v6, s9
215; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[5:6]
216; GFX8-NEXT:    v_mov_b32_e32 v5, s1
217; GFX8-NEXT:    v_mov_b32_e32 v0, s4
218; GFX8-NEXT:    v_mov_b32_e32 v1, s5
219; GFX8-NEXT:    v_mov_b32_e32 v2, s6
220; GFX8-NEXT:    v_mov_b32_e32 v3, s7
221; GFX8-NEXT:    v_mov_b32_e32 v4, s0
222; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
223; GFX8-NEXT:    s_endpgm
224;
225; EG-LABEL: constant_load_v3i64:
226; EG:       ; %bb.0: ; %entry
227; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
228; EG-NEXT:    TEX 0 @8
229; EG-NEXT:    ALU 1, @13, KC0[CB0:0-32], KC1[]
230; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
231; EG-NEXT:    TEX 0 @10
232; EG-NEXT:    ALU 3, @15, KC0[CB0:0-32], KC1[]
233; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
234; EG-NEXT:    CF_END
235; EG-NEXT:    Fetch clause starting at 8:
236; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
237; EG-NEXT:    Fetch clause starting at 10:
238; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 16, #1
239; EG-NEXT:    ALU clause starting at 12:
240; EG-NEXT:     MOV * T0.X, KC0[2].Z,
241; EG-NEXT:    ALU clause starting at 13:
242; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
243; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
244; EG-NEXT:    ALU clause starting at 15:
245; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
246; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
247; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
248; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
249;
250; GFX12-LABEL: constant_load_v3i64:
251; GFX12:       ; %bb.0: ; %entry
252; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
253; GFX12-NEXT:    s_wait_kmcnt 0x0
254; GFX12-NEXT:    s_clause 0x1
255; GFX12-NEXT:    s_load_b64 s[8:9], s[2:3], 0x10
256; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
257; GFX12-NEXT:    s_wait_kmcnt 0x0
258; GFX12-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s9
259; GFX12-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v1, s5
260; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
261; GFX12-NEXT:    v_mov_b32_e32 v2, s6
262; GFX12-NEXT:    s_clause 0x1
263; GFX12-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
264; GFX12-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
265; GFX12-NEXT:    s_endpgm
266entry:
267  %ld = load <3 x i64>, ptr addrspace(4) %in
268  store <3 x i64> %ld, ptr addrspace(1) %out
269  ret void
270}
271
272define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
273; GFX6-LABEL: constant_load_v4i64:
274; GFX6:       ; %bb.0: ; %entry
275; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
276; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
277; GFX6-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
278; GFX6-NEXT:    s_mov_b32 s11, 0xf000
279; GFX6-NEXT:    s_mov_b32 s10, -1
280; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX6-NEXT:    v_mov_b32_e32 v0, s4
282; GFX6-NEXT:    v_mov_b32_e32 v1, s5
283; GFX6-NEXT:    v_mov_b32_e32 v2, s6
284; GFX6-NEXT:    v_mov_b32_e32 v3, s7
285; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
286; GFX6-NEXT:    s_waitcnt expcnt(0)
287; GFX6-NEXT:    v_mov_b32_e32 v0, s0
288; GFX6-NEXT:    v_mov_b32_e32 v1, s1
289; GFX6-NEXT:    v_mov_b32_e32 v2, s2
290; GFX6-NEXT:    v_mov_b32_e32 v3, s3
291; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
292; GFX6-NEXT:    s_endpgm
293;
294; GFX7-LABEL: constant_load_v4i64:
295; GFX7:       ; %bb.0: ; %entry
296; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x0
297; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
298; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
299; GFX7-NEXT:    s_add_u32 s10, s8, 16
300; GFX7-NEXT:    s_addc_u32 s11, s9, 0
301; GFX7-NEXT:    v_mov_b32_e32 v6, s10
302; GFX7-NEXT:    v_mov_b32_e32 v7, s11
303; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
304; GFX7-NEXT:    v_mov_b32_e32 v0, s4
305; GFX7-NEXT:    v_mov_b32_e32 v1, s5
306; GFX7-NEXT:    v_mov_b32_e32 v2, s6
307; GFX7-NEXT:    v_mov_b32_e32 v3, s7
308; GFX7-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
309; GFX7-NEXT:    v_mov_b32_e32 v4, s0
310; GFX7-NEXT:    v_mov_b32_e32 v0, s8
311; GFX7-NEXT:    v_mov_b32_e32 v5, s1
312; GFX7-NEXT:    v_mov_b32_e32 v6, s2
313; GFX7-NEXT:    v_mov_b32_e32 v7, s3
314; GFX7-NEXT:    v_mov_b32_e32 v1, s9
315; GFX7-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
316; GFX7-NEXT:    s_endpgm
317;
318; GFX8-LABEL: constant_load_v4i64:
319; GFX8:       ; %bb.0: ; %entry
320; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
321; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
322; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
323; GFX8-NEXT:    s_add_u32 s10, s8, 16
324; GFX8-NEXT:    s_addc_u32 s11, s9, 0
325; GFX8-NEXT:    v_mov_b32_e32 v6, s10
326; GFX8-NEXT:    v_mov_b32_e32 v7, s11
327; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
328; GFX8-NEXT:    v_mov_b32_e32 v0, s4
329; GFX8-NEXT:    v_mov_b32_e32 v1, s5
330; GFX8-NEXT:    v_mov_b32_e32 v2, s6
331; GFX8-NEXT:    v_mov_b32_e32 v3, s7
332; GFX8-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
333; GFX8-NEXT:    v_mov_b32_e32 v4, s0
334; GFX8-NEXT:    v_mov_b32_e32 v0, s8
335; GFX8-NEXT:    v_mov_b32_e32 v5, s1
336; GFX8-NEXT:    v_mov_b32_e32 v6, s2
337; GFX8-NEXT:    v_mov_b32_e32 v7, s3
338; GFX8-NEXT:    v_mov_b32_e32 v1, s9
339; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
340; GFX8-NEXT:    s_endpgm
341;
342; EG-LABEL: constant_load_v4i64:
343; EG:       ; %bb.0: ; %entry
344; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
345; EG-NEXT:    TEX 0 @8
346; EG-NEXT:    ALU 3, @13, KC0[CB0:0-32], KC1[]
347; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
348; EG-NEXT:    TEX 0 @10
349; EG-NEXT:    ALU 1, @17, KC0[CB0:0-32], KC1[]
350; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
351; EG-NEXT:    CF_END
352; EG-NEXT:    Fetch clause starting at 8:
353; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
354; EG-NEXT:    Fetch clause starting at 10:
355; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
356; EG-NEXT:    ALU clause starting at 12:
357; EG-NEXT:     MOV * T0.X, KC0[2].Z,
358; EG-NEXT:    ALU clause starting at 13:
359; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
360; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
361; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
362; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
363; EG-NEXT:    ALU clause starting at 17:
364; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
365; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
366;
367; GFX12-LABEL: constant_load_v4i64:
368; GFX12:       ; %bb.0: ; %entry
369; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
370; GFX12-NEXT:    s_wait_kmcnt 0x0
371; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
372; GFX12-NEXT:    s_wait_kmcnt 0x0
373; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5
374; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
375; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s1
376; GFX12-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s3
377; GFX12-NEXT:    v_mov_b32_e32 v6, s2
378; GFX12-NEXT:    s_clause 0x1
379; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[8:9] offset:16
380; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[8:9]
381; GFX12-NEXT:    s_endpgm
382entry:
383  %ld = load <4 x i64>, ptr addrspace(4) %in
384  store <4 x i64> %ld, ptr addrspace(1) %out
385  ret void
386}
387
388define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
389; GFX6-LABEL: constant_load_v8i64:
390; GFX6:       ; %bb.0: ; %entry
391; GFX6-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x9
392; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
393; GFX6-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
394; GFX6-NEXT:    s_mov_b32 s19, 0xf000
395; GFX6-NEXT:    s_mov_b32 s18, -1
396; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
397; GFX6-NEXT:    v_mov_b32_e32 v0, s12
398; GFX6-NEXT:    v_mov_b32_e32 v1, s13
399; GFX6-NEXT:    v_mov_b32_e32 v2, s14
400; GFX6-NEXT:    v_mov_b32_e32 v3, s15
401; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48
402; GFX6-NEXT:    s_waitcnt expcnt(0)
403; GFX6-NEXT:    v_mov_b32_e32 v0, s8
404; GFX6-NEXT:    v_mov_b32_e32 v1, s9
405; GFX6-NEXT:    v_mov_b32_e32 v2, s10
406; GFX6-NEXT:    v_mov_b32_e32 v3, s11
407; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32
408; GFX6-NEXT:    s_waitcnt expcnt(0)
409; GFX6-NEXT:    v_mov_b32_e32 v0, s4
410; GFX6-NEXT:    v_mov_b32_e32 v1, s5
411; GFX6-NEXT:    v_mov_b32_e32 v2, s6
412; GFX6-NEXT:    v_mov_b32_e32 v3, s7
413; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
414; GFX6-NEXT:    s_waitcnt expcnt(0)
415; GFX6-NEXT:    v_mov_b32_e32 v0, s0
416; GFX6-NEXT:    v_mov_b32_e32 v1, s1
417; GFX6-NEXT:    v_mov_b32_e32 v2, s2
418; GFX6-NEXT:    v_mov_b32_e32 v3, s3
419; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
420; GFX6-NEXT:    s_endpgm
421;
422; GFX7-LABEL: constant_load_v8i64:
423; GFX7:       ; %bb.0: ; %entry
424; GFX7-NEXT:    s_load_dwordx4 s[16:19], s[8:9], 0x0
425; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
426; GFX7-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
427; GFX7-NEXT:    s_add_u32 s18, s16, 48
428; GFX7-NEXT:    s_addc_u32 s19, s17, 0
429; GFX7-NEXT:    v_mov_b32_e32 v6, s18
430; GFX7-NEXT:    v_mov_b32_e32 v7, s19
431; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
432; GFX7-NEXT:    v_mov_b32_e32 v0, s12
433; GFX7-NEXT:    v_mov_b32_e32 v1, s13
434; GFX7-NEXT:    v_mov_b32_e32 v2, s14
435; GFX7-NEXT:    v_mov_b32_e32 v3, s15
436; GFX7-NEXT:    v_mov_b32_e32 v4, s8
437; GFX7-NEXT:    s_add_u32 s8, s16, 32
438; GFX7-NEXT:    v_mov_b32_e32 v5, s9
439; GFX7-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
440; GFX7-NEXT:    s_addc_u32 s9, s17, 0
441; GFX7-NEXT:    v_mov_b32_e32 v0, s8
442; GFX7-NEXT:    v_mov_b32_e32 v6, s10
443; GFX7-NEXT:    v_mov_b32_e32 v7, s11
444; GFX7-NEXT:    v_mov_b32_e32 v1, s9
445; GFX7-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
446; GFX7-NEXT:    v_mov_b32_e32 v0, s4
447; GFX7-NEXT:    s_add_u32 s4, s16, 16
448; GFX7-NEXT:    v_mov_b32_e32 v1, s5
449; GFX7-NEXT:    s_addc_u32 s5, s17, 0
450; GFX7-NEXT:    v_mov_b32_e32 v4, s4
451; GFX7-NEXT:    v_mov_b32_e32 v2, s6
452; GFX7-NEXT:    v_mov_b32_e32 v3, s7
453; GFX7-NEXT:    v_mov_b32_e32 v5, s5
454; GFX7-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
455; GFX7-NEXT:    v_mov_b32_e32 v4, s16
456; GFX7-NEXT:    v_mov_b32_e32 v0, s0
457; GFX7-NEXT:    v_mov_b32_e32 v1, s1
458; GFX7-NEXT:    v_mov_b32_e32 v2, s2
459; GFX7-NEXT:    v_mov_b32_e32 v3, s3
460; GFX7-NEXT:    v_mov_b32_e32 v5, s17
461; GFX7-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
462; GFX7-NEXT:    s_endpgm
463;
464; GFX8-LABEL: constant_load_v8i64:
465; GFX8:       ; %bb.0: ; %entry
466; GFX8-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x24
467; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
468; GFX8-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
469; GFX8-NEXT:    s_add_u32 s18, s16, 48
470; GFX8-NEXT:    s_addc_u32 s19, s17, 0
471; GFX8-NEXT:    v_mov_b32_e32 v6, s18
472; GFX8-NEXT:    v_mov_b32_e32 v7, s19
473; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
474; GFX8-NEXT:    v_mov_b32_e32 v0, s12
475; GFX8-NEXT:    v_mov_b32_e32 v1, s13
476; GFX8-NEXT:    v_mov_b32_e32 v2, s14
477; GFX8-NEXT:    v_mov_b32_e32 v3, s15
478; GFX8-NEXT:    v_mov_b32_e32 v4, s8
479; GFX8-NEXT:    s_add_u32 s8, s16, 32
480; GFX8-NEXT:    v_mov_b32_e32 v5, s9
481; GFX8-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
482; GFX8-NEXT:    s_addc_u32 s9, s17, 0
483; GFX8-NEXT:    v_mov_b32_e32 v0, s8
484; GFX8-NEXT:    v_mov_b32_e32 v6, s10
485; GFX8-NEXT:    v_mov_b32_e32 v7, s11
486; GFX8-NEXT:    v_mov_b32_e32 v1, s9
487; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
488; GFX8-NEXT:    v_mov_b32_e32 v0, s4
489; GFX8-NEXT:    s_add_u32 s4, s16, 16
490; GFX8-NEXT:    v_mov_b32_e32 v1, s5
491; GFX8-NEXT:    s_addc_u32 s5, s17, 0
492; GFX8-NEXT:    v_mov_b32_e32 v4, s4
493; GFX8-NEXT:    v_mov_b32_e32 v2, s6
494; GFX8-NEXT:    v_mov_b32_e32 v3, s7
495; GFX8-NEXT:    v_mov_b32_e32 v5, s5
496; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
497; GFX8-NEXT:    v_mov_b32_e32 v4, s16
498; GFX8-NEXT:    v_mov_b32_e32 v0, s0
499; GFX8-NEXT:    v_mov_b32_e32 v1, s1
500; GFX8-NEXT:    v_mov_b32_e32 v2, s2
501; GFX8-NEXT:    v_mov_b32_e32 v3, s3
502; GFX8-NEXT:    v_mov_b32_e32 v5, s17
503; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
504; GFX8-NEXT:    s_endpgm
505;
506; EG-LABEL: constant_load_v8i64:
507; EG:       ; %bb.0: ; %entry
508; EG-NEXT:    ALU 0, @22, KC0[CB0:0-32], KC1[]
509; EG-NEXT:    TEX 0 @14
510; EG-NEXT:    ALU 3, @23, KC0[CB0:0-32], KC1[]
511; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
512; EG-NEXT:    TEX 0 @16
513; EG-NEXT:    ALU 3, @27, KC0[CB0:0-32], KC1[]
514; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
515; EG-NEXT:    TEX 0 @18
516; EG-NEXT:    ALU 3, @31, KC0[CB0:0-32], KC1[]
517; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
518; EG-NEXT:    TEX 0 @20
519; EG-NEXT:    ALU 1, @35, KC0[CB0:0-32], KC1[]
520; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
521; EG-NEXT:    CF_END
522; EG-NEXT:    Fetch clause starting at 14:
523; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
524; EG-NEXT:    Fetch clause starting at 16:
525; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 32, #1
526; EG-NEXT:    Fetch clause starting at 18:
527; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
528; EG-NEXT:    Fetch clause starting at 20:
529; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
530; EG-NEXT:    ALU clause starting at 22:
531; EG-NEXT:     MOV * T0.X, KC0[2].Z,
532; EG-NEXT:    ALU clause starting at 23:
533; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
534; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
535; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
536; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
537; EG-NEXT:    ALU clause starting at 27:
538; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
539; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
540; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
541; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
542; EG-NEXT:    ALU clause starting at 31:
543; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
544; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
545; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
546; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
547; EG-NEXT:    ALU clause starting at 35:
548; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
549; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
550;
551; GFX12-LABEL: constant_load_v8i64:
552; GFX12:       ; %bb.0: ; %entry
553; GFX12-NEXT:    s_load_b128 s[16:19], s[4:5], 0x24
554; GFX12-NEXT:    s_wait_kmcnt 0x0
555; GFX12-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
556; GFX12-NEXT:    s_wait_kmcnt 0x0
557; GFX12-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13
558; GFX12-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s15
559; GFX12-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s9
560; GFX12-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s11
561; GFX12-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v9, s5
562; GFX12-NEXT:    v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v11, s7
563; GFX12-NEXT:    v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s1
564; GFX12-NEXT:    v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s3
565; GFX12-NEXT:    v_mov_b32_e32 v14, s2
566; GFX12-NEXT:    s_clause 0x3
567; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[16:17] offset:48
568; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[16:17] offset:32
569; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[16:17] offset:16
570; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[16:17]
571; GFX12-NEXT:    s_endpgm
572entry:
573  %ld = load <8 x i64>, ptr addrspace(4) %in
574  store <8 x i64> %ld, ptr addrspace(1) %out
575  ret void
576}
577
578define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
579; GFX6-LABEL: constant_load_v16i64:
580; GFX6:       ; %bb.0: ; %entry
581; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
582; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
583; GFX6-NEXT:    s_load_dwordx16 s[16:31], s[2:3], 0x10
584; GFX6-NEXT:    s_mov_b32 s39, 0xf000
585; GFX6-NEXT:    s_mov_b32 s38, -1
586; GFX6-NEXT:    s_mov_b32 s36, s0
587; GFX6-NEXT:    s_mov_b32 s37, s1
588; GFX6-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0x0
589; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
590; GFX6-NEXT:    v_mov_b32_e32 v0, s28
591; GFX6-NEXT:    v_mov_b32_e32 v1, s29
592; GFX6-NEXT:    v_mov_b32_e32 v2, s30
593; GFX6-NEXT:    v_mov_b32_e32 v3, s31
594; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:112
595; GFX6-NEXT:    s_waitcnt expcnt(0)
596; GFX6-NEXT:    v_mov_b32_e32 v0, s24
597; GFX6-NEXT:    v_mov_b32_e32 v1, s25
598; GFX6-NEXT:    v_mov_b32_e32 v2, s26
599; GFX6-NEXT:    v_mov_b32_e32 v3, s27
600; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:96
601; GFX6-NEXT:    s_waitcnt expcnt(0)
602; GFX6-NEXT:    v_mov_b32_e32 v0, s20
603; GFX6-NEXT:    v_mov_b32_e32 v1, s21
604; GFX6-NEXT:    v_mov_b32_e32 v2, s22
605; GFX6-NEXT:    v_mov_b32_e32 v3, s23
606; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:80
607; GFX6-NEXT:    s_waitcnt expcnt(0)
608; GFX6-NEXT:    v_mov_b32_e32 v0, s16
609; GFX6-NEXT:    v_mov_b32_e32 v1, s17
610; GFX6-NEXT:    v_mov_b32_e32 v2, s18
611; GFX6-NEXT:    v_mov_b32_e32 v3, s19
612; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:64
613; GFX6-NEXT:    s_waitcnt expcnt(0)
614; GFX6-NEXT:    v_mov_b32_e32 v0, s12
615; GFX6-NEXT:    v_mov_b32_e32 v1, s13
616; GFX6-NEXT:    v_mov_b32_e32 v2, s14
617; GFX6-NEXT:    v_mov_b32_e32 v3, s15
618; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:48
619; GFX6-NEXT:    s_waitcnt expcnt(0)
620; GFX6-NEXT:    v_mov_b32_e32 v0, s8
621; GFX6-NEXT:    v_mov_b32_e32 v1, s9
622; GFX6-NEXT:    v_mov_b32_e32 v2, s10
623; GFX6-NEXT:    v_mov_b32_e32 v3, s11
624; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:32
625; GFX6-NEXT:    s_waitcnt expcnt(0)
626; GFX6-NEXT:    v_mov_b32_e32 v0, s4
627; GFX6-NEXT:    v_mov_b32_e32 v1, s5
628; GFX6-NEXT:    v_mov_b32_e32 v2, s6
629; GFX6-NEXT:    v_mov_b32_e32 v3, s7
630; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0 offset:16
631; GFX6-NEXT:    s_waitcnt expcnt(0)
632; GFX6-NEXT:    v_mov_b32_e32 v0, s0
633; GFX6-NEXT:    v_mov_b32_e32 v1, s1
634; GFX6-NEXT:    v_mov_b32_e32 v2, s2
635; GFX6-NEXT:    v_mov_b32_e32 v3, s3
636; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[36:39], 0
637; GFX6-NEXT:    s_endpgm
638;
639; GFX7-LABEL: constant_load_v16i64:
640; GFX7:       ; %bb.0: ; %entry
641; GFX7-NEXT:    s_load_dwordx4 s[16:19], s[8:9], 0x0
642; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
643; GFX7-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x10
644; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
645; GFX7-NEXT:    v_mov_b32_e32 v0, s12
646; GFX7-NEXT:    v_mov_b32_e32 v1, s13
647; GFX7-NEXT:    v_mov_b32_e32 v2, s14
648; GFX7-NEXT:    v_mov_b32_e32 v3, s15
649; GFX7-NEXT:    v_mov_b32_e32 v4, s8
650; GFX7-NEXT:    v_mov_b32_e32 v5, s9
651; GFX7-NEXT:    v_mov_b32_e32 v6, s10
652; GFX7-NEXT:    v_mov_b32_e32 v7, s11
653; GFX7-NEXT:    v_mov_b32_e32 v8, s4
654; GFX7-NEXT:    v_mov_b32_e32 v9, s5
655; GFX7-NEXT:    v_mov_b32_e32 v10, s6
656; GFX7-NEXT:    v_mov_b32_e32 v11, s7
657; GFX7-NEXT:    v_mov_b32_e32 v12, s0
658; GFX7-NEXT:    v_mov_b32_e32 v13, s1
659; GFX7-NEXT:    v_mov_b32_e32 v14, s2
660; GFX7-NEXT:    v_mov_b32_e32 v15, s3
661; GFX7-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
662; GFX7-NEXT:    s_add_u32 s18, s16, 0x70
663; GFX7-NEXT:    s_addc_u32 s19, s17, 0
664; GFX7-NEXT:    v_mov_b32_e32 v16, s18
665; GFX7-NEXT:    v_mov_b32_e32 v17, s19
666; GFX7-NEXT:    s_add_u32 s18, s16, 0x60
667; GFX7-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
668; GFX7-NEXT:    s_addc_u32 s19, s17, 0
669; GFX7-NEXT:    v_mov_b32_e32 v0, s18
670; GFX7-NEXT:    v_mov_b32_e32 v1, s19
671; GFX7-NEXT:    s_add_u32 s18, s16, 0x50
672; GFX7-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
673; GFX7-NEXT:    s_addc_u32 s19, s17, 0
674; GFX7-NEXT:    v_mov_b32_e32 v0, s18
675; GFX7-NEXT:    v_mov_b32_e32 v1, s19
676; GFX7-NEXT:    s_add_u32 s18, s16, 64
677; GFX7-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
678; GFX7-NEXT:    s_addc_u32 s19, s17, 0
679; GFX7-NEXT:    v_mov_b32_e32 v0, s18
680; GFX7-NEXT:    v_mov_b32_e32 v1, s19
681; GFX7-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
682; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
683; GFX7-NEXT:    v_mov_b32_e32 v0, s12
684; GFX7-NEXT:    s_add_u32 s12, s16, 48
685; GFX7-NEXT:    v_mov_b32_e32 v1, s13
686; GFX7-NEXT:    s_addc_u32 s13, s17, 0
687; GFX7-NEXT:    v_mov_b32_e32 v4, s12
688; GFX7-NEXT:    v_mov_b32_e32 v2, s14
689; GFX7-NEXT:    v_mov_b32_e32 v3, s15
690; GFX7-NEXT:    v_mov_b32_e32 v5, s13
691; GFX7-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
692; GFX7-NEXT:    s_nop 0
693; GFX7-NEXT:    v_mov_b32_e32 v0, s8
694; GFX7-NEXT:    s_add_u32 s8, s16, 32
695; GFX7-NEXT:    v_mov_b32_e32 v1, s9
696; GFX7-NEXT:    s_addc_u32 s9, s17, 0
697; GFX7-NEXT:    v_mov_b32_e32 v4, s8
698; GFX7-NEXT:    v_mov_b32_e32 v2, s10
699; GFX7-NEXT:    v_mov_b32_e32 v3, s11
700; GFX7-NEXT:    v_mov_b32_e32 v5, s9
701; GFX7-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
702; GFX7-NEXT:    s_nop 0
703; GFX7-NEXT:    v_mov_b32_e32 v0, s4
704; GFX7-NEXT:    s_add_u32 s4, s16, 16
705; GFX7-NEXT:    v_mov_b32_e32 v1, s5
706; GFX7-NEXT:    s_addc_u32 s5, s17, 0
707; GFX7-NEXT:    v_mov_b32_e32 v4, s4
708; GFX7-NEXT:    v_mov_b32_e32 v2, s6
709; GFX7-NEXT:    v_mov_b32_e32 v3, s7
710; GFX7-NEXT:    v_mov_b32_e32 v5, s5
711; GFX7-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
712; GFX7-NEXT:    v_mov_b32_e32 v4, s16
713; GFX7-NEXT:    v_mov_b32_e32 v0, s0
714; GFX7-NEXT:    v_mov_b32_e32 v1, s1
715; GFX7-NEXT:    v_mov_b32_e32 v2, s2
716; GFX7-NEXT:    v_mov_b32_e32 v3, s3
717; GFX7-NEXT:    v_mov_b32_e32 v5, s17
718; GFX7-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
719; GFX7-NEXT:    s_endpgm
720;
721; GFX8-LABEL: constant_load_v16i64:
722; GFX8:       ; %bb.0: ; %entry
723; GFX8-NEXT:    s_load_dwordx4 s[36:39], s[4:5], 0x24
724; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
725; GFX8-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x40
726; GFX8-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
727; GFX8-NEXT:    s_add_u32 s34, s36, 0x70
728; GFX8-NEXT:    s_addc_u32 s35, s37, 0
729; GFX8-NEXT:    v_mov_b32_e32 v5, s34
730; GFX8-NEXT:    v_mov_b32_e32 v6, s35
731; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
732; GFX8-NEXT:    v_mov_b32_e32 v0, s28
733; GFX8-NEXT:    v_mov_b32_e32 v1, s29
734; GFX8-NEXT:    v_mov_b32_e32 v2, s30
735; GFX8-NEXT:    v_mov_b32_e32 v3, s31
736; GFX8-NEXT:    v_mov_b32_e32 v4, s24
737; GFX8-NEXT:    s_add_u32 s24, s36, 0x60
738; GFX8-NEXT:    flat_store_dwordx4 v[5:6], v[0:3]
739; GFX8-NEXT:    v_mov_b32_e32 v5, s25
740; GFX8-NEXT:    s_addc_u32 s25, s37, 0
741; GFX8-NEXT:    v_mov_b32_e32 v0, s24
742; GFX8-NEXT:    v_mov_b32_e32 v6, s26
743; GFX8-NEXT:    v_mov_b32_e32 v7, s27
744; GFX8-NEXT:    v_mov_b32_e32 v1, s25
745; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
746; GFX8-NEXT:    v_mov_b32_e32 v0, s20
747; GFX8-NEXT:    s_add_u32 s20, s36, 0x50
748; GFX8-NEXT:    v_mov_b32_e32 v1, s21
749; GFX8-NEXT:    s_addc_u32 s21, s37, 0
750; GFX8-NEXT:    v_mov_b32_e32 v4, s20
751; GFX8-NEXT:    v_mov_b32_e32 v2, s22
752; GFX8-NEXT:    v_mov_b32_e32 v3, s23
753; GFX8-NEXT:    v_mov_b32_e32 v5, s21
754; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
755; GFX8-NEXT:    s_nop 0
756; GFX8-NEXT:    v_mov_b32_e32 v0, s16
757; GFX8-NEXT:    s_add_u32 s16, s36, 64
758; GFX8-NEXT:    v_mov_b32_e32 v1, s17
759; GFX8-NEXT:    s_addc_u32 s17, s37, 0
760; GFX8-NEXT:    v_mov_b32_e32 v4, s16
761; GFX8-NEXT:    v_mov_b32_e32 v2, s18
762; GFX8-NEXT:    v_mov_b32_e32 v3, s19
763; GFX8-NEXT:    v_mov_b32_e32 v5, s17
764; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
765; GFX8-NEXT:    s_nop 0
766; GFX8-NEXT:    v_mov_b32_e32 v0, s12
767; GFX8-NEXT:    s_add_u32 s12, s36, 48
768; GFX8-NEXT:    v_mov_b32_e32 v1, s13
769; GFX8-NEXT:    s_addc_u32 s13, s37, 0
770; GFX8-NEXT:    v_mov_b32_e32 v4, s12
771; GFX8-NEXT:    v_mov_b32_e32 v2, s14
772; GFX8-NEXT:    v_mov_b32_e32 v3, s15
773; GFX8-NEXT:    v_mov_b32_e32 v5, s13
774; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
775; GFX8-NEXT:    s_nop 0
776; GFX8-NEXT:    v_mov_b32_e32 v0, s8
777; GFX8-NEXT:    s_add_u32 s8, s36, 32
778; GFX8-NEXT:    v_mov_b32_e32 v1, s9
779; GFX8-NEXT:    s_addc_u32 s9, s37, 0
780; GFX8-NEXT:    v_mov_b32_e32 v4, s8
781; GFX8-NEXT:    v_mov_b32_e32 v2, s10
782; GFX8-NEXT:    v_mov_b32_e32 v3, s11
783; GFX8-NEXT:    v_mov_b32_e32 v5, s9
784; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
785; GFX8-NEXT:    s_nop 0
786; GFX8-NEXT:    v_mov_b32_e32 v0, s4
787; GFX8-NEXT:    s_add_u32 s4, s36, 16
788; GFX8-NEXT:    v_mov_b32_e32 v1, s5
789; GFX8-NEXT:    s_addc_u32 s5, s37, 0
790; GFX8-NEXT:    v_mov_b32_e32 v4, s4
791; GFX8-NEXT:    v_mov_b32_e32 v2, s6
792; GFX8-NEXT:    v_mov_b32_e32 v3, s7
793; GFX8-NEXT:    v_mov_b32_e32 v5, s5
794; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
795; GFX8-NEXT:    v_mov_b32_e32 v4, s36
796; GFX8-NEXT:    v_mov_b32_e32 v0, s0
797; GFX8-NEXT:    v_mov_b32_e32 v1, s1
798; GFX8-NEXT:    v_mov_b32_e32 v2, s2
799; GFX8-NEXT:    v_mov_b32_e32 v3, s3
800; GFX8-NEXT:    v_mov_b32_e32 v5, s37
801; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
802; GFX8-NEXT:    s_endpgm
803;
804; EG-LABEL: constant_load_v16i64:
805; EG:       ; %bb.0: ; %entry
806; EG-NEXT:    ALU 0, @42, KC0[CB0:0-32], KC1[]
807; EG-NEXT:    TEX 0 @26
808; EG-NEXT:    ALU 3, @43, KC0[CB0:0-32], KC1[]
809; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
810; EG-NEXT:    TEX 0 @28
811; EG-NEXT:    ALU 3, @47, KC0[CB0:0-32], KC1[]
812; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
813; EG-NEXT:    TEX 0 @30
814; EG-NEXT:    ALU 3, @51, KC0[CB0:0-32], KC1[]
815; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
816; EG-NEXT:    TEX 0 @32
817; EG-NEXT:    ALU 3, @55, KC0[CB0:0-32], KC1[]
818; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
819; EG-NEXT:    TEX 0 @34
820; EG-NEXT:    ALU 3, @59, KC0[CB0:0-32], KC1[]
821; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
822; EG-NEXT:    TEX 0 @36
823; EG-NEXT:    ALU 3, @63, KC0[CB0:0-32], KC1[]
824; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
825; EG-NEXT:    TEX 0 @38
826; EG-NEXT:    ALU 3, @67, KC0[CB0:0-32], KC1[]
827; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
828; EG-NEXT:    TEX 0 @40
829; EG-NEXT:    ALU 1, @71, KC0[CB0:0-32], KC1[]
830; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
831; EG-NEXT:    CF_END
832; EG-NEXT:    Fetch clause starting at 26:
833; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 112, #1
834; EG-NEXT:    Fetch clause starting at 28:
835; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 96, #1
836; EG-NEXT:    Fetch clause starting at 30:
837; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 80, #1
838; EG-NEXT:    Fetch clause starting at 32:
839; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 64, #1
840; EG-NEXT:    Fetch clause starting at 34:
841; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
842; EG-NEXT:    Fetch clause starting at 36:
843; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 32, #1
844; EG-NEXT:    Fetch clause starting at 38:
845; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
846; EG-NEXT:    Fetch clause starting at 40:
847; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
848; EG-NEXT:    ALU clause starting at 42:
849; EG-NEXT:     MOV * T0.X, KC0[2].Z,
850; EG-NEXT:    ALU clause starting at 43:
851; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
852; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
853; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
854; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
855; EG-NEXT:    ALU clause starting at 47:
856; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
857; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
858; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
859; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
860; EG-NEXT:    ALU clause starting at 51:
861; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
862; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
863; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
864; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
865; EG-NEXT:    ALU clause starting at 55:
866; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
867; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
868; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
869; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
870; EG-NEXT:    ALU clause starting at 59:
871; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
872; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
873; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
874; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
875; EG-NEXT:    ALU clause starting at 63:
876; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
877; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
878; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
879; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
880; EG-NEXT:    ALU clause starting at 67:
881; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
882; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
883; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
884; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
885; EG-NEXT:    ALU clause starting at 71:
886; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
887; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
888;
889; GFX12-LABEL: constant_load_v16i64:
890; GFX12:       ; %bb.0: ; %entry
891; GFX12-NEXT:    s_load_b128 s[36:39], s[4:5], 0x24
892; GFX12-NEXT:    s_wait_kmcnt 0x0
893; GFX12-NEXT:    s_clause 0x1
894; GFX12-NEXT:    s_load_b512 s[16:31], s[38:39], 0x40
895; GFX12-NEXT:    s_load_b512 s[0:15], s[38:39], 0x0
896; GFX12-NEXT:    s_wait_kmcnt 0x0
897; GFX12-NEXT:    v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v1, s29
898; GFX12-NEXT:    v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s31
899; GFX12-NEXT:    v_dual_mov_b32 v2, s30 :: v_dual_mov_b32 v5, s25
900; GFX12-NEXT:    v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v7, s27
901; GFX12-NEXT:    v_dual_mov_b32 v6, s26 :: v_dual_mov_b32 v9, s21
902; GFX12-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
903; GFX12-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s17
904; GFX12-NEXT:    v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v15, s19
905; GFX12-NEXT:    v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v17, s13
906; GFX12-NEXT:    v_dual_mov_b32 v16, s12 :: v_dual_mov_b32 v19, s15
907; GFX12-NEXT:    v_dual_mov_b32 v18, s14 :: v_dual_mov_b32 v21, s9
908; GFX12-NEXT:    v_dual_mov_b32 v20, s8 :: v_dual_mov_b32 v23, s11
909; GFX12-NEXT:    v_dual_mov_b32 v22, s10 :: v_dual_mov_b32 v25, s5
910; GFX12-NEXT:    v_dual_mov_b32 v24, s4 :: v_dual_mov_b32 v27, s7
911; GFX12-NEXT:    v_dual_mov_b32 v26, s6 :: v_dual_mov_b32 v29, s1
912; GFX12-NEXT:    v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v31, s3
913; GFX12-NEXT:    v_mov_b32_e32 v30, s2
914; GFX12-NEXT:    s_clause 0x7
915; GFX12-NEXT:    global_store_b128 v32, v[0:3], s[36:37] offset:112
916; GFX12-NEXT:    global_store_b128 v32, v[4:7], s[36:37] offset:96
917; GFX12-NEXT:    global_store_b128 v32, v[8:11], s[36:37] offset:80
918; GFX12-NEXT:    global_store_b128 v32, v[12:15], s[36:37] offset:64
919; GFX12-NEXT:    global_store_b128 v32, v[16:19], s[36:37] offset:48
920; GFX12-NEXT:    global_store_b128 v32, v[20:23], s[36:37] offset:32
921; GFX12-NEXT:    global_store_b128 v32, v[24:27], s[36:37] offset:16
922; GFX12-NEXT:    global_store_b128 v32, v[28:31], s[36:37]
923; GFX12-NEXT:    s_endpgm
924entry:
925  %ld = load <16 x i64>, ptr addrspace(4) %in
926  store <16 x i64> %ld, ptr addrspace(1) %out
927  ret void
928}
929
930attributes #0 = { nounwind }
931