xref: /llvm-project/llvm/test/CodeGen/AMDGPU/load-global-i32.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=SI-NOHSA %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCNX3-HSA %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNX3-NOHSA %s
5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX900-HSA %s
7; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX908-HSA %s
8
9define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
10; SI-NOHSA-LABEL: global_load_i32:
11; SI-NOHSA:       ; %bb.0: ; %entry
12; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
13; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
14; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
15; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
16; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
17; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
18; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
19; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
20; SI-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
21; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
22; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
23; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
24; SI-NOHSA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
25; SI-NOHSA-NEXT:    s_endpgm
26;
27; GCNX3-HSA-LABEL: global_load_i32:
28; GCNX3-HSA:       ; %bb.0: ; %entry
29; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
30; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
31; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
32; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
33; GCNX3-HSA-NEXT:    flat_load_dword v2, v[0:1]
34; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s0
35; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s1
36; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
37; GCNX3-HSA-NEXT:    flat_store_dword v[0:1], v2
38; GCNX3-HSA-NEXT:    s_endpgm
39;
40; GCNX3-NOHSA-LABEL: global_load_i32:
41; GCNX3-NOHSA:       ; %bb.0: ; %entry
42; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
43; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
44; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
45; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
46; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
47; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
48; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
49; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
50; GCNX3-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
51; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
52; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
53; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
54; GCNX3-NOHSA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
55; GCNX3-NOHSA-NEXT:    s_endpgm
56;
57; EG-LABEL: global_load_i32:
58; EG:       ; %bb.0: ; %entry
59; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
60; EG-NEXT:    TEX 0 @6
61; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
62; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
63; EG-NEXT:    CF_END
64; EG-NEXT:    PAD
65; EG-NEXT:    Fetch clause starting at 6:
66; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
67; EG-NEXT:    ALU clause starting at 8:
68; EG-NEXT:     MOV * T0.X, KC0[2].Z,
69; EG-NEXT:    ALU clause starting at 9:
70; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
71; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
72;
73; GCN-HSA-LABEL: global_load_i32:
74; GCN-HSA:       ; %bb.0: ; %entry
75; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
76; GCN-HSA-NEXT:    v_mov_b32_e32 v0, 0
77; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
78; GCN-HSA-NEXT:    global_load_dword v1, v0, s[2:3]
79; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
80; GCN-HSA-NEXT:    global_store_dword v0, v1, s[0:1]
81; GCN-HSA-NEXT:    s_endpgm
82entry:
83  %ld = load i32, ptr addrspace(1) %in
84  store i32 %ld, ptr addrspace(1) %out
85  ret void
86}
87
88define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
89; SI-NOHSA-LABEL: global_load_v2i32:
90; SI-NOHSA:       ; %bb.0: ; %entry
91; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
92; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
93; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
94; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
95; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
96; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
97; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
98; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
99; SI-NOHSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
100; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
101; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
102; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
103; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
104; SI-NOHSA-NEXT:    s_endpgm
105;
106; GCNX3-HSA-LABEL: global_load_v2i32:
107; GCNX3-HSA:       ; %bb.0: ; %entry
108; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
109; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
110; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
111; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
112; GCNX3-HSA-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
113; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s0
114; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s1
115; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
116; GCNX3-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
117; GCNX3-HSA-NEXT:    s_endpgm
118;
119; GCNX3-NOHSA-LABEL: global_load_v2i32:
120; GCNX3-NOHSA:       ; %bb.0: ; %entry
121; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
122; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
123; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
124; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
125; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
126; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
127; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
128; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
129; GCNX3-NOHSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
130; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
131; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
132; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
133; GCNX3-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
134; GCNX3-NOHSA-NEXT:    s_endpgm
135;
136; EG-LABEL: global_load_v2i32:
137; EG:       ; %bb.0: ; %entry
138; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
139; EG-NEXT:    TEX 0 @6
140; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
141; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
142; EG-NEXT:    CF_END
143; EG-NEXT:    PAD
144; EG-NEXT:    Fetch clause starting at 6:
145; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
146; EG-NEXT:    ALU clause starting at 8:
147; EG-NEXT:     MOV * T0.X, KC0[2].Z,
148; EG-NEXT:    ALU clause starting at 9:
149; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
150; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
151;
152; GCN-HSA-LABEL: global_load_v2i32:
153; GCN-HSA:       ; %bb.0: ; %entry
154; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
155; GCN-HSA-NEXT:    v_mov_b32_e32 v2, 0
156; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
157; GCN-HSA-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
158; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
159; GCN-HSA-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
160; GCN-HSA-NEXT:    s_endpgm
161entry:
162  %ld = load <2 x i32>, ptr addrspace(1) %in
163  store <2 x i32> %ld, ptr addrspace(1) %out
164  ret void
165}
166
167define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
168; SI-NOHSA-LABEL: global_load_v3i32:
169; SI-NOHSA:       ; %bb.0: ; %entry
170; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
171; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
172; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
173; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
174; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
175; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
176; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
177; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
178; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
179; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
180; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
181; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
182; SI-NOHSA-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
183; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
184; SI-NOHSA-NEXT:    s_endpgm
185;
186; GCNX3-HSA-LABEL: global_load_v3i32:
187; GCNX3-HSA:       ; %bb.0: ; %entry
188; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
189; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
190; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
191; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
192; GCNX3-HSA-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
193; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s0
194; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s1
195; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
196; GCNX3-HSA-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
197; GCNX3-HSA-NEXT:    s_endpgm
198;
199; GCNX3-NOHSA-LABEL: global_load_v3i32:
200; GCNX3-NOHSA:       ; %bb.0: ; %entry
201; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
202; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
203; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
204; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
205; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
206; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
207; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
208; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
209; GCNX3-NOHSA-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
210; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
211; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
212; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
213; GCNX3-NOHSA-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
214; GCNX3-NOHSA-NEXT:    s_endpgm
215;
216; EG-LABEL: global_load_v3i32:
217; EG:       ; %bb.0: ; %entry
218; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
219; EG-NEXT:    TEX 0 @6
220; EG-NEXT:    ALU 6, @9, KC0[CB0:0-32], KC1[]
221; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
222; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
223; EG-NEXT:    CF_END
224; EG-NEXT:    Fetch clause starting at 6:
225; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
226; EG-NEXT:    ALU clause starting at 8:
227; EG-NEXT:     MOV * T0.X, KC0[2].Z,
228; EG-NEXT:    ALU clause starting at 9:
229; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
230; EG-NEXT:     MOV * T2.X, T0.Z,
231; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
232; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
233; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
234; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
235; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
236;
237; GCN-HSA-LABEL: global_load_v3i32:
238; GCN-HSA:       ; %bb.0: ; %entry
239; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
240; GCN-HSA-NEXT:    v_mov_b32_e32 v3, 0
241; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
242; GCN-HSA-NEXT:    global_load_dwordx3 v[0:2], v3, s[2:3]
243; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
244; GCN-HSA-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
245; GCN-HSA-NEXT:    s_endpgm
246entry:
247  %ld = load <3 x i32>, ptr addrspace(1) %in
248  store <3 x i32> %ld, ptr addrspace(1) %out
249  ret void
250}
251
252define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
253; SI-NOHSA-LABEL: global_load_v4i32:
254; SI-NOHSA:       ; %bb.0: ; %entry
255; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
256; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
257; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
258; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
259; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
260; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
261; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
262; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
263; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
264; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
265; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
266; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
267; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
268; SI-NOHSA-NEXT:    s_endpgm
269;
270; GCNX3-HSA-LABEL: global_load_v4i32:
271; GCNX3-HSA:       ; %bb.0: ; %entry
272; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
273; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
274; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
275; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
276; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
277; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s0
278; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s1
279; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
280; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
281; GCNX3-HSA-NEXT:    s_endpgm
282;
283; GCNX3-NOHSA-LABEL: global_load_v4i32:
284; GCNX3-NOHSA:       ; %bb.0: ; %entry
285; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
286; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
287; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
288; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
289; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
290; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
291; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
292; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
293; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
294; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
295; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
296; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
297; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
298; GCNX3-NOHSA-NEXT:    s_endpgm
299;
300; EG-LABEL: global_load_v4i32:
301; EG:       ; %bb.0: ; %entry
302; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
303; EG-NEXT:    TEX 0 @6
304; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
305; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
306; EG-NEXT:    CF_END
307; EG-NEXT:    PAD
308; EG-NEXT:    Fetch clause starting at 6:
309; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
310; EG-NEXT:    ALU clause starting at 8:
311; EG-NEXT:     MOV * T0.X, KC0[2].Z,
312; EG-NEXT:    ALU clause starting at 9:
313; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
314; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
315;
316; GCN-HSA-LABEL: global_load_v4i32:
317; GCN-HSA:       ; %bb.0: ; %entry
318; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
319; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
320; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
321; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
322; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
323; GCN-HSA-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
324; GCN-HSA-NEXT:    s_endpgm
325entry:
326  %ld = load <4 x i32>, ptr addrspace(1) %in
327  store <4 x i32> %ld, ptr addrspace(1) %out
328  ret void
329}
330
331define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
332; SI-NOHSA-LABEL: global_load_v8i32:
333; SI-NOHSA:       ; %bb.0: ; %entry
334; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
335; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
336; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
337; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
338; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
339; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
340; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
341; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
342; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
343; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
344; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
345; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
346; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
347; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
348; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
349; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
350; SI-NOHSA-NEXT:    s_endpgm
351;
352; GCNX3-HSA-LABEL: global_load_v8i32:
353; GCNX3-HSA:       ; %bb.0: ; %entry
354; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
355; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
356; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
357; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
358; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
359; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
360; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
361; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
362; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
363; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
364; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s1
365; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s0
366; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 16
367; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
368; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, s1
369; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, s0
370; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(1)
371; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
372; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(1)
373; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
374; GCNX3-HSA-NEXT:    s_endpgm
375;
376; GCNX3-NOHSA-LABEL: global_load_v8i32:
377; GCNX3-NOHSA:       ; %bb.0: ; %entry
378; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
379; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
380; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
381; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
382; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
383; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
384; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
385; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
386; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
387; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
388; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
389; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
390; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
391; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
392; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
393; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
394; GCNX3-NOHSA-NEXT:    s_endpgm
395;
396; EG-LABEL: global_load_v8i32:
397; EG:       ; %bb.0: ; %entry
398; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
399; EG-NEXT:    TEX 1 @6
400; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
401; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
402; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
403; EG-NEXT:    CF_END
404; EG-NEXT:    Fetch clause starting at 6:
405; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
406; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 16, #1
407; EG-NEXT:    ALU clause starting at 10:
408; EG-NEXT:     MOV * T0.X, KC0[2].Z,
409; EG-NEXT:    ALU clause starting at 11:
410; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
411; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
412; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
413; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
414; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
415;
416; GCN-HSA-LABEL: global_load_v8i32:
417; GCN-HSA:       ; %bb.0: ; %entry
418; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
419; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
420; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
421; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
422; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3]
423; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
424; GCN-HSA-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
425; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
426; GCN-HSA-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
427; GCN-HSA-NEXT:    s_endpgm
428entry:
429  %ld = load <8 x i32>, ptr addrspace(1) %in
430  store <8 x i32> %ld, ptr addrspace(1) %out
431  ret void
432}
433
434define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
435; SI-NOHSA-LABEL: global_load_v9i32:
436; SI-NOHSA:       ; %bb.0: ; %entry
437; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
438; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
439; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
440; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
441; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
442; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
443; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
444; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
445; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
446; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
447; SI-NOHSA-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:32
448; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
449; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
450; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
451; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
452; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
453; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
454; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
455; SI-NOHSA-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:32
456; SI-NOHSA-NEXT:    s_endpgm
457;
458; GCNX3-HSA-LABEL: global_load_v9i32:
459; GCNX3-HSA:       ; %bb.0: ; %entry
460; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
461; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
462; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
463; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
464; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
465; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
466; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
467; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
468; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
469; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s5
470; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
471; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
472; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s4
473; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
474; GCNX3-HSA-NEXT:    flat_load_dword v14, v[8:9]
475; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
476; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s1
477; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
478; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s0
479; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 32
480; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
481; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, s3
482; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s1
483; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, s2
484; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s0
485; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
486; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
487; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
488; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
489; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
490; GCNX3-HSA-NEXT:    flat_store_dword v[12:13], v14
491; GCNX3-HSA-NEXT:    s_endpgm
492;
493; GCNX3-NOHSA-LABEL: global_load_v9i32:
494; GCNX3-NOHSA:       ; %bb.0: ; %entry
495; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
496; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
497; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
498; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
499; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
500; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
501; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
502; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
503; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
504; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
505; GCNX3-NOHSA-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:32
506; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
507; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
508; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
509; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
510; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
511; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
512; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
513; GCNX3-NOHSA-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:32
514; GCNX3-NOHSA-NEXT:    s_endpgm
515;
516; EG-LABEL: global_load_v9i32:
517; EG:       ; %bb.0: ; %entry
518; EG-NEXT:    ALU 8, @14, KC0[CB0:0-32], KC1[]
519; EG-NEXT:    TEX 2 @8
520; EG-NEXT:    ALU 1, @23, KC0[CB0:0-32], KC1[]
521; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 0
522; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
523; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T0.X, 1
524; EG-NEXT:    CF_END
525; EG-NEXT:    PAD
526; EG-NEXT:    Fetch clause starting at 8:
527; EG-NEXT:     VTX_READ_128 T4.XYZW, T2.X, 0, #1
528; EG-NEXT:     VTX_READ_128 T2.XYZW, T2.X, 16, #1
529; EG-NEXT:     VTX_READ_32 T3.X, T3.X, 32, #1
530; EG-NEXT:    ALU clause starting at 14:
531; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
532; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
533; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
534; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
535; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
536; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
537; EG-NEXT:     MOV * T2.X, KC0[2].Z,
538; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
539; EG-NEXT:     MOV * T3.X, PS,
540; EG-NEXT:    ALU clause starting at 23:
541; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
542; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
543;
544; GCN-HSA-LABEL: global_load_v9i32:
545; GCN-HSA:       ; %bb.0: ; %entry
546; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
547; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
548; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
549; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
550; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
551; GCN-HSA-NEXT:    global_load_dword v9, v8, s[2:3] offset:32
552; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
553; GCN-HSA-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
554; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
555; GCN-HSA-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
556; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
557; GCN-HSA-NEXT:    global_store_dword v8, v9, s[0:1] offset:32
558; GCN-HSA-NEXT:    s_endpgm
559entry:
560  %ld = load <9 x i32>, ptr addrspace(1) %in
561  store <9 x i32> %ld, ptr addrspace(1) %out
562  ret void
563}
564
565define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
566; SI-NOHSA-LABEL: global_load_v10i32:
567; SI-NOHSA:       ; %bb.0: ; %entry
568; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
569; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
570; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
571; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
572; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
573; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
574; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
575; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
576; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
577; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
578; SI-NOHSA-NEXT:    buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
579; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
580; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
581; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
582; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
583; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
584; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
585; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
586; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
587; SI-NOHSA-NEXT:    s_endpgm
588;
589; GCNX3-HSA-LABEL: global_load_v10i32:
590; GCNX3-HSA:       ; %bb.0: ; %entry
591; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
592; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
593; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
594; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
595; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
596; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
597; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
598; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
599; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s5
600; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
601; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s4
602; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
603; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
604; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
605; GCNX3-HSA-NEXT:    flat_load_dwordx2 v[8:9], v[8:9]
606; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
607; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, s1
608; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
609; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, s0
610; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 32
611; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
612; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s3
613; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s1
614; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s2
615; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, s0
616; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
617; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
618; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
619; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
620; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
621; GCNX3-HSA-NEXT:    flat_store_dwordx2 v[14:15], v[8:9]
622; GCNX3-HSA-NEXT:    s_endpgm
623;
624; GCNX3-NOHSA-LABEL: global_load_v10i32:
625; GCNX3-NOHSA:       ; %bb.0: ; %entry
626; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
627; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
628; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
629; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
630; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
631; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
632; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
633; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
634; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
635; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
636; GCNX3-NOHSA-NEXT:    buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
637; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
638; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
639; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
640; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
641; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
642; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
643; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
644; GCNX3-NOHSA-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
645; GCNX3-NOHSA-NEXT:    s_endpgm
646;
647; EG-LABEL: global_load_v10i32:
648; EG:       ; %bb.0: ; %entry
649; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
650; EG-NEXT:    TEX 2 @8
651; EG-NEXT:    ALU 7, @15, KC0[CB0:0-32], KC1[]
652; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T5.X, 0
653; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T4.X, 0
654; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 1
655; EG-NEXT:    CF_END
656; EG-NEXT:    PAD
657; EG-NEXT:    Fetch clause starting at 8:
658; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
659; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 16, #1
660; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 32, #1
661; EG-NEXT:    ALU clause starting at 14:
662; EG-NEXT:     MOV * T0.X, KC0[2].Z,
663; EG-NEXT:    ALU clause starting at 15:
664; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
665; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
666; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
667; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
668; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
669; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
670; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
671; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
672;
673; GCN-HSA-LABEL: global_load_v10i32:
674; GCN-HSA:       ; %bb.0: ; %entry
675; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
676; GCN-HSA-NEXT:    v_mov_b32_e32 v10, 0
677; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
678; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v10, s[2:3]
679; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v10, s[2:3] offset:16
680; GCN-HSA-NEXT:    global_load_dwordx2 v[8:9], v10, s[2:3] offset:32
681; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
682; GCN-HSA-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
683; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
684; GCN-HSA-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
685; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
686; GCN-HSA-NEXT:    global_store_dwordx2 v10, v[8:9], s[0:1] offset:32
687; GCN-HSA-NEXT:    s_endpgm
688entry:
689  %ld = load <10 x i32>, ptr addrspace(1) %in
690  store <10 x i32> %ld, ptr addrspace(1) %out
691  ret void
692}
693
694define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
695; SI-NOHSA-LABEL: global_load_v11i32:
696; SI-NOHSA:       ; %bb.0: ; %entry
697; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
698; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
699; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
700; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
701; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
702; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
703; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
704; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
705; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
706; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
707; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
708; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
709; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
710; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
711; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
712; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
713; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
714; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
715; SI-NOHSA-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:40
716; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
717; SI-NOHSA-NEXT:    s_endpgm
718;
719; GCNX3-HSA-LABEL: global_load_v11i32:
720; GCNX3-HSA:       ; %bb.0: ; %entry
721; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
722; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
723; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
724; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
725; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
726; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
727; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
728; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
729; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s5
730; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
731; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s4
732; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
733; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
734; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
735; GCNX3-HSA-NEXT:    flat_load_dwordx3 v[8:10], v[8:9]
736; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
737; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s1
738; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
739; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, s0
740; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 32
741; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
742; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, s3
743; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s1
744; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s2
745; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s0
746; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
747; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[0:3]
748; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
749; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[4:7]
750; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
751; GCNX3-HSA-NEXT:    flat_store_dwordx3 v[15:16], v[8:10]
752; GCNX3-HSA-NEXT:    s_endpgm
753;
754; GCNX3-NOHSA-LABEL: global_load_v11i32:
755; GCNX3-NOHSA:       ; %bb.0: ; %entry
756; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
757; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
758; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
759; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
760; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
761; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
762; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
763; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
764; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
765; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
766; GCNX3-NOHSA-NEXT:    buffer_load_dwordx3 v[8:10], off, s[8:11], 0 offset:32
767; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
768; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
769; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
770; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
771; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
772; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
773; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
774; GCNX3-NOHSA-NEXT:    buffer_store_dwordx3 v[8:10], off, s[4:7], 0 offset:32
775; GCNX3-NOHSA-NEXT:    s_endpgm
776;
777; EG-LABEL: global_load_v11i32:
778; EG:       ; %bb.0: ; %entry
779; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
780; EG-NEXT:    TEX 2 @8
781; EG-NEXT:    ALU 12, @15, KC0[CB0:0-32], KC1[]
782; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T7.X, 0
783; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T6.X, 0
784; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 0
785; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 1
786; EG-NEXT:    CF_END
787; EG-NEXT:    Fetch clause starting at 8:
788; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
789; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 16, #1
790; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 32, #1
791; EG-NEXT:    ALU clause starting at 14:
792; EG-NEXT:     MOV * T0.X, KC0[2].Z,
793; EG-NEXT:    ALU clause starting at 15:
794; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
795; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
796; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
797; EG-NEXT:     MOV * T4.X, T0.Z,
798; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
799; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
800; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
801; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
802; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
803; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
804; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
805; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
806; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
807;
808; GCN-HSA-LABEL: global_load_v11i32:
809; GCN-HSA:       ; %bb.0: ; %entry
810; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
811; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
812; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
813; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v11, s[2:3]
814; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v11, s[2:3] offset:16
815; GCN-HSA-NEXT:    global_load_dwordx3 v[8:10], v11, s[2:3] offset:32
816; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
817; GCN-HSA-NEXT:    global_store_dwordx4 v11, v[0:3], s[0:1]
818; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
819; GCN-HSA-NEXT:    global_store_dwordx4 v11, v[4:7], s[0:1] offset:16
820; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
821; GCN-HSA-NEXT:    global_store_dwordx3 v11, v[8:10], s[0:1] offset:32
822; GCN-HSA-NEXT:    s_endpgm
823entry:
824  %ld = load <11 x i32>, ptr addrspace(1) %in
825  store <11 x i32> %ld, ptr addrspace(1) %out
826  ret void
827}
828
829
830define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
831; SI-NOHSA-LABEL: global_load_v12i32:
832; SI-NOHSA:       ; %bb.0: ; %entry
833; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
834; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
835; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
836; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
837; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
838; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
839; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
840; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
841; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
842; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
843; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
844; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
845; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
846; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
847; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
848; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
849; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
850; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
851; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
852; SI-NOHSA-NEXT:    s_endpgm
853;
854; GCNX3-HSA-LABEL: global_load_v12i32:
855; GCNX3-HSA:       ; %bb.0: ; %entry
856; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
857; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
858; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
859; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
860; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
861; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
862; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
863; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
864; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s5
865; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
866; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s4
867; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
868; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
869; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
870; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
871; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
872; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s1
873; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
874; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s0
875; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 32
876; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
877; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s3
878; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, s1
879; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, s2
880; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s0
881; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
882; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
883; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
884; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
885; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
886; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
887; GCNX3-HSA-NEXT:    s_endpgm
888;
889; GCNX3-NOHSA-LABEL: global_load_v12i32:
890; GCNX3-NOHSA:       ; %bb.0: ; %entry
891; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
892; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
893; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
894; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
895; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
896; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
897; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
898; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
899; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
900; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
901; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
902; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
903; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
904; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
905; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
906; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
907; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
908; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
909; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
910; GCNX3-NOHSA-NEXT:    s_endpgm
911;
912; EG-LABEL: global_load_v12i32:
913; EG:       ; %bb.0: ; %entry
914; EG-NEXT:    ALU 7, @14, KC0[CB0:0-32], KC1[]
915; EG-NEXT:    TEX 2 @8
916; EG-NEXT:    ALU 1, @22, KC0[CB0:0-32], KC1[]
917; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T5.X, 0
918; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0
919; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
920; EG-NEXT:    CF_END
921; EG-NEXT:    PAD
922; EG-NEXT:    Fetch clause starting at 8:
923; EG-NEXT:     VTX_READ_128 T3.XYZW, T2.X, 0, #1
924; EG-NEXT:     VTX_READ_128 T4.XYZW, T2.X, 16, #1
925; EG-NEXT:     VTX_READ_128 T2.XYZW, T2.X, 32, #1
926; EG-NEXT:    ALU clause starting at 14:
927; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
928; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
929; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
930; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
931; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
932; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
933; EG-NEXT:     MOV * T2.X, KC0[2].Z,
934; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
935; EG-NEXT:    ALU clause starting at 22:
936; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
937; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
938;
939; GCN-HSA-LABEL: global_load_v12i32:
940; GCN-HSA:       ; %bb.0: ; %entry
941; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
942; GCN-HSA-NEXT:    v_mov_b32_e32 v12, 0
943; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
944; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v12, s[2:3]
945; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v12, s[2:3] offset:16
946; GCN-HSA-NEXT:    global_load_dwordx4 v[8:11], v12, s[2:3] offset:32
947; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
948; GCN-HSA-NEXT:    global_store_dwordx4 v12, v[0:3], s[0:1]
949; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
950; GCN-HSA-NEXT:    global_store_dwordx4 v12, v[4:7], s[0:1] offset:16
951; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
952; GCN-HSA-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:32
953; GCN-HSA-NEXT:    s_endpgm
954entry:
955  %ld = load <12 x i32>, ptr addrspace(1) %in
956  store <12 x i32> %ld, ptr addrspace(1) %out
957  ret void
958}
959
960define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
961; SI-NOHSA-LABEL: global_load_v16i32:
962; SI-NOHSA:       ; %bb.0: ; %entry
963; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
964; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
965; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
966; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
967; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
968; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
969; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
970; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
971; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
972; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
973; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
974; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
975; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
976; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
977; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
978; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
979; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
980; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
981; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
982; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
983; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
984; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
985; SI-NOHSA-NEXT:    s_endpgm
986;
987; GCNX3-HSA-LABEL: global_load_v16i32:
988; GCNX3-HSA:       ; %bb.0: ; %entry
989; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
990; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
991; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
992; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
993; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s5
994; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s4
995; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 48
996; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
997; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
998; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
999; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 32
1000; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
1001; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s4
1002; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
1003; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s5
1004; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
1005; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1006; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1007; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
1008; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
1009; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 32
1010; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
1011; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, s3
1012; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, s2
1013; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 48
1014; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, s1
1015; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
1016; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s0
1017; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 16
1018; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
1019; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s3
1020; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, s1
1021; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s2
1022; GCNX3-HSA-NEXT:    v_mov_b32_e32 v22, s0
1023; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
1024; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
1025; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
1026; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[4:7]
1027; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
1028; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[8:11]
1029; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
1030; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[12:15]
1031; GCNX3-HSA-NEXT:    s_endpgm
1032;
1033; GCNX3-NOHSA-LABEL: global_load_v16i32:
1034; GCNX3-NOHSA:       ; %bb.0: ; %entry
1035; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1036; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1037; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
1038; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
1039; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
1040; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1041; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
1042; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
1043; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
1044; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
1045; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
1046; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
1047; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
1048; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
1049; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
1050; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
1051; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
1052; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
1053; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
1054; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
1055; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
1056; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
1057; GCNX3-NOHSA-NEXT:    s_endpgm
1058;
1059; EG-LABEL: global_load_v16i32:
1060; EG:       ; %bb.0: ; %entry
1061; EG-NEXT:    ALU 11, @16, KC0[CB0:0-32], KC1[]
1062; EG-NEXT:    TEX 3 @8
1063; EG-NEXT:    ALU 1, @28, KC0[], KC1[]
1064; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0
1065; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T2.X, 0
1066; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T1.X, 0
1067; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1
1068; EG-NEXT:    CF_END
1069; EG-NEXT:    Fetch clause starting at 8:
1070; EG-NEXT:     VTX_READ_128 T4.XYZW, T3.X, 32, #1
1071; EG-NEXT:     VTX_READ_128 T5.XYZW, T3.X, 48, #1
1072; EG-NEXT:     VTX_READ_128 T6.XYZW, T3.X, 0, #1
1073; EG-NEXT:     VTX_READ_128 T3.XYZW, T3.X, 16, #1
1074; EG-NEXT:    ALU clause starting at 16:
1075; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1076; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1077; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
1078; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1079; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1080; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1081; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
1082; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
1083; EG-NEXT:     MOV * T3.X, KC0[2].Z,
1084; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1085; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1086; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1087; EG-NEXT:    ALU clause starting at 28:
1088; EG-NEXT:     LSHR * T7.X, T0.W, literal.x,
1089; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1090;
1091; GCN-HSA-LABEL: global_load_v16i32:
1092; GCN-HSA:       ; %bb.0: ; %entry
1093; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1094; GCN-HSA-NEXT:    v_mov_b32_e32 v16, 0
1095; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1096; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v16, s[2:3] offset:32
1097; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v16, s[2:3] offset:48
1098; GCN-HSA-NEXT:    global_load_dwordx4 v[8:11], v16, s[2:3]
1099; GCN-HSA-NEXT:    global_load_dwordx4 v[12:15], v16, s[2:3] offset:16
1100; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
1101; GCN-HSA-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1] offset:32
1102; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
1103; GCN-HSA-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:48
1104; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
1105; GCN-HSA-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1]
1106; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
1107; GCN-HSA-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:16
1108; GCN-HSA-NEXT:    s_endpgm
1109entry:
1110  %ld = load <16 x i32>, ptr addrspace(1) %in
1111  store <16 x i32> %ld, ptr addrspace(1) %out
1112  ret void
1113}
1114
1115define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1116; SI-NOHSA-LABEL: global_zextload_i32_to_i64:
1117; SI-NOHSA:       ; %bb.0:
1118; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1119; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1120; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
1121; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
1122; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
1123; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1124; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
1125; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
1126; SI-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1127; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
1128; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
1129; SI-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
1130; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1131; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1132; SI-NOHSA-NEXT:    s_endpgm
1133;
1134; GCNX3-HSA-LABEL: global_zextload_i32_to_i64:
1135; GCNX3-HSA:       ; %bb.0:
1136; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1137; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1138; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
1139; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
1140; GCNX3-HSA-NEXT:    flat_load_dword v0, v[0:1]
1141; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s0
1142; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s1
1143; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, 0
1144; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
1145; GCNX3-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1146; GCNX3-HSA-NEXT:    s_endpgm
1147;
1148; GCNX3-NOHSA-LABEL: global_zextload_i32_to_i64:
1149; GCNX3-NOHSA:       ; %bb.0:
1150; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1151; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1152; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
1153; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
1154; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
1155; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1156; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
1157; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
1158; GCNX3-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1159; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
1160; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
1161; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
1162; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1163; GCNX3-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1164; GCNX3-NOHSA-NEXT:    s_endpgm
1165;
1166; EG-LABEL: global_zextload_i32_to_i64:
1167; EG:       ; %bb.0:
1168; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1169; EG-NEXT:    TEX 0 @6
1170; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
1171; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1172; EG-NEXT:    CF_END
1173; EG-NEXT:    PAD
1174; EG-NEXT:    Fetch clause starting at 6:
1175; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1176; EG-NEXT:    ALU clause starting at 8:
1177; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1178; EG-NEXT:    ALU clause starting at 9:
1179; EG-NEXT:     MOV * T0.Y, 0.0,
1180; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1181; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1182;
1183; GCN-HSA-LABEL: global_zextload_i32_to_i64:
1184; GCN-HSA:       ; %bb.0:
1185; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1186; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
1187; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1188; GCN-HSA-NEXT:    global_load_dword v0, v1, s[2:3]
1189; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1190; GCN-HSA-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
1191; GCN-HSA-NEXT:    s_endpgm
1192  %ld = load i32, ptr addrspace(1) %in
1193  %ext = zext i32 %ld to i64
1194  store i64 %ext, ptr addrspace(1) %out
1195  ret void
1196}
1197
1198define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1199; SI-NOHSA-LABEL: global_sextload_i32_to_i64:
1200; SI-NOHSA:       ; %bb.0:
1201; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1202; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1203; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
1204; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
1205; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
1206; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1207; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
1208; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
1209; SI-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1210; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
1211; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
1212; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1213; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1214; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1215; SI-NOHSA-NEXT:    s_endpgm
1216;
1217; GCNX3-HSA-LABEL: global_sextload_i32_to_i64:
1218; GCNX3-HSA:       ; %bb.0:
1219; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1220; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1221; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
1222; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
1223; GCNX3-HSA-NEXT:    flat_load_dword v0, v[0:1]
1224; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s0
1225; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s1
1226; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
1227; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1228; GCNX3-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1229; GCNX3-HSA-NEXT:    s_endpgm
1230;
1231; GCNX3-NOHSA-LABEL: global_sextload_i32_to_i64:
1232; GCNX3-NOHSA:       ; %bb.0:
1233; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1234; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1235; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
1236; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
1237; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
1238; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1239; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
1240; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
1241; GCNX3-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1242; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
1243; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
1244; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1245; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1246; GCNX3-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1247; GCNX3-NOHSA-NEXT:    s_endpgm
1248;
1249; EG-LABEL: global_sextload_i32_to_i64:
1250; EG:       ; %bb.0:
1251; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1252; EG-NEXT:    TEX 0 @6
1253; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
1254; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1255; EG-NEXT:    CF_END
1256; EG-NEXT:    PAD
1257; EG-NEXT:    Fetch clause starting at 6:
1258; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1259; EG-NEXT:    ALU clause starting at 8:
1260; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1261; EG-NEXT:    ALU clause starting at 9:
1262; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
1263; EG-NEXT:     ASHR * T0.Y, T0.X, literal.y,
1264; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
1265;
1266; GCN-HSA-LABEL: global_sextload_i32_to_i64:
1267; GCN-HSA:       ; %bb.0:
1268; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1269; GCN-HSA-NEXT:    v_mov_b32_e32 v2, 0
1270; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1271; GCN-HSA-NEXT:    global_load_dword v0, v2, s[2:3]
1272; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1273; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1274; GCN-HSA-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1275; GCN-HSA-NEXT:    s_endpgm
1276  %ld = load i32, ptr addrspace(1) %in
1277  %ext = sext i32 %ld to i64
1278  store i64 %ext, ptr addrspace(1) %out
1279  ret void
1280}
1281
1282define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1283; SI-NOHSA-LABEL: global_zextload_v1i32_to_v1i64:
1284; SI-NOHSA:       ; %bb.0:
1285; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1286; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1287; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
1288; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
1289; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
1290; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1291; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
1292; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
1293; SI-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1294; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
1295; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
1296; SI-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
1297; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1298; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1299; SI-NOHSA-NEXT:    s_endpgm
1300;
1301; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64:
1302; GCNX3-HSA:       ; %bb.0:
1303; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1304; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1305; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
1306; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
1307; GCNX3-HSA-NEXT:    flat_load_dword v0, v[0:1]
1308; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s0
1309; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s1
1310; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, 0
1311; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
1312; GCNX3-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1313; GCNX3-HSA-NEXT:    s_endpgm
1314;
1315; GCNX3-NOHSA-LABEL: global_zextload_v1i32_to_v1i64:
1316; GCNX3-NOHSA:       ; %bb.0:
1317; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1318; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1319; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
1320; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
1321; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
1322; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1323; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
1324; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
1325; GCNX3-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1326; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
1327; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
1328; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
1329; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1330; GCNX3-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1331; GCNX3-NOHSA-NEXT:    s_endpgm
1332;
1333; EG-LABEL: global_zextload_v1i32_to_v1i64:
1334; EG:       ; %bb.0:
1335; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1336; EG-NEXT:    TEX 0 @6
1337; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
1338; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1339; EG-NEXT:    CF_END
1340; EG-NEXT:    PAD
1341; EG-NEXT:    Fetch clause starting at 6:
1342; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1343; EG-NEXT:    ALU clause starting at 8:
1344; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1345; EG-NEXT:    ALU clause starting at 9:
1346; EG-NEXT:     MOV * T0.Y, 0.0,
1347; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1348; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1349;
1350; GCN-HSA-LABEL: global_zextload_v1i32_to_v1i64:
1351; GCN-HSA:       ; %bb.0:
1352; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1353; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
1354; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1355; GCN-HSA-NEXT:    global_load_dword v0, v1, s[2:3]
1356; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1357; GCN-HSA-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
1358; GCN-HSA-NEXT:    s_endpgm
1359  %ld = load <1 x i32>, ptr addrspace(1) %in
1360  %ext = zext <1 x i32> %ld to <1 x i64>
1361  store <1 x i64> %ext, ptr addrspace(1) %out
1362  ret void
1363}
1364
1365define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1366; SI-NOHSA-LABEL: global_sextload_v1i32_to_v1i64:
1367; SI-NOHSA:       ; %bb.0:
1368; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1369; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1370; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
1371; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
1372; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
1373; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1374; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
1375; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
1376; SI-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1377; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
1378; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
1379; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1380; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1381; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1382; SI-NOHSA-NEXT:    s_endpgm
1383;
1384; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64:
1385; GCNX3-HSA:       ; %bb.0:
1386; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1387; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1388; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
1389; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
1390; GCNX3-HSA-NEXT:    flat_load_dword v0, v[0:1]
1391; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s0
1392; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s1
1393; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
1394; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1395; GCNX3-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1396; GCNX3-HSA-NEXT:    s_endpgm
1397;
1398; GCNX3-NOHSA-LABEL: global_sextload_v1i32_to_v1i64:
1399; GCNX3-NOHSA:       ; %bb.0:
1400; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1401; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1402; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
1403; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
1404; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
1405; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1406; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
1407; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
1408; GCNX3-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1409; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
1410; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
1411; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1412; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1413; GCNX3-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1414; GCNX3-NOHSA-NEXT:    s_endpgm
1415;
1416; EG-LABEL: global_sextload_v1i32_to_v1i64:
1417; EG:       ; %bb.0:
1418; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1419; EG-NEXT:    TEX 0 @6
1420; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
1421; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1422; EG-NEXT:    CF_END
1423; EG-NEXT:    PAD
1424; EG-NEXT:    Fetch clause starting at 6:
1425; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1426; EG-NEXT:    ALU clause starting at 8:
1427; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1428; EG-NEXT:    ALU clause starting at 9:
1429; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
1430; EG-NEXT:     ASHR * T0.Y, T0.X, literal.y,
1431; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
1432;
1433; GCN-HSA-LABEL: global_sextload_v1i32_to_v1i64:
1434; GCN-HSA:       ; %bb.0:
1435; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1436; GCN-HSA-NEXT:    v_mov_b32_e32 v2, 0
1437; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1438; GCN-HSA-NEXT:    global_load_dword v0, v2, s[2:3]
1439; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1440; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1441; GCN-HSA-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1442; GCN-HSA-NEXT:    s_endpgm
1443  %ld = load <1 x i32>, ptr addrspace(1) %in
1444  %ext = sext <1 x i32> %ld to <1 x i64>
1445  store <1 x i64> %ext, ptr addrspace(1) %out
1446  ret void
1447}
1448
1449define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1450; SI-NOHSA-LABEL: global_zextload_v2i32_to_v2i64:
1451; SI-NOHSA:       ; %bb.0:
1452; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1453; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1454; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
1455; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
1456; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
1457; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1458; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
1459; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
1460; SI-NOHSA-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0
1461; SI-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
1462; SI-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
1463; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
1464; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
1465; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1466; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v4
1467; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v5
1468; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1469; SI-NOHSA-NEXT:    s_endpgm
1470;
1471; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64:
1472; GCNX3-HSA:       ; %bb.0:
1473; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1474; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1475; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
1476; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
1477; GCNX3-HSA-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
1478; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, 0
1479; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s0
1480; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s1
1481; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
1482; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v2
1483; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v3
1484; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, v1
1485; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1486; GCNX3-HSA-NEXT:    s_endpgm
1487;
1488; GCNX3-NOHSA-LABEL: global_zextload_v2i32_to_v2i64:
1489; GCNX3-NOHSA:       ; %bb.0:
1490; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1491; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1492; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
1493; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
1494; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
1495; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1496; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
1497; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
1498; GCNX3-NOHSA-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0
1499; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
1500; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
1501; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
1502; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1503; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v0, v2
1504; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v2, v3
1505; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
1506; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1507; GCNX3-NOHSA-NEXT:    s_endpgm
1508;
1509; EG-LABEL: global_zextload_v2i32_to_v2i64:
1510; EG:       ; %bb.0:
1511; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1512; EG-NEXT:    TEX 0 @6
1513; EG-NEXT:    ALU 5, @9, KC0[CB0:0-32], KC1[]
1514; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
1515; EG-NEXT:    CF_END
1516; EG-NEXT:    PAD
1517; EG-NEXT:    Fetch clause starting at 6:
1518; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
1519; EG-NEXT:    ALU clause starting at 8:
1520; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1521; EG-NEXT:    ALU clause starting at 9:
1522; EG-NEXT:     MOV T1.X, T0.X,
1523; EG-NEXT:     MOV T1.Y, 0.0,
1524; EG-NEXT:     MOV T1.Z, T0.Y,
1525; EG-NEXT:     MOV T1.W, 0.0,
1526; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1527; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1528;
1529; GCN-HSA-LABEL: global_zextload_v2i32_to_v2i64:
1530; GCN-HSA:       ; %bb.0:
1531; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1532; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
1533; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1534; GCN-HSA-NEXT:    global_load_dwordx2 v[2:3], v1, s[2:3]
1535; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1536; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v2
1537; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v3
1538; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
1539; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
1540; GCN-HSA-NEXT:    s_endpgm
1541  %ld = load <2 x i32>, ptr addrspace(1) %in
1542  %ext = zext <2 x i32> %ld to <2 x i64>
1543  store <2 x i64> %ext, ptr addrspace(1) %out
1544  ret void
1545}
1546
1547define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1548; SI-NOHSA-LABEL: global_sextload_v2i32_to_v2i64:
1549; SI-NOHSA:       ; %bb.0:
1550; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1551; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1552; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
1553; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
1554; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
1555; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1556; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
1557; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
1558; SI-NOHSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1559; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
1560; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
1561; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1562; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
1563; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v1
1564; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1565; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1566; SI-NOHSA-NEXT:    s_endpgm
1567;
1568; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64:
1569; GCNX3-HSA:       ; %bb.0:
1570; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1571; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1572; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
1573; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
1574; GCNX3-HSA-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1575; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s0
1576; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s1
1577; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
1578; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
1579; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v1
1580; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1581; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1582; GCNX3-HSA-NEXT:    s_endpgm
1583;
1584; GCNX3-NOHSA-LABEL: global_sextload_v2i32_to_v2i64:
1585; GCNX3-NOHSA:       ; %bb.0:
1586; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1587; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1588; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
1589; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
1590; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
1591; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1592; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
1593; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
1594; GCNX3-NOHSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1595; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
1596; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
1597; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1598; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
1599; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v2, v1
1600; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1601; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1602; GCNX3-NOHSA-NEXT:    s_endpgm
1603;
1604; EG-LABEL: global_sextload_v2i32_to_v2i64:
1605; EG:       ; %bb.0:
1606; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1607; EG-NEXT:    TEX 0 @6
1608; EG-NEXT:    ALU 7, @9, KC0[CB0:0-32], KC1[]
1609; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
1610; EG-NEXT:    CF_END
1611; EG-NEXT:    PAD
1612; EG-NEXT:    Fetch clause starting at 6:
1613; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
1614; EG-NEXT:    ALU clause starting at 8:
1615; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1616; EG-NEXT:    ALU clause starting at 9:
1617; EG-NEXT:     ASHR * T1.W, T0.Y, literal.x,
1618; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1619; EG-NEXT:     ASHR * T1.Y, T0.X, literal.x,
1620; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1621; EG-NEXT:     MOV T1.X, T0.X,
1622; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1623; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1624; EG-NEXT:     MOV * T1.Z, T0.Y,
1625;
1626; GCN-HSA-LABEL: global_sextload_v2i32_to_v2i64:
1627; GCN-HSA:       ; %bb.0:
1628; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1629; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
1630; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1631; GCN-HSA-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
1632; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1633; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
1634; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v1
1635; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1636; GCN-HSA-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1637; GCN-HSA-NEXT:    s_endpgm
1638  %ld = load <2 x i32>, ptr addrspace(1) %in
1639  %ext = sext <2 x i32> %ld to <2 x i64>
1640  store <2 x i64> %ext, ptr addrspace(1) %out
1641  ret void
1642}
1643
1644define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1645; SI-NOHSA-LABEL: global_zextload_v4i32_to_v4i64:
1646; SI-NOHSA:       ; %bb.0:
1647; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1648; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1649; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
1650; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
1651; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
1652; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1653; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
1654; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
1655; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1656; SI-NOHSA-NEXT:    v_mov_b32_e32 v5, 0
1657; SI-NOHSA-NEXT:    v_mov_b32_e32 v7, v5
1658; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
1659; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
1660; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1661; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v2
1662; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v3
1663; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
1664; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
1665; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v0
1666; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v1
1667; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
1668; SI-NOHSA-NEXT:    s_endpgm
1669;
1670; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64:
1671; GCNX3-HSA:       ; %bb.0:
1672; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1673; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, 0
1674; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, v5
1675; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1676; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
1677; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
1678; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1679; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
1680; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
1681; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
1682; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
1683; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
1684; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v2
1685; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, v3
1686; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
1687; GCNX3-HSA-NEXT:    s_nop 0
1688; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v0
1689; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, v1
1690; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s0
1691; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s1
1692; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
1693; GCNX3-HSA-NEXT:    s_endpgm
1694;
1695; GCNX3-NOHSA-LABEL: global_zextload_v4i32_to_v4i64:
1696; GCNX3-NOHSA:       ; %bb.0:
1697; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1698; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1699; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
1700; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
1701; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
1702; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1703; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
1704; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
1705; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1706; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v5, 0
1707; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v7, v5
1708; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
1709; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
1710; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1711; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v4, v2
1712; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v6, v3
1713; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
1714; GCNX3-NOHSA-NEXT:    s_nop 0
1715; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v4, v0
1716; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v6, v1
1717; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
1718; GCNX3-NOHSA-NEXT:    s_endpgm
1719;
1720; EG-LABEL: global_zextload_v4i32_to_v4i64:
1721; EG:       ; %bb.0:
1722; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1723; EG-NEXT:    TEX 0 @6
1724; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1725; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
1726; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
1727; EG-NEXT:    CF_END
1728; EG-NEXT:    Fetch clause starting at 6:
1729; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
1730; EG-NEXT:    ALU clause starting at 8:
1731; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1732; EG-NEXT:    ALU clause starting at 9:
1733; EG-NEXT:     MOV T1.X, T0.Z,
1734; EG-NEXT:     MOV T1.Y, 0.0,
1735; EG-NEXT:     MOV * T2.X, T0.X,
1736; EG-NEXT:     MOV T2.Y, 0.0,
1737; EG-NEXT:     MOV T1.Z, T0.W,
1738; EG-NEXT:     MOV T1.W, 0.0,
1739; EG-NEXT:     MOV * T2.Z, T0.Y,
1740; EG-NEXT:     MOV * T2.W, 0.0,
1741; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
1742; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
1743; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
1744; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
1745; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1746;
1747; GCN-HSA-LABEL: global_zextload_v4i32_to_v4i64:
1748; GCN-HSA:       ; %bb.0:
1749; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1750; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
1751; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
1752; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1753; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v1, s[2:3]
1754; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1755; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v6
1756; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v7
1757; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
1758; GCN-HSA-NEXT:    s_nop 0
1759; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v4
1760; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v5
1761; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
1762; GCN-HSA-NEXT:    s_endpgm
1763  %ld = load <4 x i32>, ptr addrspace(1) %in
1764  %ext = zext <4 x i32> %ld to <4 x i64>
1765  store <4 x i64> %ext, ptr addrspace(1) %out
1766  ret void
1767}
1768
1769define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1770; SI-NOHSA-LABEL: global_sextload_v4i32_to_v4i64:
1771; SI-NOHSA:       ; %bb.0:
1772; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1773; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1774; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
1775; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
1776; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
1777; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1778; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
1779; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
1780; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1781; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
1782; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
1783; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1784; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
1785; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
1786; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
1787; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
1788; SI-NOHSA-NEXT:    v_mov_b32_e32 v7, v2
1789; SI-NOHSA-NEXT:    v_mov_b32_e32 v9, v3
1790; SI-NOHSA-NEXT:    v_mov_b32_e32 v3, v0
1791; SI-NOHSA-NEXT:    v_mov_b32_e32 v5, v1
1792; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16
1793; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[3:6], off, s[4:7], 0
1794; SI-NOHSA-NEXT:    s_endpgm
1795;
1796; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64:
1797; GCNX3-HSA:       ; %bb.0:
1798; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1799; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1800; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
1801; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
1802; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1803; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
1804; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
1805; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, s3
1806; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s1
1807; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s2
1808; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, s0
1809; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
1810; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
1811; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
1812; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, v2
1813; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, v3
1814; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
1815; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
1816; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, v0
1817; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, v1
1818; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[7:10]
1819; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[3:6]
1820; GCNX3-HSA-NEXT:    s_endpgm
1821;
1822; GCNX3-NOHSA-LABEL: global_sextload_v4i32_to_v4i64:
1823; GCNX3-NOHSA:       ; %bb.0:
1824; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1825; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1826; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
1827; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
1828; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
1829; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1830; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
1831; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
1832; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1833; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
1834; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
1835; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
1836; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
1837; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
1838; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v7, v2
1839; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v9, v3
1840; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
1841; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
1842; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v3, v0
1843; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v5, v1
1844; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16
1845; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[3:6], off, s[4:7], 0
1846; GCNX3-NOHSA-NEXT:    s_endpgm
1847;
1848; EG-LABEL: global_sextload_v4i32_to_v4i64:
1849; EG:       ; %bb.0:
1850; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1851; EG-NEXT:    TEX 0 @6
1852; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
1853; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 0
1854; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 1
1855; EG-NEXT:    CF_END
1856; EG-NEXT:    Fetch clause starting at 6:
1857; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
1858; EG-NEXT:    ALU clause starting at 8:
1859; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1860; EG-NEXT:    ALU clause starting at 9:
1861; EG-NEXT:     ASHR * T1.W, T0.Y, literal.x,
1862; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1863; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
1864; EG-NEXT:     ASHR T1.Y, T0.X, literal.y,
1865; EG-NEXT:     ASHR T3.W, T0.W, literal.y,
1866; EG-NEXT:     MOV * T1.X, T0.X,
1867; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
1868; EG-NEXT:     ASHR * T3.Y, T0.Z, literal.x,
1869; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1870; EG-NEXT:     MOV T3.X, T0.Z,
1871; EG-NEXT:     MOV T1.Z, T0.Y,
1872; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
1873; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1874; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
1875; EG-NEXT:     MOV * T3.Z, T0.W,
1876; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1877;
1878; GCN-HSA-LABEL: global_sextload_v4i32_to_v4i64:
1879; GCN-HSA:       ; %bb.0:
1880; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1881; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
1882; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1883; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v11, s[2:3]
1884; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1885; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
1886; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
1887; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v2
1888; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v3
1889; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
1890; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
1891; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v0
1892; GCN-HSA-NEXT:    v_mov_b32_e32 v5, v1
1893; GCN-HSA-NEXT:    global_store_dwordx4 v11, v[7:10], s[0:1] offset:16
1894; GCN-HSA-NEXT:    global_store_dwordx4 v11, v[3:6], s[0:1]
1895; GCN-HSA-NEXT:    s_endpgm
1896  %ld = load <4 x i32>, ptr addrspace(1) %in
1897  %ext = sext <4 x i32> %ld to <4 x i64>
1898  store <4 x i64> %ext, ptr addrspace(1) %out
1899  ret void
1900}
1901
1902define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1903; SI-NOHSA-LABEL: global_zextload_v8i32_to_v8i64:
1904; SI-NOHSA:       ; %bb.0:
1905; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1906; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1907; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
1908; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
1909; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
1910; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1911; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
1912; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
1913; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
1914; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
1915; SI-NOHSA-NEXT:    v_mov_b32_e32 v9, 0
1916; SI-NOHSA-NEXT:    v_mov_b32_e32 v11, v9
1917; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
1918; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
1919; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
1920; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v2
1921; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v3
1922; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
1923; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
1924; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v0
1925; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v1
1926; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
1927; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
1928; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v6
1929; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v7
1930; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
1931; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
1932; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v4
1933; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v5
1934; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
1935; SI-NOHSA-NEXT:    s_endpgm
1936;
1937; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64:
1938; GCNX3-HSA:       ; %bb.0:
1939; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1940; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, 0
1941; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, v9
1942; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1943; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
1944; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
1945; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
1946; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
1947; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1948; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
1949; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
1950; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1951; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
1952; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
1953; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s3
1954; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, s2
1955; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 48
1956; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s1
1957; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
1958; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s0
1959; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, s3
1960; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 32
1961; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s2
1962; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
1963; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(1)
1964; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v2
1965; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v3
1966; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
1967; GCNX3-HSA-NEXT:    s_nop 0
1968; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v0
1969; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v1
1970; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
1971; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s0
1972; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
1973; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v6
1974; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v7
1975; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
1976; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s1
1977; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v4
1978; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v5
1979; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
1980; GCNX3-HSA-NEXT:    s_endpgm
1981;
1982; GCNX3-NOHSA-LABEL: global_zextload_v8i32_to_v8i64:
1983; GCNX3-NOHSA:       ; %bb.0:
1984; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1985; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
1986; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
1987; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
1988; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
1989; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
1990; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
1991; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
1992; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
1993; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
1994; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v9, 0
1995; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v11, v9
1996; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
1997; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
1998; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
1999; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v2
2000; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v3
2001; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
2002; GCNX3-NOHSA-NEXT:    s_nop 0
2003; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v0
2004; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v1
2005; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
2006; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
2007; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v6
2008; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v7
2009; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
2010; GCNX3-NOHSA-NEXT:    s_nop 0
2011; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v4
2012; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v5
2013; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
2014; GCNX3-NOHSA-NEXT:    s_endpgm
2015;
2016; EG-LABEL: global_zextload_v8i32_to_v8i64:
2017; EG:       ; %bb.0:
2018; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
2019; EG-NEXT:    TEX 1 @8
2020; EG-NEXT:    ALU 26, @13, KC0[CB0:0-32], KC1[]
2021; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T7.X, 0
2022; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
2023; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0
2024; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T0.X, 1
2025; EG-NEXT:    CF_END
2026; EG-NEXT:    Fetch clause starting at 8:
2027; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
2028; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
2029; EG-NEXT:    ALU clause starting at 12:
2030; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2031; EG-NEXT:    ALU clause starting at 13:
2032; EG-NEXT:     MOV T2.X, T1.Z,
2033; EG-NEXT:     MOV T2.Y, 0.0,
2034; EG-NEXT:     MOV * T3.X, T1.X,
2035; EG-NEXT:     MOV * T3.Y, 0.0,
2036; EG-NEXT:     MOV T4.X, T0.Z,
2037; EG-NEXT:     MOV T4.Y, 0.0,
2038; EG-NEXT:     MOV * T5.X, T0.X,
2039; EG-NEXT:     MOV T5.Y, 0.0,
2040; EG-NEXT:     MOV T2.Z, T1.W,
2041; EG-NEXT:     MOV T2.W, 0.0,
2042; EG-NEXT:     MOV * T3.Z, T1.Y,
2043; EG-NEXT:     MOV * T3.W, 0.0,
2044; EG-NEXT:     MOV T4.Z, T0.W,
2045; EG-NEXT:     MOV T4.W, 0.0,
2046; EG-NEXT:     MOV * T5.Z, T0.Y,
2047; EG-NEXT:     MOV * T5.W, 0.0,
2048; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
2049; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2050; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2051; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
2052; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2053; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
2054; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
2055; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2056; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
2057; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
2058; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2059;
2060; GCN-HSA-LABEL: global_zextload_v8i32_to_v8i64:
2061; GCN-HSA:       ; %bb.0:
2062; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2063; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
2064; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
2065; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
2066; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v1, s[2:3] offset:16
2067; GCN-HSA-NEXT:    global_load_dwordx4 v[8:11], v1, s[2:3]
2068; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
2069; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v6
2070; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v7
2071; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:48
2072; GCN-HSA-NEXT:    s_nop 0
2073; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v4
2074; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v5
2075; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:32
2076; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
2077; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v10
2078; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v11
2079; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
2080; GCN-HSA-NEXT:    s_nop 0
2081; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v8
2082; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v9
2083; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
2084; GCN-HSA-NEXT:    s_endpgm
2085  %ld = load <8 x i32>, ptr addrspace(1) %in
2086  %ext = zext <8 x i32> %ld to <8 x i64>
2087  store <8 x i64> %ext, ptr addrspace(1) %out
2088  ret void
2089}
2090
2091define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2092; SI-NOHSA-LABEL: global_sextload_v8i32_to_v8i64:
2093; SI-NOHSA:       ; %bb.0:
2094; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2095; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
2096; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
2097; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
2098; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
2099; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
2100; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
2101; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
2102; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2103; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
2104; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
2105; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
2106; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
2107; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
2108; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
2109; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v3
2110; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
2111; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
2112; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v5
2113; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v4
2114; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v7
2115; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v6
2116; SI-NOHSA-NEXT:    v_mov_b32_e32 v19, v6
2117; SI-NOHSA-NEXT:    v_mov_b32_e32 v21, v7
2118; SI-NOHSA-NEXT:    v_mov_b32_e32 v15, v4
2119; SI-NOHSA-NEXT:    v_mov_b32_e32 v17, v5
2120; SI-NOHSA-NEXT:    v_mov_b32_e32 v11, v2
2121; SI-NOHSA-NEXT:    v_mov_b32_e32 v13, v3
2122; SI-NOHSA-NEXT:    v_mov_b32_e32 v7, v0
2123; SI-NOHSA-NEXT:    v_mov_b32_e32 v9, v1
2124; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[19:22], off, s[4:7], 0 offset:48
2125; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[15:18], off, s[4:7], 0 offset:32
2126; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[11:14], off, s[4:7], 0 offset:16
2127; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[7:10], off, s[4:7], 0
2128; SI-NOHSA-NEXT:    s_endpgm
2129;
2130; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64:
2131; GCNX3-HSA:       ; %bb.0:
2132; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2133; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
2134; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
2135; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
2136; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
2137; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
2138; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
2139; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2140; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
2141; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2142; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
2143; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
2144; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, s3
2145; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, s2
2146; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 48
2147; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, s1
2148; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
2149; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s0
2150; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 32
2151; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s3
2152; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
2153; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s2
2154; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, s1
2155; GCNX3-HSA-NEXT:    v_mov_b32_e32 v22, s0
2156; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(1)
2157; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v1
2158; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v0
2159; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v3
2160; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v2
2161; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, v2
2162; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, v3
2163; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v0
2164; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v1
2165; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
2166; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
2167; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(2)
2168; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
2169; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v7
2170; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v6
2171; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v6
2172; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v7
2173; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
2174; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v4
2175; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v5
2176; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[8:11]
2177; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
2178; GCNX3-HSA-NEXT:    s_endpgm
2179;
2180; GCNX3-NOHSA-LABEL: global_sextload_v8i32_to_v8i64:
2181; GCNX3-NOHSA:       ; %bb.0:
2182; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
2183; GCNX3-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
2184; GCNX3-NOHSA-NEXT:    s_mov_b32 s2, -1
2185; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s2
2186; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s3
2187; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
2188; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s6
2189; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s7
2190; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2191; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
2192; GCNX3-NOHSA-NEXT:    s_mov_b32 s0, s4
2193; GCNX3-NOHSA-NEXT:    s_mov_b32 s1, s5
2194; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
2195; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
2196; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
2197; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v7
2198; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v6
2199; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v19, v6
2200; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v21, v7
2201; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
2202; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v3
2203; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
2204; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v5
2205; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v4
2206; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v15, v4
2207; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v17, v5
2208; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v11, v2
2209; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v13, v3
2210; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v7, v0
2211; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v9, v1
2212; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
2213; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:32
2214; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16
2215; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
2216; GCNX3-NOHSA-NEXT:    s_endpgm
2217;
2218; EG-LABEL: global_sextload_v8i32_to_v8i64:
2219; EG:       ; %bb.0:
2220; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
2221; EG-NEXT:    TEX 1 @8
2222; EG-NEXT:    ALU 31, @13, KC0[CB0:0-32], KC1[]
2223; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T0.X, 0
2224; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T5.X, 0
2225; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T3.X, 0
2226; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T2.X, 1
2227; EG-NEXT:    CF_END
2228; EG-NEXT:    Fetch clause starting at 8:
2229; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
2230; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
2231; EG-NEXT:    ALU clause starting at 12:
2232; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2233; EG-NEXT:    ALU clause starting at 13:
2234; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
2235; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
2236; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2237; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
2238; EG-NEXT:     ADD_INT T2.W, KC0[2].Y, literal.y,
2239; EG-NEXT:     ASHR * T4.W, T0.Y, literal.z,
2240; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
2241; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
2242; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
2243; EG-NEXT:     ASHR T4.Y, T0.X, literal.y,
2244; EG-NEXT:     ASHR T6.W, T0.W, literal.y,
2245; EG-NEXT:     MOV * T4.X, T0.X,
2246; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
2247; EG-NEXT:     ASHR T6.Y, T0.Z, literal.x,
2248; EG-NEXT:     ASHR * T7.W, T1.Y, literal.x,
2249; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
2250; EG-NEXT:     MOV T6.X, T0.Z,
2251; EG-NEXT:     ASHR T7.Y, T1.X, literal.x,
2252; EG-NEXT:     MOV T4.Z, T0.Y,
2253; EG-NEXT:     ASHR T8.W, T1.W, literal.x,
2254; EG-NEXT:     MOV * T7.X, T1.X,
2255; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
2256; EG-NEXT:     ASHR T8.Y, T1.Z, literal.x,
2257; EG-NEXT:     MOV * T6.Z, T0.W,
2258; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
2259; EG-NEXT:     MOV T8.X, T1.Z,
2260; EG-NEXT:     MOV T7.Z, T1.Y,
2261; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
2262; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
2263; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
2264; EG-NEXT:     MOV * T8.Z, T1.W,
2265; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2266;
2267; GCN-HSA-LABEL: global_sextload_v8i32_to_v8i64:
2268; GCN-HSA:       ; %bb.0:
2269; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2270; GCN-HSA-NEXT:    v_mov_b32_e32 v23, 0
2271; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
2272; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v23, s[2:3]
2273; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v23, s[2:3] offset:16
2274; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
2275; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
2276; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
2277; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v7
2278; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v6
2279; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v6
2280; GCN-HSA-NEXT:    v_mov_b32_e32 v21, v7
2281; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
2282; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v3
2283; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
2284; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v5
2285; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v4
2286; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v4
2287; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v5
2288; GCN-HSA-NEXT:    v_mov_b32_e32 v11, v2
2289; GCN-HSA-NEXT:    v_mov_b32_e32 v13, v3
2290; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v0
2291; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v1
2292; GCN-HSA-NEXT:    global_store_dwordx4 v23, v[19:22], s[0:1] offset:48
2293; GCN-HSA-NEXT:    global_store_dwordx4 v23, v[15:18], s[0:1] offset:32
2294; GCN-HSA-NEXT:    global_store_dwordx4 v23, v[11:14], s[0:1] offset:16
2295; GCN-HSA-NEXT:    global_store_dwordx4 v23, v[7:10], s[0:1]
2296; GCN-HSA-NEXT:    s_endpgm
2297  %ld = load <8 x i32>, ptr addrspace(1) %in
2298  %ext = sext <8 x i32> %ld to <8 x i64>
2299  store <8 x i64> %ext, ptr addrspace(1) %out
2300  ret void
2301}
2302
2303define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2304; SI-NOHSA-LABEL: global_sextload_v16i32_to_v16i64:
2305; SI-NOHSA:       ; %bb.0:
2306; SI-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
2307; SI-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
2308; SI-NOHSA-NEXT:    s_mov_b32 s2, -1
2309; SI-NOHSA-NEXT:    s_mov_b32 s10, s2
2310; SI-NOHSA-NEXT:    s_mov_b32 s11, s3
2311; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
2312; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
2313; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
2314; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
2315; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
2316; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
2317; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
2318; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
2319; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v3
2320; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v2
2321; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v1
2322; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v0
2323; SI-NOHSA-NEXT:    v_mov_b32_e32 v20, v0
2324; SI-NOHSA-NEXT:    v_mov_b32_e32 v22, v1
2325; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v2
2326; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v3
2327; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
2328; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v7
2329; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v6
2330; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v5
2331; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v4
2332; SI-NOHSA-NEXT:    v_mov_b32_e32 v24, v4
2333; SI-NOHSA-NEXT:    v_mov_b32_e32 v26, v5
2334; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v6
2335; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v7
2336; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
2337; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v11
2338; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v10
2339; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v9
2340; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v8
2341; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v8
2342; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v9
2343; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v10
2344; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v11
2345; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
2346; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v15
2347; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v14
2348; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v13
2349; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v12
2350; SI-NOHSA-NEXT:    v_mov_b32_e32 v32, v12
2351; SI-NOHSA-NEXT:    v_mov_b32_e32 v34, v13
2352; SI-NOHSA-NEXT:    v_mov_b32_e32 v8, v14
2353; SI-NOHSA-NEXT:    v_mov_b32_e32 v10, v15
2354; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
2355; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
2356; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96
2357; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
2358; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64
2359; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
2360; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
2361; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
2362; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0
2363; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
2364; SI-NOHSA-NEXT:    s_endpgm
2365;
2366; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64:
2367; GCNX3-HSA:       ; %bb.0:
2368; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2369; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
2370; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
2371; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
2372; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
2373; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 48
2374; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
2375; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
2376; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
2377; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
2378; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
2379; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
2380; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
2381; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
2382; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
2383; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
2384; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
2385; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
2386; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
2387; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2388; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
2389; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
2390; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, s3
2391; GCNX3-HSA-NEXT:    v_mov_b32_e32 v22, s2
2392; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x60
2393; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
2394; GCNX3-HSA-NEXT:    v_mov_b32_e32 v25, s3
2395; GCNX3-HSA-NEXT:    v_mov_b32_e32 v24, s2
2396; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x70
2397; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
2398; GCNX3-HSA-NEXT:    v_mov_b32_e32 v27, s3
2399; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s1
2400; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, s2
2401; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
2402; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s0
2403; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
2404; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
2405; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v9
2406; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v8
2407; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v8
2408; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v9
2409; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
2410; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s3
2411; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s2
2412; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x50
2413; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
2414; GCNX3-HSA-NEXT:    v_mov_b32_e32 v29, s3
2415; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, s2
2416; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 32
2417; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v11
2418; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v10
2419; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v10
2420; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v11
2421; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
2422; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[16:19]
2423; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(4)
2424; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v15
2425; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v14
2426; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v13
2427; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v12
2428; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v12
2429; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v13
2430; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v14
2431; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v15
2432; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 48
2433; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, s3
2434; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
2435; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[16:19]
2436; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
2437; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(5)
2438; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v5
2439; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v7
2440; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v6
2441; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v4
2442; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, v4
2443; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, v5
2444; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v6
2445; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v7
2446; GCNX3-HSA-NEXT:    v_mov_b32_e32 v22, s2
2447; GCNX3-HSA-NEXT:    v_mov_b32_e32 v31, s1
2448; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
2449; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[8:11]
2450; GCNX3-HSA-NEXT:    v_mov_b32_e32 v30, s0
2451; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(6)
2452; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v1
2453; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v0
2454; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, v0
2455; GCNX3-HSA-NEXT:    v_mov_b32_e32 v10, v1
2456; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
2457; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
2458; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v2
2459; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, v3
2460; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[8:11]
2461; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[4:7]
2462; GCNX3-HSA-NEXT:    s_endpgm
2463;
2464; GCNX3-NOHSA-LABEL: global_sextload_v16i32_to_v16i64:
2465; GCNX3-NOHSA:       ; %bb.0:
2466; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
2467; GCNX3-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
2468; GCNX3-NOHSA-NEXT:    s_mov_b32 s2, -1
2469; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s2
2470; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s3
2471; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
2472; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s6
2473; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s7
2474; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
2475; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
2476; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
2477; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
2478; GCNX3-NOHSA-NEXT:    s_mov_b32 s0, s4
2479; GCNX3-NOHSA-NEXT:    s_mov_b32 s1, s5
2480; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
2481; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v3
2482; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
2483; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v5
2484; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v4
2485; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v24, v4
2486; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v26, v5
2487; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v2
2488; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v7
2489; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v6
2490; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v20, v6
2491; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v22, v7
2492; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
2493; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
2494; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v4, v0
2495; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v6, v1
2496; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v2
2497; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v3
2498; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
2499; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v11
2500; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v10
2501; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v9
2502; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v8
2503; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v8
2504; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v9
2505; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v0, v10
2506; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v2, v11
2507; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
2508; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v15
2509; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v14
2510; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v13
2511; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v12
2512; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v32, v12
2513; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v34, v13
2514; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v14
2515; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v15
2516; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
2517; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112
2518; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
2519; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
2520; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
2521; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
2522; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0
2523; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
2524; GCNX3-NOHSA-NEXT:    s_endpgm
2525;
2526; EG-LABEL: global_sextload_v16i32_to_v16i64:
2527; EG:       ; %bb.0:
2528; EG-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
2529; EG-NEXT:    TEX 3 @12
2530; EG-NEXT:    ALU 64, @21, KC0[CB0:0-32], KC1[]
2531; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T1.X, 0
2532; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T11.X, 0
2533; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T9.X, 0
2534; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T8.X, 0
2535; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T7.X, 0
2536; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T6.X, 0
2537; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T5.X, 0
2538; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T4.X, 1
2539; EG-NEXT:    CF_END
2540; EG-NEXT:    Fetch clause starting at 12:
2541; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
2542; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 32, #1
2543; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 16, #1
2544; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
2545; EG-NEXT:    ALU clause starting at 20:
2546; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2547; EG-NEXT:    ALU clause starting at 21:
2548; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.x,
2549; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2550; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
2551; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
2552; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2553; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.x,
2554; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
2555; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
2556; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
2557; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
2558; EG-NEXT:     LSHR T7.X, PV.W, literal.x,
2559; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
2560; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
2561; EG-NEXT:     LSHR T8.X, PV.W, literal.x,
2562; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
2563; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
2564; EG-NEXT:     LSHR T9.X, PV.W, literal.x,
2565; EG-NEXT:     ADD_INT T4.W, KC0[2].Y, literal.y,
2566; EG-NEXT:     ASHR * T10.W, T0.W, literal.z,
2567; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
2568; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
2569; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
2570; EG-NEXT:     ASHR T10.Y, T0.Z, literal.y,
2571; EG-NEXT:     ASHR T12.W, T0.Y, literal.y,
2572; EG-NEXT:     MOV * T10.X, T0.Z,
2573; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
2574; EG-NEXT:     ASHR T12.Y, T0.X, literal.x,
2575; EG-NEXT:     ASHR * T13.W, T3.W, literal.x,
2576; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
2577; EG-NEXT:     MOV T12.X, T0.X,
2578; EG-NEXT:     ASHR T13.Y, T3.Z, literal.x,
2579; EG-NEXT:     MOV T10.Z, T0.W,
2580; EG-NEXT:     ASHR T14.W, T3.Y, literal.x,
2581; EG-NEXT:     MOV * T13.X, T3.Z,
2582; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
2583; EG-NEXT:     ASHR T14.Y, T3.X, literal.x,
2584; EG-NEXT:     MOV T12.Z, T0.Y,
2585; EG-NEXT:     ASHR * T0.W, T2.W, literal.x,
2586; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
2587; EG-NEXT:     MOV T14.X, T3.X,
2588; EG-NEXT:     ASHR T0.Y, T2.Z, literal.x,
2589; EG-NEXT:     MOV T13.Z, T3.W,
2590; EG-NEXT:     ASHR T15.W, T2.Y, literal.x,
2591; EG-NEXT:     MOV * T0.X, T2.Z,
2592; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
2593; EG-NEXT:     ASHR T15.Y, T2.X, literal.x,
2594; EG-NEXT:     MOV T14.Z, T3.Y,
2595; EG-NEXT:     ASHR * T3.W, T1.W, literal.x,
2596; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
2597; EG-NEXT:     MOV T15.X, T2.X,
2598; EG-NEXT:     ASHR T3.Y, T1.Z, literal.x,
2599; EG-NEXT:     MOV T0.Z, T2.W,
2600; EG-NEXT:     ASHR T16.W, T1.Y, literal.x,
2601; EG-NEXT:     MOV * T3.X, T1.Z,
2602; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
2603; EG-NEXT:     ASHR T16.Y, T1.X, literal.x,
2604; EG-NEXT:     MOV * T15.Z, T2.Y,
2605; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
2606; EG-NEXT:     MOV T16.X, T1.X,
2607; EG-NEXT:     MOV T3.Z, T1.W,
2608; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
2609; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
2610; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
2611; EG-NEXT:     MOV * T16.Z, T1.Y,
2612; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2613;
2614; GCN-HSA-LABEL: global_sextload_v16i32_to_v16i64:
2615; GCN-HSA:       ; %bb.0:
2616; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2617; GCN-HSA-NEXT:    v_mov_b32_e32 v36, 0
2618; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
2619; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v36, s[2:3] offset:32
2620; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v36, s[2:3] offset:48
2621; GCN-HSA-NEXT:    global_load_dwordx4 v[8:11], v36, s[2:3] offset:16
2622; GCN-HSA-NEXT:    global_load_dwordx4 v[12:15], v36, s[2:3]
2623; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
2624; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v3
2625; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
2626; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v5
2627; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v4
2628; GCN-HSA-NEXT:    v_mov_b32_e32 v24, v4
2629; GCN-HSA-NEXT:    v_mov_b32_e32 v26, v5
2630; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v2
2631; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v7
2632; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v6
2633; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v6
2634; GCN-HSA-NEXT:    v_mov_b32_e32 v22, v7
2635; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
2636; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
2637; GCN-HSA-NEXT:    v_mov_b32_e32 v4, v0
2638; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v1
2639; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v2
2640; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v3
2641; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
2642; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v11
2643; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v10
2644; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v9
2645; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v8
2646; GCN-HSA-NEXT:    v_mov_b32_e32 v28, v8
2647; GCN-HSA-NEXT:    v_mov_b32_e32 v30, v9
2648; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v10
2649; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v11
2650; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
2651; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v15
2652; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v14
2653; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v13
2654; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v12
2655; GCN-HSA-NEXT:    v_mov_b32_e32 v32, v12
2656; GCN-HSA-NEXT:    v_mov_b32_e32 v34, v13
2657; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v14
2658; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v15
2659; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[24:27], s[0:1] offset:96
2660; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[20:23], s[0:1] offset:112
2661; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1] offset:64
2662; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:80
2663; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[28:31], s[0:1] offset:32
2664; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] offset:48
2665; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1]
2666; GCN-HSA-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:16
2667; GCN-HSA-NEXT:    s_endpgm
2668  %ld = load <16 x i32>, ptr addrspace(1) %in
2669  %ext = sext <16 x i32> %ld to <16 x i64>
2670  store <16 x i64> %ext, ptr addrspace(1) %out
2671  ret void
2672}
2673
2674define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2675; SI-NOHSA-LABEL: global_zextload_v16i32_to_v16i64:
2676; SI-NOHSA:       ; %bb.0:
2677; SI-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
2678; SI-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
2679; SI-NOHSA-NEXT:    s_mov_b32 s2, -1
2680; SI-NOHSA-NEXT:    s_mov_b32 s10, s2
2681; SI-NOHSA-NEXT:    s_mov_b32 s11, s3
2682; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
2683; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
2684; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
2685; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
2686; SI-NOHSA-NEXT:    v_mov_b32_e32 v5, 0
2687; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
2688; SI-NOHSA-NEXT:    v_mov_b32_e32 v7, v5
2689; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
2690; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
2691; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
2692; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
2693; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
2694; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v0
2695; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v1
2696; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
2697; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
2698; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v2
2699; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v3
2700; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112
2701; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
2702; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v8
2703; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v9
2704; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
2705; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
2706; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v10
2707; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v11
2708; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
2709; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
2710; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v16
2711; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v17
2712; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
2713; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
2714; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v18
2715; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v19
2716; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
2717; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
2718; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v12
2719; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v13
2720; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
2721; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
2722; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v14
2723; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v15
2724; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
2725; SI-NOHSA-NEXT:    s_endpgm
2726;
2727; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64:
2728; GCNX3-HSA:       ; %bb.0:
2729; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2730; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, 0
2731; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, v17
2732; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
2733; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
2734; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
2735; GCNX3-HSA-NEXT:    s_add_u32 s6, s2, 32
2736; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
2737; GCNX3-HSA-NEXT:    s_addc_u32 s7, s3, 0
2738; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
2739; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 48
2740; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
2741; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2742; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
2743; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
2744; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2745; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s7
2746; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s6
2747; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
2748; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s5
2749; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s4
2750; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
2751; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
2752; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
2753; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, s3
2754; GCNX3-HSA-NEXT:    v_mov_b32_e32 v22, s2
2755; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x60
2756; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
2757; GCNX3-HSA-NEXT:    v_mov_b32_e32 v25, s3
2758; GCNX3-HSA-NEXT:    v_mov_b32_e32 v24, s2
2759; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x70
2760; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
2761; GCNX3-HSA-NEXT:    v_mov_b32_e32 v27, s3
2762; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s1
2763; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, s2
2764; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
2765; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s0
2766; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
2767; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
2768; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v0
2769; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v1
2770; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
2771; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
2772; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
2773; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x50
2774; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v2
2775; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v3
2776; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
2777; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s2
2778; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[16:19]
2779; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s3
2780; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(4)
2781; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v4
2782; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v5
2783; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 32
2784; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[16:19]
2785; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
2786; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v6
2787; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v7
2788; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[16:19]
2789; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s3
2790; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(5)
2791; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v8
2792; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v9
2793; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 48
2794; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
2795; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s2
2796; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v10
2797; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v11
2798; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
2799; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[16:19]
2800; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s0
2801; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(6)
2802; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v12
2803; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v13
2804; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
2805; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s1
2806; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, v14
2807; GCNX3-HSA-NEXT:    v_mov_b32_e32 v18, v15
2808; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
2809; GCNX3-HSA-NEXT:    s_endpgm
2810;
2811; GCNX3-NOHSA-LABEL: global_zextload_v16i32_to_v16i64:
2812; GCNX3-NOHSA:       ; %bb.0:
2813; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
2814; GCNX3-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
2815; GCNX3-NOHSA-NEXT:    s_mov_b32 s2, -1
2816; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s2
2817; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s3
2818; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
2819; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s6
2820; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s7
2821; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
2822; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
2823; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
2824; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
2825; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v17, 0
2826; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v19, v17
2827; GCNX3-NOHSA-NEXT:    s_mov_b32 s0, s4
2828; GCNX3-NOHSA-NEXT:    s_mov_b32 s1, s5
2829; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
2830; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v0
2831; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v1
2832; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96
2833; GCNX3-NOHSA-NEXT:    s_nop 0
2834; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v2
2835; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v3
2836; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
2837; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(4)
2838; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v4
2839; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v5
2840; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
2841; GCNX3-NOHSA-NEXT:    s_nop 0
2842; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v6
2843; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v7
2844; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
2845; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(5)
2846; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v8
2847; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v9
2848; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
2849; GCNX3-NOHSA-NEXT:    s_nop 0
2850; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v10
2851; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v11
2852; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
2853; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(6)
2854; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v12
2855; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v13
2856; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0
2857; GCNX3-NOHSA-NEXT:    s_nop 0
2858; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v14
2859; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v15
2860; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
2861; GCNX3-NOHSA-NEXT:    s_endpgm
2862;
2863; EG-LABEL: global_zextload_v16i32_to_v16i64:
2864; EG:       ; %bb.0:
2865; EG-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
2866; EG-NEXT:    TEX 3 @12
2867; EG-NEXT:    ALU 55, @21, KC0[CB0:0-32], KC1[]
2868; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T15.X, 0
2869; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T14.X, 0
2870; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T13.X, 0
2871; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T12.X, 0
2872; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T3.X, 0
2873; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T2.X, 0
2874; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T1.X, 0
2875; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T0.X, 1
2876; EG-NEXT:    CF_END
2877; EG-NEXT:    Fetch clause starting at 12:
2878; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
2879; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 0, #1
2880; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 16, #1
2881; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 32, #1
2882; EG-NEXT:    ALU clause starting at 20:
2883; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2884; EG-NEXT:    ALU clause starting at 21:
2885; EG-NEXT:     MOV T4.X, T1.X,
2886; EG-NEXT:     MOV T4.Y, 0.0,
2887; EG-NEXT:     MOV * T5.X, T1.Z,
2888; EG-NEXT:     MOV * T5.Y, 0.0,
2889; EG-NEXT:     MOV T6.X, T0.X,
2890; EG-NEXT:     MOV T6.Y, 0.0,
2891; EG-NEXT:     MOV * T7.X, T0.Z,
2892; EG-NEXT:     MOV * T7.Y, 0.0,
2893; EG-NEXT:     MOV T8.X, T3.X,
2894; EG-NEXT:     MOV T8.Y, 0.0,
2895; EG-NEXT:     MOV * T9.X, T3.Z,
2896; EG-NEXT:     MOV * T9.Y, 0.0,
2897; EG-NEXT:     MOV T10.X, T2.X,
2898; EG-NEXT:     MOV T10.Y, 0.0,
2899; EG-NEXT:     MOV * T11.X, T2.Z,
2900; EG-NEXT:     MOV T11.Y, 0.0,
2901; EG-NEXT:     MOV T4.Z, T1.Y,
2902; EG-NEXT:     MOV T4.W, 0.0,
2903; EG-NEXT:     MOV * T5.Z, T1.W,
2904; EG-NEXT:     MOV * T5.W, 0.0,
2905; EG-NEXT:     MOV T6.Z, T0.Y,
2906; EG-NEXT:     MOV T6.W, 0.0,
2907; EG-NEXT:     MOV * T7.Z, T0.W,
2908; EG-NEXT:     MOV * T7.W, 0.0,
2909; EG-NEXT:     MOV T8.Z, T3.Y,
2910; EG-NEXT:     MOV T8.W, 0.0,
2911; EG-NEXT:     MOV * T9.Z, T3.W,
2912; EG-NEXT:     MOV * T9.W, 0.0,
2913; EG-NEXT:     MOV T10.Z, T2.Y,
2914; EG-NEXT:     MOV T10.W, 0.0,
2915; EG-NEXT:     MOV * T11.Z, T2.W,
2916; EG-NEXT:     MOV T11.W, 0.0,
2917; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
2918; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2919; EG-NEXT:     LSHR T0.X, PS, literal.x,
2920; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2921; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2922; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
2923; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
2924; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
2925; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2926; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
2927; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
2928; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2929; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
2930; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
2931; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2932; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
2933; EG-NEXT:     LSHR T13.X, PV.W, literal.x,
2934; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2935; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
2936; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
2937; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2938; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
2939; EG-NEXT:     LSHR * T15.X, PV.W, literal.x,
2940; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2941;
2942; GCN-HSA-LABEL: global_zextload_v16i32_to_v16i64:
2943; GCN-HSA:       ; %bb.0:
2944; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2945; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
2946; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
2947; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
2948; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v1, s[2:3] offset:48
2949; GCN-HSA-NEXT:    global_load_dwordx4 v[8:11], v1, s[2:3] offset:32
2950; GCN-HSA-NEXT:    global_load_dwordx4 v[12:15], v1, s[2:3] offset:16
2951; GCN-HSA-NEXT:    global_load_dwordx4 v[16:19], v1, s[2:3]
2952; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
2953; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v4
2954; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v5
2955; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:96
2956; GCN-HSA-NEXT:    s_nop 0
2957; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v6
2958; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v7
2959; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:112
2960; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
2961; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v8
2962; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v9
2963; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:64
2964; GCN-HSA-NEXT:    s_nop 0
2965; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v10
2966; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v11
2967; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:80
2968; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
2969; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v12
2970; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v13
2971; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:32
2972; GCN-HSA-NEXT:    s_nop 0
2973; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v14
2974; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v15
2975; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:48
2976; GCN-HSA-NEXT:    s_waitcnt vmcnt(6)
2977; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v16
2978; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v17
2979; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
2980; GCN-HSA-NEXT:    s_nop 0
2981; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v18
2982; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v19
2983; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
2984; GCN-HSA-NEXT:    s_endpgm
2985  %ld = load <16 x i32>, ptr addrspace(1) %in
2986  %ext = zext <16 x i32> %ld to <16 x i64>
2987  store <16 x i64> %ext, ptr addrspace(1) %out
2988  ret void
2989}
2990
2991define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2992; SI-NOHSA-LABEL: global_sextload_v32i32_to_v32i64:
2993; SI-NOHSA:       ; %bb.0:
2994; SI-NOHSA-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2995; SI-NOHSA-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2996; SI-NOHSA-NEXT:    s_mov_b32 s14, -1
2997; SI-NOHSA-NEXT:    s_mov_b32 s15, 0xe8f000
2998; SI-NOHSA-NEXT:    s_add_u32 s12, s12, s11
2999; SI-NOHSA-NEXT:    s_addc_u32 s13, s13, 0
3000; SI-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
3001; SI-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
3002; SI-NOHSA-NEXT:    s_mov_b32 s2, -1
3003; SI-NOHSA-NEXT:    s_mov_b32 s10, s2
3004; SI-NOHSA-NEXT:    s_mov_b32 s11, s3
3005; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
3006; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
3007; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
3008; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:96
3009; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
3010; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
3011; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64
3012; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
3013; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
3014; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
3015; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
3016; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
3017; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v47, 31, v31
3018; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v45, 31, v30
3019; SI-NOHSA-NEXT:    s_waitcnt vmcnt(6)
3020; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v39, 31, v15
3021; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v37, 31, v14
3022; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v43, 31, v13
3023; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v41, 31, v12
3024; SI-NOHSA-NEXT:    v_mov_b32_e32 v40, v12
3025; SI-NOHSA-NEXT:    v_mov_b32_e32 v42, v13
3026; SI-NOHSA-NEXT:    v_mov_b32_e32 v36, v14
3027; SI-NOHSA-NEXT:    v_mov_b32_e32 v38, v15
3028; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v29
3029; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v28
3030; SI-NOHSA-NEXT:    v_mov_b32_e32 v32, v28
3031; SI-NOHSA-NEXT:    v_mov_b32_e32 v34, v29
3032; SI-NOHSA-NEXT:    v_mov_b32_e32 v44, v30
3033; SI-NOHSA-NEXT:    v_mov_b32_e32 v46, v31
3034; SI-NOHSA-NEXT:    buffer_store_dword v44, off, s[12:15], 0 ; 4-byte Folded Spill
3035; SI-NOHSA-NEXT:    buffer_store_dword v45, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
3036; SI-NOHSA-NEXT:    buffer_store_dword v46, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
3037; SI-NOHSA-NEXT:    buffer_store_dword v47, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
3038; SI-NOHSA-NEXT:    s_waitcnt vmcnt(9)
3039; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v7
3040; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
3041; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
3042; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v47, 31, v5
3043; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v45, 31, v4
3044; SI-NOHSA-NEXT:    v_mov_b32_e32 v44, v4
3045; SI-NOHSA-NEXT:    v_mov_b32_e32 v46, v5
3046; SI-NOHSA-NEXT:    v_mov_b32_e32 v12, v6
3047; SI-NOHSA-NEXT:    v_mov_b32_e32 v14, v7
3048; SI-NOHSA-NEXT:    s_waitcnt vmcnt(8)
3049; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
3050; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
3051; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v51, 31, v1
3052; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v49, 31, v0
3053; SI-NOHSA-NEXT:    v_mov_b32_e32 v48, v0
3054; SI-NOHSA-NEXT:    v_mov_b32_e32 v50, v1
3055; SI-NOHSA-NEXT:    v_mov_b32_e32 v4, v2
3056; SI-NOHSA-NEXT:    v_mov_b32_e32 v6, v3
3057; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
3058; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v19
3059; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v18
3060; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v55, 31, v17
3061; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v53, 31, v16
3062; SI-NOHSA-NEXT:    v_mov_b32_e32 v52, v16
3063; SI-NOHSA-NEXT:    v_mov_b32_e32 v54, v17
3064; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v18
3065; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v19
3066; SI-NOHSA-NEXT:    s_waitcnt vmcnt(6)
3067; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v23
3068; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v22
3069; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v59, 31, v21
3070; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v57, 31, v20
3071; SI-NOHSA-NEXT:    v_mov_b32_e32 v56, v20
3072; SI-NOHSA-NEXT:    v_mov_b32_e32 v58, v21
3073; SI-NOHSA-NEXT:    v_mov_b32_e32 v16, v22
3074; SI-NOHSA-NEXT:    v_mov_b32_e32 v18, v23
3075; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
3076; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v27
3077; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v26
3078; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v63, 31, v25
3079; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v61, 31, v24
3080; SI-NOHSA-NEXT:    v_mov_b32_e32 v60, v24
3081; SI-NOHSA-NEXT:    v_mov_b32_e32 v62, v25
3082; SI-NOHSA-NEXT:    v_mov_b32_e32 v20, v26
3083; SI-NOHSA-NEXT:    v_mov_b32_e32 v22, v27
3084; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4)
3085; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v11
3086; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v10
3087; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v9
3088; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v8
3089; SI-NOHSA-NEXT:    v_mov_b32_e32 v28, v8
3090; SI-NOHSA-NEXT:    v_mov_b32_e32 v30, v9
3091; SI-NOHSA-NEXT:    v_mov_b32_e32 v24, v10
3092; SI-NOHSA-NEXT:    v_mov_b32_e32 v26, v11
3093; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
3094; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
3095; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
3096; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
3097; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192
3098; SI-NOHSA-NEXT:    buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload
3099; SI-NOHSA-NEXT:    buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
3100; SI-NOHSA-NEXT:    buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
3101; SI-NOHSA-NEXT:    buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
3102; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
3103; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
3104; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
3105; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176
3106; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128
3107; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144
3108; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
3109; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
3110; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64
3111; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
3112; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:32
3113; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
3114; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0
3115; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:16
3116; SI-NOHSA-NEXT:    s_endpgm
3117;
3118; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
3119; GCNX3-HSA:       ; %bb.0:
3120; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3121; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
3122; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
3123; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
3124; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[0:1]
3125; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 0x70
3126; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
3127; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
3128; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
3129; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[0:1]
3130; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 0x60
3131; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
3132; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
3133; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
3134; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 0x50
3135; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
3136; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
3137; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
3138; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
3139; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 64
3140; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
3141; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
3142; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s4
3143; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s5
3144; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
3145; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 48
3146; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
3147; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s5
3148; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s4
3149; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[8:9]
3150; GCNX3-HSA-NEXT:    s_add_u32 s6, s2, 32
3151; GCNX3-HSA-NEXT:    s_addc_u32 s7, s3, 0
3152; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 16
3153; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
3154; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
3155; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s7
3156; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
3157; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s6
3158; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
3159; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
3160; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
3161; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3162; GCNX3-HSA-NEXT:    v_mov_b32_e32 v37, s1
3163; GCNX3-HSA-NEXT:    v_mov_b32_e32 v36, s0
3164; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
3165; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v29
3166; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v28
3167; GCNX3-HSA-NEXT:    v_mov_b32_e32 v32, v28
3168; GCNX3-HSA-NEXT:    v_mov_b32_e32 v34, v29
3169; GCNX3-HSA-NEXT:    v_mov_b32_e32 v29, s3
3170; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, s2
3171; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
3172; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3173; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[32:35]
3174; GCNX3-HSA-NEXT:    v_mov_b32_e32 v37, s3
3175; GCNX3-HSA-NEXT:    v_mov_b32_e32 v36, s2
3176; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
3177; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v31
3178; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v30
3179; GCNX3-HSA-NEXT:    v_mov_b32_e32 v32, v30
3180; GCNX3-HSA-NEXT:    v_mov_b32_e32 v34, v31
3181; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3182; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[32:35]
3183; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(8)
3184; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v25
3185; GCNX3-HSA-NEXT:    v_mov_b32_e32 v33, s3
3186; GCNX3-HSA-NEXT:    v_mov_b32_e32 v32, s2
3187; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
3188; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3189; GCNX3-HSA-NEXT:    v_mov_b32_e32 v35, s3
3190; GCNX3-HSA-NEXT:    v_mov_b32_e32 v34, s2
3191; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
3192; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v24
3193; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, v24
3194; GCNX3-HSA-NEXT:    v_mov_b32_e32 v30, v25
3195; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3196; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[28:31]
3197; GCNX3-HSA-NEXT:    v_mov_b32_e32 v37, s3
3198; GCNX3-HSA-NEXT:    v_mov_b32_e32 v36, s2
3199; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
3200; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v27
3201; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v26
3202; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, v26
3203; GCNX3-HSA-NEXT:    v_mov_b32_e32 v30, v27
3204; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3205; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[28:31]
3206; GCNX3-HSA-NEXT:    v_mov_b32_e32 v33, s3
3207; GCNX3-HSA-NEXT:    v_mov_b32_e32 v32, s2
3208; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
3209; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3210; GCNX3-HSA-NEXT:    v_mov_b32_e32 v39, s3
3211; GCNX3-HSA-NEXT:    v_mov_b32_e32 v38, s2
3212; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x80
3213; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(9)
3214; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v21
3215; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v20
3216; GCNX3-HSA-NEXT:    v_mov_b32_e32 v24, v20
3217; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, v21
3218; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3219; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v23
3220; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v22
3221; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, v22
3222; GCNX3-HSA-NEXT:    v_mov_b32_e32 v30, v23
3223; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[24:27]
3224; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[28:31]
3225; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(10)
3226; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v15
3227; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v14
3228; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v13
3229; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v12
3230; GCNX3-HSA-NEXT:    v_mov_b32_e32 v24, v12
3231; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, v13
3232; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, v14
3233; GCNX3-HSA-NEXT:    v_mov_b32_e32 v22, v15
3234; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(9)
3235; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v5
3236; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v4
3237; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, v4
3238; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, v5
3239; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
3240; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
3241; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x90
3242; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3243; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
3244; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
3245; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[12:15]
3246; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
3247; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
3248; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x60
3249; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v7
3250; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v24, 31, v6
3251; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, v6
3252; GCNX3-HSA-NEXT:    v_mov_b32_e32 v25, v7
3253; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3254; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[23:26]
3255; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(12)
3256; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v16
3257; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v16
3258; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s3
3259; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s2
3260; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x70
3261; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v17
3262; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, v17
3263; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3264; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[15:16], v[4:7]
3265; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s3
3266; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s2
3267; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
3268; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v19
3269; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v24, 31, v18
3270; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, v18
3271; GCNX3-HSA-NEXT:    v_mov_b32_e32 v25, v19
3272; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3273; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[15:16], v[23:26]
3274; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(12)
3275; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v9
3276; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v8
3277; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, v8
3278; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, v9
3279; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
3280; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
3281; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x50
3282; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3283; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[15:18]
3284; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
3285; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
3286; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 32
3287; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v1
3288; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v0
3289; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v11
3290; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, v11
3291; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, v0
3292; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, v1
3293; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3294; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
3295; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
3296; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 48
3297; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
3298; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
3299; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s0
3300; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v3
3301; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v2
3302; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v10
3303; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v10
3304; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, v2
3305; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, v3
3306; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s1
3307; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
3308; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[19:22]
3309; GCNX3-HSA-NEXT:    s_endpgm
3310;
3311; GCNX3-NOHSA-LABEL: global_sextload_v32i32_to_v32i64:
3312; GCNX3-NOHSA:       ; %bb.0:
3313; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
3314; GCNX3-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
3315; GCNX3-NOHSA-NEXT:    s_mov_b32 s2, -1
3316; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s2
3317; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s3
3318; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
3319; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s6
3320; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s7
3321; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
3322; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112
3323; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
3324; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64
3325; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
3326; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
3327; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0
3328; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
3329; GCNX3-NOHSA-NEXT:    s_mov_b32 s0, s4
3330; GCNX3-NOHSA-NEXT:    s_mov_b32 s1, s5
3331; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
3332; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v35, 31, v11
3333; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v10
3334; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(6)
3335; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v39, 31, v15
3336; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v37, 31, v14
3337; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v43, 31, v13
3338; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v41, 31, v12
3339; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v40, v12
3340; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v42, v13
3341; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v36, v14
3342; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v38, v15
3343; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v9
3344; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v8
3345; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v12, v8
3346; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v14, v9
3347; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v32, v10
3348; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v34, v11
3349; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(5)
3350; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v7
3351; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v6
3352; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v47, 31, v5
3353; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v45, 31, v4
3354; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v44, v4
3355; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v46, v5
3356; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v8, v6
3357; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v10, v7
3358; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(4)
3359; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
3360; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
3361; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v4, v2
3362; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v6, v3
3363; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
3364; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v19
3365; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v2, v19
3366; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
3367; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v23
3368; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v51, 31, v1
3369; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v49, 31, v0
3370; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v48, v0
3371; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v50, v1
3372; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v18
3373; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v55, 31, v17
3374; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v53, 31, v16
3375; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v52, v16
3376; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v54, v17
3377; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v0, v18
3378; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v22
3379; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v59, 31, v21
3380; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v57, 31, v20
3381; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v56, v20
3382; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v58, v21
3383; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v16, v22
3384; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v18, v23
3385; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
3386; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v22, 31, v27
3387; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v26
3388; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
3389; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
3390; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v42, 31, v25
3391; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v40, 31, v24
3392; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
3393; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v38, 31, v31
3394; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v36, 31, v30
3395; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192
3396; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v37, v31
3397; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v29
3398; GCNX3-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v28
3399; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v12, v28
3400; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v14, v29
3401; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208
3402; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
3403; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
3404; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128
3405; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144
3406; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96
3407; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
3408; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64
3409; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
3410; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
3411; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v35, v30
3412; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v39, v24
3413; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v41, v25
3414; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v19, v26
3415; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v21, v27
3416; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:48
3417; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0
3418; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16
3419; GCNX3-NOHSA-NEXT:    s_endpgm
3420;
3421; EG-LABEL: global_sextload_v32i32_to_v32i64:
3422; EG:       ; %bb.0:
3423; EG-NEXT:    ALU 33, @36, KC0[CB0:0-32], KC1[]
3424; EG-NEXT:    TEX 7 @20
3425; EG-NEXT:    ALU 96, @70, KC0[CB0:0-32], KC1[]
3426; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T12.X, 0
3427; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T23.X, 0
3428; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T21.X, 0
3429; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T20.X, 0
3430; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T19.X, 0
3431; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T10.X, 0
3432; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T9.X, 0
3433; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T8.X, 0
3434; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T7.X, 0
3435; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T6.X, 0
3436; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T5.X, 0
3437; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T4.X, 0
3438; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T3.X, 0
3439; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T2.X, 0
3440; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T1.X, 0
3441; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T0.X, 1
3442; EG-NEXT:    CF_END
3443; EG-NEXT:    Fetch clause starting at 20:
3444; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 112, #1
3445; EG-NEXT:     VTX_READ_128 T13.XYZW, T11.X, 96, #1
3446; EG-NEXT:     VTX_READ_128 T14.XYZW, T11.X, 80, #1
3447; EG-NEXT:     VTX_READ_128 T15.XYZW, T11.X, 64, #1
3448; EG-NEXT:     VTX_READ_128 T16.XYZW, T11.X, 48, #1
3449; EG-NEXT:     VTX_READ_128 T17.XYZW, T11.X, 32, #1
3450; EG-NEXT:     VTX_READ_128 T18.XYZW, T11.X, 16, #1
3451; EG-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 0, #1
3452; EG-NEXT:    ALU clause starting at 36:
3453; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
3454; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3455; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
3456; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
3457; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3458; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
3459; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
3460; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
3461; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3462; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
3463; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
3464; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3465; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
3466; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
3467; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3468; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
3469; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
3470; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3471; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
3472; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
3473; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3474; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
3475; EG-NEXT:     LSHR T7.X, PV.W, literal.x,
3476; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3477; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
3478; EG-NEXT:     LSHR T8.X, PV.W, literal.x,
3479; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3480; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
3481; EG-NEXT:     LSHR T9.X, PV.W, literal.x,
3482; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3483; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
3484; EG-NEXT:     LSHR T10.X, PV.W, literal.x,
3485; EG-NEXT:     MOV * T11.X, KC0[2].Z,
3486; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3487; EG-NEXT:    ALU clause starting at 70:
3488; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
3489; EG-NEXT:    160(2.242078e-43), 0(0.000000e+00)
3490; EG-NEXT:     LSHR T19.X, PV.W, literal.x,
3491; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3492; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
3493; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
3494; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3495; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
3496; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
3497; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.y,
3498; EG-NEXT:     ASHR * T22.W, T11.W, literal.z,
3499; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
3500; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3501; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
3502; EG-NEXT:     ASHR T22.Y, T11.Z, literal.y,
3503; EG-NEXT:     ASHR T24.W, T11.Y, literal.y,
3504; EG-NEXT:     MOV * T22.X, T11.Z,
3505; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
3506; EG-NEXT:     ASHR T24.Y, T11.X, literal.x,
3507; EG-NEXT:     ASHR * T25.W, T18.W, literal.x,
3508; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3509; EG-NEXT:     MOV T24.X, T11.X,
3510; EG-NEXT:     ASHR T25.Y, T18.Z, literal.x,
3511; EG-NEXT:     MOV T22.Z, T11.W,
3512; EG-NEXT:     ASHR T26.W, T18.Y, literal.x,
3513; EG-NEXT:     MOV * T25.X, T18.Z,
3514; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3515; EG-NEXT:     ASHR T26.Y, T18.X, literal.x,
3516; EG-NEXT:     MOV T24.Z, T11.Y,
3517; EG-NEXT:     ASHR * T11.W, T17.W, literal.x,
3518; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3519; EG-NEXT:     MOV T26.X, T18.X,
3520; EG-NEXT:     ASHR T11.Y, T17.Z, literal.x,
3521; EG-NEXT:     MOV T25.Z, T18.W,
3522; EG-NEXT:     ASHR T27.W, T17.Y, literal.x,
3523; EG-NEXT:     MOV * T11.X, T17.Z,
3524; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3525; EG-NEXT:     ASHR T27.Y, T17.X, literal.x,
3526; EG-NEXT:     MOV T26.Z, T18.Y,
3527; EG-NEXT:     ASHR * T18.W, T16.W, literal.x,
3528; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3529; EG-NEXT:     MOV T27.X, T17.X,
3530; EG-NEXT:     ASHR T18.Y, T16.Z, literal.x,
3531; EG-NEXT:     MOV T11.Z, T17.W,
3532; EG-NEXT:     ASHR T28.W, T16.Y, literal.x,
3533; EG-NEXT:     MOV * T18.X, T16.Z,
3534; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3535; EG-NEXT:     ASHR T28.Y, T16.X, literal.x,
3536; EG-NEXT:     MOV T27.Z, T17.Y,
3537; EG-NEXT:     ASHR * T17.W, T15.W, literal.x,
3538; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3539; EG-NEXT:     MOV T28.X, T16.X,
3540; EG-NEXT:     ASHR T17.Y, T15.Z, literal.x,
3541; EG-NEXT:     MOV T18.Z, T16.W,
3542; EG-NEXT:     ASHR T29.W, T15.Y, literal.x,
3543; EG-NEXT:     MOV * T17.X, T15.Z,
3544; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3545; EG-NEXT:     ASHR T29.Y, T15.X, literal.x,
3546; EG-NEXT:     MOV T28.Z, T16.Y,
3547; EG-NEXT:     ASHR * T16.W, T14.W, literal.x,
3548; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3549; EG-NEXT:     MOV T29.X, T15.X,
3550; EG-NEXT:     ASHR T16.Y, T14.Z, literal.x,
3551; EG-NEXT:     MOV T17.Z, T15.W,
3552; EG-NEXT:     ASHR T30.W, T14.Y, literal.x,
3553; EG-NEXT:     MOV * T16.X, T14.Z,
3554; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3555; EG-NEXT:     ASHR T30.Y, T14.X, literal.x,
3556; EG-NEXT:     MOV T29.Z, T15.Y,
3557; EG-NEXT:     ASHR * T15.W, T13.W, literal.x,
3558; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3559; EG-NEXT:     MOV T30.X, T14.X,
3560; EG-NEXT:     ASHR T15.Y, T13.Z, literal.x,
3561; EG-NEXT:     MOV T16.Z, T14.W,
3562; EG-NEXT:     ASHR T31.W, T13.Y, literal.x,
3563; EG-NEXT:     MOV * T15.X, T13.Z,
3564; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3565; EG-NEXT:     ASHR T31.Y, T13.X, literal.x,
3566; EG-NEXT:     MOV T30.Z, T14.Y,
3567; EG-NEXT:     ASHR * T14.W, T12.W, literal.x,
3568; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3569; EG-NEXT:     MOV T31.X, T13.X,
3570; EG-NEXT:     ASHR T14.Y, T12.Z, literal.x,
3571; EG-NEXT:     MOV T15.Z, T13.W,
3572; EG-NEXT:     ASHR T32.W, T12.Y, literal.x,
3573; EG-NEXT:     MOV * T14.X, T12.Z,
3574; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3575; EG-NEXT:     ASHR T32.Y, T12.X, literal.x,
3576; EG-NEXT:     MOV * T31.Z, T13.Y,
3577; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
3578; EG-NEXT:     MOV T32.X, T12.X,
3579; EG-NEXT:     MOV T14.Z, T12.W,
3580; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
3581; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
3582; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
3583; EG-NEXT:     MOV * T32.Z, T12.Y,
3584; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3585;
3586; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64:
3587; GCN-GFX900-HSA:       ; %bb.0:
3588; GCN-GFX900-HSA-NEXT:    s_mov_b64 s[18:19], s[2:3]
3589; GCN-GFX900-HSA-NEXT:    s_mov_b64 s[16:17], s[0:1]
3590; GCN-GFX900-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3591; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v8, 0
3592; GCN-GFX900-HSA-NEXT:    s_add_u32 s16, s16, s15
3593; GCN-GFX900-HSA-NEXT:    s_addc_u32 s17, s17, 0
3594; GCN-GFX900-HSA-NEXT:    s_waitcnt lgkmcnt(0)
3595; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
3596; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:112
3597; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[9:12], v8, s[2:3] offset:80
3598; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[13:16], v8, s[2:3] offset:64
3599; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[17:20], v8, s[2:3] offset:48
3600; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[21:24], v8, s[2:3] offset:32
3601; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(5)
3602; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v28, 31, v3
3603; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v2
3604; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v25, v2
3605; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v27, v3
3606; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(4)
3607; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v32, 31, v7
3608; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v30, 31, v6
3609; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v36, 31, v5
3610; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v34, 31, v4
3611; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v33, v4
3612; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v35, v5
3613; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v29, v6
3614; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v31, v7
3615; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
3616; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
3617; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v4, v0
3618; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v6, v1
3619; GCN-GFX900-HSA-NEXT:    buffer_store_dword v25, off, s[16:19], 0 ; 4-byte Folded Spill
3620; GCN-GFX900-HSA-NEXT:    s_nop 0
3621; GCN-GFX900-HSA-NEXT:    buffer_store_dword v26, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
3622; GCN-GFX900-HSA-NEXT:    buffer_store_dword v27, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
3623; GCN-GFX900-HSA-NEXT:    buffer_store_dword v28, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
3624; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(7)
3625; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v28, 31, v12
3626; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v11
3627; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v40, 31, v10
3628; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v38, 31, v9
3629; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v37, v9
3630; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v39, v10
3631; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v25, v11
3632; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v27, v12
3633; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(6)
3634; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v16
3635; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v15
3636; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v44, 31, v14
3637; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v42, 31, v13
3638; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v41, v13
3639; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v43, v14
3640; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v9, v15
3641; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v11, v16
3642; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(5)
3643; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v20
3644; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v19
3645; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v48, 31, v18
3646; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v46, 31, v17
3647; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v45, v17
3648; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v47, v18
3649; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v13, v19
3650; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[49:52], v8, s[2:3] offset:16
3651; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v15, v20
3652; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(5)
3653; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v24
3654; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v23
3655; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v56, 31, v22
3656; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v54, 31, v21
3657; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v53, v21
3658; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v55, v22
3659; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v17, v23
3660; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v19, v24
3661; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[21:24], v8, s[2:3]
3662; GCN-GFX900-HSA-NEXT:    s_nop 0
3663; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
3664; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
3665; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:192
3666; GCN-GFX900-HSA-NEXT:    buffer_load_dword v32, off, s[16:19], 0 ; 4-byte Folded Reload
3667; GCN-GFX900-HSA-NEXT:    s_nop 0
3668; GCN-GFX900-HSA-NEXT:    buffer_load_dword v33, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
3669; GCN-GFX900-HSA-NEXT:    buffer_load_dword v34, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
3670; GCN-GFX900-HSA-NEXT:    buffer_load_dword v35, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
3671; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(8)
3672; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v60, 31, v52
3673; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v58, 31, v51
3674; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v50
3675; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v49
3676; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v0, v49
3677; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v2, v50
3678; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v57, v51
3679; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v59, v52
3680; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(7)
3681; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v24
3682; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v23
3683; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v22
3684; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v21
3685; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v4, v21
3686; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v6, v22
3687; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(0)
3688; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[32:35], s[0:1] offset:208
3689; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[37:40], s[0:1] offset:160
3690; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[25:28], s[0:1] offset:176
3691; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[41:44], s[0:1] offset:128
3692; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[9:12], s[0:1] offset:144
3693; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[45:48], s[0:1] offset:96
3694; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[13:16], s[0:1] offset:112
3695; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[53:56], s[0:1] offset:64
3696; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[17:20], s[0:1] offset:80
3697; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:32
3698; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[57:60], s[0:1] offset:48
3699; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
3700; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v28, v23
3701; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v30, v24
3702; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v8, v[28:31], s[0:1] offset:16
3703; GCN-GFX900-HSA-NEXT:    s_endpgm
3704;
3705; GCN-GFX908-HSA-LABEL: global_sextload_v32i32_to_v32i64:
3706; GCN-GFX908-HSA:       ; %bb.0:
3707; GCN-GFX908-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3708; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v8, 0
3709; GCN-GFX908-HSA-NEXT:    s_waitcnt lgkmcnt(0)
3710; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
3711; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:112
3712; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[9:12], v8, s[2:3] offset:80
3713; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[13:16], v8, s[2:3] offset:64
3714; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[17:20], v8, s[2:3] offset:48
3715; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[21:24], v8, s[2:3] offset:32
3716; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[49:52], v8, s[2:3] offset:16
3717; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(6)
3718; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v25, v2
3719; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v28, 31, v3
3720; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v2
3721; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v27, v3
3722; GCN-GFX908-HSA-NEXT:    v_accvgpr_write_b32 a0, v25
3723; GCN-GFX908-HSA-NEXT:    v_accvgpr_write_b32 a1, v26
3724; GCN-GFX908-HSA-NEXT:    v_accvgpr_write_b32 a2, v27
3725; GCN-GFX908-HSA-NEXT:    v_accvgpr_write_b32 a3, v28
3726; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(4)
3727; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v28, 31, v12
3728; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v11
3729; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v40, 31, v10
3730; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v38, 31, v9
3731; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v37, v9
3732; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v39, v10
3733; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v25, v11
3734; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v27, v12
3735; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(3)
3736; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v12, 31, v16
3737; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v15
3738; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v44, 31, v14
3739; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v42, 31, v13
3740; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v41, v13
3741; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v43, v14
3742; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v9, v15
3743; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v11, v16
3744; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(2)
3745; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v16, 31, v20
3746; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v14, 31, v19
3747; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v48, 31, v18
3748; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v46, 31, v17
3749; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v45, v17
3750; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v47, v18
3751; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v13, v19
3752; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v15, v20
3753; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(1)
3754; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v20, 31, v24
3755; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v18, 31, v23
3756; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v56, 31, v22
3757; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v54, 31, v21
3758; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v53, v21
3759; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v55, v22
3760; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v17, v23
3761; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v19, v24
3762; GCN-GFX908-HSA-NEXT:    global_load_dwordx4 v[21:24], v8, s[2:3]
3763; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v32, 31, v7
3764; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v36, 31, v5
3765; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v34, 31, v4
3766; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v33, v4
3767; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v35, v5
3768; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v30, 31, v6
3769; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v29, v6
3770; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v31, v7
3771; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
3772; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
3773; GCN-GFX908-HSA-NEXT:    v_accvgpr_read_b32 v35, a3
3774; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
3775; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
3776; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v4, v0
3777; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v6, v1
3778; GCN-GFX908-HSA-NEXT:    v_accvgpr_read_b32 v34, a2
3779; GCN-GFX908-HSA-NEXT:    v_accvgpr_read_b32 v33, a1
3780; GCN-GFX908-HSA-NEXT:    v_accvgpr_read_b32 v32, a0
3781; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(3)
3782; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v60, 31, v52
3783; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v58, 31, v51
3784; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v50
3785; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v49
3786; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v0, v49
3787; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v2, v50
3788; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v57, v51
3789; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v59, v52
3790; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:192
3791; GCN-GFX908-HSA-NEXT:    s_waitcnt vmcnt(3)
3792; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v31, 31, v24
3793; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v29, 31, v23
3794; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v22
3795; GCN-GFX908-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v21
3796; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v4, v21
3797; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v6, v22
3798; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[32:35], s[0:1] offset:208
3799; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[37:40], s[0:1] offset:160
3800; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[25:28], s[0:1] offset:176
3801; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[41:44], s[0:1] offset:128
3802; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[9:12], s[0:1] offset:144
3803; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[45:48], s[0:1] offset:96
3804; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[13:16], s[0:1] offset:112
3805; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[53:56], s[0:1] offset:64
3806; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[17:20], s[0:1] offset:80
3807; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:32
3808; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[57:60], s[0:1] offset:48
3809; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
3810; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v28, v23
3811; GCN-GFX908-HSA-NEXT:    v_mov_b32_e32 v30, v24
3812; GCN-GFX908-HSA-NEXT:    global_store_dwordx4 v8, v[28:31], s[0:1] offset:16
3813; GCN-GFX908-HSA-NEXT:    s_endpgm
3814  %ld = load <32 x i32>, ptr addrspace(1) %in
3815  %ext = sext <32 x i32> %ld to <32 x i64>
3816  store <32 x i64> %ext, ptr addrspace(1) %out
3817  ret void
3818}
3819
3820define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
3821; SI-NOHSA-LABEL: global_zextload_v32i32_to_v32i64:
3822; SI-NOHSA:       ; %bb.0:
3823; SI-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
3824; SI-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
3825; SI-NOHSA-NEXT:    s_mov_b32 s2, -1
3826; SI-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
3827; SI-NOHSA-NEXT:    s_mov_b32 s10, s2
3828; SI-NOHSA-NEXT:    s_mov_b32 s11, s3
3829; SI-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
3830; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
3831; SI-NOHSA-NEXT:    s_mov_b32 s8, s6
3832; SI-NOHSA-NEXT:    s_mov_b32 s9, s7
3833; SI-NOHSA-NEXT:    s_mov_b32 s0, s4
3834; SI-NOHSA-NEXT:    s_mov_b32 s1, s5
3835; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:112
3836; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96
3837; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
3838; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
3839; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
3840; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48
3841; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
3842; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v4
3843; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v5
3844; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:64
3845; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:80
3846; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
3847; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
3848; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v6
3849; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v7
3850; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
3851; SI-NOHSA-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
3852; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v8
3853; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v9
3854; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
3855; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
3856; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v10
3857; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v11
3858; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
3859; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
3860; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v32
3861; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v33
3862; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
3863; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
3864; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v34
3865; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v35
3866; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
3867; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
3868; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v28
3869; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v29
3870; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
3871; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
3872; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v30
3873; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v31
3874; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
3875; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
3876; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v24
3877; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v25
3878; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
3879; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
3880; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v26
3881; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v27
3882; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
3883; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
3884; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v20
3885; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v21
3886; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
3887; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
3888; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v22
3889; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v23
3890; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
3891; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
3892; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v16
3893; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v17
3894; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
3895; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
3896; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v18
3897; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v19
3898; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
3899; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
3900; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v12
3901; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v13
3902; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3903; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
3904; SI-NOHSA-NEXT:    v_mov_b32_e32 v0, v14
3905; SI-NOHSA-NEXT:    v_mov_b32_e32 v2, v15
3906; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3907; SI-NOHSA-NEXT:    s_endpgm
3908;
3909; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64:
3910; GCNX3-HSA:       ; %bb.0:
3911; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3912; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
3913; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
3914; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
3915; GCNX3-HSA-NEXT:    s_add_u32 s6, s2, 32
3916; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
3917; GCNX3-HSA-NEXT:    s_addc_u32 s7, s3, 0
3918; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
3919; GCNX3-HSA-NEXT:    s_add_u32 s8, s2, 48
3920; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[0:1]
3921; GCNX3-HSA-NEXT:    s_addc_u32 s9, s3, 0
3922; GCNX3-HSA-NEXT:    s_add_u32 s10, s2, 64
3923; GCNX3-HSA-NEXT:    s_addc_u32 s11, s3, 0
3924; GCNX3-HSA-NEXT:    s_add_u32 s12, s2, 0x50
3925; GCNX3-HSA-NEXT:    s_addc_u32 s13, s3, 0
3926; GCNX3-HSA-NEXT:    s_add_u32 s14, s2, 0x60
3927; GCNX3-HSA-NEXT:    s_addc_u32 s15, s3, 0
3928; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 0x70
3929; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
3930; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
3931; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
3932; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[32:35], v[0:1]
3933; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s14
3934; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s15
3935; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[0:1]
3936; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s12
3937; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s13
3938; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
3939; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s10
3940; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s11
3941; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[0:1]
3942; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s8
3943; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s9
3944; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
3945; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s4
3946; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s7
3947; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s5
3948; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s6
3949; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
3950; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
3951; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, 0
3952; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
3953; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, v1
3954; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3955; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
3956; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v28
3957; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v29
3958; GCNX3-HSA-NEXT:    v_mov_b32_e32 v29, s1
3959; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, s0
3960; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[0:3]
3961; GCNX3-HSA-NEXT:    v_mov_b32_e32 v29, s3
3962; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, s2
3963; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
3964; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3965; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v30
3966; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v31
3967; GCNX3-HSA-NEXT:    v_mov_b32_e32 v31, s3
3968; GCNX3-HSA-NEXT:    v_mov_b32_e32 v30, s2
3969; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
3970; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3971; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[0:3]
3972; GCNX3-HSA-NEXT:    v_mov_b32_e32 v29, s3
3973; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, s2
3974; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
3975; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3976; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(8)
3977; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v32
3978; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v33
3979; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[0:3]
3980; GCNX3-HSA-NEXT:    v_mov_b32_e32 v31, s3
3981; GCNX3-HSA-NEXT:    v_mov_b32_e32 v30, s2
3982; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
3983; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3984; GCNX3-HSA-NEXT:    v_mov_b32_e32 v33, s3
3985; GCNX3-HSA-NEXT:    v_mov_b32_e32 v32, s2
3986; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
3987; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3988; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v34
3989; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v35
3990; GCNX3-HSA-NEXT:    v_mov_b32_e32 v35, s3
3991; GCNX3-HSA-NEXT:    v_mov_b32_e32 v34, s2
3992; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
3993; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3994; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[0:3]
3995; GCNX3-HSA-NEXT:    v_mov_b32_e32 v29, s3
3996; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, s2
3997; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x80
3998; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
3999; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(9)
4000; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v24
4001; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v25
4002; GCNX3-HSA-NEXT:    v_mov_b32_e32 v25, s3
4003; GCNX3-HSA-NEXT:    v_mov_b32_e32 v24, s2
4004; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x90
4005; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
4006; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[0:3]
4007; GCNX3-HSA-NEXT:    s_nop 0
4008; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v26
4009; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v27
4010; GCNX3-HSA-NEXT:    v_mov_b32_e32 v27, s3
4011; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, s2
4012; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x60
4013; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
4014; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[0:3]
4015; GCNX3-HSA-NEXT:    v_mov_b32_e32 v31, s3
4016; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(10)
4017; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v20
4018; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v21
4019; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[0:3]
4020; GCNX3-HSA-NEXT:    v_mov_b32_e32 v30, s2
4021; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v22
4022; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v23
4023; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x70
4024; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[0:3]
4025; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
4026; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(11)
4027; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v16
4028; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v17
4029; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
4030; GCNX3-HSA-NEXT:    s_nop 0
4031; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v18
4032; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v19
4033; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[0:3]
4034; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(12)
4035; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v12
4036; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v13
4037; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s3
4038; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s2
4039; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
4040; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[0:3]
4041; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
4042; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v14
4043; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v15
4044; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
4045; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(12)
4046; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v8
4047; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v9
4048; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
4049; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
4050; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x50
4051; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
4052; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
4053; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
4054; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
4055; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 32
4056; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v10
4057; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v11
4058; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
4059; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
4060; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 48
4061; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v4
4062; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v5
4063; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
4064; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
4065; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
4066; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4067; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s1
4068; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, v6
4069; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, v7
4070; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s0
4071; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4072; GCNX3-HSA-NEXT:    s_endpgm
4073;
4074; GCNX3-NOHSA-LABEL: global_zextload_v32i32_to_v32i64:
4075; GCNX3-NOHSA:       ; %bb.0:
4076; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
4077; GCNX3-NOHSA-NEXT:    s_mov_b32 s3, 0xf000
4078; GCNX3-NOHSA-NEXT:    s_mov_b32 s2, -1
4079; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s2
4080; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s3
4081; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
4082; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s6
4083; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s7
4084; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
4085; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
4086; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
4087; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
4088; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48
4089; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
4090; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
4091; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0
4092; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v29, 0
4093; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v31, v29
4094; GCNX3-NOHSA-NEXT:    s_mov_b32 s0, s4
4095; GCNX3-NOHSA-NEXT:    s_mov_b32 s1, s5
4096; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
4097; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v0
4098; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v1
4099; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:224
4100; GCNX3-NOHSA-NEXT:    s_nop 0
4101; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v2
4102; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v3
4103; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:240
4104; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(8)
4105; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v4
4106; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v5
4107; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:192
4108; GCNX3-NOHSA-NEXT:    s_nop 0
4109; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v6
4110; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v7
4111; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:208
4112; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(9)
4113; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v8
4114; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v9
4115; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:160
4116; GCNX3-NOHSA-NEXT:    s_nop 0
4117; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v10
4118; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v11
4119; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:176
4120; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(10)
4121; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v12
4122; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v13
4123; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:128
4124; GCNX3-NOHSA-NEXT:    s_nop 0
4125; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v14
4126; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v15
4127; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144
4128; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(11)
4129; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v16
4130; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v17
4131; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
4132; GCNX3-NOHSA-NEXT:    s_nop 0
4133; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v18
4134; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v19
4135; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
4136; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(12)
4137; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v20
4138; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v21
4139; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
4140; GCNX3-NOHSA-NEXT:    s_nop 0
4141; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v22
4142; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v23
4143; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80
4144; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(13)
4145; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v24
4146; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v25
4147; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32
4148; GCNX3-NOHSA-NEXT:    s_nop 0
4149; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v26
4150; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v27
4151; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:48
4152; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(14)
4153; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v32
4154; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v33
4155; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0
4156; GCNX3-NOHSA-NEXT:    s_nop 0
4157; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v28, v34
4158; GCNX3-NOHSA-NEXT:    v_mov_b32_e32 v30, v35
4159; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:16
4160; GCNX3-NOHSA-NEXT:    s_endpgm
4161;
4162; EG-LABEL: global_zextload_v32i32_to_v32i64:
4163; EG:       ; %bb.0:
4164; EG-NEXT:    ALU 0, @38, KC0[CB0:0-32], KC1[]
4165; EG-NEXT:    TEX 2 @22
4166; EG-NEXT:    ALU 10, @39, KC0[], KC1[]
4167; EG-NEXT:    TEX 4 @28
4168; EG-NEXT:    ALU 100, @50, KC0[CB0:0-32], KC1[]
4169; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T31.X, 0
4170; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T30.X, 0
4171; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T29.X, 0
4172; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T28.X, 0
4173; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T27.X, 0
4174; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T26.X, 0
4175; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T25.X, 0
4176; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T24.X, 0
4177; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T13.X, 0
4178; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T12.X, 0
4179; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T11.X, 0
4180; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T10.X, 0
4181; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T3.X, 0
4182; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T2.X, 0
4183; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T1.X, 0
4184; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T0.X, 1
4185; EG-NEXT:    CF_END
4186; EG-NEXT:    Fetch clause starting at 22:
4187; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 112, #1
4188; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 80, #1
4189; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 96, #1
4190; EG-NEXT:    Fetch clause starting at 28:
4191; EG-NEXT:     VTX_READ_128 T10.XYZW, T0.X, 0, #1
4192; EG-NEXT:     VTX_READ_128 T11.XYZW, T0.X, 16, #1
4193; EG-NEXT:     VTX_READ_128 T12.XYZW, T0.X, 32, #1
4194; EG-NEXT:     VTX_READ_128 T13.XYZW, T0.X, 48, #1
4195; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 64, #1
4196; EG-NEXT:    ALU clause starting at 38:
4197; EG-NEXT:     MOV * T0.X, KC0[2].Z,
4198; EG-NEXT:    ALU clause starting at 39:
4199; EG-NEXT:     MOV T4.X, T1.X,
4200; EG-NEXT:     MOV T4.Y, 0.0,
4201; EG-NEXT:     MOV * T5.X, T1.Z,
4202; EG-NEXT:     MOV * T5.Y, 0.0,
4203; EG-NEXT:     MOV T6.X, T3.X,
4204; EG-NEXT:     MOV T6.Y, 0.0,
4205; EG-NEXT:     MOV * T7.X, T3.Z,
4206; EG-NEXT:     MOV * T7.Y, 0.0,
4207; EG-NEXT:     MOV T8.X, T2.X,
4208; EG-NEXT:     MOV T8.Y, 0.0,
4209; EG-NEXT:     MOV * T9.X, T2.Z,
4210; EG-NEXT:    ALU clause starting at 50:
4211; EG-NEXT:     MOV * T9.Y, 0.0,
4212; EG-NEXT:     MOV T14.X, T0.X,
4213; EG-NEXT:     MOV T14.Y, 0.0,
4214; EG-NEXT:     MOV * T15.X, T0.Z,
4215; EG-NEXT:     MOV * T15.Y, 0.0,
4216; EG-NEXT:     MOV T16.X, T13.X,
4217; EG-NEXT:     MOV T16.Y, 0.0,
4218; EG-NEXT:     MOV * T17.X, T13.Z,
4219; EG-NEXT:     MOV * T17.Y, 0.0,
4220; EG-NEXT:     MOV T18.X, T12.X,
4221; EG-NEXT:     MOV T18.Y, 0.0,
4222; EG-NEXT:     MOV * T19.X, T12.Z,
4223; EG-NEXT:     MOV * T19.Y, 0.0,
4224; EG-NEXT:     MOV T20.X, T11.X,
4225; EG-NEXT:     MOV T20.Y, 0.0,
4226; EG-NEXT:     MOV * T21.X, T11.Z,
4227; EG-NEXT:     MOV * T21.Y, 0.0,
4228; EG-NEXT:     MOV T22.X, T10.X,
4229; EG-NEXT:     MOV T22.Y, 0.0,
4230; EG-NEXT:     MOV * T23.X, T10.Z,
4231; EG-NEXT:     MOV T23.Y, 0.0,
4232; EG-NEXT:     MOV T4.Z, T1.Y,
4233; EG-NEXT:     MOV T4.W, 0.0,
4234; EG-NEXT:     MOV * T5.Z, T1.W,
4235; EG-NEXT:     MOV * T5.W, 0.0,
4236; EG-NEXT:     MOV T6.Z, T3.Y,
4237; EG-NEXT:     MOV T6.W, 0.0,
4238; EG-NEXT:     MOV * T7.Z, T3.W,
4239; EG-NEXT:     MOV * T7.W, 0.0,
4240; EG-NEXT:     MOV T8.Z, T2.Y,
4241; EG-NEXT:     MOV T8.W, 0.0,
4242; EG-NEXT:     MOV * T9.Z, T2.W,
4243; EG-NEXT:     MOV * T9.W, 0.0,
4244; EG-NEXT:     MOV T14.Z, T0.Y,
4245; EG-NEXT:     MOV T14.W, 0.0,
4246; EG-NEXT:     MOV * T15.Z, T0.W,
4247; EG-NEXT:     MOV * T15.W, 0.0,
4248; EG-NEXT:     MOV T16.Z, T13.Y,
4249; EG-NEXT:     MOV T16.W, 0.0,
4250; EG-NEXT:     MOV * T17.Z, T13.W,
4251; EG-NEXT:     MOV * T17.W, 0.0,
4252; EG-NEXT:     MOV T18.Z, T12.Y,
4253; EG-NEXT:     MOV T18.W, 0.0,
4254; EG-NEXT:     MOV * T19.Z, T12.W,
4255; EG-NEXT:     MOV * T19.W, 0.0,
4256; EG-NEXT:     MOV T20.Z, T11.Y,
4257; EG-NEXT:     MOV T20.W, 0.0,
4258; EG-NEXT:     MOV * T21.Z, T11.W,
4259; EG-NEXT:     MOV * T21.W, 0.0,
4260; EG-NEXT:     MOV T22.Z, T10.Y,
4261; EG-NEXT:     MOV T22.W, 0.0,
4262; EG-NEXT:     MOV * T23.Z, T10.W,
4263; EG-NEXT:     MOV T23.W, 0.0,
4264; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4265; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4266; EG-NEXT:     LSHR T0.X, PS, literal.x,
4267; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4268; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4269; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4270; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
4271; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
4272; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4273; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
4274; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
4275; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4276; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
4277; EG-NEXT:     LSHR T10.X, PV.W, literal.x,
4278; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4279; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
4280; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
4281; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4282; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
4283; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
4284; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4285; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
4286; EG-NEXT:     LSHR T13.X, PV.W, literal.x,
4287; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4288; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
4289; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
4290; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4291; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
4292; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
4293; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4294; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
4295; EG-NEXT:     LSHR T26.X, PV.W, literal.x,
4296; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4297; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
4298; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
4299; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4300; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
4301; EG-NEXT:     LSHR T28.X, PV.W, literal.x,
4302; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4303; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
4304; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
4305; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4306; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
4307; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
4308; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4309; EG-NEXT:    2(2.802597e-45), 224(3.138909e-43)
4310; EG-NEXT:     LSHR * T31.X, PV.W, literal.x,
4311; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4312;
4313; GCN-HSA-LABEL: global_zextload_v32i32_to_v32i64:
4314; GCN-HSA:       ; %bb.0:
4315; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
4316; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
4317; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
4318; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
4319; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v1, s[2:3] offset:112
4320; GCN-HSA-NEXT:    global_load_dwordx4 v[8:11], v1, s[2:3] offset:96
4321; GCN-HSA-NEXT:    global_load_dwordx4 v[12:15], v1, s[2:3] offset:80
4322; GCN-HSA-NEXT:    global_load_dwordx4 v[16:19], v1, s[2:3] offset:64
4323; GCN-HSA-NEXT:    global_load_dwordx4 v[20:23], v1, s[2:3] offset:48
4324; GCN-HSA-NEXT:    global_load_dwordx4 v[24:27], v1, s[2:3] offset:32
4325; GCN-HSA-NEXT:    global_load_dwordx4 v[28:31], v1, s[2:3] offset:16
4326; GCN-HSA-NEXT:    global_load_dwordx4 v[32:35], v1, s[2:3]
4327; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
4328; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v4
4329; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v5
4330; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:224
4331; GCN-HSA-NEXT:    s_nop 0
4332; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v6
4333; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v7
4334; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:240
4335; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
4336; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v8
4337; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v9
4338; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:192
4339; GCN-HSA-NEXT:    s_nop 0
4340; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v10
4341; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v11
4342; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:208
4343; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
4344; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v12
4345; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v13
4346; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:160
4347; GCN-HSA-NEXT:    s_nop 0
4348; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v14
4349; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v15
4350; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:176
4351; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
4352; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v16
4353; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v17
4354; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:128
4355; GCN-HSA-NEXT:    s_nop 0
4356; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v18
4357; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v19
4358; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:144
4359; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
4360; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v20
4361; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v21
4362; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:96
4363; GCN-HSA-NEXT:    s_nop 0
4364; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v22
4365; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v23
4366; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:112
4367; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
4368; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v24
4369; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v25
4370; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:64
4371; GCN-HSA-NEXT:    s_nop 0
4372; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v26
4373; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v27
4374; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:80
4375; GCN-HSA-NEXT:    s_waitcnt vmcnt(13)
4376; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v28
4377; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v29
4378; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:32
4379; GCN-HSA-NEXT:    s_nop 0
4380; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v30
4381; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v31
4382; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:48
4383; GCN-HSA-NEXT:    s_waitcnt vmcnt(14)
4384; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v32
4385; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v33
4386; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
4387; GCN-HSA-NEXT:    s_nop 0
4388; GCN-HSA-NEXT:    v_mov_b32_e32 v0, v34
4389; GCN-HSA-NEXT:    v_mov_b32_e32 v2, v35
4390; GCN-HSA-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1] offset:16
4391; GCN-HSA-NEXT:    s_endpgm
4392  %ld = load <32 x i32>, ptr addrspace(1) %in
4393  %ext = zext <32 x i32> %ld to <32 x i64>
4394  store <32 x i64> %ext, ptr addrspace(1) %out
4395  ret void
4396}
4397
4398define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
4399; SI-NOHSA-LABEL: global_load_v32i32:
4400; SI-NOHSA:       ; %bb.0:
4401; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
4402; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
4403; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
4404; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
4405; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
4406; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
4407; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
4408; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
4409; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
4410; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
4411; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
4412; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
4413; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112
4414; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96
4415; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80
4416; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64
4417; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
4418; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
4419; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4)
4420; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:96
4421; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:112
4422; SI-NOHSA-NEXT:    s_waitcnt vmcnt(4)
4423; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:64
4424; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:80
4425; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
4426; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0 offset:32
4427; SI-NOHSA-NEXT:    s_waitcnt vmcnt(5)
4428; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:48
4429; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
4430; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
4431; SI-NOHSA-NEXT:    s_endpgm
4432;
4433; GCNX3-HSA-LABEL: global_load_v32i32:
4434; GCNX3-HSA:       ; %bb.0:
4435; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
4436; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
4437; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
4438; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
4439; GCNX3-HSA-NEXT:    s_add_u32 s6, s2, 48
4440; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
4441; GCNX3-HSA-NEXT:    v_mov_b32_e32 v29, s5
4442; GCNX3-HSA-NEXT:    s_addc_u32 s7, s3, 0
4443; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
4444; GCNX3-HSA-NEXT:    v_mov_b32_e32 v28, s4
4445; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
4446; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
4447; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
4448; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s5
4449; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s4
4450; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 0x50
4451; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
4452; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s5
4453; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s4
4454; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 64
4455; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
4456; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, s5
4457; GCNX3-HSA-NEXT:    v_mov_b32_e32 v16, s4
4458; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 0x70
4459; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s6
4460; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
4461; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s7
4462; GCNX3-HSA-NEXT:    s_add_u32 s2, s2, 0x60
4463; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
4464; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
4465; GCNX3-HSA-NEXT:    s_addc_u32 s3, s3, 0
4466; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s5
4467; GCNX3-HSA-NEXT:    v_mov_b32_e32 v25, s3
4468; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s4
4469; GCNX3-HSA-NEXT:    v_mov_b32_e32 v24, s2
4470; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
4471; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
4472; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
4473; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
4474; GCNX3-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
4475; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x60
4476; GCNX3-HSA-NEXT:    v_mov_b32_e32 v33, s1
4477; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
4478; GCNX3-HSA-NEXT:    v_mov_b32_e32 v32, s0
4479; GCNX3-HSA-NEXT:    s_add_u32 s4, s0, 0x70
4480; GCNX3-HSA-NEXT:    s_addc_u32 s5, s1, 0
4481; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
4482; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[0:3]
4483; GCNX3-HSA-NEXT:    s_nop 0
4484; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
4485; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
4486; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
4487; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
4488; GCNX3-HSA-NEXT:    s_add_u32 s6, s0, 0x50
4489; GCNX3-HSA-NEXT:    s_addc_u32 s7, s1, 0
4490; GCNX3-HSA-NEXT:    s_add_u32 s8, s0, 32
4491; GCNX3-HSA-NEXT:    s_addc_u32 s9, s1, 0
4492; GCNX3-HSA-NEXT:    s_add_u32 s10, s0, 48
4493; GCNX3-HSA-NEXT:    s_addc_u32 s11, s1, 0
4494; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s10
4495; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s11
4496; GCNX3-HSA-NEXT:    s_add_u32 s0, s0, 16
4497; GCNX3-HSA-NEXT:    s_addc_u32 s1, s1, 0
4498; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
4499; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
4500; GCNX3-HSA-NEXT:    s_nop 0
4501; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, s8
4502; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, s9
4503; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
4504; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[6:7], v[8:11]
4505; GCNX3-HSA-NEXT:    v_mov_b32_e32 v6, s6
4506; GCNX3-HSA-NEXT:    v_mov_b32_e32 v2, s4
4507; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
4508; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, s7
4509; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s1
4510; GCNX3-HSA-NEXT:    v_mov_b32_e32 v3, s5
4511; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
4512; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s0
4513; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
4514; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[6:7], v[12:15]
4515; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
4516; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[16:19]
4517; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
4518; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[20:23]
4519; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
4520; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
4521; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(7)
4522; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[28:31]
4523; GCNX3-HSA-NEXT:    s_endpgm
4524;
4525; GCNX3-NOHSA-LABEL: global_load_v32i32:
4526; GCNX3-NOHSA:       ; %bb.0:
4527; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
4528; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
4529; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
4530; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
4531; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
4532; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
4533; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
4534; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
4535; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
4536; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
4537; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
4538; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
4539; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32
4540; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
4541; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0
4542; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
4543; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
4544; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
4545; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(6)
4546; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:96
4547; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:112
4548; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(6)
4549; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:64
4550; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:80
4551; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
4552; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32
4553; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
4554; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:48
4555; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
4556; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
4557; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(7)
4558; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:16
4559; GCNX3-NOHSA-NEXT:    s_endpgm
4560;
4561; EG-LABEL: global_load_v32i32:
4562; EG:       ; %bb.0:
4563; EG-NEXT:    ALU 23, @28, KC0[CB0:0-32], KC1[]
4564; EG-NEXT:    TEX 7 @12
4565; EG-NEXT:    ALU 1, @52, KC0[], KC1[]
4566; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T15.X, 0
4567; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T6.X, 0
4568; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T5.X, 0
4569; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T4.X, 0
4570; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T3.X, 0
4571; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T2.X, 0
4572; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T1.X, 0
4573; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T0.X, 1
4574; EG-NEXT:    CF_END
4575; EG-NEXT:    Fetch clause starting at 12:
4576; EG-NEXT:     VTX_READ_128 T8.XYZW, T7.X, 96, #1
4577; EG-NEXT:     VTX_READ_128 T9.XYZW, T7.X, 112, #1
4578; EG-NEXT:     VTX_READ_128 T10.XYZW, T7.X, 64, #1
4579; EG-NEXT:     VTX_READ_128 T11.XYZW, T7.X, 80, #1
4580; EG-NEXT:     VTX_READ_128 T12.XYZW, T7.X, 32, #1
4581; EG-NEXT:     VTX_READ_128 T13.XYZW, T7.X, 48, #1
4582; EG-NEXT:     VTX_READ_128 T14.XYZW, T7.X, 0, #1
4583; EG-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 16, #1
4584; EG-NEXT:    ALU clause starting at 28:
4585; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4586; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4587; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
4588; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4589; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4590; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4591; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
4592; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
4593; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4594; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
4595; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
4596; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4597; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
4598; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
4599; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4600; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
4601; EG-NEXT:     LSHR T5.X, PV.W, literal.x,
4602; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4603; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
4604; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
4605; EG-NEXT:     MOV * T7.X, KC0[2].Z,
4606; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4607; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4608; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
4609; EG-NEXT:    ALU clause starting at 52:
4610; EG-NEXT:     LSHR * T15.X, T0.W, literal.x,
4611; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4612;
4613; GCN-HSA-LABEL: global_load_v32i32:
4614; GCN-HSA:       ; %bb.0:
4615; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
4616; GCN-HSA-NEXT:    v_mov_b32_e32 v32, 0
4617; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
4618; GCN-HSA-NEXT:    global_load_dwordx4 v[0:3], v32, s[2:3] offset:96
4619; GCN-HSA-NEXT:    global_load_dwordx4 v[4:7], v32, s[2:3] offset:112
4620; GCN-HSA-NEXT:    global_load_dwordx4 v[8:11], v32, s[2:3] offset:64
4621; GCN-HSA-NEXT:    global_load_dwordx4 v[12:15], v32, s[2:3] offset:80
4622; GCN-HSA-NEXT:    global_load_dwordx4 v[16:19], v32, s[2:3] offset:32
4623; GCN-HSA-NEXT:    global_load_dwordx4 v[20:23], v32, s[2:3] offset:48
4624; GCN-HSA-NEXT:    global_load_dwordx4 v[24:27], v32, s[2:3]
4625; GCN-HSA-NEXT:    global_load_dwordx4 v[28:31], v32, s[2:3] offset:16
4626; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
4627; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
4628; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
4629; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
4630; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
4631; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
4632; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
4633; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
4634; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
4635; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
4636; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
4637; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
4638; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
4639; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1]
4640; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
4641; GCN-HSA-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:16
4642; GCN-HSA-NEXT:    s_endpgm
4643  %ld = load <32 x i32>, ptr addrspace(1) %in
4644  store <32 x i32> %ld, ptr addrspace(1) %out
4645  ret void
4646}
4647
4648attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" }
4649