xref: /llvm-project/llvm/test/CodeGen/AMDGPU/load-global-i16.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-SI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-HSA %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-VI %s
5; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=EG %s
6; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=cayman < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=CM %s
7
8; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
9
10define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
11; GCN-NOHSA-SI-LABEL: global_load_i16:
12; GCN-NOHSA-SI:       ; %bb.0: ; %entry
13; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
14; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
15; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
16; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
17; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
18; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
19; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
20; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
21; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
22; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
23; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
24; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
25; GCN-NOHSA-SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
26; GCN-NOHSA-SI-NEXT:    s_endpgm
27;
28; GCN-HSA-LABEL: global_load_i16:
29; GCN-HSA:       ; %bb.0: ; %entry
30; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
31; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
32; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
33; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
34; GCN-HSA-NEXT:    flat_load_ushort v2, v[0:1]
35; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
36; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
37; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
38; GCN-HSA-NEXT:    flat_store_short v[0:1], v2
39; GCN-HSA-NEXT:    s_endpgm
40;
41; GCN-NOHSA-VI-LABEL: global_load_i16:
42; GCN-NOHSA-VI:       ; %bb.0: ; %entry
43; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
44; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
45; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
46; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
47; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
48; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
49; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
50; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
51; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
52; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
53; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
54; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
55; GCN-NOHSA-VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
56; GCN-NOHSA-VI-NEXT:    s_endpgm
57;
58; EG-LABEL: global_load_i16:
59; EG:       ; %bb.0: ; %entry
60; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
61; EG-NEXT:    TEX 0 @6
62; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
63; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
64; EG-NEXT:    CF_END
65; EG-NEXT:    PAD
66; EG-NEXT:    Fetch clause starting at 6:
67; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
68; EG-NEXT:    ALU clause starting at 8:
69; EG-NEXT:     MOV * T0.X, KC0[2].Z,
70; EG-NEXT:    ALU clause starting at 9:
71; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
72; EG-NEXT:     AND_INT * T1.W, T0.X, literal.y,
73; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
74; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
75; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
76; EG-NEXT:     LSHL T0.X, T1.W, PV.W,
77; EG-NEXT:     LSHL * T0.W, literal.x, PV.W,
78; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
79; EG-NEXT:     MOV T0.Y, 0.0,
80; EG-NEXT:     MOV * T0.Z, 0.0,
81; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
82; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
83;
84; CM-LABEL: global_load_i16:
85; CM:       ; %bb.0: ; %entry
86; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
87; CM-NEXT:    TEX 0 @6
88; CM-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
89; CM-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
90; CM-NEXT:    CF_END
91; CM-NEXT:    PAD
92; CM-NEXT:    Fetch clause starting at 6:
93; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
94; CM-NEXT:    ALU clause starting at 8:
95; CM-NEXT:     MOV * T0.X, KC0[2].Z,
96; CM-NEXT:    ALU clause starting at 9:
97; CM-NEXT:     AND_INT * T0.W, KC0[2].Y, literal.x,
98; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
99; CM-NEXT:     AND_INT T0.Z, T0.X, literal.x,
100; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
101; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
102; CM-NEXT:     LSHL T0.X, PV.Z, PV.W,
103; CM-NEXT:     LSHL * T0.W, literal.x, PV.W,
104; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
105; CM-NEXT:     MOV T0.Y, 0.0,
106; CM-NEXT:     MOV * T0.Z, 0.0,
107; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
108; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
109entry:
110  %ld = load i16, ptr addrspace(1) %in
111  store i16 %ld, ptr addrspace(1) %out
112  ret void
113}
114
115define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
116; GCN-NOHSA-SI-LABEL: global_load_v2i16:
117; GCN-NOHSA-SI:       ; %bb.0: ; %entry
118; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
119; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
120; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
121; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
122; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
123; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
124; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
125; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
126; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
127; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
128; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
129; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
130; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
131; GCN-NOHSA-SI-NEXT:    s_endpgm
132;
133; GCN-HSA-LABEL: global_load_v2i16:
134; GCN-HSA:       ; %bb.0: ; %entry
135; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
136; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
137; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
138; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
139; GCN-HSA-NEXT:    flat_load_dword v2, v[0:1]
140; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
141; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
142; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
143; GCN-HSA-NEXT:    flat_store_dword v[0:1], v2
144; GCN-HSA-NEXT:    s_endpgm
145;
146; GCN-NOHSA-VI-LABEL: global_load_v2i16:
147; GCN-NOHSA-VI:       ; %bb.0: ; %entry
148; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
149; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
150; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
151; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
152; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
153; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
154; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
155; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
156; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
157; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
158; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
159; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
160; GCN-NOHSA-VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
161; GCN-NOHSA-VI-NEXT:    s_endpgm
162;
163; EG-LABEL: global_load_v2i16:
164; EG:       ; %bb.0: ; %entry
165; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
166; EG-NEXT:    TEX 0 @6
167; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
168; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
169; EG-NEXT:    CF_END
170; EG-NEXT:    PAD
171; EG-NEXT:    Fetch clause starting at 6:
172; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
173; EG-NEXT:    ALU clause starting at 8:
174; EG-NEXT:     MOV * T0.X, KC0[2].Z,
175; EG-NEXT:    ALU clause starting at 9:
176; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
177; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
178;
179; CM-LABEL: global_load_v2i16:
180; CM:       ; %bb.0: ; %entry
181; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
182; CM-NEXT:    TEX 0 @6
183; CM-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
184; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
185; CM-NEXT:    CF_END
186; CM-NEXT:    PAD
187; CM-NEXT:    Fetch clause starting at 6:
188; CM-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
189; CM-NEXT:    ALU clause starting at 8:
190; CM-NEXT:     MOV * T0.X, KC0[2].Z,
191; CM-NEXT:    ALU clause starting at 9:
192; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
193; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
194entry:
195  %ld = load <2 x i16>, ptr addrspace(1) %in
196  store <2 x i16> %ld, ptr addrspace(1) %out
197  ret void
198}
199
200define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
201; GCN-NOHSA-SI-LABEL: global_load_v3i16:
202; GCN-NOHSA-SI:       ; %bb.0: ; %entry
203; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
204; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
205; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
206; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
207; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
208; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
209; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
210; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
211; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
212; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
213; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
214; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
215; GCN-NOHSA-SI-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
216; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
217; GCN-NOHSA-SI-NEXT:    s_endpgm
218;
219; GCN-HSA-LABEL: global_load_v3i16:
220; GCN-HSA:       ; %bb.0: ; %entry
221; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
222; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
223; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
224; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
225; GCN-HSA-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
226; GCN-HSA-NEXT:    s_add_u32 s2, s0, 4
227; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
228; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
229; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s1
230; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
231; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
232; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
233; GCN-HSA-NEXT:    flat_store_short v[4:5], v1
234; GCN-HSA-NEXT:    flat_store_dword v[2:3], v0
235; GCN-HSA-NEXT:    s_endpgm
236;
237; GCN-NOHSA-VI-LABEL: global_load_v3i16:
238; GCN-NOHSA-VI:       ; %bb.0: ; %entry
239; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
240; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
241; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
242; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
243; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
244; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
245; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
246; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
247; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
248; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
249; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
250; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
251; GCN-NOHSA-VI-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
252; GCN-NOHSA-VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
253; GCN-NOHSA-VI-NEXT:    s_endpgm
254;
255; EG-LABEL: global_load_v3i16:
256; EG:       ; %bb.0: ; %entry
257; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
258; EG-NEXT:    TEX 2 @6
259; EG-NEXT:    ALU 19, @13, KC0[CB0:0-32], KC1[]
260; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0
261; EG-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
262; EG-NEXT:    CF_END
263; EG-NEXT:    Fetch clause starting at 6:
264; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 0, #1
265; EG-NEXT:     VTX_READ_16 T7.X, T5.X, 2, #1
266; EG-NEXT:     VTX_READ_16 T5.X, T5.X, 4, #1
267; EG-NEXT:    ALU clause starting at 12:
268; EG-NEXT:     MOV * T5.X, KC0[2].Z,
269; EG-NEXT:    ALU clause starting at 13:
270; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
271; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
272; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
273; EG-NEXT:     AND_INT * T2.W, T5.X, literal.y,
274; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
275; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
276; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
277; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
278; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
279; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
280; EG-NEXT:     MOV T5.Y, 0.0,
281; EG-NEXT:     MOV * T5.Z, 0.0,
282; EG-NEXT:     LSHR T8.X, T0.W, literal.x,
283; EG-NEXT:     LSHL T0.W, T7.X, literal.y,
284; EG-NEXT:     AND_INT * T1.W, T6.X, literal.z,
285; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
286; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
287; EG-NEXT:     OR_INT T6.X, PV.W, PS,
288; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
289; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
290;
291; CM-LABEL: global_load_v3i16:
292; CM:       ; %bb.0: ; %entry
293; CM-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
294; CM-NEXT:    TEX 2 @6
295; CM-NEXT:    ALU 19, @13, KC0[CB0:0-32], KC1[]
296; CM-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
297; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X
298; CM-NEXT:    CF_END
299; CM-NEXT:    Fetch clause starting at 6:
300; CM-NEXT:     VTX_READ_16 T6.X, T5.X, 0, #1
301; CM-NEXT:     VTX_READ_16 T7.X, T5.X, 2, #1
302; CM-NEXT:     VTX_READ_16 T5.X, T5.X, 4, #1
303; CM-NEXT:    ALU clause starting at 12:
304; CM-NEXT:     MOV * T5.X, KC0[2].Z,
305; CM-NEXT:    ALU clause starting at 13:
306; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
307; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
308; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
309; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
310; CM-NEXT:     AND_INT T0.Z, T5.X, literal.x,
311; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
312; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
313; CM-NEXT:     LSHL T5.X, PV.Z, PV.W,
314; CM-NEXT:     LSHL * T5.W, literal.x, PV.W,
315; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
316; CM-NEXT:     MOV T5.Y, 0.0,
317; CM-NEXT:     MOV * T5.Z, 0.0,
318; CM-NEXT:     LSHL T0.Z, T7.X, literal.x,
319; CM-NEXT:     AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212
320; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
321; CM-NEXT:     OR_INT * T6.X, PV.Z, PV.W,
322; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
323; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
324; CM-NEXT:     LSHR * T8.X, T0.W, literal.x,
325; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
326entry:
327  %ld = load <3 x i16>, ptr addrspace(1) %in
328  store <3 x i16> %ld, ptr addrspace(1) %out
329  ret void
330}
331
332define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
333; GCN-NOHSA-SI-LABEL: global_load_v4i16:
334; GCN-NOHSA-SI:       ; %bb.0: ; %entry
335; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
336; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
337; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
338; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
339; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
340; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
341; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
342; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
343; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
344; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
345; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
346; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
347; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
348; GCN-NOHSA-SI-NEXT:    s_endpgm
349;
350; GCN-HSA-LABEL: global_load_v4i16:
351; GCN-HSA:       ; %bb.0: ; %entry
352; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
353; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
354; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
355; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
356; GCN-HSA-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
357; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
358; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s1
359; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
360; GCN-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
361; GCN-HSA-NEXT:    s_endpgm
362;
363; GCN-NOHSA-VI-LABEL: global_load_v4i16:
364; GCN-NOHSA-VI:       ; %bb.0: ; %entry
365; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
366; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
367; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
368; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
369; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
370; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
371; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
372; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
373; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
374; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
375; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
376; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
377; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
378; GCN-NOHSA-VI-NEXT:    s_endpgm
379;
380; EG-LABEL: global_load_v4i16:
381; EG:       ; %bb.0: ; %entry
382; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
383; EG-NEXT:    TEX 0 @6
384; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
385; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
386; EG-NEXT:    CF_END
387; EG-NEXT:    PAD
388; EG-NEXT:    Fetch clause starting at 6:
389; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
390; EG-NEXT:    ALU clause starting at 8:
391; EG-NEXT:     MOV * T0.X, KC0[2].Z,
392; EG-NEXT:    ALU clause starting at 9:
393; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
394; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
395;
396; CM-LABEL: global_load_v4i16:
397; CM:       ; %bb.0: ; %entry
398; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
399; CM-NEXT:    TEX 0 @6
400; CM-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
401; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
402; CM-NEXT:    CF_END
403; CM-NEXT:    PAD
404; CM-NEXT:    Fetch clause starting at 6:
405; CM-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
406; CM-NEXT:    ALU clause starting at 8:
407; CM-NEXT:     MOV * T0.X, KC0[2].Z,
408; CM-NEXT:    ALU clause starting at 9:
409; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
410; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
411entry:
412  %ld = load <4 x i16>, ptr addrspace(1) %in
413  store <4 x i16> %ld, ptr addrspace(1) %out
414  ret void
415}
416
417define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
418; GCN-NOHSA-SI-LABEL: global_load_v8i16:
419; GCN-NOHSA-SI:       ; %bb.0: ; %entry
420; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
421; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
422; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
423; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
424; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
425; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
426; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
427; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
428; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
429; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
430; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
431; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
432; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
433; GCN-NOHSA-SI-NEXT:    s_endpgm
434;
435; GCN-HSA-LABEL: global_load_v8i16:
436; GCN-HSA:       ; %bb.0: ; %entry
437; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
438; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
439; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
440; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
441; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
442; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
443; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
444; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
445; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
446; GCN-HSA-NEXT:    s_endpgm
447;
448; GCN-NOHSA-VI-LABEL: global_load_v8i16:
449; GCN-NOHSA-VI:       ; %bb.0: ; %entry
450; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
451; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
452; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
453; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
454; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
455; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
456; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
457; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
458; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
459; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
460; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
461; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
462; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
463; GCN-NOHSA-VI-NEXT:    s_endpgm
464;
465; EG-LABEL: global_load_v8i16:
466; EG:       ; %bb.0: ; %entry
467; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
468; EG-NEXT:    TEX 0 @6
469; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
470; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
471; EG-NEXT:    CF_END
472; EG-NEXT:    PAD
473; EG-NEXT:    Fetch clause starting at 6:
474; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
475; EG-NEXT:    ALU clause starting at 8:
476; EG-NEXT:     MOV * T0.X, KC0[2].Z,
477; EG-NEXT:    ALU clause starting at 9:
478; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
479; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
480;
481; CM-LABEL: global_load_v8i16:
482; CM:       ; %bb.0: ; %entry
483; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
484; CM-NEXT:    TEX 0 @6
485; CM-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
486; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
487; CM-NEXT:    CF_END
488; CM-NEXT:    PAD
489; CM-NEXT:    Fetch clause starting at 6:
490; CM-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
491; CM-NEXT:    ALU clause starting at 8:
492; CM-NEXT:     MOV * T0.X, KC0[2].Z,
493; CM-NEXT:    ALU clause starting at 9:
494; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
495; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
496entry:
497  %ld = load <8 x i16>, ptr addrspace(1) %in
498  store <8 x i16> %ld, ptr addrspace(1) %out
499  ret void
500}
501
502define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
503; GCN-NOHSA-SI-LABEL: global_load_v16i16:
504; GCN-NOHSA-SI:       ; %bb.0: ; %entry
505; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
506; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
507; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
508; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
509; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
510; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
511; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
512; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
513; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
514; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
515; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
516; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
517; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
518; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
519; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
520; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
521; GCN-NOHSA-SI-NEXT:    s_endpgm
522;
523; GCN-HSA-LABEL: global_load_v16i16:
524; GCN-HSA:       ; %bb.0: ; %entry
525; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
526; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
527; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
528; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
529; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
530; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
531; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
532; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
533; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
534; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
535; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
536; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
537; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s5
538; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s1
539; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s4
540; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s0
541; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
542; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
543; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
544; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
545; GCN-HSA-NEXT:    s_endpgm
546;
547; GCN-NOHSA-VI-LABEL: global_load_v16i16:
548; GCN-NOHSA-VI:       ; %bb.0: ; %entry
549; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
550; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
551; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
552; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
553; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
554; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
555; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
556; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
557; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
558; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
559; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
560; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
561; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
562; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
563; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
564; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
565; GCN-NOHSA-VI-NEXT:    s_endpgm
566;
567; EG-LABEL: global_load_v16i16:
568; EG:       ; %bb.0: ; %entry
569; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
570; EG-NEXT:    TEX 0 @8
571; EG-NEXT:    ALU 1, @13, KC0[CB0:0-32], KC1[]
572; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
573; EG-NEXT:    TEX 0 @10
574; EG-NEXT:    ALU 3, @15, KC0[CB0:0-32], KC1[]
575; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
576; EG-NEXT:    CF_END
577; EG-NEXT:    Fetch clause starting at 8:
578; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
579; EG-NEXT:    Fetch clause starting at 10:
580; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 16, #1
581; EG-NEXT:    ALU clause starting at 12:
582; EG-NEXT:     MOV * T0.X, KC0[2].Z,
583; EG-NEXT:    ALU clause starting at 13:
584; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
585; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
586; EG-NEXT:    ALU clause starting at 15:
587; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
588; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
589; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
590; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
591;
592; CM-LABEL: global_load_v16i16:
593; CM:       ; %bb.0: ; %entry
594; CM-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
595; CM-NEXT:    TEX 0 @8
596; CM-NEXT:    ALU 1, @13, KC0[CB0:0-32], KC1[]
597; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
598; CM-NEXT:    TEX 0 @10
599; CM-NEXT:    ALU 3, @15, KC0[CB0:0-32], KC1[]
600; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
601; CM-NEXT:    CF_END
602; CM-NEXT:    Fetch clause starting at 8:
603; CM-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 0, #1
604; CM-NEXT:    Fetch clause starting at 10:
605; CM-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 16, #1
606; CM-NEXT:    ALU clause starting at 12:
607; CM-NEXT:     MOV * T0.X, KC0[2].Z,
608; CM-NEXT:    ALU clause starting at 13:
609; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
610; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
611; CM-NEXT:    ALU clause starting at 15:
612; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
613; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
614; CM-NEXT:     LSHR * T1.X, PV.W, literal.x,
615; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
616entry:
617  %ld = load <16 x i16>, ptr addrspace(1) %in
618  store <16 x i16> %ld, ptr addrspace(1) %out
619  ret void
620}
621
622define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
623; GCN-NOHSA-SI-LABEL: global_load_v16i16_align2:
624; GCN-NOHSA-SI:       ; %bb.0: ; %entry
625; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
626; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, 0xf000
627; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, -1
628; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, s10
629; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, s11
630; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
631; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s4
632; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s5
633; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s6
634; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s7
635; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
636; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:2
637; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v4, off, s[8:11], 0 offset:4
638; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 offset:6
639; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v5, off, s[8:11], 0 offset:8
640; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v3, off, s[8:11], 0 offset:10
641; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v6, off, s[8:11], 0 offset:12
642; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v7, off, s[8:11], 0 offset:14
643; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v8, off, s[8:11], 0 offset:16
644; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v9, off, s[8:11], 0 offset:18
645; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v10, off, s[8:11], 0 offset:20
646; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v11, off, s[8:11], 0 offset:22
647; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v12, off, s[8:11], 0 offset:24
648; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v13, off, s[8:11], 0 offset:26
649; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v14, off, s[8:11], 0 offset:28
650; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v15, off, s[8:11], 0 offset:30
651; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(8)
652; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
653; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
654; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
655; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
656; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
657; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
658; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
659; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
660; GCN-NOHSA-SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
661; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v3, v7, v6
662; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v2, v16, v5
663; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v1, v17, v4
664; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v0, v18, v0
665; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v7, v15, v14
666; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v6, v13, v12
667; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v5, v11, v10
668; GCN-NOHSA-SI-NEXT:    v_or_b32_e32 v4, v9, v8
669; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
670; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
671; GCN-NOHSA-SI-NEXT:    s_endpgm
672;
673; GCN-HSA-LABEL: global_load_v16i16_align2:
674; GCN-HSA:       ; %bb.0: ; %entry
675; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
676; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
677; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
678; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
679; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
680; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
681; GCN-HSA-NEXT:    s_add_u32 s0, s0, 16
682; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
683; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
684; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
685; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
686; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
687; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s5
688; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
689; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s4
690; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
691; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
692; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
693; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
694; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
695; GCN-HSA-NEXT:    s_endpgm
696;
697; GCN-NOHSA-VI-LABEL: global_load_v16i16_align2:
698; GCN-NOHSA-VI:       ; %bb.0: ; %entry
699; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
700; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
701; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
702; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
703; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
704; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
705; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0 offset:14
706; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:10
707; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 offset:6
708; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v3, off, s[4:7], 0 offset:2
709; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v4, off, s[4:7], 0 offset:30
710; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v5, off, s[4:7], 0 offset:26
711; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v6, off, s[4:7], 0 offset:22
712; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v7, off, s[4:7], 0 offset:18
713; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v8, off, s[4:7], 0 offset:12
714; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v9, off, s[4:7], 0 offset:8
715; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v10, off, s[4:7], 0 offset:4
716; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v11, off, s[4:7], 0
717; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v12, off, s[4:7], 0 offset:28
718; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v13, off, s[4:7], 0 offset:24
719; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v14, off, s[4:7], 0 offset:20
720; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v15, off, s[4:7], 0 offset:16
721; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s2
722; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s3
723; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(14)
724; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
725; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
726; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(13)
727; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v16, 16, v2
728; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(12)
729; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
730; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(11)
731; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
732; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(10)
733; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
734; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(9)
735; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
736; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(8)
737; GCN-NOHSA-VI-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
738; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(7)
739; GCN-NOHSA-VI-NEXT:    v_or_b32_e32 v3, v8, v0
740; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(6)
741; GCN-NOHSA-VI-NEXT:    v_or_b32_e32 v2, v9, v1
742; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(5)
743; GCN-NOHSA-VI-NEXT:    v_or_b32_e32 v1, v10, v16
744; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(4)
745; GCN-NOHSA-VI-NEXT:    v_or_b32_e32 v0, v11, v17
746; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
747; GCN-NOHSA-VI-NEXT:    v_or_b32_e32 v7, v12, v4
748; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(2)
749; GCN-NOHSA-VI-NEXT:    v_or_b32_e32 v6, v13, v5
750; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
751; GCN-NOHSA-VI-NEXT:    v_or_b32_e32 v5, v14, v18
752; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
753; GCN-NOHSA-VI-NEXT:    v_or_b32_e32 v4, v15, v19
754; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
755; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
756; GCN-NOHSA-VI-NEXT:    s_endpgm
757;
758; EG-LABEL: global_load_v16i16_align2:
759; EG:       ; %bb.0: ; %entry
760; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
761; EG-NEXT:    TEX 1 @6
762; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
763; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
764; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
765; EG-NEXT:    CF_END
766; EG-NEXT:    Fetch clause starting at 6:
767; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
768; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
769; EG-NEXT:    ALU clause starting at 10:
770; EG-NEXT:     MOV * T0.X, KC0[2].Y,
771; EG-NEXT:    ALU clause starting at 11:
772; EG-NEXT:     LSHR T2.X, KC0[2].Z, literal.x,
773; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.y,
774; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
775; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
776; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
777;
778; CM-LABEL: global_load_v16i16_align2:
779; CM:       ; %bb.0: ; %entry
780; CM-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
781; CM-NEXT:    TEX 1 @6
782; CM-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
783; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
784; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
785; CM-NEXT:    CF_END
786; CM-NEXT:    Fetch clause starting at 6:
787; CM-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
788; CM-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
789; CM-NEXT:    ALU clause starting at 10:
790; CM-NEXT:     MOV * T0.X, KC0[2].Y,
791; CM-NEXT:    ALU clause starting at 11:
792; CM-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
793; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
794; CM-NEXT:     LSHR * T2.X, PV.W, literal.x,
795; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
796; CM-NEXT:     LSHR * T3.X, KC0[2].Z, literal.x,
797; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
798entry:
799  %ld =  load <16 x i16>, ptr addrspace(1) %in, align 2
800  store <16 x i16> %ld, ptr addrspace(1) %out, align 32
801  ret void
802}
803
804define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
805; GCN-NOHSA-SI-LABEL: global_zextload_i16_to_i32:
806; GCN-NOHSA-SI:       ; %bb.0:
807; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
808; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
809; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
810; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
811; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
812; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
813; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
814; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
815; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
816; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
817; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
818; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
819; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
820; GCN-NOHSA-SI-NEXT:    s_endpgm
821;
822; GCN-HSA-LABEL: global_zextload_i16_to_i32:
823; GCN-HSA:       ; %bb.0:
824; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
825; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
826; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
827; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
828; GCN-HSA-NEXT:    flat_load_ushort v2, v[0:1]
829; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
830; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
831; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
832; GCN-HSA-NEXT:    flat_store_dword v[0:1], v2
833; GCN-HSA-NEXT:    s_endpgm
834;
835; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i32:
836; GCN-NOHSA-VI:       ; %bb.0:
837; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
838; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
839; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
840; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
841; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
842; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
843; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
844; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
845; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
846; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
847; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
848; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
849; GCN-NOHSA-VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
850; GCN-NOHSA-VI-NEXT:    s_endpgm
851;
852; EG-LABEL: global_zextload_i16_to_i32:
853; EG:       ; %bb.0:
854; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
855; EG-NEXT:    TEX 0 @6
856; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
857; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
858; EG-NEXT:    CF_END
859; EG-NEXT:    PAD
860; EG-NEXT:    Fetch clause starting at 6:
861; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
862; EG-NEXT:    ALU clause starting at 8:
863; EG-NEXT:     MOV * T0.X, KC0[2].Z,
864; EG-NEXT:    ALU clause starting at 9:
865; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
866; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
867;
868; CM-LABEL: global_zextload_i16_to_i32:
869; CM:       ; %bb.0:
870; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
871; CM-NEXT:    TEX 0 @6
872; CM-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
873; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
874; CM-NEXT:    CF_END
875; CM-NEXT:    PAD
876; CM-NEXT:    Fetch clause starting at 6:
877; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
878; CM-NEXT:    ALU clause starting at 8:
879; CM-NEXT:     MOV * T0.X, KC0[2].Z,
880; CM-NEXT:    ALU clause starting at 9:
881; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
882; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
883  %a = load i16, ptr addrspace(1) %in
884  %ext = zext i16 %a to i32
885  store i32 %ext, ptr addrspace(1) %out
886  ret void
887}
888
889define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
890; GCN-NOHSA-SI-LABEL: global_sextload_i16_to_i32:
891; GCN-NOHSA-SI:       ; %bb.0:
892; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
893; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
894; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
895; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
896; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
897; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
898; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
899; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
900; GCN-NOHSA-SI-NEXT:    buffer_load_sshort v0, off, s[8:11], 0
901; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
902; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
903; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
904; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
905; GCN-NOHSA-SI-NEXT:    s_endpgm
906;
907; GCN-HSA-LABEL: global_sextload_i16_to_i32:
908; GCN-HSA:       ; %bb.0:
909; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
910; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
911; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
912; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
913; GCN-HSA-NEXT:    flat_load_sshort v2, v[0:1]
914; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
915; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
916; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
917; GCN-HSA-NEXT:    flat_store_dword v[0:1], v2
918; GCN-HSA-NEXT:    s_endpgm
919;
920; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i32:
921; GCN-NOHSA-VI:       ; %bb.0:
922; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
923; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
924; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
925; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
926; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
927; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
928; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
929; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
930; GCN-NOHSA-VI-NEXT:    buffer_load_sshort v0, off, s[8:11], 0
931; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
932; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
933; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
934; GCN-NOHSA-VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
935; GCN-NOHSA-VI-NEXT:    s_endpgm
936;
937; EG-LABEL: global_sextload_i16_to_i32:
938; EG:       ; %bb.0:
939; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
940; EG-NEXT:    TEX 0 @6
941; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
942; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
943; EG-NEXT:    CF_END
944; EG-NEXT:    PAD
945; EG-NEXT:    Fetch clause starting at 6:
946; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
947; EG-NEXT:    ALU clause starting at 8:
948; EG-NEXT:     MOV * T0.X, KC0[2].Z,
949; EG-NEXT:    ALU clause starting at 9:
950; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
951; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
952; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
953;
954; CM-LABEL: global_sextload_i16_to_i32:
955; CM:       ; %bb.0:
956; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
957; CM-NEXT:    TEX 0 @6
958; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
959; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
960; CM-NEXT:    CF_END
961; CM-NEXT:    PAD
962; CM-NEXT:    Fetch clause starting at 6:
963; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
964; CM-NEXT:    ALU clause starting at 8:
965; CM-NEXT:     MOV * T0.X, KC0[2].Z,
966; CM-NEXT:    ALU clause starting at 9:
967; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
968; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
969; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
970; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
971  %a = load i16, ptr addrspace(1) %in
972  %ext = sext i16 %a to i32
973  store i32 %ext, ptr addrspace(1) %out
974  ret void
975}
976
977define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
978; GCN-NOHSA-SI-LABEL: global_zextload_v1i16_to_v1i32:
979; GCN-NOHSA-SI:       ; %bb.0:
980; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
981; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
982; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
983; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
984; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
985; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
986; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
987; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
988; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
989; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
990; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
991; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
992; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
993; GCN-NOHSA-SI-NEXT:    s_endpgm
994;
995; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32:
996; GCN-HSA:       ; %bb.0:
997; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
998; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
999; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
1000; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
1001; GCN-HSA-NEXT:    flat_load_ushort v2, v[0:1]
1002; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
1003; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
1004; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1005; GCN-HSA-NEXT:    flat_store_dword v[0:1], v2
1006; GCN-HSA-NEXT:    s_endpgm
1007;
1008; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i32:
1009; GCN-NOHSA-VI:       ; %bb.0:
1010; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1011; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
1012; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
1013; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
1014; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
1015; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
1016; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
1017; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
1018; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
1019; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
1020; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
1021; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
1022; GCN-NOHSA-VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1023; GCN-NOHSA-VI-NEXT:    s_endpgm
1024;
1025; EG-LABEL: global_zextload_v1i16_to_v1i32:
1026; EG:       ; %bb.0:
1027; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1028; EG-NEXT:    TEX 0 @6
1029; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
1030; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1031; EG-NEXT:    CF_END
1032; EG-NEXT:    PAD
1033; EG-NEXT:    Fetch clause starting at 6:
1034; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1035; EG-NEXT:    ALU clause starting at 8:
1036; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1037; EG-NEXT:    ALU clause starting at 9:
1038; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1039; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1040;
1041; CM-LABEL: global_zextload_v1i16_to_v1i32:
1042; CM:       ; %bb.0:
1043; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1044; CM-NEXT:    TEX 0 @6
1045; CM-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
1046; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
1047; CM-NEXT:    CF_END
1048; CM-NEXT:    PAD
1049; CM-NEXT:    Fetch clause starting at 6:
1050; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1051; CM-NEXT:    ALU clause starting at 8:
1052; CM-NEXT:     MOV * T0.X, KC0[2].Z,
1053; CM-NEXT:    ALU clause starting at 9:
1054; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1055; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1056  %load = load <1 x i16>, ptr addrspace(1) %in
1057  %ext = zext <1 x i16> %load to <1 x i32>
1058  store <1 x i32> %ext, ptr addrspace(1) %out
1059  ret void
1060}
1061
1062define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1063; GCN-NOHSA-SI-LABEL: global_sextload_v1i16_to_v1i32:
1064; GCN-NOHSA-SI:       ; %bb.0:
1065; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1066; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
1067; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
1068; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
1069; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
1070; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
1071; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
1072; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
1073; GCN-NOHSA-SI-NEXT:    buffer_load_sshort v0, off, s[8:11], 0
1074; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
1075; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
1076; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
1077; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1078; GCN-NOHSA-SI-NEXT:    s_endpgm
1079;
1080; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32:
1081; GCN-HSA:       ; %bb.0:
1082; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1083; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1084; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
1085; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
1086; GCN-HSA-NEXT:    flat_load_sshort v2, v[0:1]
1087; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
1088; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
1089; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1090; GCN-HSA-NEXT:    flat_store_dword v[0:1], v2
1091; GCN-HSA-NEXT:    s_endpgm
1092;
1093; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i32:
1094; GCN-NOHSA-VI:       ; %bb.0:
1095; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1096; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
1097; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
1098; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
1099; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
1100; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
1101; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
1102; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
1103; GCN-NOHSA-VI-NEXT:    buffer_load_sshort v0, off, s[8:11], 0
1104; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
1105; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
1106; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
1107; GCN-NOHSA-VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1108; GCN-NOHSA-VI-NEXT:    s_endpgm
1109;
1110; EG-LABEL: global_sextload_v1i16_to_v1i32:
1111; EG:       ; %bb.0:
1112; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1113; EG-NEXT:    TEX 0 @6
1114; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
1115; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1116; EG-NEXT:    CF_END
1117; EG-NEXT:    PAD
1118; EG-NEXT:    Fetch clause starting at 6:
1119; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1120; EG-NEXT:    ALU clause starting at 8:
1121; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1122; EG-NEXT:    ALU clause starting at 9:
1123; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
1124; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1125; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
1126;
1127; CM-LABEL: global_sextload_v1i16_to_v1i32:
1128; CM:       ; %bb.0:
1129; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1130; CM-NEXT:    TEX 0 @6
1131; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
1132; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
1133; CM-NEXT:    CF_END
1134; CM-NEXT:    PAD
1135; CM-NEXT:    Fetch clause starting at 6:
1136; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1137; CM-NEXT:    ALU clause starting at 8:
1138; CM-NEXT:     MOV * T0.X, KC0[2].Z,
1139; CM-NEXT:    ALU clause starting at 9:
1140; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
1141; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1142; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1143; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1144  %load = load <1 x i16>, ptr addrspace(1) %in
1145  %ext = sext <1 x i16> %load to <1 x i32>
1146  store <1 x i32> %ext, ptr addrspace(1) %out
1147  ret void
1148}
1149
1150define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1151; GCN-NOHSA-SI-LABEL: global_zextload_v2i16_to_v2i32:
1152; GCN-NOHSA-SI:       ; %bb.0:
1153; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1154; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
1155; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
1156; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
1157; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
1158; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
1159; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
1160; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
1161; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1162; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
1163; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
1164; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
1165; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1166; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1167; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1168; GCN-NOHSA-SI-NEXT:    s_endpgm
1169;
1170; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32:
1171; GCN-HSA:       ; %bb.0:
1172; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1173; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1174; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
1175; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
1176; GCN-HSA-NEXT:    flat_load_dword v2, v[0:1]
1177; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
1178; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
1179; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1180; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1181; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1182; GCN-HSA-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1183; GCN-HSA-NEXT:    s_endpgm
1184;
1185; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i32:
1186; GCN-NOHSA-VI:       ; %bb.0:
1187; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1188; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
1189; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
1190; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
1191; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
1192; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
1193; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
1194; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
1195; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1196; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
1197; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
1198; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
1199; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1200; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1201; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1202; GCN-NOHSA-VI-NEXT:    s_endpgm
1203;
1204; EG-LABEL: global_zextload_v2i16_to_v2i32:
1205; EG:       ; %bb.0:
1206; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1207; EG-NEXT:    TEX 0 @6
1208; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
1209; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1
1210; EG-NEXT:    CF_END
1211; EG-NEXT:    PAD
1212; EG-NEXT:    Fetch clause starting at 6:
1213; EG-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
1214; EG-NEXT:    ALU clause starting at 8:
1215; EG-NEXT:     MOV * T4.X, KC0[2].Z,
1216; EG-NEXT:    ALU clause starting at 9:
1217; EG-NEXT:     LSHR * T4.Y, T4.X, literal.x,
1218; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1219; EG-NEXT:     AND_INT T4.X, T4.X, literal.x,
1220; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.y,
1221; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
1222;
1223; CM-LABEL: global_zextload_v2i16_to_v2i32:
1224; CM:       ; %bb.0:
1225; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1226; CM-NEXT:    TEX 0 @6
1227; CM-NEXT:    ALU 5, @9, KC0[CB0:0-32], KC1[]
1228; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
1229; CM-NEXT:    CF_END
1230; CM-NEXT:    PAD
1231; CM-NEXT:    Fetch clause starting at 6:
1232; CM-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
1233; CM-NEXT:    ALU clause starting at 8:
1234; CM-NEXT:     MOV * T4.X, KC0[2].Z,
1235; CM-NEXT:    ALU clause starting at 9:
1236; CM-NEXT:     LSHR * T4.Y, T4.X, literal.x,
1237; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1238; CM-NEXT:     AND_INT * T4.X, T4.X, literal.x,
1239; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1240; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
1241; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1242  %load = load <2 x i16>, ptr addrspace(1) %in
1243  %ext = zext <2 x i16> %load to <2 x i32>
1244  store <2 x i32> %ext, ptr addrspace(1) %out
1245  ret void
1246}
1247
1248; TODO: This should use ASHR instead of LSHR + BFE
1249define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1250; GCN-NOHSA-SI-LABEL: global_sextload_v2i16_to_v2i32:
1251; GCN-NOHSA-SI:       ; %bb.0:
1252; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1253; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
1254; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
1255; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
1256; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
1257; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
1258; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
1259; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
1260; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1261; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
1262; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
1263; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
1264; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
1265; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
1266; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1267; GCN-NOHSA-SI-NEXT:    s_endpgm
1268;
1269; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32:
1270; GCN-HSA:       ; %bb.0:
1271; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1272; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1273; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
1274; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
1275; GCN-HSA-NEXT:    flat_load_dword v2, v[0:1]
1276; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
1277; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
1278; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1279; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v2
1280; GCN-HSA-NEXT:    v_bfe_i32 v2, v2, 0, 16
1281; GCN-HSA-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1282; GCN-HSA-NEXT:    s_endpgm
1283;
1284; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i32:
1285; GCN-NOHSA-VI:       ; %bb.0:
1286; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1287; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
1288; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
1289; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
1290; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
1291; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
1292; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
1293; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
1294; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1295; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
1296; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
1297; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
1298; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
1299; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
1300; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1301; GCN-NOHSA-VI-NEXT:    s_endpgm
1302;
1303; EG-LABEL: global_sextload_v2i16_to_v2i32:
1304; EG:       ; %bb.0:
1305; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1306; EG-NEXT:    TEX 0 @6
1307; EG-NEXT:    ALU 5, @9, KC0[CB0:0-32], KC1[]
1308; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 1
1309; EG-NEXT:    CF_END
1310; EG-NEXT:    PAD
1311; EG-NEXT:    Fetch clause starting at 6:
1312; EG-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
1313; EG-NEXT:    ALU clause starting at 8:
1314; EG-NEXT:     MOV * T4.X, KC0[2].Z,
1315; EG-NEXT:    ALU clause starting at 9:
1316; EG-NEXT:     BFE_INT T5.X, T4.X, 0.0, literal.x,
1317; EG-NEXT:     LSHR T0.W, T4.X, literal.x,
1318; EG-NEXT:     LSHR * T4.X, KC0[2].Y, literal.y,
1319; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
1320; EG-NEXT:     BFE_INT * T5.Y, PV.W, 0.0, literal.x,
1321; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1322;
1323; CM-LABEL: global_sextload_v2i16_to_v2i32:
1324; CM:       ; %bb.0:
1325; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1326; CM-NEXT:    TEX 0 @6
1327; CM-NEXT:    ALU 5, @9, KC0[CB0:0-32], KC1[]
1328; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T4.X
1329; CM-NEXT:    CF_END
1330; CM-NEXT:    PAD
1331; CM-NEXT:    Fetch clause starting at 6:
1332; CM-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
1333; CM-NEXT:    ALU clause starting at 8:
1334; CM-NEXT:     MOV * T4.X, KC0[2].Z,
1335; CM-NEXT:    ALU clause starting at 9:
1336; CM-NEXT:     BFE_INT T5.X, T4.X, 0.0, literal.x,
1337; CM-NEXT:     LSHR * T0.W, T4.X, literal.x,
1338; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1339; CM-NEXT:     LSHR T4.X, KC0[2].Y, literal.x,
1340; CM-NEXT:     BFE_INT * T5.Y, PV.W, 0.0, literal.y,
1341; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
1342  %load = load <2 x i16>, ptr addrspace(1) %in
1343  %ext = sext <2 x i16> %load to <2 x i32>
1344  store <2 x i32> %ext, ptr addrspace(1) %out
1345  ret void
1346}
1347
1348define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1349; GCN-NOHSA-SI-LABEL: global_zextload_v3i16_to_v3i32:
1350; GCN-NOHSA-SI:       ; %bb.0: ; %entry
1351; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1352; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
1353; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
1354; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
1355; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
1356; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
1357; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
1358; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
1359; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1360; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
1361; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
1362; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
1363; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
1364; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v0
1365; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v1
1366; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
1367; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
1368; GCN-NOHSA-SI-NEXT:    s_endpgm
1369;
1370; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32:
1371; GCN-HSA:       ; %bb.0: ; %entry
1372; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1373; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1374; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
1375; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
1376; GCN-HSA-NEXT:    flat_load_dwordx2 v[3:4], v[0:1]
1377; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s0
1378; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s1
1379; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1380; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
1381; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v4
1382; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v3
1383; GCN-HSA-NEXT:    flat_store_dwordx3 v[5:6], v[0:2]
1384; GCN-HSA-NEXT:    s_endpgm
1385;
1386; GCN-NOHSA-VI-LABEL: global_zextload_v3i16_to_v3i32:
1387; GCN-NOHSA-VI:       ; %bb.0: ; %entry
1388; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1389; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
1390; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
1391; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
1392; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
1393; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
1394; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
1395; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
1396; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1397; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
1398; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
1399; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
1400; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1401; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1402; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1403; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
1404; GCN-NOHSA-VI-NEXT:    s_endpgm
1405;
1406; EG-LABEL: global_zextload_v3i16_to_v3i32:
1407; EG:       ; %bb.0: ; %entry
1408; EG-NEXT:    ALU 4, @12, KC0[CB0:0-32], KC1[]
1409; EG-NEXT:    TEX 2 @6
1410; EG-NEXT:    ALU 2, @17, KC0[], KC1[]
1411; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0
1412; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XY, T0.X, 1
1413; EG-NEXT:    CF_END
1414; EG-NEXT:    Fetch clause starting at 6:
1415; EG-NEXT:     VTX_READ_16 T2.X, T1.X, 4, #1
1416; EG-NEXT:     VTX_READ_16 T3.X, T1.X, 0, #1
1417; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 2, #1
1418; EG-NEXT:    ALU clause starting at 12:
1419; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
1420; EG-NEXT:     MOV * T1.X, KC0[2].Z,
1421; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1422; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1423; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1424; EG-NEXT:    ALU clause starting at 17:
1425; EG-NEXT:     LSHR T4.X, T0.W, literal.x,
1426; EG-NEXT:     MOV * T3.Y, T1.X,
1427; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1428;
1429; CM-LABEL: global_zextload_v3i16_to_v3i32:
1430; CM:       ; %bb.0: ; %entry
1431; CM-NEXT:    ALU 4, @12, KC0[CB0:0-32], KC1[]
1432; CM-NEXT:    TEX 2 @6
1433; CM-NEXT:    ALU 2, @17, KC0[CB0:0-32], KC1[]
1434; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3, T4.X
1435; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X
1436; CM-NEXT:    CF_END
1437; CM-NEXT:    Fetch clause starting at 6:
1438; CM-NEXT:     VTX_READ_16 T2.X, T1.X, 4, #1
1439; CM-NEXT:     VTX_READ_16 T3.X, T1.X, 0, #1
1440; CM-NEXT:     VTX_READ_16 T1.X, T1.X, 2, #1
1441; CM-NEXT:    ALU clause starting at 12:
1442; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1443; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1444; CM-NEXT:     LSHR * T0.X, PV.W, literal.x,
1445; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1446; CM-NEXT:     MOV * T1.X, KC0[2].Z,
1447; CM-NEXT:    ALU clause starting at 17:
1448; CM-NEXT:     LSHR T4.X, KC0[2].Y, literal.x,
1449; CM-NEXT:     MOV * T3.Y, T1.X,
1450; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1451entry:
1452  %ld = load <3 x i16>, ptr addrspace(1) %in
1453  %ext = zext <3 x i16> %ld to <3 x i32>
1454  store <3 x i32> %ext, ptr addrspace(1) %out
1455  ret void
1456}
1457
1458define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1459; GCN-NOHSA-SI-LABEL: global_sextload_v3i16_to_v3i32:
1460; GCN-NOHSA-SI:       ; %bb.0: ; %entry
1461; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1462; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
1463; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
1464; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
1465; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
1466; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
1467; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
1468; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
1469; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1470; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
1471; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
1472; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
1473; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
1474; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v0, 0, 16
1475; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v1, 0, 16
1476; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
1477; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
1478; GCN-NOHSA-SI-NEXT:    s_endpgm
1479;
1480; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32:
1481; GCN-HSA:       ; %bb.0: ; %entry
1482; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1483; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1484; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
1485; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
1486; GCN-HSA-NEXT:    flat_load_dwordx2 v[3:4], v[0:1]
1487; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s0
1488; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s1
1489; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1490; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v3
1491; GCN-HSA-NEXT:    v_bfe_i32 v2, v4, 0, 16
1492; GCN-HSA-NEXT:    v_bfe_i32 v0, v3, 0, 16
1493; GCN-HSA-NEXT:    flat_store_dwordx3 v[5:6], v[0:2]
1494; GCN-HSA-NEXT:    s_endpgm
1495;
1496; GCN-NOHSA-VI-LABEL: global_sextload_v3i16_to_v3i32:
1497; GCN-NOHSA-VI:       ; %bb.0: ; %entry
1498; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1499; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
1500; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
1501; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
1502; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
1503; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
1504; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
1505; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
1506; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[8:11], 0
1507; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
1508; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
1509; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
1510; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v3
1511; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v4, 0, 16
1512; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v3, 0, 16
1513; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
1514; GCN-NOHSA-VI-NEXT:    s_endpgm
1515;
1516; EG-LABEL: global_sextload_v3i16_to_v3i32:
1517; EG:       ; %bb.0: ; %entry
1518; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
1519; EG-NEXT:    TEX 2 @6
1520; EG-NEXT:    ALU 9, @13, KC0[CB0:0-32], KC1[]
1521; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
1522; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1523; EG-NEXT:    CF_END
1524; EG-NEXT:    Fetch clause starting at 6:
1525; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
1526; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 4, #1
1527; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1528; EG-NEXT:    ALU clause starting at 12:
1529; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1530; EG-NEXT:    ALU clause starting at 13:
1531; EG-NEXT:     BFE_INT * T0.Y, T1.X, 0.0, literal.x,
1532; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1533; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
1534; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1535; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
1536; EG-NEXT:     BFE_INT T2.X, T2.X, 0.0, literal.x,
1537; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
1538; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
1539; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
1540; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1541;
1542; CM-LABEL: global_sextload_v3i16_to_v3i32:
1543; CM:       ; %bb.0: ; %entry
1544; CM-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
1545; CM-NEXT:    TEX 2 @6
1546; CM-NEXT:    ALU 9, @13, KC0[CB0:0-32], KC1[]
1547; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
1548; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T3.X
1549; CM-NEXT:    CF_END
1550; CM-NEXT:    Fetch clause starting at 6:
1551; CM-NEXT:     VTX_READ_16 T1.X, T0.X, 4, #1
1552; CM-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
1553; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 2, #1
1554; CM-NEXT:    ALU clause starting at 12:
1555; CM-NEXT:     MOV * T0.X, KC0[2].Z,
1556; CM-NEXT:    ALU clause starting at 13:
1557; CM-NEXT:     BFE_INT T1.X, T1.X, 0.0, literal.x,
1558; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
1559; CM-NEXT:    16(2.242078e-44), 8(1.121039e-44)
1560; CM-NEXT:     LSHR T3.X, PV.W, literal.x,
1561; CM-NEXT:     BFE_INT * T0.Y, T0.X, 0.0, literal.y,
1562; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
1563; CM-NEXT:     BFE_INT * T0.X, T2.X, 0.0, literal.x,
1564; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1565; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
1566; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1567entry:
1568  %ld = load <3 x i16>, ptr addrspace(1) %in
1569  %ext = sext <3 x i16> %ld to <3 x i32>
1570  store <3 x i32> %ext, ptr addrspace(1) %out
1571  ret void
1572}
1573
1574; TODO: This should use DST, but for some there are redundant MOVs
1575define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1576; GCN-NOHSA-SI-LABEL: global_zextload_v4i16_to_v4i32:
1577; GCN-NOHSA-SI:       ; %bb.0:
1578; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1579; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
1580; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
1581; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
1582; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
1583; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
1584; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
1585; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
1586; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0
1587; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
1588; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
1589; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
1590; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
1591; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
1592; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v5
1593; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v4
1594; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1595; GCN-NOHSA-SI-NEXT:    s_endpgm
1596;
1597; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32:
1598; GCN-HSA:       ; %bb.0:
1599; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1600; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1601; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
1602; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
1603; GCN-HSA-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
1604; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s0
1605; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s1
1606; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1607; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
1608; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
1609; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v5
1610; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v4
1611; GCN-HSA-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
1612; GCN-HSA-NEXT:    s_endpgm
1613;
1614; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i32:
1615; GCN-NOHSA-VI:       ; %bb.0:
1616; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1617; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
1618; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
1619; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
1620; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
1621; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
1622; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
1623; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
1624; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1625; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
1626; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
1627; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
1628; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
1629; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, 0xffff, v1
1630; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1631; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1632; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1633; GCN-NOHSA-VI-NEXT:    s_endpgm
1634;
1635; EG-LABEL: global_zextload_v4i16_to_v4i32:
1636; EG:       ; %bb.0:
1637; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1638; EG-NEXT:    TEX 0 @6
1639; EG-NEXT:    ALU 8, @9, KC0[CB0:0-32], KC1[]
1640; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
1641; EG-NEXT:    CF_END
1642; EG-NEXT:    PAD
1643; EG-NEXT:    Fetch clause starting at 6:
1644; EG-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
1645; EG-NEXT:    ALU clause starting at 8:
1646; EG-NEXT:     MOV * T5.X, KC0[2].Z,
1647; EG-NEXT:    ALU clause starting at 9:
1648; EG-NEXT:     LSHR * T5.W, T5.Y, literal.x,
1649; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1650; EG-NEXT:     AND_INT * T5.Z, T5.Y, literal.x,
1651; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1652; EG-NEXT:     LSHR * T5.Y, T5.X, literal.x,
1653; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1654; EG-NEXT:     AND_INT T5.X, T5.X, literal.x,
1655; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.y,
1656; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
1657;
1658; CM-LABEL: global_zextload_v4i16_to_v4i32:
1659; CM:       ; %bb.0:
1660; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1661; CM-NEXT:    TEX 0 @6
1662; CM-NEXT:    ALU 9, @9, KC0[CB0:0-32], KC1[]
1663; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
1664; CM-NEXT:    CF_END
1665; CM-NEXT:    PAD
1666; CM-NEXT:    Fetch clause starting at 6:
1667; CM-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
1668; CM-NEXT:    ALU clause starting at 8:
1669; CM-NEXT:     MOV * T5.X, KC0[2].Z,
1670; CM-NEXT:    ALU clause starting at 9:
1671; CM-NEXT:     LSHR * T5.W, T5.Y, literal.x,
1672; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1673; CM-NEXT:     AND_INT * T5.Z, T5.Y, literal.x,
1674; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1675; CM-NEXT:     LSHR * T5.Y, T5.X, literal.x,
1676; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1677; CM-NEXT:     AND_INT * T5.X, T5.X, literal.x,
1678; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1679; CM-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
1680; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1681  %load = load <4 x i16>, ptr addrspace(1) %in
1682  %ext = zext <4 x i16> %load to <4 x i32>
1683  store <4 x i32> %ext, ptr addrspace(1) %out
1684  ret void
1685}
1686
1687; TODO: We should use ASHR instead of LSHR + BFE
1688; TODO: This should use DST, but for some there are redundant MOVs
1689define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1690; GCN-NOHSA-SI-LABEL: global_sextload_v4i16_to_v4i32:
1691; GCN-NOHSA-SI:       ; %bb.0:
1692; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1693; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
1694; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
1695; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
1696; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
1697; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
1698; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
1699; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
1700; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[8:11], 0
1701; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
1702; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
1703; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
1704; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v3
1705; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[5:6], v[3:4], 48
1706; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v4, 0, 16
1707; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
1708; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v5
1709; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1710; GCN-NOHSA-SI-NEXT:    s_endpgm
1711;
1712; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32:
1713; GCN-HSA:       ; %bb.0:
1714; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1715; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1716; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
1717; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
1718; GCN-HSA-NEXT:    flat_load_dwordx2 v[3:4], v[0:1]
1719; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s0
1720; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s1
1721; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1722; GCN-HSA-NEXT:    v_ashr_i64 v[7:8], v[3:4], 48
1723; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v3
1724; GCN-HSA-NEXT:    v_bfe_i32 v2, v4, 0, 16
1725; GCN-HSA-NEXT:    v_bfe_i32 v0, v3, 0, 16
1726; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v7
1727; GCN-HSA-NEXT:    flat_store_dwordx4 v[5:6], v[0:3]
1728; GCN-HSA-NEXT:    s_endpgm
1729;
1730; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i32:
1731; GCN-NOHSA-VI:       ; %bb.0:
1732; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1733; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
1734; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
1735; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
1736; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
1737; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
1738; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
1739; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
1740; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[8:11], 0
1741; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
1742; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
1743; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
1744; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v5
1745; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v4
1746; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v5, 0, 16
1747; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v4, 0, 16
1748; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1749; GCN-NOHSA-VI-NEXT:    s_endpgm
1750;
1751; EG-LABEL: global_sextload_v4i16_to_v4i32:
1752; EG:       ; %bb.0:
1753; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1754; EG-NEXT:    TEX 0 @6
1755; EG-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
1756; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
1757; EG-NEXT:    CF_END
1758; EG-NEXT:    PAD
1759; EG-NEXT:    Fetch clause starting at 6:
1760; EG-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
1761; EG-NEXT:    ALU clause starting at 8:
1762; EG-NEXT:     MOV * T5.X, KC0[2].Z,
1763; EG-NEXT:    ALU clause starting at 9:
1764; EG-NEXT:     BFE_INT * T6.Z, T5.Y, 0.0, literal.x,
1765; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1766; EG-NEXT:     BFE_INT T6.X, T5.X, 0.0, literal.x,
1767; EG-NEXT:     LSHR * T0.W, T5.Y, literal.x,
1768; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1769; EG-NEXT:     BFE_INT T6.W, PV.W, 0.0, literal.x,
1770; EG-NEXT:     LSHR * T0.W, T5.X, literal.x,
1771; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1772; EG-NEXT:     LSHR T5.X, KC0[2].Y, literal.x,
1773; EG-NEXT:     BFE_INT * T6.Y, PS, 0.0, literal.y,
1774; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
1775;
1776; CM-LABEL: global_sextload_v4i16_to_v4i32:
1777; CM:       ; %bb.0:
1778; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1779; CM-NEXT:    TEX 0 @6
1780; CM-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
1781; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T5.X
1782; CM-NEXT:    CF_END
1783; CM-NEXT:    PAD
1784; CM-NEXT:    Fetch clause starting at 6:
1785; CM-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
1786; CM-NEXT:    ALU clause starting at 8:
1787; CM-NEXT:     MOV * T5.X, KC0[2].Z,
1788; CM-NEXT:    ALU clause starting at 9:
1789; CM-NEXT:     BFE_INT * T6.Z, T5.Y, 0.0, literal.x,
1790; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1791; CM-NEXT:     BFE_INT T6.X, T5.X, 0.0, literal.x,
1792; CM-NEXT:     LSHR * T0.W, T5.Y, literal.x,
1793; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1794; CM-NEXT:     LSHR T0.Z, T5.X, literal.x,
1795; CM-NEXT:     BFE_INT * T6.W, PV.W, 0.0, literal.x,
1796; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1797; CM-NEXT:     LSHR T5.X, KC0[2].Y, literal.x,
1798; CM-NEXT:     BFE_INT * T6.Y, PV.Z, 0.0, literal.y,
1799; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
1800  %load = load <4 x i16>, ptr addrspace(1) %in
1801  %ext = sext <4 x i16> %load to <4 x i32>
1802  store <4 x i32> %ext, ptr addrspace(1) %out
1803  ret void
1804}
1805
1806; TODO: These should use LSHR instead of BFE_UINT
1807define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1808; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i32:
1809; GCN-NOHSA-SI:       ; %bb.0:
1810; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1811; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
1812; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
1813; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
1814; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
1815; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
1816; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
1817; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
1818; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1819; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
1820; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
1821; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
1822; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
1823; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
1824; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
1825; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
1826; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xffff, v1
1827; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
1828; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xffff, v3
1829; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v2
1830; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
1831; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
1832; GCN-NOHSA-SI-NEXT:    s_endpgm
1833;
1834; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32:
1835; GCN-HSA:       ; %bb.0:
1836; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1837; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1838; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
1839; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
1840; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1841; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
1842; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
1843; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
1844; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
1845; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
1846; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
1847; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1848; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
1849; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
1850; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v3
1851; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v2
1852; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
1853; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
1854; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v1
1855; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v0
1856; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
1857; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
1858; GCN-HSA-NEXT:    s_endpgm
1859;
1860; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i32:
1861; GCN-NOHSA-VI:       ; %bb.0:
1862; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1863; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
1864; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
1865; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
1866; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
1867; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
1868; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
1869; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
1870; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1871; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
1872; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
1873; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
1874; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
1875; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, 0xffff, v3
1876; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
1877; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v2
1878; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
1879; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, 0xffff, v1
1880; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
1881; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
1882; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
1883; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
1884; GCN-NOHSA-VI-NEXT:    s_endpgm
1885;
1886; EG-LABEL: global_zextload_v8i16_to_v8i32:
1887; EG:       ; %bb.0:
1888; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1889; EG-NEXT:    TEX 0 @6
1890; EG-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
1891; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
1892; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
1893; EG-NEXT:    CF_END
1894; EG-NEXT:    Fetch clause starting at 6:
1895; EG-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
1896; EG-NEXT:    ALU clause starting at 8:
1897; EG-NEXT:     MOV * T7.X, KC0[2].Z,
1898; EG-NEXT:    ALU clause starting at 9:
1899; EG-NEXT:     LSHR * T8.W, T7.Y, literal.x,
1900; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1901; EG-NEXT:     AND_INT * T8.Z, T7.Y, literal.x,
1902; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1903; EG-NEXT:     LSHR T8.Y, T7.X, literal.x,
1904; EG-NEXT:     LSHR * T9.W, T7.W, literal.x,
1905; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1906; EG-NEXT:     AND_INT T8.X, T7.X, literal.x,
1907; EG-NEXT:     AND_INT T9.Z, T7.W, literal.x,
1908; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.y,
1909; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
1910; EG-NEXT:     LSHR * T9.Y, T7.Z, literal.x,
1911; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1912; EG-NEXT:     AND_INT T9.X, T7.Z, literal.x,
1913; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
1914; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
1915; EG-NEXT:     LSHR * T10.X, PV.W, literal.x,
1916; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1917;
1918; CM-LABEL: global_zextload_v8i16_to_v8i32:
1919; CM:       ; %bb.0:
1920; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1921; CM-NEXT:    TEX 0 @6
1922; CM-NEXT:    ALU 17, @9, KC0[CB0:0-32], KC1[]
1923; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T10.X
1924; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T9.X
1925; CM-NEXT:    CF_END
1926; CM-NEXT:    Fetch clause starting at 6:
1927; CM-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
1928; CM-NEXT:    ALU clause starting at 8:
1929; CM-NEXT:     MOV * T7.X, KC0[2].Z,
1930; CM-NEXT:    ALU clause starting at 9:
1931; CM-NEXT:     LSHR * T8.W, T7.W, literal.x,
1932; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1933; CM-NEXT:     AND_INT * T8.Z, T7.W, literal.x,
1934; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1935; CM-NEXT:     LSHR T8.Y, T7.Z, literal.x,
1936; CM-NEXT:     LSHR * T7.W, T7.Y, literal.x,
1937; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1938; CM-NEXT:     AND_INT T8.X, T7.Z, literal.x,
1939; CM-NEXT:     AND_INT T7.Z, T7.Y, literal.x,
1940; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
1941; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
1942; CM-NEXT:     LSHR T9.X, PV.W, literal.x,
1943; CM-NEXT:     LSHR * T7.Y, T7.X, literal.y,
1944; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
1945; CM-NEXT:     AND_INT * T7.X, T7.X, literal.x,
1946; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1947; CM-NEXT:     LSHR * T10.X, KC0[2].Y, literal.x,
1948; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1949  %load = load <8 x i16>, ptr addrspace(1) %in
1950  %ext = zext <8 x i16> %load to <8 x i32>
1951  store <8 x i32> %ext, ptr addrspace(1) %out
1952  ret void
1953}
1954
1955; TODO: These should use ASHR instead of LSHR + BFE_INT
1956define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1957; GCN-NOHSA-SI-LABEL: global_sextload_v8i16_to_v8i32:
1958; GCN-NOHSA-SI:       ; %bb.0:
1959; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1960; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
1961; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
1962; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
1963; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
1964; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
1965; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
1966; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
1967; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1968; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
1969; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
1970; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
1971; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
1972; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
1973; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v1, 0, 16
1974; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v0, 0, 16
1975; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
1976; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
1977; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v3, 0, 16
1978; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v2, 0, 16
1979; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
1980; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
1981; GCN-NOHSA-SI-NEXT:    s_endpgm
1982;
1983; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32:
1984; GCN-HSA:       ; %bb.0:
1985; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1986; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
1987; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
1988; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
1989; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1990; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
1991; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
1992; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
1993; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
1994; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
1995; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
1996; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
1997; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
1998; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
1999; GCN-HSA-NEXT:    v_bfe_i32 v10, v3, 0, 16
2000; GCN-HSA-NEXT:    v_bfe_i32 v8, v2, 0, 16
2001; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
2002; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
2003; GCN-HSA-NEXT:    v_bfe_i32 v6, v1, 0, 16
2004; GCN-HSA-NEXT:    v_bfe_i32 v4, v0, 0, 16
2005; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
2006; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
2007; GCN-HSA-NEXT:    s_endpgm
2008;
2009; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i32:
2010; GCN-NOHSA-VI:       ; %bb.0:
2011; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2012; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
2013; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
2014; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
2015; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
2016; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
2017; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
2018; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
2019; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2020; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
2021; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
2022; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
2023; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
2024; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
2025; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v3, 0, 16
2026; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v2, 0, 16
2027; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
2028; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
2029; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v1, 0, 16
2030; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v0, 0, 16
2031; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
2032; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
2033; GCN-NOHSA-VI-NEXT:    s_endpgm
2034;
2035; EG-LABEL: global_sextload_v8i16_to_v8i32:
2036; EG:       ; %bb.0:
2037; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
2038; EG-NEXT:    TEX 0 @6
2039; EG-NEXT:    ALU 19, @9, KC0[CB0:0-32], KC1[]
2040; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
2041; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
2042; EG-NEXT:    CF_END
2043; EG-NEXT:    Fetch clause starting at 6:
2044; EG-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
2045; EG-NEXT:    ALU clause starting at 8:
2046; EG-NEXT:     MOV * T7.X, KC0[2].Z,
2047; EG-NEXT:    ALU clause starting at 9:
2048; EG-NEXT:     BFE_INT * T8.Z, T7.Y, 0.0, literal.x,
2049; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2050; EG-NEXT:     BFE_INT T8.X, T7.X, 0.0, literal.x,
2051; EG-NEXT:     BFE_INT T9.Z, T7.W, 0.0, literal.x,
2052; EG-NEXT:     LSHR * T0.W, T7.Y, literal.x,
2053; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2054; EG-NEXT:     BFE_INT T9.X, T7.Z, 0.0, literal.x,
2055; EG-NEXT:     LSHR T0.Z, T7.W, literal.x,
2056; EG-NEXT:     BFE_INT T8.W, PV.W, 0.0, literal.x,
2057; EG-NEXT:     LSHR * T0.W, T7.X, literal.x,
2058; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2059; EG-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
2060; EG-NEXT:     BFE_INT T8.Y, PS, 0.0, literal.y,
2061; EG-NEXT:     LSHR T1.Z, T7.Z, literal.y,
2062; EG-NEXT:     BFE_INT T9.W, PV.Z, 0.0, literal.y,
2063; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2064; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2065; EG-NEXT:     LSHR T10.X, PS, literal.x,
2066; EG-NEXT:     BFE_INT * T9.Y, PV.Z, 0.0, literal.y,
2067; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2068;
2069; CM-LABEL: global_sextload_v8i16_to_v8i32:
2070; CM:       ; %bb.0:
2071; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
2072; CM-NEXT:    TEX 0 @6
2073; CM-NEXT:    ALU 19, @9, KC0[CB0:0-32], KC1[]
2074; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T9, T7.X
2075; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T10.X
2076; CM-NEXT:    CF_END
2077; CM-NEXT:    Fetch clause starting at 6:
2078; CM-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
2079; CM-NEXT:    ALU clause starting at 8:
2080; CM-NEXT:     MOV * T7.X, KC0[2].Z,
2081; CM-NEXT:    ALU clause starting at 9:
2082; CM-NEXT:     BFE_INT * T8.Z, T7.W, 0.0, literal.x,
2083; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2084; CM-NEXT:     BFE_INT T8.X, T7.Z, 0.0, literal.x,
2085; CM-NEXT:     LSHR T0.Y, T7.Y, literal.x,
2086; CM-NEXT:     BFE_INT T9.Z, T7.Y, 0.0, literal.x,
2087; CM-NEXT:     LSHR * T0.W, T7.W, literal.x,
2088; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2089; CM-NEXT:     BFE_INT T9.X, T7.X, 0.0, literal.x,
2090; CM-NEXT:     LSHR T1.Y, T7.Z, literal.x,
2091; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.x,
2092; CM-NEXT:     BFE_INT * T8.W, PV.W, 0.0, literal.x,
2093; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2094; CM-NEXT:     LSHR T10.X, PV.Z, literal.x,
2095; CM-NEXT:     BFE_INT T8.Y, PV.Y, 0.0, literal.y,
2096; CM-NEXT:     LSHR T0.Z, T7.X, literal.y,
2097; CM-NEXT:     BFE_INT * T9.W, T0.Y, 0.0, literal.y,
2098; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2099; CM-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
2100; CM-NEXT:     BFE_INT * T9.Y, PV.Z, 0.0, literal.y,
2101; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2102  %load = load <8 x i16>, ptr addrspace(1) %in
2103  %ext = sext <8 x i16> %load to <8 x i32>
2104  store <8 x i32> %ext, ptr addrspace(1) %out
2105  ret void
2106}
2107
2108define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2109; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i32:
2110; GCN-NOHSA-SI:       ; %bb.0:
2111; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
2112; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
2113; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
2114; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
2115; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
2116; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
2117; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
2118; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
2119; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2120; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
2121; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
2122; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
2123; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
2124; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
2125; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
2126; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
2127; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
2128; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
2129; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
2130; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
2131; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v7
2132; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v6
2133; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xffff, v1
2134; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v0
2135; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, 0xffff, v3
2136; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v2
2137; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v5
2138; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v4
2139; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v22, 0xffff, v7
2140; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v6
2141; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
2142; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
2143; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
2144; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
2145; GCN-NOHSA-SI-NEXT:    s_endpgm
2146;
2147; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32:
2148; GCN-HSA:       ; %bb.0:
2149; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2150; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
2151; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
2152; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
2153; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
2154; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
2155; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
2156; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
2157; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2158; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2159; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
2160; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
2161; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
2162; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
2163; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
2164; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s1
2165; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
2166; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s0
2167; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
2168; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
2169; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
2170; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s1
2171; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
2172; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s0
2173; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
2174; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
2175; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
2176; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
2177; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
2178; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v1
2179; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v0
2180; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v3
2181; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v2
2182; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
2183; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
2184; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
2185; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v7
2186; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v6
2187; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
2188; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
2189; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v5
2190; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v4
2191; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
2192; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
2193; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[12:15]
2194; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
2195; GCN-HSA-NEXT:    s_endpgm
2196;
2197; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i32:
2198; GCN-NOHSA-VI:       ; %bb.0:
2199; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
2200; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
2201; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
2202; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s2
2203; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
2204; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
2205; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
2206; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
2207; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2208; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
2209; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
2210; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
2211; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
2212; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
2213; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
2214; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
2215; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, 0xffff, v7
2216; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
2217; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v6
2218; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, 0xffff, v1
2219; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
2220; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v0
2221; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
2222; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, 0xffff, v3
2223; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
2224; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v2
2225; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
2226; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, 0xffff, v5
2227; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
2228; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v4
2229; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
2230; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
2231; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
2232; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
2233; GCN-NOHSA-VI-NEXT:    s_endpgm
2234;
2235; EG-LABEL: global_zextload_v16i16_to_v16i32:
2236; EG:       ; %bb.0:
2237; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
2238; EG-NEXT:    TEX 1 @8
2239; EG-NEXT:    ALU 35, @13, KC0[CB0:0-32], KC1[]
2240; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T18.X, 0
2241; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T11.X, 0
2242; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0
2243; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T12.X, 1
2244; EG-NEXT:    CF_END
2245; EG-NEXT:    Fetch clause starting at 8:
2246; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 0, #1
2247; EG-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 16, #1
2248; EG-NEXT:    ALU clause starting at 12:
2249; EG-NEXT:     MOV * T11.X, KC0[2].Z,
2250; EG-NEXT:    ALU clause starting at 13:
2251; EG-NEXT:     LSHR * T13.W, T12.Y, literal.x,
2252; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2253; EG-NEXT:     AND_INT * T13.Z, T12.Y, literal.x,
2254; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2255; EG-NEXT:     LSHR T13.Y, T12.X, literal.x,
2256; EG-NEXT:     LSHR * T14.W, T12.W, literal.x,
2257; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2258; EG-NEXT:     AND_INT T13.X, T12.X, literal.x,
2259; EG-NEXT:     AND_INT T14.Z, T12.W, literal.x,
2260; EG-NEXT:     LSHR * T12.X, KC0[2].Y, literal.y,
2261; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
2262; EG-NEXT:     LSHR T14.Y, T12.Z, literal.x,
2263; EG-NEXT:     LSHR * T15.W, T11.Y, literal.x,
2264; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2265; EG-NEXT:     AND_INT T14.X, T12.Z, literal.x,
2266; EG-NEXT:     AND_INT T15.Z, T11.Y, literal.x,
2267; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2268; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2269; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
2270; EG-NEXT:     LSHR T15.Y, T11.X, literal.y,
2271; EG-NEXT:     LSHR T17.W, T11.W, literal.y,
2272; EG-NEXT:     AND_INT * T15.X, T11.X, literal.z,
2273; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2274; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2275; EG-NEXT:     AND_INT T17.Z, T11.W, literal.x,
2276; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2277; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
2278; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
2279; EG-NEXT:     LSHR T17.Y, T11.Z, literal.y,
2280; EG-NEXT:     AND_INT * T17.X, T11.Z, literal.z,
2281; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2282; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2283; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
2284; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
2285; EG-NEXT:     LSHR * T18.X, PV.W, literal.x,
2286; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2287;
2288; CM-LABEL: global_zextload_v16i16_to_v16i32:
2289; CM:       ; %bb.0:
2290; CM-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
2291; CM-NEXT:    TEX 1 @8
2292; CM-NEXT:    ALU 33, @13, KC0[CB0:0-32], KC1[]
2293; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T18.X
2294; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T15, T17.X
2295; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T16.X
2296; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T14.X
2297; CM-NEXT:    CF_END
2298; CM-NEXT:    Fetch clause starting at 8:
2299; CM-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 16, #1
2300; CM-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 0, #1
2301; CM-NEXT:    ALU clause starting at 12:
2302; CM-NEXT:     MOV * T11.X, KC0[2].Z,
2303; CM-NEXT:    ALU clause starting at 13:
2304; CM-NEXT:     LSHR * T13.W, T12.W, literal.x,
2305; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2306; CM-NEXT:     AND_INT * T13.Z, T12.W, literal.x,
2307; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2308; CM-NEXT:     LSHR T13.Y, T12.Z, literal.x,
2309; CM-NEXT:     LSHR * T12.W, T12.Y, literal.x,
2310; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2311; CM-NEXT:     AND_INT T13.X, T12.Z, literal.x,
2312; CM-NEXT:     AND_INT T12.Z, T12.Y, literal.x,
2313; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2314; CM-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
2315; CM-NEXT:     LSHR T14.X, PV.W, literal.x,
2316; CM-NEXT:     LSHR T12.Y, T12.X, literal.y,
2317; CM-NEXT:     LSHR * T15.W, T11.W, literal.y,
2318; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2319; CM-NEXT:     AND_INT T12.X, T12.X, literal.x,
2320; CM-NEXT:     AND_INT T15.Z, T11.W, literal.x,
2321; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2322; CM-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
2323; CM-NEXT:     LSHR T16.X, PV.W, literal.x,
2324; CM-NEXT:     LSHR T15.Y, T11.Z, literal.y,
2325; CM-NEXT:     LSHR * T11.W, T11.Y, literal.y,
2326; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2327; CM-NEXT:     AND_INT T15.X, T11.Z, literal.x,
2328; CM-NEXT:     AND_INT T11.Z, T11.Y, literal.x,
2329; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2330; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2331; CM-NEXT:     LSHR T17.X, PV.W, literal.x,
2332; CM-NEXT:     LSHR * T11.Y, T11.X, literal.y,
2333; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2334; CM-NEXT:     AND_INT * T11.X, T11.X, literal.x,
2335; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2336; CM-NEXT:     LSHR * T18.X, KC0[2].Y, literal.x,
2337; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2338  %load = load <16 x i16>, ptr addrspace(1) %in
2339  %ext = zext <16 x i16> %load to <16 x i32>
2340  store <16 x i32> %ext, ptr addrspace(1) %out
2341  ret void
2342}
2343
2344define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2345; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i32:
2346; GCN-NOHSA-SI:       ; %bb.0:
2347; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
2348; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
2349; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
2350; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
2351; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
2352; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
2353; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
2354; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
2355; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2356; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
2357; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
2358; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
2359; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
2360; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v1
2361; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v0
2362; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v1, 0, 16
2363; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v0, 0, 16
2364; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v3
2365; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v2
2366; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v14, v3, 0, 16
2367; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v12, v2, 0, 16
2368; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
2369; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v5
2370; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v4
2371; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v5, 0, 16
2372; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v4, 0, 16
2373; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v7
2374; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v6
2375; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v18, v7, 0, 16
2376; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v6, 0, 16
2377; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
2378; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
2379; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
2380; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
2381; GCN-NOHSA-SI-NEXT:    s_endpgm
2382;
2383; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32:
2384; GCN-HSA:       ; %bb.0:
2385; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2386; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
2387; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
2388; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
2389; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
2390; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
2391; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
2392; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2393; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
2394; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2395; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
2396; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
2397; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
2398; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
2399; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
2400; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
2401; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
2402; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
2403; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
2404; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
2405; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
2406; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
2407; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s1
2408; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s0
2409; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
2410; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 16, v0
2411; GCN-HSA-NEXT:    v_bfe_i32 v10, v1, 0, 16
2412; GCN-HSA-NEXT:    v_bfe_i32 v8, v0, 0, 16
2413; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v3
2414; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v2
2415; GCN-HSA-NEXT:    v_bfe_i32 v14, v3, 0, 16
2416; GCN-HSA-NEXT:    v_bfe_i32 v12, v2, 0, 16
2417; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 16, v1
2418; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
2419; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
2420; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
2421; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v5
2422; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 16, v7
2423; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 16, v6
2424; GCN-HSA-NEXT:    v_bfe_i32 v9, v7, 0, 16
2425; GCN-HSA-NEXT:    v_bfe_i32 v7, v6, 0, 16
2426; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v4
2427; GCN-HSA-NEXT:    v_bfe_i32 v2, v5, 0, 16
2428; GCN-HSA-NEXT:    v_bfe_i32 v0, v4, 0, 16
2429; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[7:10]
2430; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
2431; GCN-HSA-NEXT:    s_endpgm
2432;
2433; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i32:
2434; GCN-NOHSA-VI:       ; %bb.0:
2435; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
2436; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
2437; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
2438; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s2
2439; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
2440; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
2441; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
2442; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
2443; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2444; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
2445; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
2446; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
2447; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
2448; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 16, v1
2449; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
2450; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 16, v7
2451; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 16, v6
2452; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v7, 0, 16
2453; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v6, 0, 16
2454; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 16, v0
2455; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v1, 0, 16
2456; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v0, 0, 16
2457; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 16, v3
2458; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 16, v2
2459; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v3, 0, 16
2460; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v2, 0, 16
2461; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v5
2462; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v4
2463; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v5, 0, 16
2464; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v4, 0, 16
2465; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
2466; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
2467; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
2468; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
2469; GCN-NOHSA-VI-NEXT:    s_endpgm
2470;
2471; EG-LABEL: global_sextload_v16i16_to_v16i32:
2472; EG:       ; %bb.0:
2473; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
2474; EG-NEXT:    TEX 1 @8
2475; EG-NEXT:    ALU 39, @13, KC0[CB0:0-32], KC1[]
2476; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T12.X, 0
2477; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T11.X, 0
2478; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T14.X, 0
2479; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T13.X, 1
2480; EG-NEXT:    CF_END
2481; EG-NEXT:    Fetch clause starting at 8:
2482; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 16, #1
2483; EG-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 0, #1
2484; EG-NEXT:    ALU clause starting at 12:
2485; EG-NEXT:     MOV * T11.X, KC0[2].Z,
2486; EG-NEXT:    ALU clause starting at 13:
2487; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
2488; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2489; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2490; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
2491; EG-NEXT:     BFE_INT * T15.Z, T11.Y, 0.0, literal.y,
2492; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2493; EG-NEXT:     BFE_INT T15.X, T11.X, 0.0, literal.x,
2494; EG-NEXT:     LSHR T0.Y, T12.W, literal.x,
2495; EG-NEXT:     BFE_INT T16.Z, T11.W, 0.0, literal.x, BS:VEC_120/SCL_212
2496; EG-NEXT:     LSHR T0.W, T12.Y, literal.x,
2497; EG-NEXT:     LSHR * T1.W, T11.Y, literal.x,
2498; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2499; EG-NEXT:     BFE_INT T16.X, T11.Z, 0.0, literal.x,
2500; EG-NEXT:     LSHR T1.Y, T11.W, literal.x,
2501; EG-NEXT:     BFE_INT T17.Z, T12.Y, 0.0, literal.x,
2502; EG-NEXT:     BFE_INT T15.W, PS, 0.0, literal.x,
2503; EG-NEXT:     LSHR * T1.W, T11.X, literal.x,
2504; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2505; EG-NEXT:     BFE_INT T17.X, T12.X, 0.0, literal.x,
2506; EG-NEXT:     BFE_INT T15.Y, PS, 0.0, literal.x,
2507; EG-NEXT:     BFE_INT T18.Z, T12.W, 0.0, literal.x,
2508; EG-NEXT:     BFE_INT T16.W, PV.Y, 0.0, literal.x,
2509; EG-NEXT:     LSHR * T1.W, T11.Z, literal.x,
2510; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2511; EG-NEXT:     BFE_INT T18.X, T12.Z, 0.0, literal.x,
2512; EG-NEXT:     BFE_INT T16.Y, PS, 0.0, literal.x,
2513; EG-NEXT:     LSHR T0.Z, T12.X, literal.x,
2514; EG-NEXT:     BFE_INT T17.W, T0.W, 0.0, literal.x,
2515; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2516; EG-NEXT:    16(2.242078e-44), 32(4.484155e-44)
2517; EG-NEXT:     LSHR T11.X, PS, literal.x,
2518; EG-NEXT:     BFE_INT T17.Y, PV.Z, 0.0, literal.y,
2519; EG-NEXT:     LSHR T0.Z, T12.Z, literal.y,
2520; EG-NEXT:     BFE_INT T18.W, T0.Y, 0.0, literal.y,
2521; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
2522; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2523; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
2524; EG-NEXT:     LSHR T12.X, PS, literal.x,
2525; EG-NEXT:     BFE_INT * T18.Y, PV.Z, 0.0, literal.y,
2526; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2527;
2528; CM-LABEL: global_sextload_v16i16_to_v16i32:
2529; CM:       ; %bb.0:
2530; CM-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
2531; CM-NEXT:    TEX 1 @8
2532; CM-NEXT:    ALU 40, @13, KC0[CB0:0-32], KC1[]
2533; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T17, T11.X
2534; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T18.X
2535; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T16, T14.X
2536; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T15, T13.X
2537; CM-NEXT:    CF_END
2538; CM-NEXT:    Fetch clause starting at 8:
2539; CM-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 16, #1
2540; CM-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 0, #1
2541; CM-NEXT:    ALU clause starting at 12:
2542; CM-NEXT:     MOV * T11.X, KC0[2].Z,
2543; CM-NEXT:    ALU clause starting at 13:
2544; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
2545; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
2546; CM-NEXT:     LSHR T13.X, PV.W, literal.x,
2547; CM-NEXT:     LSHR T0.Y, T11.Y, literal.y,
2548; CM-NEXT:     LSHR T0.Z, T11.Z, literal.y,
2549; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
2550; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2551; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
2552; CM-NEXT:     LSHR T14.X, PV.W, literal.x,
2553; CM-NEXT:     LSHR T1.Y, T11.W, literal.y,
2554; CM-NEXT:     BFE_INT T15.Z, T12.W, 0.0, literal.y, BS:VEC_120/SCL_212
2555; CM-NEXT:     LSHR * T0.W, T12.X, literal.y,
2556; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2557; CM-NEXT:     BFE_INT T15.X, T12.Z, 0.0, literal.x,
2558; CM-NEXT:     LSHR T2.Y, T12.Y, literal.x,
2559; CM-NEXT:     BFE_INT T16.Z, T12.Y, 0.0, literal.x,
2560; CM-NEXT:     LSHR * T1.W, T12.W, literal.x,
2561; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2562; CM-NEXT:     BFE_INT T16.X, T12.X, 0.0, literal.x,
2563; CM-NEXT:     LSHR T3.Y, T12.Z, literal.x,
2564; CM-NEXT:     BFE_INT T12.Z, T11.W, 0.0, literal.x,
2565; CM-NEXT:     BFE_INT * T15.W, PV.W, 0.0, literal.x,
2566; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2567; CM-NEXT:     BFE_INT T12.X, T11.Z, 0.0, literal.x,
2568; CM-NEXT:     BFE_INT T15.Y, PV.Y, 0.0, literal.x,
2569; CM-NEXT:     BFE_INT T17.Z, T11.Y, 0.0, literal.x,
2570; CM-NEXT:     BFE_INT * T16.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
2571; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2572; CM-NEXT:     BFE_INT T17.X, T11.X, 0.0, literal.x,
2573; CM-NEXT:     BFE_INT T16.Y, T0.W, 0.0, literal.x,
2574; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
2575; CM-NEXT:     BFE_INT * T12.W, T1.Y, 0.0, literal.x,
2576; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2577; CM-NEXT:     LSHR T18.X, PV.Z, literal.x,
2578; CM-NEXT:     BFE_INT T12.Y, T0.Z, 0.0, literal.y,
2579; CM-NEXT:     LSHR T0.Z, T11.X, literal.y,
2580; CM-NEXT:     BFE_INT * T17.W, T0.Y, 0.0, literal.y,
2581; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2582; CM-NEXT:     LSHR T11.X, KC0[2].Y, literal.x,
2583; CM-NEXT:     BFE_INT * T17.Y, PV.Z, 0.0, literal.y,
2584; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2585  %load = load <16 x i16>, ptr addrspace(1) %in
2586  %ext = sext <16 x i16> %load to <16 x i32>
2587  store <16 x i32> %ext, ptr addrspace(1) %out
2588  ret void
2589}
2590
2591define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2592; GCN-NOHSA-SI-LABEL: global_zextload_v32i16_to_v32i32:
2593; GCN-NOHSA-SI:       ; %bb.0:
2594; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
2595; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
2596; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
2597; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
2598; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
2599; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
2600; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
2601; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
2602; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2603; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
2604; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
2605; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
2606; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
2607; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
2608; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
2609; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
2610; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v0
2611; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
2612; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
2613; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
2614; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v3
2615; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v2
2616; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v22, 0xffff, v1
2617; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v0
2618; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
2619; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
2620; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v26, 0xffff, v7
2621; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xffff, v6
2622; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v5
2623; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v4
2624; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
2625; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
2626; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
2627; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v9
2628; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v8
2629; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xffff, v11
2630; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v10
2631; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v30, 0xffff, v9
2632; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v28, 0xffff, v8
2633; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
2634; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
2635; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
2636; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v13
2637; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
2638; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xffff, v15
2639; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v14
2640; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v34, 0xffff, v13
2641; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v32, 0xffff, v12
2642; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
2643; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
2644; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96
2645; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
2646; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
2647; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
2648; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
2649; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:48
2650; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0
2651; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
2652; GCN-NOHSA-SI-NEXT:    s_endpgm
2653;
2654; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32:
2655; GCN-HSA:       ; %bb.0:
2656; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2657; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
2658; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
2659; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
2660; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
2661; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
2662; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
2663; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
2664; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
2665; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
2666; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
2667; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
2668; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
2669; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
2670; GCN-HSA-NEXT:    s_add_u32 s2, s2, 48
2671; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
2672; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2673; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
2674; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
2675; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
2676; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
2677; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
2678; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
2679; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
2680; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
2681; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
2682; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s3
2683; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s2
2684; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
2685; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
2686; GCN-HSA-NEXT:    s_add_u32 s4, s0, 64
2687; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
2688; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0x50
2689; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
2690; GCN-HSA-NEXT:    s_add_u32 s8, s0, 32
2691; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
2692; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s9
2693; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s8
2694; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s1
2695; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s0
2696; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
2697; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
2698; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s3
2699; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s1
2700; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s2
2701; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s0
2702; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
2703; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
2704; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
2705; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v5
2706; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v4
2707; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
2708; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[16:19]
2709; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
2710; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
2711; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
2712; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
2713; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v13
2714; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v12
2715; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s7
2716; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[16:19]
2717; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v14
2718; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
2719; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
2720; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v7
2721; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v6
2722; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
2723; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v15
2724; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v14
2725; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s6
2726; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
2727; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
2728; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v23, 16, v3
2729; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
2730; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
2731; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v1
2732; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v0
2733; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[4:7]
2734; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v21, 16, v2
2735; GCN-HSA-NEXT:    v_and_b32_e32 v22, 0xffff, v3
2736; GCN-HSA-NEXT:    v_and_b32_e32 v20, 0xffff, v2
2737; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
2738; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
2739; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
2740; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v9
2741; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
2742; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v11
2743; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v10
2744; GCN-HSA-NEXT:    v_and_b32_e32 v13, 0xffff, v9
2745; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v8
2746; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[20:23]
2747; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[11:14]
2748; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[4:7]
2749; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
2750; GCN-HSA-NEXT:    s_endpgm
2751;
2752; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i32:
2753; GCN-NOHSA-VI:       ; %bb.0:
2754; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
2755; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
2756; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
2757; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s2
2758; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
2759; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
2760; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
2761; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
2762; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2763; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
2764; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
2765; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
2766; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
2767; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
2768; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
2769; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
2770; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, 0xffff, v3
2771; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
2772; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
2773; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
2774; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v30, 0xffff, v15
2775; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
2776; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, 0xffff, v14
2777; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
2778; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, 0xffff, v13
2779; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
2780; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
2781; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v2
2782; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2783; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, 0xffff, v1
2784; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
2785; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2786; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v7
2787; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v22, 0xffff, v7
2788; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v6
2789; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v6
2790; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
2791; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, 0xffff, v5
2792; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
2793; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
2794; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v11
2795; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v26, 0xffff, v11
2796; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
2797; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v10
2798; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
2799; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, 0xffff, v9
2800; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
2801; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
2802; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96
2803; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
2804; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
2805; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
2806; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
2807; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
2808; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2809; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
2810; GCN-NOHSA-VI-NEXT:    s_endpgm
2811;
2812; EG-LABEL: global_zextload_v32i16_to_v32i32:
2813; EG:       ; %bb.0:
2814; EG-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
2815; EG-NEXT:    TEX 3 @12
2816; EG-NEXT:    ALU 72, @21, KC0[CB0:0-32], KC1[]
2817; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T34.X, 0
2818; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T33.X, 0
2819; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T32.X, 0
2820; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T30.X, 0
2821; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T29.X, 0
2822; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T27.X, 0
2823; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T26.X, 0
2824; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T24.X, 1
2825; EG-NEXT:    CF_END
2826; EG-NEXT:    Fetch clause starting at 12:
2827; EG-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 0, #1
2828; EG-NEXT:     VTX_READ_128 T21.XYZW, T19.X, 48, #1
2829; EG-NEXT:     VTX_READ_128 T22.XYZW, T19.X, 32, #1
2830; EG-NEXT:     VTX_READ_128 T19.XYZW, T19.X, 16, #1
2831; EG-NEXT:    ALU clause starting at 20:
2832; EG-NEXT:     MOV * T19.X, KC0[2].Z,
2833; EG-NEXT:    ALU clause starting at 21:
2834; EG-NEXT:     LSHR * T23.W, T20.W, literal.x,
2835; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2836; EG-NEXT:     AND_INT * T23.Z, T20.W, literal.x,
2837; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2838; EG-NEXT:     LSHR T23.Y, T20.Z, literal.x,
2839; EG-NEXT:     LSHR * T20.W, T20.Y, literal.x,
2840; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2841; EG-NEXT:     AND_INT T23.X, T20.Z, literal.x,
2842; EG-NEXT:     AND_INT T20.Z, T20.Y, literal.x,
2843; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2844; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2845; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
2846; EG-NEXT:     LSHR T20.Y, T20.X, literal.y,
2847; EG-NEXT:     LSHR T25.W, T19.W, literal.y,
2848; EG-NEXT:     AND_INT * T20.X, T20.X, literal.z,
2849; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2850; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2851; EG-NEXT:     AND_INT * T25.Z, T19.W, literal.x,
2852; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2853; EG-NEXT:     LSHR T26.X, KC0[2].Y, literal.x,
2854; EG-NEXT:     LSHR T25.Y, T19.Z, literal.y,
2855; EG-NEXT:     LSHR T19.W, T19.Y, literal.y,
2856; EG-NEXT:     AND_INT * T25.X, T19.Z, literal.z,
2857; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2858; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2859; EG-NEXT:     AND_INT T19.Z, T19.Y, literal.x,
2860; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2861; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
2862; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
2863; EG-NEXT:     LSHR T19.Y, T19.X, literal.y,
2864; EG-NEXT:     LSHR T28.W, T22.W, literal.y,
2865; EG-NEXT:     AND_INT * T19.X, T19.X, literal.z,
2866; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2867; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2868; EG-NEXT:     AND_INT T28.Z, T22.W, literal.x,
2869; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2870; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
2871; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
2872; EG-NEXT:     LSHR T28.Y, T22.Z, literal.y,
2873; EG-NEXT:     LSHR T22.W, T22.Y, literal.y,
2874; EG-NEXT:     AND_INT * T28.X, T22.Z, literal.z,
2875; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2876; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2877; EG-NEXT:     AND_INT T22.Z, T22.Y, literal.x,
2878; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2879; EG-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
2880; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
2881; EG-NEXT:     LSHR T22.Y, T22.X, literal.y,
2882; EG-NEXT:     LSHR T31.W, T21.W, literal.y,
2883; EG-NEXT:     AND_INT * T22.X, T22.X, literal.z,
2884; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2885; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2886; EG-NEXT:     AND_INT T31.Z, T21.W, literal.x,
2887; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2888; EG-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
2889; EG-NEXT:     LSHR T32.X, PV.W, literal.x,
2890; EG-NEXT:     LSHR T31.Y, T21.Z, literal.y,
2891; EG-NEXT:     LSHR T21.W, T21.Y, literal.y,
2892; EG-NEXT:     AND_INT * T31.X, T21.Z, literal.z,
2893; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2894; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2895; EG-NEXT:     AND_INT T21.Z, T21.Y, literal.x,
2896; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2897; EG-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
2898; EG-NEXT:     LSHR T33.X, PV.W, literal.x,
2899; EG-NEXT:     LSHR T21.Y, T21.X, literal.y,
2900; EG-NEXT:     AND_INT * T21.X, T21.X, literal.z,
2901; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2902; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2903; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
2904; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
2905; EG-NEXT:     LSHR * T34.X, PV.W, literal.x,
2906; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2907;
2908; CM-LABEL: global_zextload_v32i16_to_v32i32:
2909; CM:       ; %bb.0:
2910; CM-NEXT:    ALU 0, @20, KC0[CB0:0-32], KC1[]
2911; CM-NEXT:    TEX 3 @12
2912; CM-NEXT:    ALU 65, @21, KC0[CB0:0-32], KC1[]
2913; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T33, T34.X
2914; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T31, T21.X
2915; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T30, T32.X
2916; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T28, T22.X
2917; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T27, T29.X
2918; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T25, T19.X
2919; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T26.X
2920; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T20.X
2921; CM-NEXT:    CF_END
2922; CM-NEXT:    Fetch clause starting at 12:
2923; CM-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 48, #1
2924; CM-NEXT:     VTX_READ_128 T21.XYZW, T19.X, 0, #1
2925; CM-NEXT:     VTX_READ_128 T22.XYZW, T19.X, 16, #1
2926; CM-NEXT:     VTX_READ_128 T19.XYZW, T19.X, 32, #1
2927; CM-NEXT:    ALU clause starting at 20:
2928; CM-NEXT:     MOV * T19.X, KC0[2].Z,
2929; CM-NEXT:    ALU clause starting at 21:
2930; CM-NEXT:     LSHR * T23.W, T20.Y, literal.x,
2931; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2932; CM-NEXT:     AND_INT * T23.Z, T20.Y, literal.x,
2933; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2934; CM-NEXT:     LSHR T23.Y, T20.X, literal.x,
2935; CM-NEXT:     LSHR * T24.W, T20.W, literal.x,
2936; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2937; CM-NEXT:     AND_INT T23.X, T20.X, literal.x,
2938; CM-NEXT:     AND_INT T24.Z, T20.W, literal.x,
2939; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2940; CM-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
2941; CM-NEXT:     LSHR T20.X, PV.W, literal.x,
2942; CM-NEXT:     LSHR T24.Y, T20.Z, literal.y,
2943; CM-NEXT:     LSHR * T25.W, T19.Y, literal.y,
2944; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2945; CM-NEXT:     AND_INT T24.X, T20.Z, literal.x,
2946; CM-NEXT:     AND_INT T25.Z, T19.Y, literal.x,
2947; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2948; CM-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
2949; CM-NEXT:     LSHR T26.X, PV.W, literal.x,
2950; CM-NEXT:     LSHR T25.Y, T19.X, literal.y,
2951; CM-NEXT:     LSHR * T27.W, T19.W, literal.y,
2952; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2953; CM-NEXT:     AND_INT T25.X, T19.X, literal.x,
2954; CM-NEXT:     AND_INT T27.Z, T19.W, literal.x,
2955; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2956; CM-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
2957; CM-NEXT:     LSHR T19.X, PV.W, literal.x,
2958; CM-NEXT:     LSHR T27.Y, T19.Z, literal.y,
2959; CM-NEXT:     LSHR * T28.W, T22.Y, literal.y,
2960; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2961; CM-NEXT:     AND_INT T27.X, T19.Z, literal.x,
2962; CM-NEXT:     AND_INT T28.Z, T22.Y, literal.x,
2963; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2964; CM-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
2965; CM-NEXT:     LSHR T29.X, PV.W, literal.x,
2966; CM-NEXT:     LSHR T28.Y, T22.X, literal.y,
2967; CM-NEXT:     LSHR * T30.W, T22.W, literal.y,
2968; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2969; CM-NEXT:     AND_INT T28.X, T22.X, literal.x,
2970; CM-NEXT:     AND_INT T30.Z, T22.W, literal.x,
2971; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2972; CM-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
2973; CM-NEXT:     LSHR T22.X, PV.W, literal.x,
2974; CM-NEXT:     LSHR T30.Y, T22.Z, literal.y,
2975; CM-NEXT:     LSHR * T31.W, T21.Y, literal.y,
2976; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2977; CM-NEXT:     AND_INT T30.X, T22.Z, literal.x,
2978; CM-NEXT:     AND_INT T31.Z, T21.Y, literal.x,
2979; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2980; CM-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
2981; CM-NEXT:     LSHR T32.X, PV.W, literal.x,
2982; CM-NEXT:     LSHR T31.Y, T21.X, literal.y,
2983; CM-NEXT:     LSHR * T33.W, T21.W, literal.y,
2984; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2985; CM-NEXT:     AND_INT T31.X, T21.X, literal.x,
2986; CM-NEXT:     AND_INT * T33.Z, T21.W, literal.x,
2987; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2988; CM-NEXT:     LSHR T21.X, KC0[2].Y, literal.x,
2989; CM-NEXT:     LSHR * T33.Y, T21.Z, literal.y,
2990; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2991; CM-NEXT:     AND_INT T33.X, T21.Z, literal.x,
2992; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
2993; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2994; CM-NEXT:     LSHR * T34.X, PV.W, literal.x,
2995; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2996  %load = load <32 x i16>, ptr addrspace(1) %in
2997  %ext = zext <32 x i16> %load to <32 x i32>
2998  store <32 x i32> %ext, ptr addrspace(1) %out
2999  ret void
3000}
3001
3002define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
3003; GCN-NOHSA-SI-LABEL: global_sextload_v32i16_to_v32i32:
3004; GCN-NOHSA-SI:       ; %bb.0:
3005; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
3006; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
3007; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
3008; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
3009; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
3010; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
3011; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
3012; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
3013; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
3014; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
3015; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
3016; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
3017; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
3018; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v3
3019; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v2
3020; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v18, v3, 0, 16
3021; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v2, 0, 16
3022; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v23, 16, v1
3023; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v21, 16, v0
3024; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v22, v1, 0, 16
3025; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v20, v0, 0, 16
3026; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
3027; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
3028; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
3029; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v7, 0, 16
3030; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v6, 0, 16
3031; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v27, 16, v5
3032; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v25, 16, v4
3033; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v26, v5, 0, 16
3034; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v24, v4, 0, 16
3035; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
3036; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v11
3037; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v10
3038; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v11, 0, 16
3039; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v10, 0, 16
3040; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v31, 16, v9
3041; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v29, 16, v8
3042; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v30, v9, 0, 16
3043; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v28, v8, 0, 16
3044; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
3045; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v15
3046; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v14
3047; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v15, 0, 16
3048; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v14, 0, 16
3049; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v35, 16, v13
3050; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v33, 16, v12
3051; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v34, v13, 0, 16
3052; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v32, v12, 0, 16
3053; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
3054; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
3055; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96
3056; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
3057; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
3058; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
3059; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32
3060; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
3061; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0
3062; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
3063; GCN-NOHSA-SI-NEXT:    s_endpgm
3064;
3065; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32:
3066; GCN-HSA:       ; %bb.0:
3067; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3068; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
3069; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
3070; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
3071; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
3072; GCN-HSA-NEXT:    s_add_u32 s4, s2, 48
3073; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
3074; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
3075; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
3076; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
3077; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
3078; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
3079; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
3080; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
3081; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
3082; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
3083; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
3084; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
3085; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
3086; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
3087; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
3088; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
3089; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
3090; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
3091; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
3092; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
3093; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
3094; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
3095; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
3096; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
3097; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s3
3098; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s1
3099; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s2
3100; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
3101; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s0
3102; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
3103; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
3104; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 16, v13
3105; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 16, v12
3106; GCN-HSA-NEXT:    v_bfe_i32 v18, v13, 0, 16
3107; GCN-HSA-NEXT:    v_bfe_i32 v16, v12, 0, 16
3108; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
3109; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s3
3110; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s2
3111; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
3112; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
3113; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s3
3114; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s2
3115; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
3116; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 16, v15
3117; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v16, 16, v14
3118; GCN-HSA-NEXT:    v_bfe_i32 v17, v15, 0, 16
3119; GCN-HSA-NEXT:    v_bfe_i32 v15, v14, 0, 16
3120; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
3121; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[15:18]
3122; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
3123; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 16, v11
3124; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 16, v10
3125; GCN-HSA-NEXT:    v_bfe_i32 v13, v11, 0, 16
3126; GCN-HSA-NEXT:    v_bfe_i32 v11, v10, 0, 16
3127; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 16, v9
3128; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v16, 16, v8
3129; GCN-HSA-NEXT:    v_bfe_i32 v17, v9, 0, 16
3130; GCN-HSA-NEXT:    v_bfe_i32 v15, v8, 0, 16
3131; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
3132; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
3133; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
3134; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[15:18]
3135; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[11:14]
3136; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
3137; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 16, v7
3138; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 16, v6
3139; GCN-HSA-NEXT:    v_bfe_i32 v9, v7, 0, 16
3140; GCN-HSA-NEXT:    v_bfe_i32 v7, v6, 0, 16
3141; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 16, v5
3142; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 16, v4
3143; GCN-HSA-NEXT:    v_bfe_i32 v13, v5, 0, 16
3144; GCN-HSA-NEXT:    v_bfe_i32 v11, v4, 0, 16
3145; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s2
3146; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s1
3147; GCN-HSA-NEXT:    flat_store_dwordx4 v[19:20], v[11:14]
3148; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[7:10]
3149; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s0
3150; GCN-HSA-NEXT:    s_waitcnt vmcnt(6)
3151; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 16, v1
3152; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 16, v0
3153; GCN-HSA-NEXT:    v_bfe_i32 v9, v1, 0, 16
3154; GCN-HSA-NEXT:    v_bfe_i32 v7, v0, 0, 16
3155; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v6, 16, v3
3156; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 16, v2
3157; GCN-HSA-NEXT:    v_bfe_i32 v5, v3, 0, 16
3158; GCN-HSA-NEXT:    v_bfe_i32 v3, v2, 0, 16
3159; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[7:10]
3160; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[3:6]
3161; GCN-HSA-NEXT:    s_endpgm
3162;
3163; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i32:
3164; GCN-NOHSA-VI:       ; %bb.0:
3165; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
3166; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
3167; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
3168; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s2
3169; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
3170; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
3171; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
3172; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
3173; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
3174; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
3175; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
3176; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
3177; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
3178; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
3179; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
3180; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 16, v3
3181; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 16, v2
3182; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v3, 0, 16
3183; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
3184; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v35, 16, v13
3185; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v33, 16, v12
3186; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v34, v13, 0, 16
3187; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v32, v12, 0, 16
3188; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v2, 0, 16
3189; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v23, 16, v1
3190; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 16, v0
3191; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v22, v1, 0, 16
3192; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v0, 0, 16
3193; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
3194; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
3195; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v7, 0, 16
3196; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v6, 0, 16
3197; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v27, 16, v5
3198; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v25, 16, v4
3199; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v26, v5, 0, 16
3200; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v24, v4, 0, 16
3201; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 16, v11
3202; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 16, v10
3203; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v11, 0, 16
3204; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v10, 0, 16
3205; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v31, 16, v9
3206; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v29, 16, v8
3207; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v30, v9, 0, 16
3208; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v28, v8, 0, 16
3209; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 16, v15
3210; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 16, v14
3211; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v15, 0, 16
3212; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v14, 0, 16
3213; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96
3214; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
3215; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
3216; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
3217; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32
3218; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
3219; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0
3220; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
3221; GCN-NOHSA-VI-NEXT:    s_endpgm
3222;
3223; EG-LABEL: global_sextload_v32i16_to_v32i32:
3224; EG:       ; %bb.0:
3225; EG-NEXT:    ALU 9, @20, KC0[CB0:0-32], KC1[]
3226; EG-NEXT:    TEX 3 @12
3227; EG-NEXT:    ALU 73, @30, KC0[CB0:0-32], KC1[]
3228; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T22.X, 0
3229; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T34.X, 0
3230; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T28.X, 0
3231; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T27.X, 0
3232; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T26.X, 0
3233; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T21.X, 0
3234; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T20.X, 0
3235; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T19.X, 1
3236; EG-NEXT:    CF_END
3237; EG-NEXT:    Fetch clause starting at 12:
3238; EG-NEXT:     VTX_READ_128 T23.XYZW, T22.X, 16, #1
3239; EG-NEXT:     VTX_READ_128 T24.XYZW, T22.X, 32, #1
3240; EG-NEXT:     VTX_READ_128 T25.XYZW, T22.X, 0, #1
3241; EG-NEXT:     VTX_READ_128 T22.XYZW, T22.X, 48, #1
3242; EG-NEXT:    ALU clause starting at 20:
3243; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
3244; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3245; EG-NEXT:     LSHR T19.X, PV.W, literal.x,
3246; EG-NEXT:     LSHR * T20.X, KC0[2].Y, literal.x,
3247; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3248; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
3249; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
3250; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
3251; EG-NEXT:     MOV * T22.X, KC0[2].Z,
3252; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3253; EG-NEXT:    ALU clause starting at 30:
3254; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
3255; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
3256; EG-NEXT:     LSHR T26.X, PV.W, literal.x,
3257; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3258; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
3259; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
3260; EG-NEXT:     LSHR T0.W, T22.Y, literal.y,
3261; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
3262; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3263; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
3264; EG-NEXT:     LSHR T28.X, PS, literal.x,
3265; EG-NEXT:     LSHR T0.Y, T22.W, literal.y,
3266; EG-NEXT:     BFE_INT T29.Z, T25.W, 0.0, literal.y, BS:VEC_120/SCL_212
3267; EG-NEXT:     LSHR T1.W, T24.Y, literal.y,
3268; EG-NEXT:     LSHR * T2.W, T24.W, literal.y,
3269; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3270; EG-NEXT:     BFE_INT T29.X, T25.Z, 0.0, literal.x,
3271; EG-NEXT:     LSHR T1.Y, T23.Y, literal.x,
3272; EG-NEXT:     BFE_INT T30.Z, T25.Y, 0.0, literal.x, BS:VEC_120/SCL_212
3273; EG-NEXT:     LSHR T3.W, T23.W, literal.x,
3274; EG-NEXT:     LSHR * T4.W, T25.W, literal.x,
3275; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3276; EG-NEXT:     BFE_INT T30.X, T25.X, 0.0, literal.x,
3277; EG-NEXT:     LSHR T2.Y, T25.Y, literal.x,
3278; EG-NEXT:     BFE_INT T31.Z, T23.W, 0.0, literal.x,
3279; EG-NEXT:     BFE_INT T29.W, PS, 0.0, literal.x,
3280; EG-NEXT:     LSHR * T4.W, T25.Z, literal.x,
3281; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3282; EG-NEXT:     BFE_INT T31.X, T23.Z, 0.0, literal.x,
3283; EG-NEXT:     BFE_INT T29.Y, PS, 0.0, literal.x,
3284; EG-NEXT:     BFE_INT T25.Z, T23.Y, 0.0, literal.x,
3285; EG-NEXT:     BFE_INT T30.W, PV.Y, 0.0, literal.x,
3286; EG-NEXT:     LSHR * T4.W, T25.X, literal.x,
3287; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3288; EG-NEXT:     BFE_INT T25.X, T23.X, 0.0, literal.x,
3289; EG-NEXT:     BFE_INT T30.Y, PS, 0.0, literal.x,
3290; EG-NEXT:     BFE_INT T32.Z, T24.W, 0.0, literal.x,
3291; EG-NEXT:     BFE_INT T31.W, T3.W, 0.0, literal.x, BS:VEC_120/SCL_212
3292; EG-NEXT:     LSHR * T3.W, T23.Z, literal.x,
3293; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3294; EG-NEXT:     BFE_INT T32.X, T24.Z, 0.0, literal.x,
3295; EG-NEXT:     BFE_INT T31.Y, PS, 0.0, literal.x,
3296; EG-NEXT:     BFE_INT T23.Z, T24.Y, 0.0, literal.x,
3297; EG-NEXT:     BFE_INT T25.W, T1.Y, 0.0, literal.x, BS:VEC_120/SCL_212
3298; EG-NEXT:     LSHR * T3.W, T23.X, literal.x,
3299; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3300; EG-NEXT:     BFE_INT T23.X, T24.X, 0.0, literal.x,
3301; EG-NEXT:     BFE_INT T25.Y, PS, 0.0, literal.x,
3302; EG-NEXT:     BFE_INT T33.Z, T22.W, 0.0, literal.x,
3303; EG-NEXT:     BFE_INT T32.W, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
3304; EG-NEXT:     LSHR * T2.W, T24.Z, literal.x,
3305; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3306; EG-NEXT:     BFE_INT T33.X, T22.Z, 0.0, literal.x,
3307; EG-NEXT:     BFE_INT T32.Y, PS, 0.0, literal.x,
3308; EG-NEXT:     BFE_INT T24.Z, T22.Y, 0.0, literal.x,
3309; EG-NEXT:     BFE_INT T23.W, T1.W, 0.0, literal.x,
3310; EG-NEXT:     LSHR * T1.W, T24.X, literal.x,
3311; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3312; EG-NEXT:     BFE_INT T24.X, T22.X, 0.0, literal.x,
3313; EG-NEXT:     BFE_INT T23.Y, PS, 0.0, literal.x,
3314; EG-NEXT:     LSHR T0.Z, T22.Z, literal.x,
3315; EG-NEXT:     BFE_INT T33.W, T0.Y, 0.0, literal.x,
3316; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
3317; EG-NEXT:    16(2.242078e-44), 112(1.569454e-43)
3318; EG-NEXT:     LSHR T34.X, PS, literal.x,
3319; EG-NEXT:     BFE_INT T33.Y, PV.Z, 0.0, literal.y,
3320; EG-NEXT:     LSHR T0.Z, T22.X, literal.y,
3321; EG-NEXT:     BFE_INT T24.W, T0.W, 0.0, literal.y,
3322; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
3323; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3324; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
3325; EG-NEXT:     LSHR T22.X, PS, literal.x,
3326; EG-NEXT:     BFE_INT * T24.Y, PV.Z, 0.0, literal.y,
3327; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3328;
3329; CM-LABEL: global_sextload_v32i16_to_v32i32:
3330; CM:       ; %bb.0:
3331; CM-NEXT:    ALU 0, @22, KC0[CB0:0-32], KC1[]
3332; CM-NEXT:    TEX 0 @14
3333; CM-NEXT:    ALU 7, @23, KC0[CB0:0-32], KC1[]
3334; CM-NEXT:    TEX 2 @16
3335; CM-NEXT:    ALU 76, @31, KC0[CB0:0-32], KC1[]
3336; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T33, T34.X
3337; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T20.X
3338; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T32, T28.X
3339; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T27.X
3340; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T31, T26.X
3341; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T25.X
3342; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T30, T24.X
3343; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T29, T21.X
3344; CM-NEXT:    CF_END
3345; CM-NEXT:    Fetch clause starting at 14:
3346; CM-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 0, #1
3347; CM-NEXT:    Fetch clause starting at 16:
3348; CM-NEXT:     VTX_READ_128 T22.XYZW, T19.X, 48, #1
3349; CM-NEXT:     VTX_READ_128 T23.XYZW, T19.X, 32, #1
3350; CM-NEXT:     VTX_READ_128 T19.XYZW, T19.X, 16, #1
3351; CM-NEXT:    ALU clause starting at 22:
3352; CM-NEXT:     MOV * T19.X, KC0[2].Z,
3353; CM-NEXT:    ALU clause starting at 23:
3354; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
3355; CM-NEXT:    96(1.345247e-43), 0(0.000000e+00)
3356; CM-NEXT:     LSHR T21.X, PV.W, literal.x,
3357; CM-NEXT:     LSHR T0.Y, T20.Z, literal.y,
3358; CM-NEXT:     LSHR T0.Z, T20.W, literal.y,
3359; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
3360; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3361; CM-NEXT:    112(1.569454e-43), 0(0.000000e+00)
3362; CM-NEXT:    ALU clause starting at 31:
3363; CM-NEXT:     LSHR T24.X, T0.W, literal.x,
3364; CM-NEXT:     LSHR T1.Y, T20.Y, literal.y,
3365; CM-NEXT:     LSHR T1.Z, T19.Z, literal.y,
3366; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
3367; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3368; CM-NEXT:    64(8.968310e-44), 0(0.000000e+00)
3369; CM-NEXT:     LSHR T25.X, PV.W, literal.x,
3370; CM-NEXT:     LSHR T2.Y, T19.W, literal.y,
3371; CM-NEXT:     LSHR T2.Z, T19.X, literal.y,
3372; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
3373; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3374; CM-NEXT:    80(1.121039e-43), 0(0.000000e+00)
3375; CM-NEXT:     LSHR T26.X, PV.W, literal.x,
3376; CM-NEXT:     LSHR T3.Y, T19.Y, literal.y,
3377; CM-NEXT:     LSHR T3.Z, T23.Z, literal.y,
3378; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
3379; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3380; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
3381; CM-NEXT:     LSHR T27.X, PV.W, literal.x,
3382; CM-NEXT:     LSHR T4.Y, T23.W, literal.y,
3383; CM-NEXT:     LSHR T4.Z, T23.X, literal.y,
3384; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
3385; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3386; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
3387; CM-NEXT:     LSHR T28.X, PV.W, literal.x,
3388; CM-NEXT:     LSHR T5.Y, T23.Y, literal.y,
3389; CM-NEXT:     BFE_INT T29.Z, T22.Y, 0.0, literal.y, BS:VEC_120/SCL_212
3390; CM-NEXT:     LSHR * T0.W, T22.Z, literal.y,
3391; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3392; CM-NEXT:     BFE_INT T29.X, T22.X, 0.0, literal.x,
3393; CM-NEXT:     LSHR T6.Y, T22.W, literal.x,
3394; CM-NEXT:     BFE_INT T30.Z, T22.W, 0.0, literal.x,
3395; CM-NEXT:     LSHR * T1.W, T22.Y, literal.x,
3396; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3397; CM-NEXT:     BFE_INT T30.X, T22.Z, 0.0, literal.x,
3398; CM-NEXT:     LSHR T7.Y, T22.X, literal.x,
3399; CM-NEXT:     BFE_INT T22.Z, T23.Y, 0.0, literal.x,
3400; CM-NEXT:     BFE_INT * T29.W, PV.W, 0.0, literal.x,
3401; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3402; CM-NEXT:     BFE_INT T22.X, T23.X, 0.0, literal.x,
3403; CM-NEXT:     BFE_INT T29.Y, PV.Y, 0.0, literal.x,
3404; CM-NEXT:     BFE_INT T31.Z, T23.W, 0.0, literal.x,
3405; CM-NEXT:     BFE_INT * T30.W, T6.Y, 0.0, literal.x,
3406; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3407; CM-NEXT:     BFE_INT T31.X, T23.Z, 0.0, literal.x,
3408; CM-NEXT:     BFE_INT T30.Y, T0.W, 0.0, literal.x,
3409; CM-NEXT:     BFE_INT T23.Z, T19.Y, 0.0, literal.x,
3410; CM-NEXT:     BFE_INT * T22.W, T5.Y, 0.0, literal.x, BS:VEC_120/SCL_212
3411; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3412; CM-NEXT:     BFE_INT T23.X, T19.X, 0.0, literal.x,
3413; CM-NEXT:     BFE_INT T22.Y, T4.Z, 0.0, literal.x,
3414; CM-NEXT:     BFE_INT T32.Z, T19.W, 0.0, literal.x,
3415; CM-NEXT:     BFE_INT * T31.W, T4.Y, 0.0, literal.x,
3416; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3417; CM-NEXT:     BFE_INT T32.X, T19.Z, 0.0, literal.x,
3418; CM-NEXT:     BFE_INT T31.Y, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
3419; CM-NEXT:     BFE_INT T19.Z, T20.Y, 0.0, literal.x,
3420; CM-NEXT:     BFE_INT * T23.W, T3.Y, 0.0, literal.x, BS:VEC_120/SCL_212
3421; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3422; CM-NEXT:     BFE_INT T19.X, T20.X, 0.0, literal.x,
3423; CM-NEXT:     BFE_INT T23.Y, T2.Z, 0.0, literal.x,
3424; CM-NEXT:     BFE_INT T33.Z, T20.W, 0.0, literal.x,
3425; CM-NEXT:     BFE_INT * T32.W, T2.Y, 0.0, literal.x,
3426; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3427; CM-NEXT:     BFE_INT T33.X, T20.Z, 0.0, literal.x,
3428; CM-NEXT:     BFE_INT T32.Y, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
3429; CM-NEXT:     LSHR T1.Z, T20.X, literal.x,
3430; CM-NEXT:     BFE_INT * T19.W, T1.Y, 0.0, literal.x,
3431; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3432; CM-NEXT:     LSHR T20.X, KC0[2].Y, literal.x,
3433; CM-NEXT:     BFE_INT T19.Y, PV.Z, 0.0, literal.y,
3434; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.y,
3435; CM-NEXT:     BFE_INT * T33.W, T0.Z, 0.0, literal.y,
3436; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3437; CM-NEXT:     LSHR T34.X, PV.Z, literal.x,
3438; CM-NEXT:     BFE_INT * T33.Y, T0.Y, 0.0, literal.y,
3439; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3440  %load = load <32 x i16>, ptr addrspace(1) %in
3441  %ext = sext <32 x i16> %load to <32 x i32>
3442  store <32 x i32> %ext, ptr addrspace(1) %out
3443  ret void
3444}
3445
3446define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
3447; GCN-NOHSA-SI-LABEL: global_zextload_v64i16_to_v64i32:
3448; GCN-NOHSA-SI:       ; %bb.0:
3449; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
3450; GCN-NOHSA-SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
3451; GCN-NOHSA-SI-NEXT:    s_mov_b32 s14, -1
3452; GCN-NOHSA-SI-NEXT:    s_mov_b32 s15, 0xe8f000
3453; GCN-NOHSA-SI-NEXT:    s_add_u32 s12, s12, s11
3454; GCN-NOHSA-SI-NEXT:    s_addc_u32 s13, s13, 0
3455; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
3456; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
3457; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
3458; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
3459; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
3460; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
3461; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
3462; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
3463; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
3464; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
3465; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
3466; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
3467; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:64
3468; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:80
3469; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:96
3470; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112
3471; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(7)
3472; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v15
3473; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
3474; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
3475; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
3476; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(6)
3477; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
3478; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
3479; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v17, 0xffff, v15
3480; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v15, 0xffff, v14
3481; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 ; 4-byte Folded Spill
3482; GCN-NOHSA-SI-NEXT:    buffer_store_dword v16, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
3483; GCN-NOHSA-SI-NEXT:    buffer_store_dword v17, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
3484; GCN-NOHSA-SI-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
3485; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v13
3486; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
3487; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v12
3488; GCN-NOHSA-SI-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
3489; GCN-NOHSA-SI-NEXT:    buffer_store_dword v19, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
3490; GCN-NOHSA-SI-NEXT:    buffer_store_dword v20, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
3491; GCN-NOHSA-SI-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
3492; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(2)
3493; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
3494; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v8
3495; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v25, 0xffff, v11
3496; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v23, 0xffff, v10
3497; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v18, 0xffff, v9
3498; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v8
3499; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(13)
3500; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
3501; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
3502; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v5
3503; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v4
3504; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v14, 0xffff, v7
3505; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v6
3506; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v45, 0xffff, v5
3507; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v43, 0xffff, v4
3508; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(12)
3509; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
3510; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
3511; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
3512; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v0
3513; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v10, 0xffff, v3
3514; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v2
3515; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v49, 0xffff, v1
3516; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v47, 0xffff, v0
3517; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(11)
3518; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v30
3519; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v29
3520; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v28
3521; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v27
3522; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v53, 0xffff, v30
3523; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v51, 0xffff, v29
3524; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v57, 0xffff, v28
3525; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v55, 0xffff, v27
3526; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(10)
3527; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v34
3528; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v33
3529; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v32
3530; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v31
3531; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v29, 0xffff, v34
3532; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v27, 0xffff, v33
3533; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v61, 0xffff, v32
3534; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v59, 0xffff, v31
3535; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(9)
3536; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v38
3537; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v37
3538; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v36
3539; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v35
3540; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v33, 0xffff, v38
3541; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v31, 0xffff, v37
3542; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v6, 0xffff, v36
3543; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v35
3544; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(8)
3545; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v42
3546; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v41
3547; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v40
3548; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v39
3549; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v37, 0xffff, v42
3550; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v35, 0xffff, v41
3551; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v2, 0xffff, v40
3552; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v39
3553; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
3554; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
3555; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
3556; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:240
3557; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
3558; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:208
3559; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160
3560; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176
3561; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128
3562; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144
3563; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96
3564; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
3565; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64
3566; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
3567; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
3568; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48
3569; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
3570; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
3571; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
3572; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
3573; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
3574; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3575; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
3576; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
3577; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
3578; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
3579; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
3580; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
3581; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3582; GCN-NOHSA-SI-NEXT:    s_endpgm
3583;
3584; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32:
3585; GCN-HSA:       ; %bb.0:
3586; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
3587; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
3588; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
3589; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
3590; GCN-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[0:1]
3591; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
3592; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
3593; GCN-HSA-NEXT:    s_add_u32 s6, s2, 32
3594; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
3595; GCN-HSA-NEXT:    s_add_u32 s8, s2, 48
3596; GCN-HSA-NEXT:    s_addc_u32 s9, s3, 0
3597; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s9
3598; GCN-HSA-NEXT:    s_add_u32 s10, s2, 64
3599; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s8
3600; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[16:17]
3601; GCN-HSA-NEXT:    s_addc_u32 s11, s3, 0
3602; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
3603; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s11
3604; GCN-HSA-NEXT:    s_add_u32 s10, s2, 0x50
3605; GCN-HSA-NEXT:    s_addc_u32 s11, s3, 0
3606; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
3607; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
3608; GCN-HSA-NEXT:    s_add_u32 s10, s2, 0x60
3609; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
3610; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[4:5]
3611; GCN-HSA-NEXT:    s_addc_u32 s11, s3, 0
3612; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
3613; GCN-HSA-NEXT:    s_add_u32 s2, s2, 0x70
3614; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
3615; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
3616; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
3617; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
3618; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
3619; GCN-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[12:13]
3620; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s5
3621; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s7
3622; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s4
3623; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s6
3624; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[12:13]
3625; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[14:15]
3626; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
3627; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
3628; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s1
3629; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s0
3630; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
3631; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
3632; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
3633; GCN-HSA-NEXT:    v_and_b32_e32 v34, 0xffff, v25
3634; GCN-HSA-NEXT:    v_and_b32_e32 v32, 0xffff, v24
3635; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
3636; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
3637; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
3638; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
3639; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xf0
3640; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[32:35]
3641; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s3
3642; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
3643; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s2
3644; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
3645; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
3646; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0xd0
3647; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
3648; GCN-HSA-NEXT:    s_add_u32 s8, s0, 0xa0
3649; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
3650; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0xb0
3651; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
3652; GCN-HSA-NEXT:    s_add_u32 s12, s0, 0x80
3653; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v35, 16, v27
3654; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
3655; GCN-HSA-NEXT:    v_and_b32_e32 v34, 0xffff, v27
3656; GCN-HSA-NEXT:    v_and_b32_e32 v32, 0xffff, v26
3657; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
3658; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[32:35]
3659; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
3660; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v1
3661; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s13
3662; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s12
3663; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
3664; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v1
3665; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v0
3666; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
3667; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
3668; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s9
3669; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
3670; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
3671; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
3672; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v9
3673; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v8
3674; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s10
3675; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
3676; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s11
3677; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v11
3678; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
3679; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v11
3680; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v10
3681; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[24:27]
3682; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
3683; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
3684; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
3685; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v5
3686; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v4
3687; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
3688; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
3689; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
3690; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s7
3691; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
3692; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
3693; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
3694; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v7
3695; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v6
3696; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s6
3697; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[8:11]
3698; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
3699; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v28
3700; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v29
3701; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v28
3702; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v29
3703; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
3704; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
3705; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
3706; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v31
3707; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v30
3708; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v31
3709; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v30
3710; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[7:10]
3711; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
3712; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
3713; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
3714; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
3715; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
3716; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
3717; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v3
3718; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v2
3719; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
3720; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
3721; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
3722; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
3723; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
3724; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
3725; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
3726; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
3727; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v21
3728; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v21
3729; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v20
3730; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
3731; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
3732; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3733; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
3734; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v22
3735; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v23
3736; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v22
3737; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
3738; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
3739; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
3740; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
3741; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
3742; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
3743; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
3744; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v16
3745; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v18
3746; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v17
3747; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v16
3748; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
3749; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v12
3750; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v15
3751; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v13
3752; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v12
3753; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
3754; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
3755; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
3756; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
3757; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[15:18]
3758; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
3759; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
3760; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
3761; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
3762; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v14
3763; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
3764; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
3765; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
3766; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
3767; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
3768; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
3769; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
3770; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
3771; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
3772; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v19
3773; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
3774; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3775; GCN-HSA-NEXT:    s_endpgm
3776;
3777; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32:
3778; GCN-NOHSA-VI:       ; %bb.0:
3779; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
3780; GCN-NOHSA-VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
3781; GCN-NOHSA-VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
3782; GCN-NOHSA-VI-NEXT:    s_mov_b32 s90, -1
3783; GCN-NOHSA-VI-NEXT:    s_mov_b32 s91, 0xe80000
3784; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
3785; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
3786; GCN-NOHSA-VI-NEXT:    s_add_u32 s88, s88, s11
3787; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s2
3788; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
3789; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
3790; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
3791; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
3792; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
3793; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
3794; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
3795; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
3796; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:64
3797; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:80
3798; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:96
3799; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112
3800; GCN-NOHSA-VI-NEXT:    s_addc_u32 s89, s89, 0
3801; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
3802; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
3803; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(7)
3804; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v15
3805; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
3806; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v17, 0xffff, v15
3807; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v15, 0xffff, v14
3808; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 ; 4-byte Folded Spill
3809; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
3810; GCN-NOHSA-VI-NEXT:    buffer_store_dword v17, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
3811; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
3812; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
3813; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
3814; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v13
3815; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, 0xffff, v12
3816; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
3817; GCN-NOHSA-VI-NEXT:    buffer_store_dword v19, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
3818; GCN-NOHSA-VI-NEXT:    buffer_store_dword v20, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
3819; GCN-NOHSA-VI-NEXT:    buffer_store_dword v21, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
3820; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(14)
3821; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
3822; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
3823; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
3824; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v8
3825; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v25, 0xffff, v11
3826; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v23, 0xffff, v10
3827; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, 0xffff, v9
3828; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v8
3829; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(12)
3830; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
3831; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
3832; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
3833; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v0
3834; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, 0xffff, v3
3835; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v2
3836; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v49, 0xffff, v1
3837; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v47, 0xffff, v0
3838; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(8)
3839; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v40
3840; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v39
3841; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, 0xffff, v40
3842; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v39
3843; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
3844; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
3845; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v5
3846; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v44, 16, v4
3847; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, 0xffff, v7
3848; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v6
3849; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v45, 0xffff, v5
3850; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v43, 0xffff, v4
3851; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v54, 16, v30
3852; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v29
3853; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v58, 16, v28
3854; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v27
3855; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v53, 0xffff, v30
3856; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v51, 0xffff, v29
3857; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v57, 0xffff, v28
3858; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v55, 0xffff, v27
3859; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v34
3860; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v28, 16, v33
3861; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v62, 16, v32
3862; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v31
3863; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v29, 0xffff, v34
3864; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v27, 0xffff, v33
3865; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v61, 0xffff, v32
3866; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v59, 0xffff, v31
3867; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v38
3868; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v37
3869; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v36
3870; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v35
3871; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v33, 0xffff, v38
3872; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v31, 0xffff, v37
3873; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, 0xffff, v36
3874; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v35
3875; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v42
3876; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v41
3877; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v37, 0xffff, v42
3878; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v35, 0xffff, v41
3879; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
3880; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:240
3881; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
3882; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:208
3883; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160
3884; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176
3885; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128
3886; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144
3887; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96
3888; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
3889; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64
3890; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
3891; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
3892; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48
3893; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
3894; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
3895; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
3896; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
3897; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
3898; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3899; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
3900; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
3901; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
3902; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
3903; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
3904; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3905; GCN-NOHSA-VI-NEXT:    s_endpgm
3906;
3907; EG-LABEL: global_zextload_v64i16_to_v64i32:
3908; EG:       ; %bb.0:
3909; EG-NEXT:    ALU 0, @38, KC0[CB0:0-32], KC1[]
3910; EG-NEXT:    TEX 3 @22
3911; EG-NEXT:    ALU 56, @39, KC0[CB0:0-32], KC1[]
3912; EG-NEXT:    TEX 3 @30
3913; EG-NEXT:    ALU 87, @96, KC0[CB0:0-32], KC1[]
3914; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T66.X, 0
3915; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T65.X, 0
3916; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T64.X, 0
3917; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T62.X, 0
3918; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T51.XYZW, T61.X, 0
3919; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T59.X, 0
3920; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T52.XYZW, T58.X, 0
3921; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T56.X, 0
3922; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T55.X, 0
3923; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T53.X, 0
3924; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T48.X, 0
3925; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T47.X, 0
3926; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T46.X, 0
3927; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T44.X, 0
3928; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T43.X, 0
3929; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T41.X, 1
3930; EG-NEXT:    CF_END
3931; EG-NEXT:    Fetch clause starting at 22:
3932; EG-NEXT:     VTX_READ_128 T36.XYZW, T37.X, 0, #1
3933; EG-NEXT:     VTX_READ_128 T38.XYZW, T37.X, 48, #1
3934; EG-NEXT:     VTX_READ_128 T39.XYZW, T37.X, 32, #1
3935; EG-NEXT:     VTX_READ_128 T40.XYZW, T37.X, 16, #1
3936; EG-NEXT:    Fetch clause starting at 30:
3937; EG-NEXT:     VTX_READ_128 T49.XYZW, T37.X, 112, #1
3938; EG-NEXT:     VTX_READ_128 T50.XYZW, T37.X, 96, #1
3939; EG-NEXT:     VTX_READ_128 T51.XYZW, T37.X, 80, #1
3940; EG-NEXT:     VTX_READ_128 T52.XYZW, T37.X, 64, #1
3941; EG-NEXT:    ALU clause starting at 38:
3942; EG-NEXT:     MOV * T37.X, KC0[2].Z,
3943; EG-NEXT:    ALU clause starting at 39:
3944; EG-NEXT:     LSHR * T35.W, T36.W, literal.x,
3945; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3946; EG-NEXT:     AND_INT * T35.Z, T36.W, literal.x,
3947; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3948; EG-NEXT:     LSHR T35.Y, T36.Z, literal.x,
3949; EG-NEXT:     LSHR * T36.W, T36.Y, literal.x,
3950; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3951; EG-NEXT:     AND_INT T35.X, T36.Z, literal.x,
3952; EG-NEXT:     AND_INT T36.Z, T36.Y, literal.x,
3953; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3954; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3955; EG-NEXT:     LSHR T41.X, PV.W, literal.x,
3956; EG-NEXT:     LSHR T36.Y, T36.X, literal.y,
3957; EG-NEXT:     LSHR T42.W, T40.W, literal.y,
3958; EG-NEXT:     AND_INT * T36.X, T36.X, literal.z,
3959; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3960; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3961; EG-NEXT:     AND_INT * T42.Z, T40.W, literal.x,
3962; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3963; EG-NEXT:     LSHR T43.X, KC0[2].Y, literal.x,
3964; EG-NEXT:     LSHR T42.Y, T40.Z, literal.y,
3965; EG-NEXT:     LSHR T40.W, T40.Y, literal.y,
3966; EG-NEXT:     AND_INT * T42.X, T40.Z, literal.z,
3967; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3968; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3969; EG-NEXT:     AND_INT T40.Z, T40.Y, literal.x,
3970; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3971; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
3972; EG-NEXT:     LSHR T44.X, PV.W, literal.x,
3973; EG-NEXT:     LSHR T40.Y, T40.X, literal.y,
3974; EG-NEXT:     LSHR T45.W, T39.W, literal.y,
3975; EG-NEXT:     AND_INT * T40.X, T40.X, literal.z,
3976; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3977; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3978; EG-NEXT:     AND_INT T45.Z, T39.W, literal.x,
3979; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3980; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
3981; EG-NEXT:     LSHR T46.X, PV.W, literal.x,
3982; EG-NEXT:     LSHR T45.Y, T39.Z, literal.y,
3983; EG-NEXT:     LSHR T39.W, T39.Y, literal.y,
3984; EG-NEXT:     AND_INT * T45.X, T39.Z, literal.z,
3985; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3986; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3987; EG-NEXT:     AND_INT T39.Z, T39.Y, literal.x,
3988; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3989; EG-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
3990; EG-NEXT:     LSHR T47.X, PV.W, literal.x,
3991; EG-NEXT:     LSHR T39.Y, T39.X, literal.y,
3992; EG-NEXT:     AND_INT * T39.X, T39.X, literal.z,
3993; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3994; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3995; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
3996; EG-NEXT:     LSHR * T37.W, T38.W, literal.y,
3997; EG-NEXT:    64(8.968310e-44), 16(2.242078e-44)
3998; EG-NEXT:     LSHR T48.X, PV.W, literal.x,
3999; EG-NEXT:     AND_INT * T37.Z, T38.W, literal.y,
4000; EG-NEXT:    2(2.802597e-45), 65535(9.183409e-41)
4001; EG-NEXT:    ALU clause starting at 96:
4002; EG-NEXT:     LSHR T37.Y, T38.Z, literal.x,
4003; EG-NEXT:     LSHR * T38.W, T38.Y, literal.x,
4004; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4005; EG-NEXT:     AND_INT T37.X, T38.Z, literal.x,
4006; EG-NEXT:     AND_INT T38.Z, T38.Y, literal.x,
4007; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4008; EG-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
4009; EG-NEXT:     LSHR T53.X, PV.W, literal.x,
4010; EG-NEXT:     LSHR T38.Y, T38.X, literal.y,
4011; EG-NEXT:     LSHR T54.W, T52.W, literal.y,
4012; EG-NEXT:     AND_INT * T38.X, T38.X, literal.z,
4013; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4014; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
4015; EG-NEXT:     AND_INT T54.Z, T52.W, literal.x,
4016; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4017; EG-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
4018; EG-NEXT:     LSHR T55.X, PV.W, literal.x,
4019; EG-NEXT:     LSHR T54.Y, T52.Z, literal.y,
4020; EG-NEXT:     LSHR T52.W, T52.Y, literal.y,
4021; EG-NEXT:     AND_INT * T54.X, T52.Z, literal.z,
4022; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4023; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
4024; EG-NEXT:     AND_INT T52.Z, T52.Y, literal.x,
4025; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4026; EG-NEXT:    65535(9.183409e-41), 144(2.017870e-43)
4027; EG-NEXT:     LSHR T56.X, PV.W, literal.x,
4028; EG-NEXT:     LSHR T52.Y, T52.X, literal.y,
4029; EG-NEXT:     LSHR T57.W, T51.W, literal.y,
4030; EG-NEXT:     AND_INT * T52.X, T52.X, literal.z,
4031; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4032; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
4033; EG-NEXT:     AND_INT T57.Z, T51.W, literal.x,
4034; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4035; EG-NEXT:    65535(9.183409e-41), 128(1.793662e-43)
4036; EG-NEXT:     LSHR T58.X, PV.W, literal.x,
4037; EG-NEXT:     LSHR T57.Y, T51.Z, literal.y,
4038; EG-NEXT:     LSHR T51.W, T51.Y, literal.y,
4039; EG-NEXT:     AND_INT * T57.X, T51.Z, literal.z,
4040; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4041; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
4042; EG-NEXT:     AND_INT T51.Z, T51.Y, literal.x,
4043; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4044; EG-NEXT:    65535(9.183409e-41), 176(2.466285e-43)
4045; EG-NEXT:     LSHR T59.X, PV.W, literal.x,
4046; EG-NEXT:     LSHR T51.Y, T51.X, literal.y,
4047; EG-NEXT:     LSHR T60.W, T50.W, literal.y,
4048; EG-NEXT:     AND_INT * T51.X, T51.X, literal.z,
4049; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4050; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
4051; EG-NEXT:     AND_INT T60.Z, T50.W, literal.x,
4052; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4053; EG-NEXT:    65535(9.183409e-41), 160(2.242078e-43)
4054; EG-NEXT:     LSHR T61.X, PV.W, literal.x,
4055; EG-NEXT:     LSHR T60.Y, T50.Z, literal.y,
4056; EG-NEXT:     LSHR T50.W, T50.Y, literal.y,
4057; EG-NEXT:     AND_INT * T60.X, T50.Z, literal.z,
4058; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4059; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
4060; EG-NEXT:     AND_INT T50.Z, T50.Y, literal.x,
4061; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4062; EG-NEXT:    65535(9.183409e-41), 208(2.914701e-43)
4063; EG-NEXT:     LSHR T62.X, PV.W, literal.x,
4064; EG-NEXT:     LSHR T50.Y, T50.X, literal.y,
4065; EG-NEXT:     LSHR T63.W, T49.W, literal.y,
4066; EG-NEXT:     AND_INT * T50.X, T50.X, literal.z,
4067; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4068; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
4069; EG-NEXT:     AND_INT T63.Z, T49.W, literal.x,
4070; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4071; EG-NEXT:    65535(9.183409e-41), 192(2.690493e-43)
4072; EG-NEXT:     LSHR T64.X, PV.W, literal.x,
4073; EG-NEXT:     LSHR T63.Y, T49.Z, literal.y,
4074; EG-NEXT:     LSHR T49.W, T49.Y, literal.y,
4075; EG-NEXT:     AND_INT * T63.X, T49.Z, literal.z,
4076; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4077; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
4078; EG-NEXT:     AND_INT T49.Z, T49.Y, literal.x,
4079; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4080; EG-NEXT:    65535(9.183409e-41), 240(3.363116e-43)
4081; EG-NEXT:     LSHR T65.X, PV.W, literal.x,
4082; EG-NEXT:     LSHR T49.Y, T49.X, literal.y,
4083; EG-NEXT:     AND_INT * T49.X, T49.X, literal.z,
4084; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4085; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
4086; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4087; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
4088; EG-NEXT:     LSHR * T66.X, PV.W, literal.x,
4089; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4090;
4091; CM-LABEL: global_zextload_v64i16_to_v64i32:
4092; CM:       ; %bb.0:
4093; CM-NEXT:    ALU 0, @38, KC0[CB0:0-32], KC1[]
4094; CM-NEXT:    TEX 3 @22
4095; CM-NEXT:    ALU 50, @39, KC0[CB0:0-32], KC1[]
4096; CM-NEXT:    TEX 3 @30
4097; CM-NEXT:    ALU 78, @90, KC0[CB0:0-32], KC1[]
4098; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T65, T66.X
4099; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T63, T48.X
4100; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T62, T64.X
4101; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T60, T49.X
4102; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T59, T61.X
4103; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T57, T50.X
4104; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T56, T58.X
4105; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T54, T51.X
4106; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T53, T55.X
4107; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T35, T37.X
4108; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T47, T52.X
4109; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T45, T38.X
4110; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T44, T46.X
4111; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T42, T39.X
4112; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T41, T43.X
4113; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T40, T36.X
4114; CM-NEXT:    CF_END
4115; CM-NEXT:    Fetch clause starting at 22:
4116; CM-NEXT:     VTX_READ_128 T36.XYZW, T35.X, 112, #1
4117; CM-NEXT:     VTX_READ_128 T37.XYZW, T35.X, 64, #1
4118; CM-NEXT:     VTX_READ_128 T38.XYZW, T35.X, 80, #1
4119; CM-NEXT:     VTX_READ_128 T39.XYZW, T35.X, 96, #1
4120; CM-NEXT:    Fetch clause starting at 30:
4121; CM-NEXT:     VTX_READ_128 T48.XYZW, T35.X, 0, #1
4122; CM-NEXT:     VTX_READ_128 T49.XYZW, T35.X, 16, #1
4123; CM-NEXT:     VTX_READ_128 T50.XYZW, T35.X, 32, #1
4124; CM-NEXT:     VTX_READ_128 T51.XYZW, T35.X, 48, #1
4125; CM-NEXT:    ALU clause starting at 38:
4126; CM-NEXT:     MOV * T35.X, KC0[2].Z,
4127; CM-NEXT:    ALU clause starting at 39:
4128; CM-NEXT:     LSHR * T40.W, T36.Y, literal.x,
4129; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4130; CM-NEXT:     AND_INT * T40.Z, T36.Y, literal.x,
4131; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
4132; CM-NEXT:     LSHR T40.Y, T36.X, literal.x,
4133; CM-NEXT:     LSHR * T41.W, T36.W, literal.x,
4134; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4135; CM-NEXT:     AND_INT T40.X, T36.X, literal.x,
4136; CM-NEXT:     AND_INT T41.Z, T36.W, literal.x,
4137; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4138; CM-NEXT:    65535(9.183409e-41), 224(3.138909e-43)
4139; CM-NEXT:     LSHR T36.X, PV.W, literal.x,
4140; CM-NEXT:     LSHR T41.Y, T36.Z, literal.y,
4141; CM-NEXT:     LSHR * T42.W, T39.Y, literal.y,
4142; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4143; CM-NEXT:     AND_INT T41.X, T36.Z, literal.x,
4144; CM-NEXT:     AND_INT T42.Z, T39.Y, literal.x,
4145; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4146; CM-NEXT:    65535(9.183409e-41), 240(3.363116e-43)
4147; CM-NEXT:     LSHR T43.X, PV.W, literal.x,
4148; CM-NEXT:     LSHR T42.Y, T39.X, literal.y,
4149; CM-NEXT:     LSHR * T44.W, T39.W, literal.y,
4150; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4151; CM-NEXT:     AND_INT T42.X, T39.X, literal.x,
4152; CM-NEXT:     AND_INT T44.Z, T39.W, literal.x,
4153; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4154; CM-NEXT:    65535(9.183409e-41), 192(2.690493e-43)
4155; CM-NEXT:     LSHR T39.X, PV.W, literal.x,
4156; CM-NEXT:     LSHR T44.Y, T39.Z, literal.y,
4157; CM-NEXT:     LSHR * T45.W, T38.Y, literal.y,
4158; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4159; CM-NEXT:     AND_INT T44.X, T39.Z, literal.x,
4160; CM-NEXT:     AND_INT T45.Z, T38.Y, literal.x,
4161; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4162; CM-NEXT:    65535(9.183409e-41), 208(2.914701e-43)
4163; CM-NEXT:     LSHR T46.X, PV.W, literal.x,
4164; CM-NEXT:     LSHR T45.Y, T38.X, literal.y,
4165; CM-NEXT:     LSHR * T47.W, T38.W, literal.y,
4166; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4167; CM-NEXT:     AND_INT T45.X, T38.X, literal.x,
4168; CM-NEXT:     AND_INT T47.Z, T38.W, literal.x,
4169; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4170; CM-NEXT:    65535(9.183409e-41), 160(2.242078e-43)
4171; CM-NEXT:     LSHR T38.X, PV.W, literal.x,
4172; CM-NEXT:     LSHR T47.Y, T38.Z, literal.y,
4173; CM-NEXT:     LSHR * T35.W, T37.Y, literal.y,
4174; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4175; CM-NEXT:     AND_INT T47.X, T38.Z, literal.x,
4176; CM-NEXT:     AND_INT T35.Z, T37.Y, literal.x,
4177; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4178; CM-NEXT:    65535(9.183409e-41), 176(2.466285e-43)
4179; CM-NEXT:    ALU clause starting at 90:
4180; CM-NEXT:     LSHR T52.X, T0.W, literal.x,
4181; CM-NEXT:     LSHR T35.Y, T37.X, literal.y,
4182; CM-NEXT:     LSHR * T53.W, T37.W, literal.y, BS:VEC_120/SCL_212
4183; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4184; CM-NEXT:     AND_INT T35.X, T37.X, literal.x,
4185; CM-NEXT:     AND_INT T53.Z, T37.W, literal.x,
4186; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4187; CM-NEXT:    65535(9.183409e-41), 128(1.793662e-43)
4188; CM-NEXT:     LSHR T37.X, PV.W, literal.x,
4189; CM-NEXT:     LSHR T53.Y, T37.Z, literal.y,
4190; CM-NEXT:     LSHR * T54.W, T51.Y, literal.y,
4191; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4192; CM-NEXT:     AND_INT T53.X, T37.Z, literal.x,
4193; CM-NEXT:     AND_INT T54.Z, T51.Y, literal.x,
4194; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4195; CM-NEXT:    65535(9.183409e-41), 144(2.017870e-43)
4196; CM-NEXT:     LSHR T55.X, PV.W, literal.x,
4197; CM-NEXT:     LSHR T54.Y, T51.X, literal.y,
4198; CM-NEXT:     LSHR * T56.W, T51.W, literal.y,
4199; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4200; CM-NEXT:     AND_INT T54.X, T51.X, literal.x,
4201; CM-NEXT:     AND_INT T56.Z, T51.W, literal.x,
4202; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4203; CM-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
4204; CM-NEXT:     LSHR T51.X, PV.W, literal.x,
4205; CM-NEXT:     LSHR T56.Y, T51.Z, literal.y,
4206; CM-NEXT:     LSHR * T57.W, T50.Y, literal.y,
4207; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4208; CM-NEXT:     AND_INT T56.X, T51.Z, literal.x,
4209; CM-NEXT:     AND_INT T57.Z, T50.Y, literal.x,
4210; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4211; CM-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
4212; CM-NEXT:     LSHR T58.X, PV.W, literal.x,
4213; CM-NEXT:     LSHR T57.Y, T50.X, literal.y,
4214; CM-NEXT:     LSHR * T59.W, T50.W, literal.y,
4215; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4216; CM-NEXT:     AND_INT T57.X, T50.X, literal.x,
4217; CM-NEXT:     AND_INT T59.Z, T50.W, literal.x,
4218; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4219; CM-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
4220; CM-NEXT:     LSHR T50.X, PV.W, literal.x,
4221; CM-NEXT:     LSHR T59.Y, T50.Z, literal.y,
4222; CM-NEXT:     LSHR * T60.W, T49.Y, literal.y,
4223; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4224; CM-NEXT:     AND_INT T59.X, T50.Z, literal.x,
4225; CM-NEXT:     AND_INT T60.Z, T49.Y, literal.x,
4226; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4227; CM-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
4228; CM-NEXT:     LSHR T61.X, PV.W, literal.x,
4229; CM-NEXT:     LSHR T60.Y, T49.X, literal.y,
4230; CM-NEXT:     LSHR * T62.W, T49.W, literal.y,
4231; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4232; CM-NEXT:     AND_INT T60.X, T49.X, literal.x,
4233; CM-NEXT:     AND_INT T62.Z, T49.W, literal.x,
4234; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4235; CM-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
4236; CM-NEXT:     LSHR T49.X, PV.W, literal.x,
4237; CM-NEXT:     LSHR T62.Y, T49.Z, literal.y,
4238; CM-NEXT:     LSHR * T63.W, T48.Y, literal.y,
4239; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4240; CM-NEXT:     AND_INT T62.X, T49.Z, literal.x,
4241; CM-NEXT:     AND_INT T63.Z, T48.Y, literal.x,
4242; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4243; CM-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
4244; CM-NEXT:     LSHR T64.X, PV.W, literal.x,
4245; CM-NEXT:     LSHR T63.Y, T48.X, literal.y,
4246; CM-NEXT:     LSHR * T65.W, T48.W, literal.y,
4247; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4248; CM-NEXT:     AND_INT T63.X, T48.X, literal.x,
4249; CM-NEXT:     AND_INT * T65.Z, T48.W, literal.x,
4250; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
4251; CM-NEXT:     LSHR T48.X, KC0[2].Y, literal.x,
4252; CM-NEXT:     LSHR * T65.Y, T48.Z, literal.y,
4253; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4254; CM-NEXT:     AND_INT T65.X, T48.Z, literal.x,
4255; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4256; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
4257; CM-NEXT:     LSHR * T66.X, PV.W, literal.x,
4258; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4259  %load = load <64 x i16>, ptr addrspace(1) %in
4260  %ext = zext <64 x i16> %load to <64 x i32>
4261  store <64 x i32> %ext, ptr addrspace(1) %out
4262  ret void
4263}
4264
4265define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
4266; GCN-NOHSA-SI-LABEL: global_sextload_v64i16_to_v64i32:
4267; GCN-NOHSA-SI:       ; %bb.0:
4268; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
4269; GCN-NOHSA-SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
4270; GCN-NOHSA-SI-NEXT:    s_mov_b32 s14, -1
4271; GCN-NOHSA-SI-NEXT:    s_mov_b32 s15, 0xe8f000
4272; GCN-NOHSA-SI-NEXT:    s_add_u32 s12, s12, s11
4273; GCN-NOHSA-SI-NEXT:    s_addc_u32 s13, s13, 0
4274; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
4275; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
4276; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
4277; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
4278; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
4279; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
4280; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s6
4281; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s7
4282; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, s2
4283; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, s3
4284; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:112
4285; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:96
4286; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:80
4287; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:64
4288; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0
4289; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:16
4290; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:32
4291; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[40:43], off, s[4:7], 0 offset:48
4292; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
4293; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v11
4294; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
4295; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v11, 0, 16
4296; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v10, 0, 16
4297; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
4298; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
4299; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
4300; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
4301; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v9
4302; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v8
4303; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v9, 0, 16
4304; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v8, 0, 16
4305; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(6)
4306; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v35
4307; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v34
4308; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v35, 0, 16
4309; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v34, 0, 16
4310; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v33
4311; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v32
4312; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v14, v33, 0, 16
4313; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v12, v32, 0, 16
4314; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(5)
4315; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v35, 16, v39
4316; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v33, 16, v38
4317; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v34, v39, 0, 16
4318; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v32, v38, 0, 16
4319; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v47, 16, v37
4320; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v45, 16, v36
4321; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v46, v37, 0, 16
4322; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v44, v36, 0, 16
4323; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
4324; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v39, 16, v43
4325; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v37, 16, v42
4326; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v38, v43, 0, 16
4327; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v36, v42, 0, 16
4328; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v51, 16, v41
4329; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v49, 16, v40
4330; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v50, v41, 0, 16
4331; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v48, v40, 0, 16
4332; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v43, 16, v31
4333; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v41, 16, v30
4334; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v42, v31, 0, 16
4335; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v40, v30, 0, 16
4336; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v55, 16, v29
4337; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v53, 16, v28
4338; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v54, v29, 0, 16
4339; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v52, v28, 0, 16
4340; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v30, 16, v27
4341; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v28, 16, v26
4342; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v29, v27, 0, 16
4343; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v27, v26, 0, 16
4344; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v59, 16, v25
4345; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v57, 16, v24
4346; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v58, v25, 0, 16
4347; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v56, v24, 0, 16
4348; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v26, 16, v23
4349; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v24, 16, v22
4350; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v25, v23, 0, 16
4351; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v23, v22, 0, 16
4352; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v63, 16, v21
4353; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v61, 16, v20
4354; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v62, v21, 0, 16
4355; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v60, v20, 0, 16
4356; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v22, 16, v19
4357; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v20, 16, v18
4358; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v21, v19, 0, 16
4359; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v19, v18, 0, 16
4360; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
4361; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v17
4362; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v16
4363; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v17, 0, 16
4364; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v16, 0, 16
4365; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
4366; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:240
4367; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192
4368; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208
4369; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160
4370; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:176
4371; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128
4372; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:144
4373; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96
4374; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112
4375; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64
4376; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:80
4377; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
4378; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
4379; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
4380; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
4381; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
4382; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
4383; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
4384; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
4385; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
4386; GCN-NOHSA-SI-NEXT:    s_endpgm
4387;
4388; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32:
4389; GCN-HSA:       ; %bb.0:
4390; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
4391; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
4392; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
4393; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
4394; GCN-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[0:1]
4395; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x70
4396; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
4397; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
4398; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
4399; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
4400; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x60
4401; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
4402; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
4403; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
4404; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x50
4405; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
4406; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
4407; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
4408; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
4409; GCN-HSA-NEXT:    s_add_u32 s4, s2, 64
4410; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
4411; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
4412; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
4413; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
4414; GCN-HSA-NEXT:    s_add_u32 s4, s2, 48
4415; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
4416; GCN-HSA-NEXT:    s_add_u32 s6, s2, 32
4417; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
4418; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
4419; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
4420; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
4421; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
4422; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
4423; GCN-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[8:9]
4424; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
4425; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
4426; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
4427; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s7
4428; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s6
4429; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
4430; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
4431; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
4432; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s1
4433; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s0
4434; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
4435; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v35, 16, v29
4436; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v33, 16, v28
4437; GCN-HSA-NEXT:    v_bfe_i32 v34, v29, 0, 16
4438; GCN-HSA-NEXT:    v_bfe_i32 v32, v28, 0, 16
4439; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s3
4440; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s2
4441; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
4442; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
4443; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[32:35]
4444; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s3
4445; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s2
4446; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
4447; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v34, 16, v31
4448; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v32, 16, v30
4449; GCN-HSA-NEXT:    v_bfe_i32 v33, v31, 0, 16
4450; GCN-HSA-NEXT:    v_bfe_i32 v31, v30, 0, 16
4451; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
4452; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[31:34]
4453; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
4454; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v29, 16, v20
4455; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s3
4456; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s2
4457; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
4458; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
4459; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v31, 16, v21
4460; GCN-HSA-NEXT:    v_bfe_i32 v30, v21, 0, 16
4461; GCN-HSA-NEXT:    v_bfe_i32 v28, v20, 0, 16
4462; GCN-HSA-NEXT:    flat_store_dwordx4 v[35:36], v[28:31]
4463; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s3
4464; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s2
4465; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
4466; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
4467; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s3
4468; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s2
4469; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
4470; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v31, 16, v23
4471; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v29, 16, v22
4472; GCN-HSA-NEXT:    v_bfe_i32 v30, v23, 0, 16
4473; GCN-HSA-NEXT:    v_bfe_i32 v28, v22, 0, 16
4474; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
4475; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[28:31]
4476; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s3
4477; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s2
4478; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
4479; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
4480; GCN-HSA-NEXT:    v_mov_b32_e32 v39, s3
4481; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
4482; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v13
4483; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v12
4484; GCN-HSA-NEXT:    v_bfe_i32 v22, v13, 0, 16
4485; GCN-HSA-NEXT:    v_bfe_i32 v20, v12, 0, 16
4486; GCN-HSA-NEXT:    v_mov_b32_e32 v38, s2
4487; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
4488; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v31, 16, v15
4489; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v29, 16, v14
4490; GCN-HSA-NEXT:    v_bfe_i32 v30, v15, 0, 16
4491; GCN-HSA-NEXT:    v_bfe_i32 v28, v14, 0, 16
4492; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[20:23]
4493; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[28:31]
4494; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
4495; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v5
4496; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v4
4497; GCN-HSA-NEXT:    v_bfe_i32 v14, v5, 0, 16
4498; GCN-HSA-NEXT:    v_bfe_i32 v12, v4, 0, 16
4499; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v7
4500; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v6
4501; GCN-HSA-NEXT:    v_bfe_i32 v22, v7, 0, 16
4502; GCN-HSA-NEXT:    v_bfe_i32 v20, v6, 0, 16
4503; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
4504; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
4505; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
4506; GCN-HSA-NEXT:    v_bfe_i32 v6, v1, 0, 16
4507; GCN-HSA-NEXT:    v_bfe_i32 v4, v0, 0, 16
4508; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
4509; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
4510; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
4511; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
4512; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[12:15]
4513; GCN-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
4514; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
4515; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
4516; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
4517; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
4518; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
4519; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v3
4520; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v2
4521; GCN-HSA-NEXT:    v_bfe_i32 v22, v3, 0, 16
4522; GCN-HSA-NEXT:    v_bfe_i32 v20, v2, 0, 16
4523; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
4524; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
4525; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
4526; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v9
4527; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v8
4528; GCN-HSA-NEXT:    v_bfe_i32 v2, v9, 0, 16
4529; GCN-HSA-NEXT:    v_bfe_i32 v0, v8, 0, 16
4530; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
4531; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
4532; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
4533; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
4534; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
4535; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
4536; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
4537; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
4538; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v24
4539; GCN-HSA-NEXT:    v_bfe_i32 v12, v24, 0, 16
4540; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v24, 16, v11
4541; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v22, 16, v10
4542; GCN-HSA-NEXT:    v_bfe_i32 v23, v11, 0, 16
4543; GCN-HSA-NEXT:    v_bfe_i32 v21, v10, 0, 16
4544; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
4545; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[21:24]
4546; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
4547; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
4548; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
4549; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
4550; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v19
4551; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v18
4552; GCN-HSA-NEXT:    v_bfe_i32 v2, v19, 0, 16
4553; GCN-HSA-NEXT:    v_bfe_i32 v0, v18, 0, 16
4554; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v20, 16, v17
4555; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 16, v16
4556; GCN-HSA-NEXT:    v_bfe_i32 v19, v17, 0, 16
4557; GCN-HSA-NEXT:    v_bfe_i32 v17, v16, 0, 16
4558; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
4559; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[17:20]
4560; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
4561; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
4562; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
4563; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
4564; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
4565; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
4566; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v25
4567; GCN-HSA-NEXT:    v_bfe_i32 v14, v25, 0, 16
4568; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
4569; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
4570; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
4571; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
4572; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
4573; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v27
4574; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v26
4575; GCN-HSA-NEXT:    v_bfe_i32 v6, v27, 0, 16
4576; GCN-HSA-NEXT:    v_bfe_i32 v4, v26, 0, 16
4577; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
4578; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
4579; GCN-HSA-NEXT:    s_endpgm
4580;
4581; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32:
4582; GCN-NOHSA-VI:       ; %bb.0:
4583; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
4584; GCN-NOHSA-VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
4585; GCN-NOHSA-VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
4586; GCN-NOHSA-VI-NEXT:    s_mov_b32 s90, -1
4587; GCN-NOHSA-VI-NEXT:    s_mov_b32 s91, 0xe80000
4588; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
4589; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
4590; GCN-NOHSA-VI-NEXT:    s_add_u32 s88, s88, s11
4591; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s2
4592; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
4593; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
4594; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
4595; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
4596; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
4597; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
4598; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
4599; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
4600; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[23:26], off, s[8:11], 0 offset:64
4601; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[27:30], off, s[8:11], 0 offset:80
4602; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:96
4603; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:112
4604; GCN-NOHSA-VI-NEXT:    s_addc_u32 s89, s89, 0
4605; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
4606; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
4607; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(7)
4608; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v18, 16, v15
4609; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 16, v14
4610; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v15, 0, 16
4611; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v14, 0, 16
4612; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 ; 4-byte Folded Spill
4613; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
4614; GCN-NOHSA-VI-NEXT:    buffer_store_dword v17, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
4615; GCN-NOHSA-VI-NEXT:    buffer_store_dword v18, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
4616; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 16, v13
4617; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 16, v12
4618; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v13, 0, 16
4619; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v12, 0, 16
4620; GCN-NOHSA-VI-NEXT:    buffer_store_dword v13, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
4621; GCN-NOHSA-VI-NEXT:    buffer_store_dword v14, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
4622; GCN-NOHSA-VI-NEXT:    buffer_store_dword v15, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
4623; GCN-NOHSA-VI-NEXT:    buffer_store_dword v16, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
4624; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(14)
4625; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 16, v11
4626; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 16, v10
4627; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v11, 0, 16
4628; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v10, 0, 16
4629; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v42, 16, v9
4630; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v40, 16, v8
4631; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v41, v9, 0, 16
4632; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v39, v8, 0, 16
4633; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(12)
4634; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
4635; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
4636; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v3, 0, 16
4637; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v2, 0, 16
4638; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v50, 16, v1
4639; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v48, 16, v0
4640; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v49, v1, 0, 16
4641; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v47, v0, 0, 16
4642; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(8)
4643; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v36
4644; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v35
4645; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v36, 0, 16
4646; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v35, 0, 16
4647; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 16, v7
4648; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 16, v6
4649; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v7, 0, 16
4650; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v6, 0, 16
4651; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v46, 16, v5
4652; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v44, 16, v4
4653; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v45, v5, 0, 16
4654; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v43, v4, 0, 16
4655; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v54, 16, v26
4656; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v52, 16, v25
4657; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v53, v26, 0, 16
4658; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v51, v25, 0, 16
4659; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v58, 16, v24
4660; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v56, 16, v23
4661; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v57, v24, 0, 16
4662; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v55, v23, 0, 16
4663; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v26, 16, v30
4664; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v24, 16, v29
4665; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v25, v30, 0, 16
4666; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v23, v29, 0, 16
4667; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v62, 16, v28
4668; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v60, 16, v27
4669; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v61, v28, 0, 16
4670; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v59, v27, 0, 16
4671; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v30, 16, v34
4672; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v28, 16, v33
4673; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v29, v34, 0, 16
4674; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v27, v33, 0, 16
4675; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 16, v32
4676; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 16, v31
4677; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v32, 0, 16
4678; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v31, 0, 16
4679; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v34, 16, v38
4680; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v32, 16, v37
4681; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v33, v38, 0, 16
4682; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v31, v37, 0, 16
4683; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
4684; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:240
4685; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
4686; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:208
4687; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[59:62], off, s[0:3], 0 offset:160
4688; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176
4689; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[55:58], off, s[0:3], 0 offset:128
4690; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:144
4691; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:96
4692; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
4693; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:64
4694; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
4695; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:32
4696; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
4697; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
4698; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
4699; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
4700; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
4701; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
4702; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
4703; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
4704; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
4705; GCN-NOHSA-VI-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
4706; GCN-NOHSA-VI-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
4707; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
4708; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
4709; GCN-NOHSA-VI-NEXT:    s_endpgm
4710;
4711; EG-LABEL: global_sextload_v64i16_to_v64i32:
4712; EG:       ; %bb.0:
4713; EG-NEXT:    ALU 18, @38, KC0[CB0:0-32], KC1[]
4714; EG-NEXT:    TEX 7 @22
4715; EG-NEXT:    ALU 75, @57, KC0[CB0:0-32], KC1[]
4716; EG-NEXT:    ALU 71, @133, KC0[CB0:0-32], KC1[]
4717; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T41.X, 0
4718; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T65.XYZW, T66.X, 0
4719; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T56.X, 0
4720; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T64.XYZW, T55.X, 0
4721; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T54.X, 0
4722; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T53.X, 0
4723; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T52.X, 0
4724; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T51.X, 0
4725; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T50.X, 0
4726; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T61.XYZW, T49.X, 0
4727; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T40.X, 0
4728; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T39.X, 0
4729; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T38.X, 0
4730; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T37.X, 0
4731; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T36.X, 0
4732; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T35.X, 1
4733; EG-NEXT:    CF_END
4734; EG-NEXT:    PAD
4735; EG-NEXT:    Fetch clause starting at 22:
4736; EG-NEXT:     VTX_READ_128 T42.XYZW, T41.X, 16, #1
4737; EG-NEXT:     VTX_READ_128 T43.XYZW, T41.X, 32, #1
4738; EG-NEXT:     VTX_READ_128 T44.XYZW, T41.X, 0, #1
4739; EG-NEXT:     VTX_READ_128 T45.XYZW, T41.X, 48, #1
4740; EG-NEXT:     VTX_READ_128 T46.XYZW, T41.X, 64, #1
4741; EG-NEXT:     VTX_READ_128 T47.XYZW, T41.X, 80, #1
4742; EG-NEXT:     VTX_READ_128 T48.XYZW, T41.X, 96, #1
4743; EG-NEXT:     VTX_READ_128 T41.XYZW, T41.X, 112, #1
4744; EG-NEXT:    ALU clause starting at 38:
4745; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4746; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4747; EG-NEXT:     LSHR T35.X, PV.W, literal.x,
4748; EG-NEXT:     LSHR * T36.X, KC0[2].Y, literal.x,
4749; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4750; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4751; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
4752; EG-NEXT:     LSHR T37.X, PV.W, literal.x,
4753; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4754; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
4755; EG-NEXT:     LSHR T38.X, PV.W, literal.x,
4756; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4757; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
4758; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
4759; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4760; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
4761; EG-NEXT:     LSHR T40.X, PV.W, literal.x,
4762; EG-NEXT:     MOV * T41.X, KC0[2].Z,
4763; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4764; EG-NEXT:    ALU clause starting at 57:
4765; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4766; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
4767; EG-NEXT:     LSHR T49.X, PV.W, literal.x,
4768; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4769; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
4770; EG-NEXT:     LSHR T50.X, PV.W, literal.x,
4771; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4772; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
4773; EG-NEXT:     LSHR T51.X, PV.W, literal.x,
4774; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4775; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
4776; EG-NEXT:     LSHR T52.X, PV.W, literal.x,
4777; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4778; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
4779; EG-NEXT:     LSHR T53.X, PV.W, literal.x,
4780; EG-NEXT:     LSHR T0.Y, T41.Y, literal.y,
4781; EG-NEXT:     LSHR T0.Z, T41.W, literal.y,
4782; EG-NEXT:     LSHR T0.W, T48.Y, literal.y, BS:VEC_120/SCL_212
4783; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
4784; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4785; EG-NEXT:    160(2.242078e-43), 0(0.000000e+00)
4786; EG-NEXT:     LSHR T54.X, PS, literal.x,
4787; EG-NEXT:     LSHR T1.Y, T48.W, literal.y,
4788; EG-NEXT:     LSHR T1.Z, T47.Y, literal.y,
4789; EG-NEXT:     LSHR T1.W, T47.W, literal.y, BS:VEC_120/SCL_212
4790; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.z,
4791; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4792; EG-NEXT:    208(2.914701e-43), 0(0.000000e+00)
4793; EG-NEXT:     LSHR T55.X, PS, literal.x,
4794; EG-NEXT:     LSHR T2.Y, T46.Y, literal.y,
4795; EG-NEXT:     LSHR T2.Z, T46.W, literal.y,
4796; EG-NEXT:     LSHR T2.W, T45.Y, literal.y, BS:VEC_120/SCL_212
4797; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.z,
4798; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4799; EG-NEXT:    192(2.690493e-43), 0(0.000000e+00)
4800; EG-NEXT:     LSHR T56.X, PS, literal.x,
4801; EG-NEXT:     LSHR T3.Y, T45.W, literal.y,
4802; EG-NEXT:     BFE_INT T57.Z, T44.W, 0.0, literal.y, BS:VEC_120/SCL_212
4803; EG-NEXT:     LSHR T3.W, T43.Y, literal.y,
4804; EG-NEXT:     LSHR * T4.W, T43.W, literal.y,
4805; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4806; EG-NEXT:     BFE_INT T57.X, T44.Z, 0.0, literal.x,
4807; EG-NEXT:     LSHR T4.Y, T42.Y, literal.x,
4808; EG-NEXT:     BFE_INT T58.Z, T44.Y, 0.0, literal.x, BS:VEC_120/SCL_212
4809; EG-NEXT:     LSHR T5.W, T42.W, literal.x,
4810; EG-NEXT:     LSHR * T6.W, T44.W, literal.x,
4811; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4812; EG-NEXT:     BFE_INT T58.X, T44.X, 0.0, literal.x,
4813; EG-NEXT:     LSHR T5.Y, T44.Y, literal.x,
4814; EG-NEXT:     BFE_INT T59.Z, T42.W, 0.0, literal.x,
4815; EG-NEXT:     BFE_INT T57.W, PS, 0.0, literal.x,
4816; EG-NEXT:     LSHR * T6.W, T44.Z, literal.x,
4817; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4818; EG-NEXT:     BFE_INT T59.X, T42.Z, 0.0, literal.x,
4819; EG-NEXT:     BFE_INT T57.Y, PS, 0.0, literal.x,
4820; EG-NEXT:     BFE_INT T44.Z, T42.Y, 0.0, literal.x,
4821; EG-NEXT:     BFE_INT T58.W, PV.Y, 0.0, literal.x,
4822; EG-NEXT:     LSHR * T6.W, T44.X, literal.x,
4823; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4824; EG-NEXT:     BFE_INT T44.X, T42.X, 0.0, literal.x,
4825; EG-NEXT:     BFE_INT T58.Y, PS, 0.0, literal.x,
4826; EG-NEXT:     BFE_INT T60.Z, T43.W, 0.0, literal.x,
4827; EG-NEXT:     BFE_INT T59.W, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
4828; EG-NEXT:     LSHR * T5.W, T42.Z, literal.x,
4829; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4830; EG-NEXT:     BFE_INT T60.X, T43.Z, 0.0, literal.x,
4831; EG-NEXT:     BFE_INT T59.Y, PS, 0.0, literal.x,
4832; EG-NEXT:     BFE_INT T42.Z, T43.Y, 0.0, literal.x,
4833; EG-NEXT:     BFE_INT T44.W, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
4834; EG-NEXT:     LSHR * T5.W, T42.X, literal.x,
4835; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4836; EG-NEXT:     BFE_INT T42.X, T43.X, 0.0, literal.x,
4837; EG-NEXT:     BFE_INT T44.Y, PS, 0.0, literal.x,
4838; EG-NEXT:     BFE_INT T61.Z, T45.W, 0.0, literal.x,
4839; EG-NEXT:     BFE_INT * T60.W, T4.W, 0.0, literal.x, BS:VEC_120/SCL_212
4840; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4841; EG-NEXT:    ALU clause starting at 133:
4842; EG-NEXT:     LSHR * T4.W, T43.Z, literal.x,
4843; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4844; EG-NEXT:     BFE_INT T61.X, T45.Z, 0.0, literal.x,
4845; EG-NEXT:     BFE_INT T60.Y, PV.W, 0.0, literal.x,
4846; EG-NEXT:     BFE_INT T43.Z, T45.Y, 0.0, literal.x,
4847; EG-NEXT:     BFE_INT T42.W, T3.W, 0.0, literal.x,
4848; EG-NEXT:     LSHR * T3.W, T43.X, literal.x,
4849; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4850; EG-NEXT:     BFE_INT T43.X, T45.X, 0.0, literal.x,
4851; EG-NEXT:     BFE_INT T42.Y, PS, 0.0, literal.x,
4852; EG-NEXT:     BFE_INT T62.Z, T46.W, 0.0, literal.x,
4853; EG-NEXT:     BFE_INT T61.W, T3.Y, 0.0, literal.x,
4854; EG-NEXT:     LSHR * T3.W, T45.Z, literal.x,
4855; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4856; EG-NEXT:     BFE_INT T62.X, T46.Z, 0.0, literal.x,
4857; EG-NEXT:     BFE_INT T61.Y, PS, 0.0, literal.x,
4858; EG-NEXT:     BFE_INT T45.Z, T46.Y, 0.0, literal.x,
4859; EG-NEXT:     BFE_INT T43.W, T2.W, 0.0, literal.x,
4860; EG-NEXT:     LSHR * T2.W, T45.X, literal.x,
4861; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4862; EG-NEXT:     BFE_INT T45.X, T46.X, 0.0, literal.x,
4863; EG-NEXT:     BFE_INT T43.Y, PS, 0.0, literal.x,
4864; EG-NEXT:     BFE_INT T63.Z, T47.W, 0.0, literal.x,
4865; EG-NEXT:     BFE_INT T62.W, T2.Z, 0.0, literal.x,
4866; EG-NEXT:     LSHR * T2.W, T46.Z, literal.x,
4867; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4868; EG-NEXT:     BFE_INT T63.X, T47.Z, 0.0, literal.x,
4869; EG-NEXT:     BFE_INT T62.Y, PS, 0.0, literal.x,
4870; EG-NEXT:     BFE_INT T46.Z, T47.Y, 0.0, literal.x,
4871; EG-NEXT:     BFE_INT T45.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
4872; EG-NEXT:     LSHR * T2.W, T46.X, literal.x,
4873; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4874; EG-NEXT:     BFE_INT T46.X, T47.X, 0.0, literal.x,
4875; EG-NEXT:     BFE_INT T45.Y, PS, 0.0, literal.x,
4876; EG-NEXT:     BFE_INT T64.Z, T48.W, 0.0, literal.x,
4877; EG-NEXT:     BFE_INT T63.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
4878; EG-NEXT:     LSHR * T1.W, T47.Z, literal.x,
4879; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4880; EG-NEXT:     BFE_INT T64.X, T48.Z, 0.0, literal.x,
4881; EG-NEXT:     BFE_INT T63.Y, PS, 0.0, literal.x,
4882; EG-NEXT:     BFE_INT T47.Z, T48.Y, 0.0, literal.x,
4883; EG-NEXT:     BFE_INT T46.W, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
4884; EG-NEXT:     LSHR * T1.W, T47.X, literal.x,
4885; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4886; EG-NEXT:     BFE_INT T47.X, T48.X, 0.0, literal.x,
4887; EG-NEXT:     BFE_INT T46.Y, PS, 0.0, literal.x,
4888; EG-NEXT:     BFE_INT T65.Z, T41.W, 0.0, literal.x,
4889; EG-NEXT:     BFE_INT T64.W, T1.Y, 0.0, literal.x,
4890; EG-NEXT:     LSHR * T1.W, T48.Z, literal.x,
4891; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4892; EG-NEXT:     BFE_INT T65.X, T41.Z, 0.0, literal.x,
4893; EG-NEXT:     BFE_INT T64.Y, PS, 0.0, literal.x,
4894; EG-NEXT:     BFE_INT T48.Z, T41.Y, 0.0, literal.x,
4895; EG-NEXT:     BFE_INT T47.W, T0.W, 0.0, literal.x,
4896; EG-NEXT:     LSHR * T0.W, T48.X, literal.x,
4897; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4898; EG-NEXT:     BFE_INT T48.X, T41.X, 0.0, literal.x,
4899; EG-NEXT:     BFE_INT T47.Y, PS, 0.0, literal.x,
4900; EG-NEXT:     LSHR T1.Z, T41.Z, literal.x,
4901; EG-NEXT:     BFE_INT T65.W, T0.Z, 0.0, literal.x, BS:VEC_120/SCL_212
4902; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4903; EG-NEXT:    16(2.242078e-44), 240(3.363116e-43)
4904; EG-NEXT:     LSHR T66.X, PS, literal.x,
4905; EG-NEXT:     BFE_INT T65.Y, PV.Z, 0.0, literal.y,
4906; EG-NEXT:     LSHR T0.Z, T41.X, literal.y,
4907; EG-NEXT:     BFE_INT T48.W, T0.Y, 0.0, literal.y,
4908; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
4909; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4910; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
4911; EG-NEXT:     LSHR T41.X, PS, literal.x,
4912; EG-NEXT:     BFE_INT * T48.Y, PV.Z, 0.0, literal.y,
4913; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4914;
4915; CM-LABEL: global_sextload_v64i16_to_v64i32:
4916; CM:       ; %bb.0:
4917; CM-NEXT:    ALU 0, @40, KC0[CB0:0-32], KC1[]
4918; CM-NEXT:    TEX 1 @24
4919; CM-NEXT:    ALU 15, @41, KC0[CB0:0-32], KC1[]
4920; CM-NEXT:    TEX 5 @28
4921; CM-NEXT:    ALU 82, @57, KC0[CB0:0-32], KC1[]
4922; CM-NEXT:    ALU 72, @140, KC0[CB0:0-32], KC1[]
4923; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T65, T66.X
4924; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T36, T35.X
4925; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T64, T56.X
4926; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T37, T55.X
4927; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T63, T54.X
4928; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T45, T53.X
4929; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T62, T52.X
4930; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T44, T51.X
4931; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T61, T50.X
4932; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T43, T49.X
4933; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T60, T48.X
4934; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T42, T47.X
4935; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T59, T46.X
4936; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T41, T40.X
4937; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T58, T39.X
4938; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T57, T38.X
4939; CM-NEXT:    CF_END
4940; CM-NEXT:    PAD
4941; CM-NEXT:    Fetch clause starting at 24:
4942; CM-NEXT:     VTX_READ_128 T36.XYZW, T37.X, 16, #1
4943; CM-NEXT:     VTX_READ_128 T35.XYZW, T37.X, 0, #1
4944; CM-NEXT:    Fetch clause starting at 28:
4945; CM-NEXT:     VTX_READ_128 T41.XYZW, T37.X, 112, #1
4946; CM-NEXT:     VTX_READ_128 T42.XYZW, T37.X, 96, #1
4947; CM-NEXT:     VTX_READ_128 T43.XYZW, T37.X, 80, #1
4948; CM-NEXT:     VTX_READ_128 T44.XYZW, T37.X, 64, #1
4949; CM-NEXT:     VTX_READ_128 T45.XYZW, T37.X, 48, #1
4950; CM-NEXT:     VTX_READ_128 T37.XYZW, T37.X, 32, #1
4951; CM-NEXT:    ALU clause starting at 40:
4952; CM-NEXT:     MOV * T37.X, KC0[2].Z,
4953; CM-NEXT:    ALU clause starting at 41:
4954; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4955; CM-NEXT:    224(3.138909e-43), 0(0.000000e+00)
4956; CM-NEXT:     LSHR T38.X, PV.W, literal.x,
4957; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4958; CM-NEXT:    2(2.802597e-45), 240(3.363116e-43)
4959; CM-NEXT:     LSHR T39.X, PV.W, literal.x,
4960; CM-NEXT:     LSHR T0.Y, T35.Z, literal.y,
4961; CM-NEXT:     LSHR T0.Z, T35.W, literal.y,
4962; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
4963; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4964; CM-NEXT:    192(2.690493e-43), 0(0.000000e+00)
4965; CM-NEXT:     LSHR T40.X, PV.W, literal.x,
4966; CM-NEXT:     LSHR T1.Y, T35.Y, literal.y,
4967; CM-NEXT:     LSHR T1.Z, T36.Z, literal.y,
4968; CM-NEXT:     LSHR * T0.W, T36.W, literal.y,
4969; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4970; CM-NEXT:    ALU clause starting at 57:
4971; CM-NEXT:     LSHR T2.Z, T36.X, literal.x,
4972; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
4973; CM-NEXT:    16(2.242078e-44), 208(2.914701e-43)
4974; CM-NEXT:     LSHR T46.X, PV.W, literal.x,
4975; CM-NEXT:     LSHR T2.Y, T36.Y, literal.y,
4976; CM-NEXT:     LSHR T3.Z, T37.Z, literal.y,
4977; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
4978; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4979; CM-NEXT:    160(2.242078e-43), 0(0.000000e+00)
4980; CM-NEXT:     LSHR T47.X, PV.W, literal.x,
4981; CM-NEXT:     LSHR T3.Y, T37.W, literal.y,
4982; CM-NEXT:     LSHR T4.Z, T37.X, literal.y,
4983; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
4984; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4985; CM-NEXT:    176(2.466285e-43), 0(0.000000e+00)
4986; CM-NEXT:     LSHR T48.X, PV.W, literal.x,
4987; CM-NEXT:     LSHR T4.Y, T37.Y, literal.y,
4988; CM-NEXT:     LSHR T5.Z, T45.Z, literal.y,
4989; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
4990; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4991; CM-NEXT:    128(1.793662e-43), 0(0.000000e+00)
4992; CM-NEXT:     LSHR T49.X, PV.W, literal.x,
4993; CM-NEXT:     LSHR T5.Y, T45.W, literal.y,
4994; CM-NEXT:     LSHR T6.Z, T45.X, literal.y,
4995; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
4996; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4997; CM-NEXT:    144(2.017870e-43), 0(0.000000e+00)
4998; CM-NEXT:     LSHR T50.X, PV.W, literal.x,
4999; CM-NEXT:     LSHR T6.Y, T45.Y, literal.y,
5000; CM-NEXT:     LSHR T7.Z, T44.Z, literal.y,
5001; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
5002; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
5003; CM-NEXT:    96(1.345247e-43), 0(0.000000e+00)
5004; CM-NEXT:     LSHR T51.X, PV.W, literal.x,
5005; CM-NEXT:     LSHR T7.Y, T44.W, literal.y,
5006; CM-NEXT:     LSHR T8.Z, T44.X, literal.y,
5007; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
5008; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
5009; CM-NEXT:    112(1.569454e-43), 0(0.000000e+00)
5010; CM-NEXT:     LSHR T52.X, PV.W, literal.x,
5011; CM-NEXT:     LSHR T8.Y, T44.Y, literal.y,
5012; CM-NEXT:     LSHR T9.Z, T43.Z, literal.y,
5013; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
5014; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
5015; CM-NEXT:    64(8.968310e-44), 0(0.000000e+00)
5016; CM-NEXT:     LSHR T53.X, PV.W, literal.x,
5017; CM-NEXT:     LSHR T9.Y, T43.W, literal.y,
5018; CM-NEXT:     LSHR T10.Z, T43.X, literal.y,
5019; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
5020; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
5021; CM-NEXT:    80(1.121039e-43), 0(0.000000e+00)
5022; CM-NEXT:     LSHR T54.X, PV.W, literal.x,
5023; CM-NEXT:     LSHR T10.Y, T43.Y, literal.y,
5024; CM-NEXT:     LSHR T11.Z, T42.Z, literal.y,
5025; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
5026; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
5027; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
5028; CM-NEXT:     LSHR T55.X, PV.W, literal.x,
5029; CM-NEXT:     LSHR T11.Y, T42.W, literal.y,
5030; CM-NEXT:     LSHR T12.Z, T42.X, literal.y,
5031; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.z,
5032; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
5033; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
5034; CM-NEXT:     LSHR T56.X, PV.W, literal.x,
5035; CM-NEXT:     LSHR T12.Y, T42.Y, literal.y,
5036; CM-NEXT:     BFE_INT T57.Z, T41.Y, 0.0, literal.y, BS:VEC_120/SCL_212
5037; CM-NEXT:     LSHR * T1.W, T41.Z, literal.y,
5038; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
5039; CM-NEXT:     BFE_INT T57.X, T41.X, 0.0, literal.x,
5040; CM-NEXT:     LSHR T13.Y, T41.W, literal.x,
5041; CM-NEXT:     BFE_INT T58.Z, T41.W, 0.0, literal.x,
5042; CM-NEXT:     LSHR * T2.W, T41.Y, literal.x,
5043; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5044; CM-NEXT:     BFE_INT T58.X, T41.Z, 0.0, literal.x,
5045; CM-NEXT:     LSHR T14.Y, T41.X, literal.x,
5046; CM-NEXT:     BFE_INT T41.Z, T42.Y, 0.0, literal.x,
5047; CM-NEXT:     BFE_INT * T57.W, PV.W, 0.0, literal.x,
5048; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5049; CM-NEXT:     BFE_INT T41.X, T42.X, 0.0, literal.x,
5050; CM-NEXT:     BFE_INT T57.Y, PV.Y, 0.0, literal.x,
5051; CM-NEXT:     BFE_INT T59.Z, T42.W, 0.0, literal.x,
5052; CM-NEXT:     BFE_INT * T58.W, T13.Y, 0.0, literal.x,
5053; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5054; CM-NEXT:    ALU clause starting at 140:
5055; CM-NEXT:     BFE_INT T59.X, T42.Z, 0.0, literal.x,
5056; CM-NEXT:     BFE_INT T58.Y, T1.W, 0.0, literal.x,
5057; CM-NEXT:     BFE_INT T42.Z, T43.Y, 0.0, literal.x,
5058; CM-NEXT:     BFE_INT * T41.W, T12.Y, 0.0, literal.x, BS:VEC_120/SCL_212
5059; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5060; CM-NEXT:     BFE_INT T42.X, T43.X, 0.0, literal.x,
5061; CM-NEXT:     BFE_INT T41.Y, T12.Z, 0.0, literal.x,
5062; CM-NEXT:     BFE_INT T60.Z, T43.W, 0.0, literal.x,
5063; CM-NEXT:     BFE_INT * T59.W, T11.Y, 0.0, literal.x,
5064; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5065; CM-NEXT:     BFE_INT T60.X, T43.Z, 0.0, literal.x,
5066; CM-NEXT:     BFE_INT T59.Y, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
5067; CM-NEXT:     BFE_INT T43.Z, T44.Y, 0.0, literal.x,
5068; CM-NEXT:     BFE_INT * T42.W, T10.Y, 0.0, literal.x, BS:VEC_120/SCL_212
5069; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5070; CM-NEXT:     BFE_INT T43.X, T44.X, 0.0, literal.x,
5071; CM-NEXT:     BFE_INT T42.Y, T10.Z, 0.0, literal.x,
5072; CM-NEXT:     BFE_INT T61.Z, T44.W, 0.0, literal.x,
5073; CM-NEXT:     BFE_INT * T60.W, T9.Y, 0.0, literal.x,
5074; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5075; CM-NEXT:     BFE_INT T61.X, T44.Z, 0.0, literal.x,
5076; CM-NEXT:     BFE_INT T60.Y, T9.Z, 0.0, literal.x, BS:VEC_120/SCL_212
5077; CM-NEXT:     BFE_INT T44.Z, T45.Y, 0.0, literal.x,
5078; CM-NEXT:     BFE_INT * T43.W, T8.Y, 0.0, literal.x, BS:VEC_120/SCL_212
5079; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5080; CM-NEXT:     BFE_INT T44.X, T45.X, 0.0, literal.x,
5081; CM-NEXT:     BFE_INT T43.Y, T8.Z, 0.0, literal.x,
5082; CM-NEXT:     BFE_INT T62.Z, T45.W, 0.0, literal.x,
5083; CM-NEXT:     BFE_INT * T61.W, T7.Y, 0.0, literal.x,
5084; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5085; CM-NEXT:     BFE_INT T62.X, T45.Z, 0.0, literal.x,
5086; CM-NEXT:     BFE_INT T61.Y, T7.Z, 0.0, literal.x, BS:VEC_120/SCL_212
5087; CM-NEXT:     BFE_INT T45.Z, T37.Y, 0.0, literal.x,
5088; CM-NEXT:     BFE_INT * T44.W, T6.Y, 0.0, literal.x, BS:VEC_120/SCL_212
5089; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5090; CM-NEXT:     BFE_INT T45.X, T37.X, 0.0, literal.x,
5091; CM-NEXT:     BFE_INT T44.Y, T6.Z, 0.0, literal.x,
5092; CM-NEXT:     BFE_INT T63.Z, T37.W, 0.0, literal.x,
5093; CM-NEXT:     BFE_INT * T62.W, T5.Y, 0.0, literal.x,
5094; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5095; CM-NEXT:     BFE_INT T63.X, T37.Z, 0.0, literal.x,
5096; CM-NEXT:     BFE_INT T62.Y, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
5097; CM-NEXT:     BFE_INT T37.Z, T36.Y, 0.0, literal.x,
5098; CM-NEXT:     BFE_INT * T45.W, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
5099; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5100; CM-NEXT:     BFE_INT T37.X, T36.X, 0.0, literal.x,
5101; CM-NEXT:     BFE_INT T45.Y, T4.Z, 0.0, literal.x,
5102; CM-NEXT:     BFE_INT T64.Z, T36.W, 0.0, literal.x,
5103; CM-NEXT:     BFE_INT * T63.W, T3.Y, 0.0, literal.x,
5104; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5105; CM-NEXT:     BFE_INT T64.X, T36.Z, 0.0, literal.x,
5106; CM-NEXT:     BFE_INT T63.Y, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
5107; CM-NEXT:     BFE_INT T36.Z, T35.Y, 0.0, literal.x,
5108; CM-NEXT:     BFE_INT * T37.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
5109; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5110; CM-NEXT:     BFE_INT T36.X, T35.X, 0.0, literal.x,
5111; CM-NEXT:     BFE_INT T37.Y, T2.Z, 0.0, literal.x,
5112; CM-NEXT:     BFE_INT T65.Z, T35.W, 0.0, literal.x,
5113; CM-NEXT:     BFE_INT * T64.W, T0.W, 0.0, literal.x, BS:VEC_120/SCL_212
5114; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5115; CM-NEXT:     BFE_INT T65.X, T35.Z, 0.0, literal.x,
5116; CM-NEXT:     BFE_INT T64.Y, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
5117; CM-NEXT:     LSHR T1.Z, T35.X, literal.x,
5118; CM-NEXT:     BFE_INT * T36.W, T1.Y, 0.0, literal.x,
5119; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5120; CM-NEXT:     LSHR T35.X, KC0[2].Y, literal.x,
5121; CM-NEXT:     BFE_INT T36.Y, PV.Z, 0.0, literal.y,
5122; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.y,
5123; CM-NEXT:     BFE_INT * T65.W, T0.Z, 0.0, literal.y,
5124; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
5125; CM-NEXT:     LSHR T66.X, PV.Z, literal.x,
5126; CM-NEXT:     BFE_INT * T65.Y, T0.Y, 0.0, literal.y,
5127; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
5128  %load = load <64 x i16>, ptr addrspace(1) %in
5129  %ext = sext <64 x i16> %load to <64 x i32>
5130  store <64 x i32> %ext, ptr addrspace(1) %out
5131  ret void
5132}
5133
5134define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
5135; GCN-NOHSA-SI-LABEL: global_zextload_i16_to_i64:
5136; GCN-NOHSA-SI:       ; %bb.0:
5137; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5138; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
5139; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
5140; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
5141; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
5142; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
5143; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
5144; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
5145; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
5146; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
5147; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
5148; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
5149; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
5150; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5151; GCN-NOHSA-SI-NEXT:    s_endpgm
5152;
5153; GCN-HSA-LABEL: global_zextload_i16_to_i64:
5154; GCN-HSA:       ; %bb.0:
5155; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
5156; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
5157; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
5158; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
5159; GCN-HSA-NEXT:    flat_load_ushort v0, v[0:1]
5160; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
5161; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s1
5162; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
5163; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
5164; GCN-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
5165; GCN-HSA-NEXT:    s_endpgm
5166;
5167; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i64:
5168; GCN-NOHSA-VI:       ; %bb.0:
5169; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5170; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
5171; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
5172; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
5173; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
5174; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
5175; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
5176; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
5177; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
5178; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
5179; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
5180; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, 0
5181; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
5182; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
5183; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5184; GCN-NOHSA-VI-NEXT:    s_endpgm
5185;
5186; EG-LABEL: global_zextload_i16_to_i64:
5187; EG:       ; %bb.0:
5188; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5189; EG-NEXT:    TEX 0 @6
5190; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
5191; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
5192; EG-NEXT:    CF_END
5193; EG-NEXT:    PAD
5194; EG-NEXT:    Fetch clause starting at 6:
5195; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
5196; EG-NEXT:    ALU clause starting at 8:
5197; EG-NEXT:     MOV * T0.X, KC0[2].Z,
5198; EG-NEXT:    ALU clause starting at 9:
5199; EG-NEXT:     MOV * T0.Y, 0.0,
5200; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
5201; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5202;
5203; CM-LABEL: global_zextload_i16_to_i64:
5204; CM:       ; %bb.0:
5205; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5206; CM-NEXT:    TEX 0 @6
5207; CM-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
5208; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
5209; CM-NEXT:    CF_END
5210; CM-NEXT:    PAD
5211; CM-NEXT:    Fetch clause starting at 6:
5212; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
5213; CM-NEXT:    ALU clause starting at 8:
5214; CM-NEXT:     MOV * T0.X, KC0[2].Z,
5215; CM-NEXT:    ALU clause starting at 9:
5216; CM-NEXT:     MOV * T0.Y, 0.0,
5217; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
5218; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5219  %a = load i16, ptr addrspace(1) %in
5220  %ext = zext i16 %a to i64
5221  store i64 %ext, ptr addrspace(1) %out
5222  ret void
5223}
5224
5225; FIXME: Need to optimize this sequence to avoid extra bfe:
5226;  t28: i32,ch = load<LD2[%in(addrspace=1)], anyext from i16> t12, t27, undef:i64
5227;          t31: i64 = any_extend t28
5228;        t33: i64 = sign_extend_inreg t31, ValueType:ch:i16
5229
5230; TODO: These could be expanded earlier using ASHR 15
5231define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
5232; GCN-NOHSA-SI-LABEL: global_sextload_i16_to_i64:
5233; GCN-NOHSA-SI:       ; %bb.0:
5234; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5235; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
5236; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
5237; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
5238; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
5239; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
5240; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
5241; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
5242; GCN-NOHSA-SI-NEXT:    buffer_load_sshort v0, off, s[8:11], 0
5243; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
5244; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
5245; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
5246; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
5247; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5248; GCN-NOHSA-SI-NEXT:    s_endpgm
5249;
5250; GCN-HSA-LABEL: global_sextload_i16_to_i64:
5251; GCN-HSA:       ; %bb.0:
5252; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
5253; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
5254; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
5255; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
5256; GCN-HSA-NEXT:    flat_load_sshort v0, v[0:1]
5257; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
5258; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s1
5259; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
5260; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
5261; GCN-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
5262; GCN-HSA-NEXT:    s_endpgm
5263;
5264; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i64:
5265; GCN-NOHSA-VI:       ; %bb.0:
5266; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5267; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
5268; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
5269; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
5270; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
5271; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
5272; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
5273; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
5274; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
5275; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
5276; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
5277; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
5278; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
5279; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
5280; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5281; GCN-NOHSA-VI-NEXT:    s_endpgm
5282;
5283; EG-LABEL: global_sextload_i16_to_i64:
5284; EG:       ; %bb.0:
5285; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5286; EG-NEXT:    TEX 0 @6
5287; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
5288; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
5289; EG-NEXT:    CF_END
5290; EG-NEXT:    PAD
5291; EG-NEXT:    Fetch clause starting at 6:
5292; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
5293; EG-NEXT:    ALU clause starting at 8:
5294; EG-NEXT:     MOV * T0.X, KC0[2].Z,
5295; EG-NEXT:    ALU clause starting at 9:
5296; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
5297; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
5298; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
5299; EG-NEXT:     ASHR * T0.Y, PV.X, literal.x,
5300; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
5301;
5302; CM-LABEL: global_sextload_i16_to_i64:
5303; CM:       ; %bb.0:
5304; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5305; CM-NEXT:    TEX 0 @6
5306; CM-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
5307; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
5308; CM-NEXT:    CF_END
5309; CM-NEXT:    PAD
5310; CM-NEXT:    Fetch clause starting at 6:
5311; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
5312; CM-NEXT:    ALU clause starting at 8:
5313; CM-NEXT:     MOV * T0.X, KC0[2].Z,
5314; CM-NEXT:    ALU clause starting at 9:
5315; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
5316; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5317; CM-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
5318; CM-NEXT:     ASHR * T0.Y, PV.X, literal.y,
5319; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
5320  %a = load i16, ptr addrspace(1) %in
5321  %ext = sext i16 %a to i64
5322  store i64 %ext, ptr addrspace(1) %out
5323  ret void
5324}
5325
5326define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
5327; GCN-NOHSA-SI-LABEL: global_zextload_v1i16_to_v1i64:
5328; GCN-NOHSA-SI:       ; %bb.0:
5329; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5330; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
5331; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
5332; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
5333; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
5334; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
5335; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
5336; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
5337; GCN-NOHSA-SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
5338; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
5339; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
5340; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
5341; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
5342; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5343; GCN-NOHSA-SI-NEXT:    s_endpgm
5344;
5345; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64:
5346; GCN-HSA:       ; %bb.0:
5347; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
5348; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
5349; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
5350; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
5351; GCN-HSA-NEXT:    flat_load_ushort v0, v[0:1]
5352; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
5353; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s1
5354; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
5355; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
5356; GCN-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
5357; GCN-HSA-NEXT:    s_endpgm
5358;
5359; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i64:
5360; GCN-NOHSA-VI:       ; %bb.0:
5361; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5362; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
5363; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
5364; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
5365; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
5366; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
5367; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
5368; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
5369; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
5370; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
5371; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
5372; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, 0
5373; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
5374; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
5375; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5376; GCN-NOHSA-VI-NEXT:    s_endpgm
5377;
5378; EG-LABEL: global_zextload_v1i16_to_v1i64:
5379; EG:       ; %bb.0:
5380; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5381; EG-NEXT:    TEX 0 @6
5382; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
5383; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
5384; EG-NEXT:    CF_END
5385; EG-NEXT:    PAD
5386; EG-NEXT:    Fetch clause starting at 6:
5387; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
5388; EG-NEXT:    ALU clause starting at 8:
5389; EG-NEXT:     MOV * T0.X, KC0[2].Z,
5390; EG-NEXT:    ALU clause starting at 9:
5391; EG-NEXT:     MOV * T0.Y, 0.0,
5392; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
5393; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5394;
5395; CM-LABEL: global_zextload_v1i16_to_v1i64:
5396; CM:       ; %bb.0:
5397; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5398; CM-NEXT:    TEX 0 @6
5399; CM-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
5400; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
5401; CM-NEXT:    CF_END
5402; CM-NEXT:    PAD
5403; CM-NEXT:    Fetch clause starting at 6:
5404; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
5405; CM-NEXT:    ALU clause starting at 8:
5406; CM-NEXT:     MOV * T0.X, KC0[2].Z,
5407; CM-NEXT:    ALU clause starting at 9:
5408; CM-NEXT:     MOV * T0.Y, 0.0,
5409; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
5410; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5411  %load = load <1 x i16>, ptr addrspace(1) %in
5412  %ext = zext <1 x i16> %load to <1 x i64>
5413  store <1 x i64> %ext, ptr addrspace(1) %out
5414  ret void
5415}
5416
5417; TODO: These could be expanded earlier using ASHR 15
5418define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
5419; GCN-NOHSA-SI-LABEL: global_sextload_v1i16_to_v1i64:
5420; GCN-NOHSA-SI:       ; %bb.0:
5421; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5422; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
5423; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
5424; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
5425; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
5426; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
5427; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
5428; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
5429; GCN-NOHSA-SI-NEXT:    buffer_load_sshort v0, off, s[8:11], 0
5430; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
5431; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
5432; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
5433; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
5434; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5435; GCN-NOHSA-SI-NEXT:    s_endpgm
5436;
5437; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64:
5438; GCN-HSA:       ; %bb.0:
5439; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
5440; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
5441; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
5442; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
5443; GCN-HSA-NEXT:    flat_load_sshort v0, v[0:1]
5444; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
5445; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s1
5446; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
5447; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
5448; GCN-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
5449; GCN-HSA-NEXT:    s_endpgm
5450;
5451; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i64:
5452; GCN-NOHSA-VI:       ; %bb.0:
5453; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5454; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
5455; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
5456; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
5457; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
5458; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
5459; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
5460; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
5461; GCN-NOHSA-VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
5462; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
5463; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
5464; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
5465; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
5466; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
5467; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5468; GCN-NOHSA-VI-NEXT:    s_endpgm
5469;
5470; EG-LABEL: global_sextload_v1i16_to_v1i64:
5471; EG:       ; %bb.0:
5472; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5473; EG-NEXT:    TEX 0 @6
5474; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
5475; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
5476; EG-NEXT:    CF_END
5477; EG-NEXT:    PAD
5478; EG-NEXT:    Fetch clause starting at 6:
5479; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
5480; EG-NEXT:    ALU clause starting at 8:
5481; EG-NEXT:     MOV * T0.X, KC0[2].Z,
5482; EG-NEXT:    ALU clause starting at 9:
5483; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
5484; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
5485; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
5486; EG-NEXT:     ASHR * T0.Y, PV.X, literal.x,
5487; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
5488;
5489; CM-LABEL: global_sextload_v1i16_to_v1i64:
5490; CM:       ; %bb.0:
5491; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5492; CM-NEXT:    TEX 0 @6
5493; CM-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
5494; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
5495; CM-NEXT:    CF_END
5496; CM-NEXT:    PAD
5497; CM-NEXT:    Fetch clause starting at 6:
5498; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
5499; CM-NEXT:    ALU clause starting at 8:
5500; CM-NEXT:     MOV * T0.X, KC0[2].Z,
5501; CM-NEXT:    ALU clause starting at 9:
5502; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
5503; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5504; CM-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
5505; CM-NEXT:     ASHR * T0.Y, PV.X, literal.y,
5506; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
5507  %load = load <1 x i16>, ptr addrspace(1) %in
5508  %ext = sext <1 x i16> %load to <1 x i64>
5509  store <1 x i64> %ext, ptr addrspace(1) %out
5510  ret void
5511}
5512
5513define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
5514; GCN-NOHSA-SI-LABEL: global_zextload_v2i16_to_v2i64:
5515; GCN-NOHSA-SI:       ; %bb.0:
5516; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5517; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
5518; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
5519; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
5520; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
5521; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
5522; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
5523; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
5524; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
5525; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
5526; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
5527; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
5528; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
5529; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
5530; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
5531; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
5532; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5533; GCN-NOHSA-SI-NEXT:    s_endpgm
5534;
5535; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64:
5536; GCN-HSA:       ; %bb.0:
5537; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
5538; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
5539; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
5540; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
5541; GCN-HSA-NEXT:    flat_load_dword v0, v[0:1]
5542; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
5543; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
5544; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
5545; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
5546; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
5547; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
5548; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v0
5549; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
5550; GCN-HSA-NEXT:    s_endpgm
5551;
5552; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i64:
5553; GCN-NOHSA-VI:       ; %bb.0:
5554; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5555; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
5556; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
5557; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
5558; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
5559; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
5560; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
5561; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
5562; GCN-NOHSA-VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
5563; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, 0
5564; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
5565; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
5566; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, v1
5567; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
5568; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
5569; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
5570; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5571; GCN-NOHSA-VI-NEXT:    s_endpgm
5572;
5573; EG-LABEL: global_zextload_v2i16_to_v2i64:
5574; EG:       ; %bb.0:
5575; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5576; EG-NEXT:    TEX 0 @6
5577; EG-NEXT:    ALU 6, @9, KC0[CB0:0-32], KC1[]
5578; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
5579; EG-NEXT:    CF_END
5580; EG-NEXT:    PAD
5581; EG-NEXT:    Fetch clause starting at 6:
5582; EG-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
5583; EG-NEXT:    ALU clause starting at 8:
5584; EG-NEXT:     MOV * T4.X, KC0[2].Z,
5585; EG-NEXT:    ALU clause starting at 9:
5586; EG-NEXT:     LSHR * T4.Z, T4.X, literal.x,
5587; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5588; EG-NEXT:     AND_INT T4.X, T4.X, literal.x,
5589; EG-NEXT:     MOV T4.Y, 0.0,
5590; EG-NEXT:     MOV T4.W, 0.0,
5591; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.y,
5592; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
5593;
5594; CM-LABEL: global_zextload_v2i16_to_v2i64:
5595; CM:       ; %bb.0:
5596; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5597; CM-NEXT:    TEX 0 @6
5598; CM-NEXT:    ALU 7, @9, KC0[CB0:0-32], KC1[]
5599; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
5600; CM-NEXT:    CF_END
5601; CM-NEXT:    PAD
5602; CM-NEXT:    Fetch clause starting at 6:
5603; CM-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
5604; CM-NEXT:    ALU clause starting at 8:
5605; CM-NEXT:     MOV * T4.X, KC0[2].Z,
5606; CM-NEXT:    ALU clause starting at 9:
5607; CM-NEXT:     LSHR * T4.Z, T4.X, literal.x,
5608; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5609; CM-NEXT:     AND_INT T4.X, T4.X, literal.x,
5610; CM-NEXT:     MOV T4.Y, 0.0,
5611; CM-NEXT:     MOV * T4.W, 0.0,
5612; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5613; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
5614; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5615  %load = load <2 x i16>, ptr addrspace(1) %in
5616  %ext = zext <2 x i16> %load to <2 x i64>
5617  store <2 x i64> %ext, ptr addrspace(1) %out
5618  ret void
5619}
5620
5621define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
5622; GCN-NOHSA-SI-LABEL: global_sextload_v2i16_to_v2i64:
5623; GCN-NOHSA-SI:       ; %bb.0:
5624; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5625; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
5626; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
5627; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
5628; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
5629; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
5630; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
5631; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
5632; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
5633; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
5634; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
5635; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
5636; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
5637; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
5638; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
5639; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
5640; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
5641; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5642; GCN-NOHSA-SI-NEXT:    s_endpgm
5643;
5644; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64:
5645; GCN-HSA:       ; %bb.0:
5646; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
5647; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
5648; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
5649; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
5650; GCN-HSA-NEXT:    flat_load_dword v0, v[0:1]
5651; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
5652; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
5653; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
5654; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
5655; GCN-HSA-NEXT:    v_bfe_i32 v0, v0, 0, 16
5656; GCN-HSA-NEXT:    v_bfe_i32 v2, v2, 0, 16
5657; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
5658; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
5659; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
5660; GCN-HSA-NEXT:    s_endpgm
5661;
5662; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i64:
5663; GCN-NOHSA-VI:       ; %bb.0:
5664; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5665; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
5666; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
5667; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
5668; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
5669; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
5670; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
5671; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
5672; GCN-NOHSA-VI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
5673; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
5674; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
5675; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
5676; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
5677; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v1, 0, 16
5678; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v2, 0, 16
5679; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
5680; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
5681; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5682; GCN-NOHSA-VI-NEXT:    s_endpgm
5683;
5684; EG-LABEL: global_sextload_v2i16_to_v2i64:
5685; EG:       ; %bb.0:
5686; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5687; EG-NEXT:    TEX 0 @6
5688; EG-NEXT:    ALU 8, @9, KC0[CB0:0-32], KC1[]
5689; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
5690; EG-NEXT:    CF_END
5691; EG-NEXT:    PAD
5692; EG-NEXT:    Fetch clause starting at 6:
5693; EG-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
5694; EG-NEXT:    ALU clause starting at 8:
5695; EG-NEXT:     MOV * T4.X, KC0[2].Z,
5696; EG-NEXT:    ALU clause starting at 9:
5697; EG-NEXT:     ASHR * T4.W, T4.X, literal.x,
5698; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
5699; EG-NEXT:     ASHR * T4.Z, T4.X, literal.x,
5700; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5701; EG-NEXT:     BFE_INT T4.X, T4.X, 0.0, literal.x,
5702; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.y,
5703; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
5704; EG-NEXT:     ASHR * T4.Y, PV.X, literal.x,
5705; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
5706;
5707; CM-LABEL: global_sextload_v2i16_to_v2i64:
5708; CM:       ; %bb.0:
5709; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5710; CM-NEXT:    TEX 0 @6
5711; CM-NEXT:    ALU 8, @9, KC0[CB0:0-32], KC1[]
5712; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
5713; CM-NEXT:    CF_END
5714; CM-NEXT:    PAD
5715; CM-NEXT:    Fetch clause starting at 6:
5716; CM-NEXT:     VTX_READ_32 T4.X, T4.X, 0, #1
5717; CM-NEXT:    ALU clause starting at 8:
5718; CM-NEXT:     MOV * T4.X, KC0[2].Z,
5719; CM-NEXT:    ALU clause starting at 9:
5720; CM-NEXT:     ASHR * T4.W, T4.X, literal.x,
5721; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
5722; CM-NEXT:     ASHR * T4.Z, T4.X, literal.x,
5723; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5724; CM-NEXT:     BFE_INT * T4.X, T4.X, 0.0, literal.x,
5725; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5726; CM-NEXT:     LSHR T5.X, KC0[2].Y, literal.x,
5727; CM-NEXT:     ASHR * T4.Y, PV.X, literal.y,
5728; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
5729  %load = load <2 x i16>, ptr addrspace(1) %in
5730  %ext = sext <2 x i16> %load to <2 x i64>
5731  store <2 x i64> %ext, ptr addrspace(1) %out
5732  ret void
5733}
5734
5735define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
5736; GCN-NOHSA-SI-LABEL: global_zextload_v4i16_to_v4i64:
5737; GCN-NOHSA-SI:       ; %bb.0:
5738; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5739; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
5740; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
5741; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
5742; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
5743; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
5744; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
5745; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
5746; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[8:9], off, s[8:11], 0
5747; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, 0
5748; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v1
5749; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v1
5750; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v1
5751; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
5752; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
5753; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
5754; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
5755; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
5756; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v8
5757; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v9
5758; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
5759; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
5760; GCN-NOHSA-SI-NEXT:    s_endpgm
5761;
5762; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64:
5763; GCN-HSA:       ; %bb.0:
5764; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
5765; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
5766; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
5767; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
5768; GCN-HSA-NEXT:    flat_load_dwordx2 v[8:9], v[0:1]
5769; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
5770; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
5771; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
5772; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
5773; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
5774; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
5775; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
5776; GCN-HSA-NEXT:    v_mov_b32_e32 v5, v1
5777; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v1
5778; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
5779; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
5780; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
5781; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v9
5782; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
5783; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v8
5784; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
5785; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
5786; GCN-HSA-NEXT:    s_endpgm
5787;
5788; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i64:
5789; GCN-NOHSA-VI:       ; %bb.0:
5790; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5791; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
5792; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
5793; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
5794; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
5795; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
5796; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
5797; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
5798; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx2 v[8:9], off, s[8:11], 0
5799; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, 0
5800; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, v1
5801; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
5802; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
5803; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, v1
5804; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, v1
5805; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
5806; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
5807; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v9
5808; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
5809; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v8
5810; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
5811; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
5812; GCN-NOHSA-VI-NEXT:    s_endpgm
5813;
5814; EG-LABEL: global_zextload_v4i16_to_v4i64:
5815; EG:       ; %bb.0:
5816; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5817; EG-NEXT:    TEX 0 @6
5818; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
5819; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T8.X, 0
5820; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 1
5821; EG-NEXT:    CF_END
5822; EG-NEXT:    Fetch clause starting at 6:
5823; EG-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
5824; EG-NEXT:    ALU clause starting at 8:
5825; EG-NEXT:     MOV * T5.X, KC0[2].Z,
5826; EG-NEXT:    ALU clause starting at 9:
5827; EG-NEXT:     LSHR * T6.Z, T5.Y, literal.x,
5828; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5829; EG-NEXT:     AND_INT T6.X, T5.Y, literal.x,
5830; EG-NEXT:     MOV T6.Y, 0.0,
5831; EG-NEXT:     LSHR T5.Z, T5.X, literal.y,
5832; EG-NEXT:     AND_INT * T5.X, T5.X, literal.x,
5833; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
5834; EG-NEXT:     MOV T5.Y, 0.0,
5835; EG-NEXT:     MOV T6.W, 0.0,
5836; EG-NEXT:     MOV * T5.W, 0.0,
5837; EG-NEXT:     LSHR T7.X, KC0[2].Y, literal.x,
5838; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
5839; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
5840; EG-NEXT:     LSHR * T8.X, PV.W, literal.x,
5841; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5842;
5843; CM-LABEL: global_zextload_v4i16_to_v4i64:
5844; CM:       ; %bb.0:
5845; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5846; CM-NEXT:    TEX 0 @6
5847; CM-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
5848; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T8.X
5849; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T7.X
5850; CM-NEXT:    CF_END
5851; CM-NEXT:    Fetch clause starting at 6:
5852; CM-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
5853; CM-NEXT:    ALU clause starting at 8:
5854; CM-NEXT:     MOV * T5.X, KC0[2].Z,
5855; CM-NEXT:    ALU clause starting at 9:
5856; CM-NEXT:     LSHR * T6.Z, T5.X, literal.x,
5857; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5858; CM-NEXT:     AND_INT T6.X, T5.X, literal.x,
5859; CM-NEXT:     MOV T6.Y, 0.0,
5860; CM-NEXT:     LSHR * T5.Z, T5.Y, literal.y,
5861; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
5862; CM-NEXT:     AND_INT T5.X, T5.Y, literal.x,
5863; CM-NEXT:     MOV T5.Y, 0.0,
5864; CM-NEXT:     MOV * T6.W, 0.0,
5865; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5866; CM-NEXT:     MOV * T5.W, 0.0,
5867; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
5868; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5869; CM-NEXT:     LSHR * T7.X, PV.W, literal.x,
5870; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5871; CM-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
5872; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5873  %load = load <4 x i16>, ptr addrspace(1) %in
5874  %ext = zext <4 x i16> %load to <4 x i64>
5875  store <4 x i64> %ext, ptr addrspace(1) %out
5876  ret void
5877}
5878
5879define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
5880; GCN-NOHSA-SI-LABEL: global_sextload_v4i16_to_v4i64:
5881; GCN-NOHSA-SI:       ; %bb.0:
5882; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
5883; GCN-NOHSA-SI-NEXT:    s_mov_b32 s7, 0xf000
5884; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, -1
5885; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s6
5886; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s7
5887; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
5888; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s2
5889; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s3
5890; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx2 v[1:2], off, s[8:11], 0
5891; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s0
5892; GCN-NOHSA-SI-NEXT:    s_mov_b32 s5, s1
5893; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
5894; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v2
5895; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
5896; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v1, 0, 16
5897; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[6:7], v[1:2], 48
5898; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v3, 0, 16
5899; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
5900; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v2, v5, 0, 16
5901; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
5902; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
5903; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
5904; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5905; GCN-NOHSA-SI-NEXT:    s_endpgm
5906;
5907; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64:
5908; GCN-HSA:       ; %bb.0:
5909; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
5910; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
5911; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
5912; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
5913; GCN-HSA-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
5914; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
5915; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
5916; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s3
5917; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s1
5918; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s2
5919; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s0
5920; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
5921; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v2
5922; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
5923; GCN-HSA-NEXT:    v_ashr_i64 v[6:7], v[1:2], 48
5924; GCN-HSA-NEXT:    v_bfe_i32 v2, v4, 0, 16
5925; GCN-HSA-NEXT:    v_bfe_i32 v4, v3, 0, 16
5926; GCN-HSA-NEXT:    v_bfe_i32 v0, v1, 0, 16
5927; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
5928; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
5929; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
5930; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
5931; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
5932; GCN-HSA-NEXT:    s_endpgm
5933;
5934; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64:
5935; GCN-NOHSA-VI:       ; %bb.0:
5936; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
5937; GCN-NOHSA-VI-NEXT:    s_mov_b32 s7, 0xf000
5938; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, -1
5939; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s6
5940; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s7
5941; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
5942; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s2
5943; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s3
5944; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx2 v[1:2], off, s[8:11], 0
5945; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s0
5946; GCN-NOHSA-VI-NEXT:    s_mov_b32 s5, s1
5947; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
5948; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, v2
5949; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
5950; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
5951; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v4, 0, 16
5952; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v5, 0, 16
5953; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v1, 0, 16
5954; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v3, 0, 16
5955; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
5956; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
5957; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
5958; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
5959; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
5960; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5961; GCN-NOHSA-VI-NEXT:    s_endpgm
5962;
5963; EG-LABEL: global_sextload_v4i16_to_v4i64:
5964; EG:       ; %bb.0:
5965; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5966; EG-NEXT:    TEX 0 @6
5967; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
5968; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
5969; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
5970; EG-NEXT:    CF_END
5971; EG-NEXT:    Fetch clause starting at 6:
5972; EG-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
5973; EG-NEXT:    ALU clause starting at 8:
5974; EG-NEXT:     MOV * T5.X, KC0[2].Z,
5975; EG-NEXT:    ALU clause starting at 9:
5976; EG-NEXT:     ASHR * T5.W, T5.X, literal.x,
5977; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
5978; EG-NEXT:     LSHR T6.X, KC0[2].Y, literal.x,
5979; EG-NEXT:     ASHR T5.Z, T5.X, literal.y,
5980; EG-NEXT:     ASHR * T7.W, T5.Y, literal.z,
5981; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
5982; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
5983; EG-NEXT:     BFE_INT T5.X, T5.X, 0.0, literal.x,
5984; EG-NEXT:     ASHR * T7.Z, T5.Y, literal.x,
5985; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
5986; EG-NEXT:     BFE_INT T7.X, T5.Y, 0.0, literal.x,
5987; EG-NEXT:     ASHR T5.Y, PV.X, literal.y,
5988; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
5989; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
5990; EG-NEXT:     LSHR T8.X, PV.W, literal.x,
5991; EG-NEXT:     ASHR * T7.Y, PV.X, literal.y,
5992; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
5993;
5994; CM-LABEL: global_sextload_v4i16_to_v4i64:
5995; CM:       ; %bb.0:
5996; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
5997; CM-NEXT:    TEX 0 @6
5998; CM-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
5999; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T8.X
6000; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6, T7.X
6001; CM-NEXT:    CF_END
6002; CM-NEXT:    Fetch clause starting at 6:
6003; CM-NEXT:     VTX_READ_64 T5.XY, T5.X, 0, #1
6004; CM-NEXT:    ALU clause starting at 8:
6005; CM-NEXT:     MOV * T5.X, KC0[2].Z,
6006; CM-NEXT:    ALU clause starting at 9:
6007; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.x,
6008; CM-NEXT:     ASHR * T6.W, T5.Y, literal.y,
6009; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
6010; CM-NEXT:     LSHR T7.X, PV.Z, literal.x,
6011; CM-NEXT:     ASHR T6.Z, T5.Y, literal.y,
6012; CM-NEXT:     ASHR * T5.W, T5.X, literal.z,
6013; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
6014; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
6015; CM-NEXT:     BFE_INT T6.X, T5.Y, 0.0, literal.x,
6016; CM-NEXT:     ASHR * T5.Z, T5.X, literal.x,
6017; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
6018; CM-NEXT:     BFE_INT T5.X, T5.X, 0.0, literal.x,
6019; CM-NEXT:     ASHR * T6.Y, PV.X, literal.y,
6020; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
6021; CM-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
6022; CM-NEXT:     ASHR * T5.Y, PV.X, literal.y,
6023; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
6024  %load = load <4 x i16>, ptr addrspace(1) %in
6025  %ext = sext <4 x i16> %load to <4 x i64>
6026  store <4 x i64> %ext, ptr addrspace(1) %out
6027  ret void
6028}
6029
6030define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
6031; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i64:
6032; GCN-NOHSA-SI:       ; %bb.0:
6033; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
6034; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
6035; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
6036; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
6037; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
6038; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
6039; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
6040; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
6041; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
6042; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, 0
6043; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, v4
6044; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, v4
6045; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, v4
6046; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, v4
6047; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, v4
6048; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, v4
6049; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, v4
6050; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
6051; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
6052; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
6053; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
6054; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
6055; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
6056; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
6057; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v15, 0xffff, v0
6058; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v11, 0xffff, v2
6059; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v7, 0xffff, v1
6060; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
6061; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:48
6062; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
6063; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32
6064; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0
6065; GCN-NOHSA-SI-NEXT:    s_endpgm
6066;
6067; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64:
6068; GCN-HSA:       ; %bb.0:
6069; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
6070; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
6071; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v4
6072; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v4
6073; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v4
6074; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
6075; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
6076; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
6077; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
6078; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
6079; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
6080; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
6081; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s2
6082; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
6083; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s1
6084; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
6085; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s0
6086; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
6087; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
6088; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s3
6089; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s1
6090; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v4
6091; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v4
6092; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v4
6093; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v4
6094; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s2
6095; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s0
6096; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
6097; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
6098; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v3
6099; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
6100; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
6101; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
6102; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v0
6103; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v2
6104; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v1
6105; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[3:6]
6106; GCN-HSA-NEXT:    flat_store_dwordx4 v[23:24], v[7:10]
6107; GCN-HSA-NEXT:    flat_store_dwordx4 v[25:26], v[11:14]
6108; GCN-HSA-NEXT:    flat_store_dwordx4 v[19:20], v[15:18]
6109; GCN-HSA-NEXT:    s_endpgm
6110;
6111; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i64:
6112; GCN-NOHSA-VI:       ; %bb.0:
6113; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
6114; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
6115; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
6116; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s2
6117; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
6118; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
6119; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
6120; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
6121; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
6122; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, 0
6123; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v6, v4
6124; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
6125; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
6126; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v8, v4
6127; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v10, v4
6128; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v12, v4
6129; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v14, v4
6130; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v16, v4
6131; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v18, v4
6132; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
6133; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
6134; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
6135; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
6136; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v15, 0xffff, v0
6137; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
6138; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v11, 0xffff, v1
6139; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
6140; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v7, 0xffff, v2
6141; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:48
6142; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32
6143; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16
6144; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0
6145; GCN-NOHSA-VI-NEXT:    s_endpgm
6146;
6147; EG-LABEL: global_zextload_v8i16_to_v8i64:
6148; EG:       ; %bb.0:
6149; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
6150; EG-NEXT:    TEX 0 @8
6151; EG-NEXT:    ALU 30, @11, KC0[CB0:0-32], KC1[]
6152; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T14.X, 0
6153; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T13.X, 0
6154; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T12.X, 0
6155; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T11.X, 1
6156; EG-NEXT:    CF_END
6157; EG-NEXT:    Fetch clause starting at 8:
6158; EG-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
6159; EG-NEXT:    ALU clause starting at 10:
6160; EG-NEXT:     MOV * T7.X, KC0[2].Z,
6161; EG-NEXT:    ALU clause starting at 11:
6162; EG-NEXT:     LSHR * T8.Z, T7.W, literal.x,
6163; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
6164; EG-NEXT:     AND_INT T8.X, T7.W, literal.x,
6165; EG-NEXT:     MOV T8.Y, 0.0,
6166; EG-NEXT:     LSHR T9.Z, T7.Z, literal.y,
6167; EG-NEXT:     AND_INT * T9.X, T7.Z, literal.x,
6168; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6169; EG-NEXT:     MOV T9.Y, 0.0,
6170; EG-NEXT:     LSHR * T10.Z, T7.Y, literal.x,
6171; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
6172; EG-NEXT:     AND_INT T10.X, T7.Y, literal.x,
6173; EG-NEXT:     MOV T10.Y, 0.0,
6174; EG-NEXT:     LSHR T7.Z, T7.X, literal.y,
6175; EG-NEXT:     AND_INT * T7.X, T7.X, literal.x,
6176; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6177; EG-NEXT:     MOV T7.Y, 0.0,
6178; EG-NEXT:     MOV T8.W, 0.0,
6179; EG-NEXT:     MOV * T9.W, 0.0,
6180; EG-NEXT:     MOV T10.W, 0.0,
6181; EG-NEXT:     MOV * T7.W, 0.0,
6182; EG-NEXT:     LSHR T11.X, KC0[2].Y, literal.x,
6183; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6184; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
6185; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
6186; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6187; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
6188; EG-NEXT:     LSHR T13.X, PV.W, literal.x,
6189; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6190; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
6191; EG-NEXT:     LSHR * T14.X, PV.W, literal.x,
6192; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6193;
6194; CM-LABEL: global_zextload_v8i16_to_v8i64:
6195; CM:       ; %bb.0:
6196; CM-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
6197; CM-NEXT:    TEX 0 @8
6198; CM-NEXT:    ALU 32, @11, KC0[CB0:0-32], KC1[]
6199; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8, T14.X
6200; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T9, T13.X
6201; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T10, T12.X
6202; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T11.X
6203; CM-NEXT:    CF_END
6204; CM-NEXT:    Fetch clause starting at 8:
6205; CM-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
6206; CM-NEXT:    ALU clause starting at 10:
6207; CM-NEXT:     MOV * T7.X, KC0[2].Z,
6208; CM-NEXT:    ALU clause starting at 11:
6209; CM-NEXT:     LSHR * T8.Z, T7.X, literal.x,
6210; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
6211; CM-NEXT:     AND_INT T8.X, T7.X, literal.x,
6212; CM-NEXT:     MOV T8.Y, 0.0,
6213; CM-NEXT:     LSHR * T9.Z, T7.Y, literal.y,
6214; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6215; CM-NEXT:     AND_INT T9.X, T7.Y, literal.x,
6216; CM-NEXT:     MOV T9.Y, 0.0,
6217; CM-NEXT:     LSHR * T10.Z, T7.Z, literal.y,
6218; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6219; CM-NEXT:     AND_INT T10.X, T7.Z, literal.x,
6220; CM-NEXT:     MOV T10.Y, 0.0,
6221; CM-NEXT:     LSHR * T7.Z, T7.W, literal.y,
6222; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6223; CM-NEXT:     AND_INT T7.X, T7.W, literal.x,
6224; CM-NEXT:     MOV T7.Y, 0.0,
6225; CM-NEXT:     MOV * T8.W, 0.0,
6226; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
6227; CM-NEXT:     MOV * T9.W, 0.0,
6228; CM-NEXT:     MOV * T10.W, 0.0,
6229; CM-NEXT:     MOV * T7.W, 0.0,
6230; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6231; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
6232; CM-NEXT:     LSHR T11.X, PV.W, literal.x,
6233; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6234; CM-NEXT:    2(2.802597e-45), 32(4.484155e-44)
6235; CM-NEXT:     LSHR T12.X, PV.W, literal.x,
6236; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6237; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
6238; CM-NEXT:     LSHR * T13.X, PV.W, literal.x,
6239; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6240; CM-NEXT:     LSHR * T14.X, KC0[2].Y, literal.x,
6241; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6242  %load = load <8 x i16>, ptr addrspace(1) %in
6243  %ext = zext <8 x i16> %load to <8 x i64>
6244  store <8 x i64> %ext, ptr addrspace(1) %out
6245  ret void
6246}
6247
6248define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
6249; GCN-NOHSA-SI-LABEL: global_sextload_v8i16_to_v8i64:
6250; GCN-NOHSA-SI:       ; %bb.0:
6251; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
6252; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
6253; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
6254; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
6255; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
6256; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
6257; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
6258; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
6259; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
6260; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
6261; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
6262; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
6263; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v3
6264; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
6265; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
6266; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v0, 0, 16
6267; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v2, 0, 16
6268; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[14:15], v[0:1], 48
6269; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[2:3], v[2:3], 48
6270; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v12, v1, 0, 16
6271; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v5, 0, 16
6272; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
6273; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
6274; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v6, 0, 16
6275; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v7, 0, 16
6276; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
6277; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
6278; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
6279; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
6280; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
6281; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
6282; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
6283; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
6284; GCN-NOHSA-SI-NEXT:    s_endpgm
6285;
6286; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64:
6287; GCN-HSA:       ; %bb.0:
6288; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
6289; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
6290; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
6291; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
6292; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
6293; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
6294; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
6295; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
6296; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
6297; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
6298; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
6299; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
6300; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
6301; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
6302; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
6303; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
6304; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
6305; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s1
6306; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s0
6307; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
6308; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v3
6309; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
6310; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
6311; GCN-HSA-NEXT:    v_ashr_i64 v[14:15], v[0:1], 48
6312; GCN-HSA-NEXT:    v_bfe_i32 v12, v1, 0, 16
6313; GCN-HSA-NEXT:    v_bfe_i32 v4, v0, 0, 16
6314; GCN-HSA-NEXT:    v_bfe_i32 v8, v2, 0, 16
6315; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[2:3], 48
6316; GCN-HSA-NEXT:    v_bfe_i32 v6, v6, 0, 16
6317; GCN-HSA-NEXT:    v_bfe_i32 v10, v10, 0, 16
6318; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
6319; GCN-HSA-NEXT:    v_bfe_i32 v0, v7, 0, 16
6320; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
6321; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
6322; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
6323; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
6324; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
6325; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
6326; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
6327; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[8:11]
6328; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
6329; GCN-HSA-NEXT:    s_endpgm
6330;
6331; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i64:
6332; GCN-NOHSA-VI:       ; %bb.0:
6333; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
6334; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
6335; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
6336; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s2
6337; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
6338; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
6339; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
6340; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
6341; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
6342; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
6343; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
6344; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
6345; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v3
6346; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
6347; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
6348; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
6349; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
6350; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v11, 0, 16
6351; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v3, 0, 16
6352; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v0, 0, 16
6353; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v1, 0, 16
6354; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v2, 0, 16
6355; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v6, 0, 16
6356; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v7, 0, 16
6357; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v10, 0, 16
6358; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
6359; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
6360; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
6361; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
6362; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
6363; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
6364; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
6365; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
6366; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
6367; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
6368; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
6369; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
6370; GCN-NOHSA-VI-NEXT:    s_endpgm
6371;
6372; EG-LABEL: global_sextload_v8i16_to_v8i64:
6373; EG:       ; %bb.0:
6374; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
6375; EG-NEXT:    TEX 0 @8
6376; EG-NEXT:    ALU 33, @11, KC0[CB0:0-32], KC1[]
6377; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T7.X, 0
6378; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 0
6379; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T9.X, 0
6380; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T8.X, 1
6381; EG-NEXT:    CF_END
6382; EG-NEXT:    Fetch clause starting at 8:
6383; EG-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
6384; EG-NEXT:    ALU clause starting at 10:
6385; EG-NEXT:     MOV * T7.X, KC0[2].Z,
6386; EG-NEXT:    ALU clause starting at 11:
6387; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
6388; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6389; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
6390; EG-NEXT:     LSHR T9.X, PV.W, literal.x,
6391; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.y,
6392; EG-NEXT:     ASHR * T10.W, T7.X, literal.z,
6393; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
6394; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
6395; EG-NEXT:     LSHR T11.X, PV.W, literal.x,
6396; EG-NEXT:     ASHR T10.Z, T7.X, literal.y,
6397; EG-NEXT:     ASHR * T12.W, T7.Y, literal.z,
6398; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
6399; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
6400; EG-NEXT:     BFE_INT T10.X, T7.X, 0.0, literal.x,
6401; EG-NEXT:     ASHR T12.Z, T7.Y, literal.x,
6402; EG-NEXT:     ASHR * T13.W, T7.Z, literal.y,
6403; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
6404; EG-NEXT:     BFE_INT T12.X, T7.Y, 0.0, literal.x,
6405; EG-NEXT:     ASHR T10.Y, PV.X, literal.y,
6406; EG-NEXT:     ASHR T13.Z, T7.Z, literal.x,
6407; EG-NEXT:     ASHR * T14.W, T7.W, literal.y,
6408; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
6409; EG-NEXT:     BFE_INT T13.X, T7.Z, 0.0, literal.x,
6410; EG-NEXT:     ASHR T12.Y, PV.X, literal.y,
6411; EG-NEXT:     ASHR * T14.Z, T7.W, literal.x,
6412; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
6413; EG-NEXT:     BFE_INT T14.X, T7.W, 0.0, literal.x,
6414; EG-NEXT:     ASHR T13.Y, PV.X, literal.y,
6415; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
6416; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
6417; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
6418; EG-NEXT:     LSHR T7.X, PV.W, literal.x,
6419; EG-NEXT:     ASHR * T14.Y, PV.X, literal.y,
6420; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
6421;
6422; CM-LABEL: global_sextload_v8i16_to_v8i64:
6423; CM:       ; %bb.0:
6424; CM-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
6425; CM-NEXT:    TEX 0 @8
6426; CM-NEXT:    ALU 33, @11, KC0[CB0:0-32], KC1[]
6427; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T14.X
6428; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T11.X
6429; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T9.X
6430; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T10, T8.X
6431; CM-NEXT:    CF_END
6432; CM-NEXT:    Fetch clause starting at 8:
6433; CM-NEXT:     VTX_READ_128 T7.XYZW, T7.X, 0, #1
6434; CM-NEXT:    ALU clause starting at 10:
6435; CM-NEXT:     MOV * T7.X, KC0[2].Z,
6436; CM-NEXT:    ALU clause starting at 11:
6437; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6438; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
6439; CM-NEXT:     LSHR T8.X, PV.W, literal.x,
6440; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6441; CM-NEXT:    2(2.802597e-45), 32(4.484155e-44)
6442; CM-NEXT:     LSHR T9.X, PV.W, literal.x,
6443; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.y,
6444; CM-NEXT:     ASHR * T10.W, T7.W, literal.z,
6445; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
6446; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
6447; CM-NEXT:     LSHR T11.X, PV.Z, literal.x,
6448; CM-NEXT:     ASHR T10.Z, T7.W, literal.y,
6449; CM-NEXT:     ASHR * T12.W, T7.Z, literal.z,
6450; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
6451; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
6452; CM-NEXT:     BFE_INT T10.X, T7.W, 0.0, literal.x,
6453; CM-NEXT:     ASHR T12.Z, T7.Z, literal.x,
6454; CM-NEXT:     ASHR * T13.W, T7.Y, literal.y,
6455; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
6456; CM-NEXT:     BFE_INT T12.X, T7.Z, 0.0, literal.x,
6457; CM-NEXT:     ASHR T10.Y, PV.X, literal.y,
6458; CM-NEXT:     ASHR T13.Z, T7.Y, literal.x,
6459; CM-NEXT:     ASHR * T7.W, T7.X, literal.y,
6460; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
6461; CM-NEXT:     BFE_INT T13.X, T7.Y, 0.0, literal.x,
6462; CM-NEXT:     ASHR T12.Y, PV.X, literal.y,
6463; CM-NEXT:     ASHR * T7.Z, T7.X, literal.x,
6464; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
6465; CM-NEXT:     BFE_INT T7.X, T7.X, 0.0, literal.x,
6466; CM-NEXT:     ASHR * T13.Y, PV.X, literal.y,
6467; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
6468; CM-NEXT:     LSHR T14.X, KC0[2].Y, literal.x,
6469; CM-NEXT:     ASHR * T7.Y, PV.X, literal.y,
6470; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
6471  %load = load <8 x i16>, ptr addrspace(1) %in
6472  %ext = sext <8 x i16> %load to <8 x i64>
6473  store <8 x i64> %ext, ptr addrspace(1) %out
6474  ret void
6475}
6476
6477define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
6478; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i64:
6479; GCN-NOHSA-SI:       ; %bb.0:
6480; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
6481; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
6482; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
6483; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
6484; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
6485; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
6486; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
6487; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
6488; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
6489; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
6490; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
6491; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
6492; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
6493; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
6494; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v0
6495; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v2
6496; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v1
6497; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
6498; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v3
6499; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, 0
6500; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
6501; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
6502; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v6
6503; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xffff, v6
6504; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
6505; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
6506; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v7
6507; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v28, 0xffff, v7
6508; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v5
6509; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, v21
6510; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v29, v21
6511; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v31, v21
6512; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v21
6513; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v21
6514; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v21
6515; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, v21
6516; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v21
6517; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v21
6518; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, v21
6519; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, v21
6520; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v21
6521; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v21
6522; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, v21
6523; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, v21
6524; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
6525; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
6526; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
6527; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
6528; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
6529; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
6530; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
6531; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
6532; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
6533; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0
6534; GCN-NOHSA-SI-NEXT:    s_endpgm
6535;
6536; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
6537; GCN-HSA:       ; %bb.0:
6538; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
6539; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
6540; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v8
6541; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v8
6542; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v8
6543; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
6544; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
6545; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
6546; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
6547; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
6548; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
6549; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
6550; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
6551; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
6552; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
6553; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
6554; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
6555; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
6556; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s5
6557; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
6558; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x50
6559; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
6560; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v8
6561; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v8
6562; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v8
6563; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v8
6564; GCN-HSA-NEXT:    v_mov_b32_e32 v23, v8
6565; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
6566; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
6567; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v1
6568; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[11:14]
6569; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s5
6570; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
6571; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x70
6572; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
6573; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
6574; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v5
6575; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
6576; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[14:17]
6577; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
6578; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s5
6579; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s4
6580; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v7
6581; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[17:20]
6582; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
6583; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
6584; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
6585; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
6586; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
6587; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
6588; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v3
6589; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
6590; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
6591; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[7:10]
6592; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s1
6593; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
6594; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s0
6595; GCN-HSA-NEXT:    s_add_u32 s0, s0, 0x60
6596; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
6597; GCN-HSA-NEXT:    v_mov_b32_e32 v11, v8
6598; GCN-HSA-NEXT:    v_mov_b32_e32 v13, v8
6599; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
6600; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v4
6601; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s3
6602; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s1
6603; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
6604; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v2
6605; GCN-HSA-NEXT:    v_mov_b32_e32 v21, v8
6606; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v8
6607; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v8
6608; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v8
6609; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v8
6610; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
6611; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v6
6612; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s2
6613; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s0
6614; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v22, 16, v0
6615; GCN-HSA-NEXT:    v_and_b32_e32 v20, 0xffff, v0
6616; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[10:13]
6617; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[20:23]
6618; GCN-HSA-NEXT:    flat_store_dwordx4 v[3:4], v[14:17]
6619; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[6:9]
6620; GCN-HSA-NEXT:    s_endpgm
6621;
6622; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64:
6623; GCN-NOHSA-VI:       ; %bb.0:
6624; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
6625; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
6626; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
6627; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s2
6628; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
6629; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
6630; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
6631; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
6632; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
6633; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
6634; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, 0
6635; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v31, v29
6636; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
6637; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
6638; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v25, v29
6639; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v27, v29
6640; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v21, v29
6641; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v23, v29
6642; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, v29
6643; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, v29
6644; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v29
6645; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, v29
6646; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, v29
6647; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v29
6648; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
6649; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
6650; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
6651; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
6652; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, 0xffff, v5
6653; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v0
6654; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
6655; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v1
6656; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v2
6657; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v2
6658; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
6659; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v3
6660; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
6661; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v6
6662; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v7
6663; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v7
6664; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
6665; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
6666; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, v29
6667; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, v29
6668; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v29
6669; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, v29
6670; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80
6671; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
6672; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
6673; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96
6674; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
6675; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
6676; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
6677; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
6678; GCN-NOHSA-VI-NEXT:    s_endpgm
6679;
6680; EG-LABEL: global_zextload_v16i16_to_v16i64:
6681; EG:       ; %bb.0:
6682; EG-NEXT:    ALU 0, @16, KC0[CB0:0-32], KC1[]
6683; EG-NEXT:    TEX 1 @12
6684; EG-NEXT:    ALU 62, @17, KC0[CB0:0-32], KC1[]
6685; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T26.X, 0
6686; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T25.X, 0
6687; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T24.X, 0
6688; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T23.X, 0
6689; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T22.X, 0
6690; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T21.X, 0
6691; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T20.X, 0
6692; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T19.X, 1
6693; EG-NEXT:    CF_END
6694; EG-NEXT:    Fetch clause starting at 12:
6695; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 16, #1
6696; EG-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 0, #1
6697; EG-NEXT:    ALU clause starting at 16:
6698; EG-NEXT:     MOV * T11.X, KC0[2].Z,
6699; EG-NEXT:    ALU clause starting at 17:
6700; EG-NEXT:     LSHR * T13.Z, T12.W, literal.x,
6701; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
6702; EG-NEXT:     AND_INT T13.X, T12.W, literal.x,
6703; EG-NEXT:     MOV T13.Y, 0.0,
6704; EG-NEXT:     LSHR T14.Z, T12.Z, literal.y,
6705; EG-NEXT:     AND_INT * T14.X, T12.Z, literal.x,
6706; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6707; EG-NEXT:     MOV T14.Y, 0.0,
6708; EG-NEXT:     LSHR * T15.Z, T12.Y, literal.x,
6709; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
6710; EG-NEXT:     AND_INT T15.X, T12.Y, literal.x,
6711; EG-NEXT:     MOV T15.Y, 0.0,
6712; EG-NEXT:     LSHR T12.Z, T12.X, literal.y,
6713; EG-NEXT:     AND_INT * T12.X, T12.X, literal.x,
6714; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6715; EG-NEXT:     MOV T12.Y, 0.0,
6716; EG-NEXT:     LSHR * T16.Z, T11.W, literal.x,
6717; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
6718; EG-NEXT:     AND_INT T16.X, T11.W, literal.x,
6719; EG-NEXT:     MOV T16.Y, 0.0,
6720; EG-NEXT:     LSHR T17.Z, T11.Z, literal.y,
6721; EG-NEXT:     AND_INT * T17.X, T11.Z, literal.x,
6722; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6723; EG-NEXT:     MOV T17.Y, 0.0,
6724; EG-NEXT:     LSHR * T18.Z, T11.Y, literal.x,
6725; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
6726; EG-NEXT:     AND_INT T18.X, T11.Y, literal.x,
6727; EG-NEXT:     MOV T18.Y, 0.0,
6728; EG-NEXT:     LSHR T11.Z, T11.X, literal.y,
6729; EG-NEXT:     AND_INT * T11.X, T11.X, literal.x,
6730; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6731; EG-NEXT:     MOV T11.Y, 0.0,
6732; EG-NEXT:     MOV T13.W, 0.0,
6733; EG-NEXT:     MOV * T14.W, 0.0,
6734; EG-NEXT:     MOV T15.W, 0.0,
6735; EG-NEXT:     MOV * T12.W, 0.0,
6736; EG-NEXT:     MOV T16.W, 0.0,
6737; EG-NEXT:     MOV * T17.W, 0.0,
6738; EG-NEXT:     MOV T18.W, 0.0,
6739; EG-NEXT:     MOV * T11.W, 0.0,
6740; EG-NEXT:     LSHR T19.X, KC0[2].Y, literal.x,
6741; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6742; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
6743; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
6744; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6745; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
6746; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
6747; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6748; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
6749; EG-NEXT:     LSHR T22.X, PV.W, literal.x,
6750; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6751; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
6752; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
6753; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6754; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
6755; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
6756; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6757; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
6758; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
6759; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6760; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
6761; EG-NEXT:     LSHR * T26.X, PV.W, literal.x,
6762; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6763;
6764; CM-LABEL: global_zextload_v16i16_to_v16i64:
6765; CM:       ; %bb.0:
6766; CM-NEXT:    ALU 0, @16, KC0[CB0:0-32], KC1[]
6767; CM-NEXT:    TEX 1 @12
6768; CM-NEXT:    ALU 64, @17, KC0[CB0:0-32], KC1[]
6769; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T13, T26.X
6770; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T14, T25.X
6771; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T15, T24.X
6772; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T23.X
6773; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T16, T22.X
6774; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T17, T21.X
6775; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T18, T20.X
6776; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T19.X
6777; CM-NEXT:    CF_END
6778; CM-NEXT:    Fetch clause starting at 12:
6779; CM-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 0, #1
6780; CM-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 16, #1
6781; CM-NEXT:    ALU clause starting at 16:
6782; CM-NEXT:     MOV * T11.X, KC0[2].Z,
6783; CM-NEXT:    ALU clause starting at 17:
6784; CM-NEXT:     LSHR * T13.Z, T12.X, literal.x,
6785; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
6786; CM-NEXT:     AND_INT T13.X, T12.X, literal.x,
6787; CM-NEXT:     MOV T13.Y, 0.0,
6788; CM-NEXT:     LSHR * T14.Z, T12.Y, literal.y,
6789; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6790; CM-NEXT:     AND_INT T14.X, T12.Y, literal.x,
6791; CM-NEXT:     MOV T14.Y, 0.0,
6792; CM-NEXT:     LSHR * T15.Z, T12.Z, literal.y,
6793; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6794; CM-NEXT:     AND_INT T15.X, T12.Z, literal.x,
6795; CM-NEXT:     MOV T15.Y, 0.0,
6796; CM-NEXT:     LSHR * T12.Z, T12.W, literal.y,
6797; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6798; CM-NEXT:     AND_INT T12.X, T12.W, literal.x,
6799; CM-NEXT:     MOV T12.Y, 0.0,
6800; CM-NEXT:     LSHR * T16.Z, T11.X, literal.y,
6801; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6802; CM-NEXT:     AND_INT T16.X, T11.X, literal.x,
6803; CM-NEXT:     MOV T16.Y, 0.0,
6804; CM-NEXT:     LSHR * T17.Z, T11.Y, literal.y,
6805; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6806; CM-NEXT:     AND_INT T17.X, T11.Y, literal.x,
6807; CM-NEXT:     MOV T17.Y, 0.0,
6808; CM-NEXT:     LSHR * T18.Z, T11.Z, literal.y,
6809; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6810; CM-NEXT:     AND_INT T18.X, T11.Z, literal.x,
6811; CM-NEXT:     MOV T18.Y, 0.0,
6812; CM-NEXT:     LSHR * T11.Z, T11.W, literal.y,
6813; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
6814; CM-NEXT:     AND_INT T11.X, T11.W, literal.x,
6815; CM-NEXT:     MOV T11.Y, 0.0,
6816; CM-NEXT:     MOV * T13.W, 0.0,
6817; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
6818; CM-NEXT:     MOV * T14.W, 0.0,
6819; CM-NEXT:     MOV * T15.W, 0.0,
6820; CM-NEXT:     MOV * T12.W, 0.0,
6821; CM-NEXT:     MOV * T16.W, 0.0,
6822; CM-NEXT:     MOV * T17.W, 0.0,
6823; CM-NEXT:     MOV * T18.W, 0.0,
6824; CM-NEXT:     MOV * T11.W, 0.0,
6825; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6826; CM-NEXT:    112(1.569454e-43), 0(0.000000e+00)
6827; CM-NEXT:     LSHR T19.X, PV.W, literal.x,
6828; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6829; CM-NEXT:    2(2.802597e-45), 96(1.345247e-43)
6830; CM-NEXT:     LSHR T20.X, PV.W, literal.x,
6831; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6832; CM-NEXT:    2(2.802597e-45), 80(1.121039e-43)
6833; CM-NEXT:     LSHR T21.X, PV.W, literal.x,
6834; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6835; CM-NEXT:    2(2.802597e-45), 64(8.968310e-44)
6836; CM-NEXT:     LSHR T22.X, PV.W, literal.x,
6837; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6838; CM-NEXT:    2(2.802597e-45), 48(6.726233e-44)
6839; CM-NEXT:     LSHR T23.X, PV.W, literal.x,
6840; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6841; CM-NEXT:    2(2.802597e-45), 32(4.484155e-44)
6842; CM-NEXT:     LSHR T24.X, PV.W, literal.x,
6843; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
6844; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
6845; CM-NEXT:     LSHR * T25.X, PV.W, literal.x,
6846; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6847; CM-NEXT:     LSHR * T26.X, KC0[2].Y, literal.x,
6848; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6849  %load = load <16 x i16>, ptr addrspace(1) %in
6850  %ext = zext <16 x i16> %load to <16 x i64>
6851  store <16 x i64> %ext, ptr addrspace(1) %out
6852  ret void
6853}
6854
6855define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
6856; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i64:
6857; GCN-NOHSA-SI:       ; %bb.0:
6858; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
6859; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
6860; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
6861; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
6862; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
6863; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
6864; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
6865; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
6866; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
6867; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
6868; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
6869; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
6870; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
6871; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v7
6872; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, v3
6873; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
6874; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
6875; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
6876; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v8, v0, 0, 16
6877; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v12, v2, 0, 16
6878; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[18:19], v[0:1], 48
6879; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v16, v1, 0, 16
6880; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[2:3], v[2:3], 48
6881; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v0, v10, 0, 16
6882; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v20, v4, 0, 16
6883; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v10, v14, 0, 16
6884; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v14, v13, 0, 16
6885; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v22, v11, 0, 16
6886; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v23, v9, 0, 16
6887; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[25:26], v[6:7], 48
6888; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
6889; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112
6890; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
6891; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[26:27], v[4:5], 48
6892; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v24, v5, 0, 16
6893; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
6894; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v4, v6, 0, 16
6895; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v6, v1, 0, 16
6896; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
6897; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
6898; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
6899; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
6900; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
6901; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
6902; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
6903; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
6904; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
6905; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
6906; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
6907; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
6908; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
6909; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
6910; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
6911; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
6912; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
6913; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
6914; GCN-NOHSA-SI-NEXT:    s_endpgm
6915;
6916; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64:
6917; GCN-HSA:       ; %bb.0:
6918; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
6919; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
6920; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
6921; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
6922; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
6923; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
6924; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
6925; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
6926; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
6927; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
6928; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
6929; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
6930; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
6931; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
6932; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
6933; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
6934; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
6935; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
6936; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
6937; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
6938; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
6939; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
6940; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
6941; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
6942; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
6943; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
6944; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
6945; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
6946; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
6947; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
6948; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
6949; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
6950; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
6951; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
6952; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
6953; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
6954; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
6955; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s1
6956; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
6957; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s0
6958; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
6959; GCN-HSA-NEXT:    v_ashr_i64 v[10:11], v[4:5], 48
6960; GCN-HSA-NEXT:    v_bfe_i32 v8, v5, 0, 16
6961; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
6962; GCN-HSA-NEXT:    v_mov_b32_e32 v5, v7
6963; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
6964; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
6965; GCN-HSA-NEXT:    v_bfe_i32 v8, v5, 0, 16
6966; GCN-HSA-NEXT:    v_ashr_i64 v[10:11], v[6:7], 48
6967; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
6968; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
6969; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
6970; GCN-HSA-NEXT:    v_bfe_i32 v7, v6, 0, 16
6971; GCN-HSA-NEXT:    v_bfe_i32 v9, v16, 0, 16
6972; GCN-HSA-NEXT:    v_bfe_i32 v4, v4, 0, 16
6973; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
6974; GCN-HSA-NEXT:    v_bfe_i32 v6, v17, 0, 16
6975; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
6976; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
6977; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[7:10]
6978; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
6979; GCN-HSA-NEXT:    v_mov_b32_e32 v11, v3
6980; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
6981; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
6982; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
6983; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
6984; GCN-HSA-NEXT:    v_ashr_i64 v[14:15], v[0:1], 48
6985; GCN-HSA-NEXT:    v_bfe_i32 v12, v1, 0, 16
6986; GCN-HSA-NEXT:    v_bfe_i32 v8, v0, 0, 16
6987; GCN-HSA-NEXT:    v_bfe_i32 v4, v2, 0, 16
6988; GCN-HSA-NEXT:    v_ashr_i64 v[2:3], v[2:3], 48
6989; GCN-HSA-NEXT:    v_bfe_i32 v10, v17, 0, 16
6990; GCN-HSA-NEXT:    v_bfe_i32 v6, v16, 0, 16
6991; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
6992; GCN-HSA-NEXT:    v_bfe_i32 v0, v11, 0, 16
6993; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
6994; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
6995; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
6996; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
6997; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
6998; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
6999; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
7000; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
7001; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
7002; GCN-HSA-NEXT:    s_endpgm
7003;
7004; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i64:
7005; GCN-NOHSA-VI:       ; %bb.0:
7006; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
7007; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
7008; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
7009; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s2
7010; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
7011; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
7012; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
7013; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
7014; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
7015; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
7016; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
7017; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
7018; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
7019; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v0, 0, 16
7020; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
7021; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v1, 0, 16
7022; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
7023; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v2, 0, 16
7024; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
7025; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v1, 0, 16
7026; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
7027; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
7028; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v1, 0, 16
7029; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
7030; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v3
7031; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
7032; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v6
7033; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v22, v7
7034; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
7035; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v5, 0, 16
7036; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v27, v6, 0, 16
7037; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v4, 0, 16
7038; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v1, 0, 16
7039; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v9, 0, 16
7040; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v11, 0, 16
7041; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v19, v13, 0, 16
7042; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v21, v3, 0, 16
7043; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
7044; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
7045; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v23, v22, 0, 16
7046; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v25, v7, 0, 16
7047; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v29, v20, 0, 16
7048; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
7049; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
7050; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80
7051; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
7052; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
7053; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
7054; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v28, 31, v27
7055; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
7056; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
7057; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
7058; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
7059; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
7060; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v30, 31, v29
7061; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
7062; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
7063; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
7064; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112
7065; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:96
7066; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
7067; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
7068; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
7069; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
7070; GCN-NOHSA-VI-NEXT:    s_endpgm
7071;
7072; EG-LABEL: global_sextload_v16i16_to_v16i64:
7073; EG:       ; %bb.0:
7074; EG-NEXT:    ALU 0, @16, KC0[CB0:0-32], KC1[]
7075; EG-NEXT:    TEX 1 @12
7076; EG-NEXT:    ALU 65, @17, KC0[CB0:0-32], KC1[]
7077; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T12.X, 0
7078; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T20.X, 0
7079; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T18.X, 0
7080; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T17.X, 0
7081; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T16.X, 0
7082; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T15.X, 0
7083; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T14.X, 0
7084; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T13.X, 1
7085; EG-NEXT:    CF_END
7086; EG-NEXT:    Fetch clause starting at 12:
7087; EG-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 16, #1
7088; EG-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 0, #1
7089; EG-NEXT:    ALU clause starting at 16:
7090; EG-NEXT:     MOV * T11.X, KC0[2].Z,
7091; EG-NEXT:    ALU clause starting at 17:
7092; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
7093; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7094; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
7095; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
7096; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7097; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
7098; EG-NEXT:     LSHR T15.X, PV.W, literal.x,
7099; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7100; EG-NEXT:    2(2.802597e-45), 48(6.726233e-44)
7101; EG-NEXT:     LSHR T16.X, PV.W, literal.x,
7102; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7103; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
7104; EG-NEXT:     LSHR T17.X, PV.W, literal.x,
7105; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7106; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
7107; EG-NEXT:     LSHR T18.X, PV.W, literal.x,
7108; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.y,
7109; EG-NEXT:     ASHR * T19.W, T11.X, literal.z,
7110; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
7111; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
7112; EG-NEXT:     LSHR T20.X, PV.W, literal.x,
7113; EG-NEXT:     ASHR T19.Z, T11.X, literal.y,
7114; EG-NEXT:     ASHR * T21.W, T11.Y, literal.z,
7115; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
7116; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
7117; EG-NEXT:     BFE_INT T19.X, T11.X, 0.0, literal.x,
7118; EG-NEXT:     ASHR T21.Z, T11.Y, literal.x,
7119; EG-NEXT:     ASHR * T22.W, T11.Z, literal.y,
7120; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7121; EG-NEXT:     BFE_INT T21.X, T11.Y, 0.0, literal.x,
7122; EG-NEXT:     ASHR T19.Y, PV.X, literal.y,
7123; EG-NEXT:     ASHR T22.Z, T11.Z, literal.x,
7124; EG-NEXT:     ASHR * T23.W, T11.W, literal.y,
7125; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7126; EG-NEXT:     BFE_INT T22.X, T11.Z, 0.0, literal.x,
7127; EG-NEXT:     ASHR T21.Y, PV.X, literal.y,
7128; EG-NEXT:     ASHR T23.Z, T11.W, literal.x,
7129; EG-NEXT:     ASHR * T24.W, T12.X, literal.y,
7130; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7131; EG-NEXT:     BFE_INT T23.X, T11.W, 0.0, literal.x,
7132; EG-NEXT:     ASHR T22.Y, PV.X, literal.y,
7133; EG-NEXT:     ASHR T24.Z, T12.X, literal.x,
7134; EG-NEXT:     ASHR * T11.W, T12.Y, literal.y,
7135; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7136; EG-NEXT:     BFE_INT T24.X, T12.X, 0.0, literal.x,
7137; EG-NEXT:     ASHR T23.Y, PV.X, literal.y,
7138; EG-NEXT:     ASHR T11.Z, T12.Y, literal.x,
7139; EG-NEXT:     ASHR * T25.W, T12.Z, literal.y,
7140; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7141; EG-NEXT:     BFE_INT T11.X, T12.Y, 0.0, literal.x,
7142; EG-NEXT:     ASHR T24.Y, PV.X, literal.y,
7143; EG-NEXT:     ASHR T25.Z, T12.Z, literal.x,
7144; EG-NEXT:     ASHR * T26.W, T12.W, literal.y,
7145; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7146; EG-NEXT:     BFE_INT T25.X, T12.Z, 0.0, literal.x,
7147; EG-NEXT:     ASHR T11.Y, PV.X, literal.y,
7148; EG-NEXT:     ASHR * T26.Z, T12.W, literal.x,
7149; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7150; EG-NEXT:     BFE_INT T26.X, T12.W, 0.0, literal.x,
7151; EG-NEXT:     ASHR T25.Y, PV.X, literal.y,
7152; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
7153; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7154; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
7155; EG-NEXT:     LSHR T12.X, PV.W, literal.x,
7156; EG-NEXT:     ASHR * T26.Y, PV.X, literal.y,
7157; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
7158;
7159; CM-LABEL: global_sextload_v16i16_to_v16i64:
7160; CM:       ; %bb.0:
7161; CM-NEXT:    ALU 0, @16, KC0[CB0:0-32], KC1[]
7162; CM-NEXT:    TEX 1 @12
7163; CM-NEXT:    ALU 65, @17, KC0[CB0:0-32], KC1[]
7164; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T26.X
7165; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T25, T20.X
7166; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T18.X
7167; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T23, T17.X
7168; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T16.X
7169; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T15.X
7170; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T21, T14.X
7171; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T13.X
7172; CM-NEXT:    CF_END
7173; CM-NEXT:    Fetch clause starting at 12:
7174; CM-NEXT:     VTX_READ_128 T12.XYZW, T11.X, 0, #1
7175; CM-NEXT:     VTX_READ_128 T11.XYZW, T11.X, 16, #1
7176; CM-NEXT:    ALU clause starting at 16:
7177; CM-NEXT:     MOV * T11.X, KC0[2].Z,
7178; CM-NEXT:    ALU clause starting at 17:
7179; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
7180; CM-NEXT:    112(1.569454e-43), 0(0.000000e+00)
7181; CM-NEXT:     LSHR T13.X, PV.W, literal.x,
7182; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7183; CM-NEXT:    2(2.802597e-45), 96(1.345247e-43)
7184; CM-NEXT:     LSHR T14.X, PV.W, literal.x,
7185; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7186; CM-NEXT:    2(2.802597e-45), 80(1.121039e-43)
7187; CM-NEXT:     LSHR T15.X, PV.W, literal.x,
7188; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7189; CM-NEXT:    2(2.802597e-45), 64(8.968310e-44)
7190; CM-NEXT:     LSHR T16.X, PV.W, literal.x,
7191; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7192; CM-NEXT:    2(2.802597e-45), 48(6.726233e-44)
7193; CM-NEXT:     LSHR T17.X, PV.W, literal.x,
7194; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7195; CM-NEXT:    2(2.802597e-45), 32(4.484155e-44)
7196; CM-NEXT:     LSHR T18.X, PV.W, literal.x,
7197; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.y,
7198; CM-NEXT:     ASHR * T19.W, T11.W, literal.z,
7199; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
7200; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
7201; CM-NEXT:     LSHR T20.X, PV.Z, literal.x,
7202; CM-NEXT:     ASHR T19.Z, T11.W, literal.y,
7203; CM-NEXT:     ASHR * T21.W, T11.Z, literal.z,
7204; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
7205; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
7206; CM-NEXT:     BFE_INT T19.X, T11.W, 0.0, literal.x,
7207; CM-NEXT:     ASHR T21.Z, T11.Z, literal.x,
7208; CM-NEXT:     ASHR * T22.W, T11.Y, literal.y,
7209; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7210; CM-NEXT:     BFE_INT T21.X, T11.Z, 0.0, literal.x,
7211; CM-NEXT:     ASHR T19.Y, PV.X, literal.y,
7212; CM-NEXT:     ASHR T22.Z, T11.Y, literal.x,
7213; CM-NEXT:     ASHR * T11.W, T11.X, literal.y,
7214; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7215; CM-NEXT:     BFE_INT T22.X, T11.Y, 0.0, literal.x,
7216; CM-NEXT:     ASHR T21.Y, PV.X, literal.y,
7217; CM-NEXT:     ASHR T11.Z, T11.X, literal.x,
7218; CM-NEXT:     ASHR * T23.W, T12.W, literal.y,
7219; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7220; CM-NEXT:     BFE_INT T11.X, T11.X, 0.0, literal.x,
7221; CM-NEXT:     ASHR T22.Y, PV.X, literal.y,
7222; CM-NEXT:     ASHR T23.Z, T12.W, literal.x,
7223; CM-NEXT:     ASHR * T24.W, T12.Z, literal.y,
7224; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7225; CM-NEXT:     BFE_INT T23.X, T12.W, 0.0, literal.x,
7226; CM-NEXT:     ASHR T11.Y, PV.X, literal.y,
7227; CM-NEXT:     ASHR T24.Z, T12.Z, literal.x,
7228; CM-NEXT:     ASHR * T25.W, T12.Y, literal.y,
7229; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7230; CM-NEXT:     BFE_INT T24.X, T12.Z, 0.0, literal.x,
7231; CM-NEXT:     ASHR T23.Y, PV.X, literal.y,
7232; CM-NEXT:     ASHR T25.Z, T12.Y, literal.x,
7233; CM-NEXT:     ASHR * T12.W, T12.X, literal.y,
7234; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7235; CM-NEXT:     BFE_INT T25.X, T12.Y, 0.0, literal.x,
7236; CM-NEXT:     ASHR T24.Y, PV.X, literal.y,
7237; CM-NEXT:     ASHR * T12.Z, T12.X, literal.x,
7238; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7239; CM-NEXT:     BFE_INT T12.X, T12.X, 0.0, literal.x,
7240; CM-NEXT:     ASHR * T25.Y, PV.X, literal.y,
7241; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
7242; CM-NEXT:     LSHR T26.X, KC0[2].Y, literal.x,
7243; CM-NEXT:     ASHR * T12.Y, PV.X, literal.y,
7244; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
7245  %load = load <16 x i16>, ptr addrspace(1) %in
7246  %ext = sext <16 x i16> %load to <16 x i64>
7247  store <16 x i64> %ext, ptr addrspace(1) %out
7248  ret void
7249}
7250
7251define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
7252; GCN-NOHSA-SI-LABEL: global_zextload_v32i16_to_v32i64:
7253; GCN-NOHSA-SI:       ; %bb.0:
7254; GCN-NOHSA-SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
7255; GCN-NOHSA-SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
7256; GCN-NOHSA-SI-NEXT:    s_mov_b32 s14, -1
7257; GCN-NOHSA-SI-NEXT:    s_mov_b32 s15, 0xe8f000
7258; GCN-NOHSA-SI-NEXT:    s_add_u32 s12, s12, s11
7259; GCN-NOHSA-SI-NEXT:    s_addc_u32 s13, s13, 0
7260; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
7261; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
7262; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
7263; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v39, 0
7264; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
7265; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
7266; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
7267; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
7268; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
7269; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[14:17], off, s[8:11], 0
7270; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[18:21], off, s[8:11], 0 offset:16
7271; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:32
7272; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[26:29], off, s[8:11], 0 offset:48
7273; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
7274; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
7275; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
7276; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
7277; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v20
7278; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
7279; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v14
7280; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v14
7281; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
7282; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
7283; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
7284; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
7285; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v16
7286; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, v3
7287; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v30, 0xffff, v15
7288; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v34, 0xffff, v17
7289; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
7290; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v18
7291; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v20
7292; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, v5
7293; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v19
7294; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v19
7295; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v21
7296; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v42, 0xffff, v21
7297; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(5)
7298; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v22
7299; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v46, 0xffff, v22
7300; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v24
7301; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v24
7302; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v23
7303; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v50, 0xffff, v23
7304; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v25
7305; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v54, 0xffff, v25
7306; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(4)
7307; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v29
7308; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v26
7309; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v58, 0xffff, v26
7310; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v28
7311; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xffff, v28
7312; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(1)
7313; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v27
7314; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v27
7315; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v38, 0xffff, v29
7316; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v41, v39
7317; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v39
7318; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
7319; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v39
7320; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v55, v39
7321; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v57, v39
7322; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v51, v39
7323; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v53, v39
7324; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v43, v39
7325; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v45, v39
7326; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, v39
7327; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, v39
7328; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v35, v39
7329; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v37, v39
7330; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v31, v39
7331; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v33, v39
7332; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, v39
7333; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, v39
7334; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v59, v39
7335; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v61, v39
7336; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, v39
7337; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, v39
7338; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v47, v39
7339; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v49, v39
7340; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v39
7341; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v39
7342; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v39
7343; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, v39
7344; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v39
7345; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v39
7346; GCN-NOHSA-SI-NEXT:    buffer_store_dword v12, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
7347; GCN-NOHSA-SI-NEXT:    buffer_store_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
7348; GCN-NOHSA-SI-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
7349; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
7350; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(3)
7351; GCN-NOHSA-SI-NEXT:    buffer_load_dword v12, off, s[12:15], 0 ; 4-byte Folded Reload
7352; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(2)
7353; GCN-NOHSA-SI-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
7354; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(1)
7355; GCN-NOHSA-SI-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
7356; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
7357; GCN-NOHSA-SI-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
7358; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
7359; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v39
7360; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
7361; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v39
7362; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
7363; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
7364; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:240
7365; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
7366; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:176
7367; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:144
7368; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:112
7369; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
7370; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:48
7371; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:16
7372; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:224
7373; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192
7374; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160
7375; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128
7376; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
7377; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
7378; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
7379; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
7380; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
7381; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
7382; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
7383; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
7384; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0
7385; GCN-NOHSA-SI-NEXT:    s_endpgm
7386;
7387; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
7388; GCN-HSA:       ; %bb.0:
7389; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
7390; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
7391; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
7392; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
7393; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
7394; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
7395; GCN-HSA-NEXT:    flat_load_dwordx4 v[2:5], v[0:1]
7396; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
7397; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
7398; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
7399; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
7400; GCN-HSA-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
7401; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
7402; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
7403; GCN-HSA-NEXT:    s_add_u32 s2, s2, 48
7404; GCN-HSA-NEXT:    flat_load_dwordx4 v[10:13], v[0:1]
7405; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
7406; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
7407; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
7408; GCN-HSA-NEXT:    flat_load_dwordx4 v[14:17], v[0:1]
7409; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
7410; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
7411; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
7412; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
7413; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0xf0
7414; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
7415; GCN-HSA-NEXT:    s_add_u32 s8, s0, 0xd0
7416; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
7417; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0xb0
7418; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
7419; GCN-HSA-NEXT:    s_add_u32 s12, s0, 0x90
7420; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
7421; GCN-HSA-NEXT:    s_add_u32 s14, s0, 0x70
7422; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
7423; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s15
7424; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
7425; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s14
7426; GCN-HSA-NEXT:    s_add_u32 s14, s0, 0x50
7427; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v1
7428; GCN-HSA-NEXT:    v_mov_b32_e32 v21, v1
7429; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
7430; GCN-HSA-NEXT:    v_mov_b32_e32 v24, v1
7431; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
7432; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
7433; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v5
7434; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
7435; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s15
7436; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s14
7437; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
7438; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v3
7439; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
7440; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s11
7441; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s10
7442; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
7443; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
7444; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v9
7445; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
7446; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s13
7447; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s12
7448; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v7
7449; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v7
7450; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
7451; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s5
7452; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s4
7453; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
7454; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
7455; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v11
7456; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
7457; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s7
7458; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v1
7459; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v1
7460; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s6
7461; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
7462; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
7463; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v17
7464; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
7465; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s9
7466; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
7467; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s8
7468; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v15
7469; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v15
7470; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
7471; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
7472; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s5
7473; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
7474; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
7475; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v12
7476; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s1
7477; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
7478; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
7479; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s0
7480; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
7481; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v10
7482; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
7483; GCN-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[17:20]
7484; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v16
7485; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v16
7486; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s5
7487; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v1
7488; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v1
7489; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s4
7490; GCN-HSA-NEXT:    flat_store_dwordx4 v[15:16], v[9:12]
7491; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v13
7492; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
7493; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v2
7494; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
7495; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
7496; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
7497; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
7498; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
7499; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
7500; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
7501; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
7502; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v23, 16, v14
7503; GCN-HSA-NEXT:    v_and_b32_e32 v21, 0xffff, v14
7504; GCN-HSA-NEXT:    v_mov_b32_e32 v22, v1
7505; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
7506; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
7507; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[21:24]
7508; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
7509; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
7510; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v8
7511; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v8
7512; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v1
7513; GCN-HSA-NEXT:    v_mov_b32_e32 v21, v1
7514; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
7515; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
7516; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[18:21]
7517; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
7518; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
7519; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
7520; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
7521; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
7522; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
7523; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v6
7524; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v1
7525; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v1
7526; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
7527; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[15:18]
7528; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v1
7529; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v1
7530; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
7531; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v1
7532; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
7533; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
7534; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
7535; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v4
7536; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
7537; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
7538; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[5:8]
7539; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[9:12]
7540; GCN-HSA-NEXT:    s_endpgm
7541;
7542; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64:
7543; GCN-NOHSA-VI:       ; %bb.0:
7544; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
7545; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
7546; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
7547; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s2
7548; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
7549; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
7550; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
7551; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
7552; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
7553; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[13:16], off, s[8:11], 0 offset:16
7554; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
7555; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
7556; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
7557; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, 0xffff, v10
7558; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
7559; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
7560; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v17, 0xffff, v14
7561; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v13
7562; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v21, 0xffff, v13
7563; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
7564; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v25, 0xffff, v16
7565; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v15
7566; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, 0xffff, v15
7567; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[13:16], off, s[8:11], 0 offset:32
7568; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:48
7569; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
7570; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v8
7571; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
7572; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v3, 0xffff, v7
7573; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v10
7574; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v9
7575; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, 0xffff, v9
7576; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
7577; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v13
7578; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v38, 0xffff, v13
7579; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v15
7580; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v44, 0xffff, v15
7581; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
7582; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v32
7583; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v13, 0xffff, v32
7584; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v34
7585; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v50, 0xffff, v34
7586; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
7587; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v32, 0xffff, v33
7588; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v33, 0
7589; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v51, v33
7590; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v53, v33
7591; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v31
7592; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v47, 0xffff, v31
7593; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:240
7594; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v48, v33
7595; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v50, v33
7596; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:192
7597; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v45, v33
7598; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v47, v33
7599; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v16
7600; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v41, 0xffff, v16
7601; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
7602; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v42, v33
7603; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v44, v33
7604; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:176
7605; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v39, v33
7606; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v41, v33
7607; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, v33
7608; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v31, v33
7609; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v14
7610; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v35, 0xffff, v14
7611; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v14, v33
7612; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v16, v33
7613; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:128
7614; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v36, v33
7615; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v38, v33
7616; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
7617; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v26, v33
7618; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v28, v33
7619; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:208
7620; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:144
7621; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v22, v33
7622; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v35, v33
7623; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v24, v33
7624; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v18, v33
7625; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v20, v33
7626; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v33
7627; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v33
7628; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, v33
7629; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, v33
7630; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, v33
7631; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112
7632; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:224
7633; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:64
7634; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:80
7635; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32
7636; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
7637; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v33
7638; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v6, v33
7639; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0
7640; GCN-NOHSA-VI-NEXT:    s_nop 0
7641; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, v33
7642; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
7643; GCN-NOHSA-VI-NEXT:    s_endpgm
7644;
7645; EG-LABEL: global_zextload_v32i16_to_v32i64:
7646; EG:       ; %bb.0:
7647; EG-NEXT:    ALU 0, @30, KC0[CB0:0-32], KC1[]
7648; EG-NEXT:    TEX 2 @22
7649; EG-NEXT:    ALU 33, @31, KC0[], KC1[]
7650; EG-NEXT:    TEX 0 @28
7651; EG-NEXT:    ALU 93, @65, KC0[CB0:0-32], KC1[]
7652; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T50.X, 0
7653; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T49.X, 0
7654; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T48.X, 0
7655; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T47.X, 0
7656; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T46.X, 0
7657; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T45.X, 0
7658; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T44.X, 0
7659; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T43.X, 0
7660; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T42.X, 0
7661; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T41.X, 0
7662; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T40.X, 0
7663; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T39.X, 0
7664; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T38.X, 0
7665; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T37.X, 0
7666; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T36.X, 0
7667; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T35.X, 1
7668; EG-NEXT:    CF_END
7669; EG-NEXT:    Fetch clause starting at 22:
7670; EG-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 48, #1
7671; EG-NEXT:     VTX_READ_128 T21.XYZW, T19.X, 16, #1
7672; EG-NEXT:     VTX_READ_128 T22.XYZW, T19.X, 32, #1
7673; EG-NEXT:    Fetch clause starting at 28:
7674; EG-NEXT:     VTX_READ_128 T29.XYZW, T19.X, 0, #1
7675; EG-NEXT:    ALU clause starting at 30:
7676; EG-NEXT:     MOV * T19.X, KC0[2].Z,
7677; EG-NEXT:    ALU clause starting at 31:
7678; EG-NEXT:     LSHR * T23.Z, T20.Z, literal.x,
7679; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
7680; EG-NEXT:     AND_INT T23.X, T20.Z, literal.x,
7681; EG-NEXT:     MOV T23.Y, 0.0,
7682; EG-NEXT:     LSHR T24.Z, T20.W, literal.y,
7683; EG-NEXT:     AND_INT * T24.X, T20.W, literal.x,
7684; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7685; EG-NEXT:     MOV T24.Y, 0.0,
7686; EG-NEXT:     LSHR * T25.Z, T20.X, literal.x,
7687; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
7688; EG-NEXT:     AND_INT T25.X, T20.X, literal.x,
7689; EG-NEXT:     MOV T25.Y, 0.0,
7690; EG-NEXT:     LSHR T20.Z, T20.Y, literal.y,
7691; EG-NEXT:     AND_INT * T20.X, T20.Y, literal.x,
7692; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7693; EG-NEXT:     MOV T20.Y, 0.0,
7694; EG-NEXT:     LSHR * T26.Z, T22.Z, literal.x,
7695; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
7696; EG-NEXT:     AND_INT T26.X, T22.Z, literal.x,
7697; EG-NEXT:     MOV T26.Y, 0.0,
7698; EG-NEXT:     LSHR T27.Z, T22.W, literal.y,
7699; EG-NEXT:     AND_INT * T27.X, T22.W, literal.x,
7700; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7701; EG-NEXT:     MOV T27.Y, 0.0,
7702; EG-NEXT:     LSHR * T28.Z, T22.X, literal.x,
7703; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
7704; EG-NEXT:     AND_INT T28.X, T22.X, literal.x,
7705; EG-NEXT:     MOV T28.Y, 0.0,
7706; EG-NEXT:     LSHR T22.Z, T22.Y, literal.y,
7707; EG-NEXT:     AND_INT * T22.X, T22.Y, literal.x,
7708; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7709; EG-NEXT:     MOV T22.Y, 0.0,
7710; EG-NEXT:     LSHR * T19.Z, T21.Z, literal.x,
7711; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
7712; EG-NEXT:    ALU clause starting at 65:
7713; EG-NEXT:     AND_INT T19.X, T21.Z, literal.x,
7714; EG-NEXT:     MOV T19.Y, 0.0,
7715; EG-NEXT:     LSHR T30.Z, T21.W, literal.y,
7716; EG-NEXT:     AND_INT * T30.X, T21.W, literal.x,
7717; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7718; EG-NEXT:     MOV T30.Y, 0.0,
7719; EG-NEXT:     LSHR * T31.Z, T21.X, literal.x,
7720; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
7721; EG-NEXT:     AND_INT T31.X, T21.X, literal.x,
7722; EG-NEXT:     MOV T31.Y, 0.0,
7723; EG-NEXT:     LSHR T21.Z, T21.Y, literal.y,
7724; EG-NEXT:     AND_INT * T21.X, T21.Y, literal.x,
7725; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7726; EG-NEXT:     MOV T21.Y, 0.0,
7727; EG-NEXT:     LSHR * T32.Z, T29.Z, literal.x,
7728; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
7729; EG-NEXT:     AND_INT T32.X, T29.Z, literal.x,
7730; EG-NEXT:     MOV T32.Y, 0.0,
7731; EG-NEXT:     LSHR T33.Z, T29.W, literal.y,
7732; EG-NEXT:     AND_INT * T33.X, T29.W, literal.x,
7733; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7734; EG-NEXT:     MOV T33.Y, 0.0,
7735; EG-NEXT:     LSHR * T34.Z, T29.X, literal.x,
7736; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
7737; EG-NEXT:     AND_INT T34.X, T29.X, literal.x,
7738; EG-NEXT:     MOV T34.Y, 0.0,
7739; EG-NEXT:     LSHR T29.Z, T29.Y, literal.y,
7740; EG-NEXT:     AND_INT * T29.X, T29.Y, literal.x,
7741; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7742; EG-NEXT:     MOV T29.Y, 0.0,
7743; EG-NEXT:     MOV T23.W, 0.0,
7744; EG-NEXT:     MOV * T24.W, 0.0,
7745; EG-NEXT:     MOV T25.W, 0.0,
7746; EG-NEXT:     MOV * T20.W, 0.0,
7747; EG-NEXT:     MOV T26.W, 0.0,
7748; EG-NEXT:     MOV * T27.W, 0.0,
7749; EG-NEXT:     MOV T28.W, 0.0,
7750; EG-NEXT:     MOV * T22.W, 0.0,
7751; EG-NEXT:     MOV T19.W, 0.0,
7752; EG-NEXT:     MOV * T30.W, 0.0,
7753; EG-NEXT:     MOV T31.W, 0.0,
7754; EG-NEXT:     MOV * T21.W, 0.0,
7755; EG-NEXT:     MOV T32.W, 0.0,
7756; EG-NEXT:     MOV * T33.W, 0.0,
7757; EG-NEXT:     MOV T34.W, 0.0,
7758; EG-NEXT:     MOV * T29.W, 0.0,
7759; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
7760; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
7761; EG-NEXT:     LSHR T35.X, PV.W, literal.x,
7762; EG-NEXT:     LSHR * T36.X, KC0[2].Y, literal.x,
7763; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
7764; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
7765; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
7766; EG-NEXT:     LSHR T37.X, PV.W, literal.x,
7767; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7768; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
7769; EG-NEXT:     LSHR T38.X, PV.W, literal.x,
7770; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7771; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
7772; EG-NEXT:     LSHR T39.X, PV.W, literal.x,
7773; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7774; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
7775; EG-NEXT:     LSHR T40.X, PV.W, literal.x,
7776; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7777; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
7778; EG-NEXT:     LSHR T41.X, PV.W, literal.x,
7779; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7780; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
7781; EG-NEXT:     LSHR T42.X, PV.W, literal.x,
7782; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7783; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
7784; EG-NEXT:     LSHR T43.X, PV.W, literal.x,
7785; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7786; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
7787; EG-NEXT:     LSHR T44.X, PV.W, literal.x,
7788; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7789; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
7790; EG-NEXT:     LSHR T45.X, PV.W, literal.x,
7791; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7792; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
7793; EG-NEXT:     LSHR T46.X, PV.W, literal.x,
7794; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7795; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
7796; EG-NEXT:     LSHR T47.X, PV.W, literal.x,
7797; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7798; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
7799; EG-NEXT:     LSHR T48.X, PV.W, literal.x,
7800; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7801; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
7802; EG-NEXT:     LSHR T49.X, PV.W, literal.x,
7803; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7804; EG-NEXT:    2(2.802597e-45), 224(3.138909e-43)
7805; EG-NEXT:     LSHR * T50.X, PV.W, literal.x,
7806; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
7807;
7808; CM-LABEL: global_zextload_v32i16_to_v32i64:
7809; CM:       ; %bb.0:
7810; CM-NEXT:    ALU 0, @30, KC0[CB0:0-32], KC1[]
7811; CM-NEXT:    TEX 2 @22
7812; CM-NEXT:    ALU 33, @31, KC0[], KC1[]
7813; CM-NEXT:    TEX 0 @28
7814; CM-NEXT:    ALU 94, @65, KC0[CB0:0-32], KC1[]
7815; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T50.X
7816; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T24, T49.X
7817; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T25, T48.X
7818; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T26, T47.X
7819; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T21, T46.X
7820; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T27, T45.X
7821; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T28, T44.X
7822; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T29, T43.X
7823; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T42.X
7824; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T30, T41.X
7825; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T31, T40.X
7826; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T32, T39.X
7827; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T22, T38.X
7828; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T33, T37.X
7829; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T34, T36.X
7830; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T35, T23.X
7831; CM-NEXT:    CF_END
7832; CM-NEXT:    Fetch clause starting at 22:
7833; CM-NEXT:     VTX_READ_128 T21.XYZW, T20.X, 0, #1
7834; CM-NEXT:     VTX_READ_128 T22.XYZW, T20.X, 32, #1
7835; CM-NEXT:     VTX_READ_128 T23.XYZW, T20.X, 16, #1
7836; CM-NEXT:    Fetch clause starting at 28:
7837; CM-NEXT:     VTX_READ_128 T23.XYZW, T20.X, 48, #1
7838; CM-NEXT:    ALU clause starting at 30:
7839; CM-NEXT:     MOV * T20.X, KC0[2].Z,
7840; CM-NEXT:    ALU clause starting at 31:
7841; CM-NEXT:     LSHR * T19.Z, T21.Y, literal.x,
7842; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
7843; CM-NEXT:     AND_INT T19.X, T21.Y, literal.x,
7844; CM-NEXT:     MOV T19.Y, 0.0,
7845; CM-NEXT:     LSHR * T24.Z, T21.X, literal.y,
7846; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7847; CM-NEXT:     AND_INT T24.X, T21.X, literal.x,
7848; CM-NEXT:     MOV T24.Y, 0.0,
7849; CM-NEXT:     LSHR * T25.Z, T21.W, literal.y,
7850; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7851; CM-NEXT:     AND_INT T25.X, T21.W, literal.x,
7852; CM-NEXT:     MOV T25.Y, 0.0,
7853; CM-NEXT:     LSHR * T26.Z, T21.Z, literal.y,
7854; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7855; CM-NEXT:     AND_INT T26.X, T21.Z, literal.x,
7856; CM-NEXT:     MOV T26.Y, 0.0,
7857; CM-NEXT:     LSHR * T21.Z, T23.Y, literal.y,
7858; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7859; CM-NEXT:     AND_INT T21.X, T23.Y, literal.x,
7860; CM-NEXT:     MOV T21.Y, 0.0,
7861; CM-NEXT:     LSHR * T27.Z, T23.X, literal.y,
7862; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7863; CM-NEXT:     AND_INT T27.X, T23.X, literal.x,
7864; CM-NEXT:     MOV T27.Y, 0.0,
7865; CM-NEXT:     LSHR * T28.Z, T23.W, literal.y,
7866; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7867; CM-NEXT:     AND_INT T28.X, T23.W, literal.x,
7868; CM-NEXT:     MOV T28.Y, 0.0,
7869; CM-NEXT:     LSHR * T29.Z, T23.Z, literal.y,
7870; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7871; CM-NEXT:     AND_INT T29.X, T23.Z, literal.x,
7872; CM-NEXT:     MOV T29.Y, 0.0,
7873; CM-NEXT:     LSHR * T20.Z, T22.Y, literal.y,
7874; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7875; CM-NEXT:    ALU clause starting at 65:
7876; CM-NEXT:     AND_INT T20.X, T22.Y, literal.x,
7877; CM-NEXT:     MOV T20.Y, 0.0,
7878; CM-NEXT:     LSHR * T30.Z, T22.X, literal.y,
7879; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7880; CM-NEXT:     AND_INT T30.X, T22.X, literal.x,
7881; CM-NEXT:     MOV T30.Y, 0.0,
7882; CM-NEXT:     LSHR * T31.Z, T22.W, literal.y,
7883; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7884; CM-NEXT:     AND_INT T31.X, T22.W, literal.x,
7885; CM-NEXT:     MOV T31.Y, 0.0,
7886; CM-NEXT:     LSHR * T32.Z, T22.Z, literal.y,
7887; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7888; CM-NEXT:     AND_INT T32.X, T22.Z, literal.x,
7889; CM-NEXT:     MOV T32.Y, 0.0,
7890; CM-NEXT:     LSHR * T22.Z, T23.Y, literal.y,
7891; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7892; CM-NEXT:     AND_INT T22.X, T23.Y, literal.x,
7893; CM-NEXT:     MOV T22.Y, 0.0,
7894; CM-NEXT:     LSHR * T33.Z, T23.X, literal.y,
7895; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7896; CM-NEXT:     AND_INT T33.X, T23.X, literal.x,
7897; CM-NEXT:     MOV T33.Y, 0.0,
7898; CM-NEXT:     LSHR * T34.Z, T23.W, literal.y,
7899; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7900; CM-NEXT:     AND_INT T34.X, T23.W, literal.x,
7901; CM-NEXT:     MOV T34.Y, 0.0,
7902; CM-NEXT:     LSHR * T35.Z, T23.Z, literal.y,
7903; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
7904; CM-NEXT:     AND_INT T35.X, T23.Z, literal.x,
7905; CM-NEXT:     MOV T35.Y, 0.0,
7906; CM-NEXT:     MOV * T19.W, 0.0,
7907; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
7908; CM-NEXT:     MOV * T24.W, 0.0,
7909; CM-NEXT:     MOV * T25.W, 0.0,
7910; CM-NEXT:     MOV * T26.W, 0.0,
7911; CM-NEXT:     MOV * T21.W, 0.0,
7912; CM-NEXT:     MOV * T27.W, 0.0,
7913; CM-NEXT:     MOV * T28.W, 0.0,
7914; CM-NEXT:     MOV * T29.W, 0.0,
7915; CM-NEXT:     MOV * T20.W, 0.0,
7916; CM-NEXT:     MOV * T30.W, 0.0,
7917; CM-NEXT:     MOV * T31.W, 0.0,
7918; CM-NEXT:     MOV * T32.W, 0.0,
7919; CM-NEXT:     MOV * T22.W, 0.0,
7920; CM-NEXT:     MOV * T33.W, 0.0,
7921; CM-NEXT:     MOV * T34.W, 0.0,
7922; CM-NEXT:     MOV * T35.W, 0.0,
7923; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
7924; CM-NEXT:    224(3.138909e-43), 0(0.000000e+00)
7925; CM-NEXT:     LSHR T23.X, PV.W, literal.x,
7926; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7927; CM-NEXT:    2(2.802597e-45), 240(3.363116e-43)
7928; CM-NEXT:     LSHR T36.X, PV.W, literal.x,
7929; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7930; CM-NEXT:    2(2.802597e-45), 192(2.690493e-43)
7931; CM-NEXT:     LSHR T37.X, PV.W, literal.x,
7932; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7933; CM-NEXT:    2(2.802597e-45), 208(2.914701e-43)
7934; CM-NEXT:     LSHR T38.X, PV.W, literal.x,
7935; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7936; CM-NEXT:    2(2.802597e-45), 160(2.242078e-43)
7937; CM-NEXT:     LSHR T39.X, PV.W, literal.x,
7938; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7939; CM-NEXT:    2(2.802597e-45), 176(2.466285e-43)
7940; CM-NEXT:     LSHR T40.X, PV.W, literal.x,
7941; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7942; CM-NEXT:    2(2.802597e-45), 128(1.793662e-43)
7943; CM-NEXT:     LSHR T41.X, PV.W, literal.x,
7944; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7945; CM-NEXT:    2(2.802597e-45), 144(2.017870e-43)
7946; CM-NEXT:     LSHR T42.X, PV.W, literal.x,
7947; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7948; CM-NEXT:    2(2.802597e-45), 96(1.345247e-43)
7949; CM-NEXT:     LSHR T43.X, PV.W, literal.x,
7950; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7951; CM-NEXT:    2(2.802597e-45), 112(1.569454e-43)
7952; CM-NEXT:     LSHR T44.X, PV.W, literal.x,
7953; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7954; CM-NEXT:    2(2.802597e-45), 64(8.968310e-44)
7955; CM-NEXT:     LSHR T45.X, PV.W, literal.x,
7956; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7957; CM-NEXT:    2(2.802597e-45), 80(1.121039e-43)
7958; CM-NEXT:     LSHR T46.X, PV.W, literal.x,
7959; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7960; CM-NEXT:    2(2.802597e-45), 32(4.484155e-44)
7961; CM-NEXT:     LSHR T47.X, PV.W, literal.x,
7962; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7963; CM-NEXT:    2(2.802597e-45), 48(6.726233e-44)
7964; CM-NEXT:     LSHR * T48.X, PV.W, literal.x,
7965; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
7966; CM-NEXT:     LSHR T49.X, KC0[2].Y, literal.x,
7967; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
7968; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
7969; CM-NEXT:     LSHR * T50.X, PV.W, literal.x,
7970; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
7971  %load = load <32 x i16>, ptr addrspace(1) %in
7972  %ext = zext <32 x i16> %load to <32 x i64>
7973  store <32 x i64> %ext, ptr addrspace(1) %out
7974  ret void
7975}
7976
7977define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
7978; GCN-NOHSA-SI-LABEL: global_sextload_v32i16_to_v32i64:
7979; GCN-NOHSA-SI:       ; %bb.0:
7980; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
7981; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
7982; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
7983; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
7984; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
7985; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
7986; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
7987; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
7988; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
7989; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
7990; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
7991; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
7992; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
7993; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
7994; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
7995; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, v3
7996; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v26, v7
7997; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, v11
7998; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, v15
7999; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
8000; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
8001; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v10
8002; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v8
8003; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
8004; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v22, v22, 0, 16
8005; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[24:25], v[2:3], 48
8006; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
8007; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:240
8008; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
8009; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[24:25], v[0:1], 48
8010; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v22, v1, 0, 16
8011; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
8012; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:208
8013; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v12
8014; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
8015; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v22, v26, 0, 16
8016; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[24:25], v[6:7], 48
8017; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
8018; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:176
8019; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
8020; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[24:25], v[4:5], 48
8021; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v22, v5, 0, 16
8022; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
8023; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:144
8024; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
8025; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[24:25], v[10:11], 48
8026; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v22, v27, 0, 16
8027; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
8028; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112
8029; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
8030; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[24:25], v[8:9], 48
8031; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v22, v9, 0, 16
8032; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
8033; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80
8034; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
8035; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[23:24], v[14:15], 48
8036; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v21, v21, 0, 16
8037; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
8038; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:48
8039; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
8040; GCN-NOHSA-SI-NEXT:    v_ashr_i64 v[23:24], v[12:13], 48
8041; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v21, v13, 0, 16
8042; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
8043; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:16
8044; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v1, v12, 0, 16
8045; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v5, v14, 0, 16
8046; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v3, v3, 0, 16
8047; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v20, 0, 16
8048; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v11, v2, 0, 16
8049; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
8050; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
8051; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:224
8052; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v9, v8, 0, 16
8053; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
8054; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v13, v10, 0, 16
8055; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v7, v19, 0, 16
8056; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v11, v18, 0, 16
8057; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v15, v17, 0, 16
8058; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v19, v16, 0, 16
8059; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v17, v4, 0, 16
8060; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
8061; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v21, v6, 0, 16
8062; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v23, v2, 0, 16
8063; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
8064; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v25, v0, 0, 16
8065; GCN-NOHSA-SI-NEXT:    v_bfe_i32 v27, v2, 0, 16
8066; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
8067; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
8068; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
8069; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
8070; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
8071; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
8072; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
8073; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
8074; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
8075; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
8076; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
8077; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
8078; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
8079; GCN-NOHSA-SI-NEXT:    v_ashrrev_i32_e32 v28, 31, v27
8080; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:192
8081; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:160
8082; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:128
8083; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96
8084; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64
8085; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
8086; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
8087; GCN-NOHSA-SI-NEXT:    s_endpgm
8088;
8089; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64:
8090; GCN-HSA:       ; %bb.0:
8091; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
8092; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
8093; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
8094; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
8095; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
8096; GCN-HSA-NEXT:    s_add_u32 s4, s2, 48
8097; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
8098; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
8099; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
8100; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
8101; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
8102; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
8103; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
8104; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
8105; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
8106; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
8107; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
8108; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
8109; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
8110; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
8111; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
8112; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
8113; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
8114; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
8115; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
8116; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
8117; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
8118; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
8119; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
8120; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
8121; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
8122; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
8123; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
8124; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
8125; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s3
8126; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s2
8127; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
8128; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
8129; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
8130; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[8:9], 48
8131; GCN-HSA-NEXT:    v_bfe_i32 v16, v9, 0, 16
8132; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
8133; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[16:19]
8134; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
8135; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
8136; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
8137; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
8138; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x70
8139; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
8140; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0x50
8141; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
8142; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v11
8143; GCN-HSA-NEXT:    s_add_u32 s8, s0, 32
8144; GCN-HSA-NEXT:    v_bfe_i32 v16, v9, 0, 16
8145; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[10:11], 48
8146; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
8147; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
8148; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v10
8149; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
8150; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s9
8151; GCN-HSA-NEXT:    v_bfe_i32 v18, v9, 0, 16
8152; GCN-HSA-NEXT:    v_bfe_i32 v16, v10, 0, 16
8153; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s8
8154; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
8155; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
8156; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
8157; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s3
8158; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
8159; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[0:1], 48
8160; GCN-HSA-NEXT:    v_bfe_i32 v16, v1, 0, 16
8161; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
8162; GCN-HSA-NEXT:    v_mov_b32_e32 v1, v3
8163; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[16:19]
8164; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s2
8165; GCN-HSA-NEXT:    v_bfe_i32 v16, v1, 0, 16
8166; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[2:3], 48
8167; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
8168; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[16:19]
8169; GCN-HSA-NEXT:    s_waitcnt vmcnt(6)
8170; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v7
8171; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[4:5], 48
8172; GCN-HSA-NEXT:    v_bfe_i32 v16, v5, 0, 16
8173; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
8174; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[16:19]
8175; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s7
8176; GCN-HSA-NEXT:    v_bfe_i32 v16, v3, 0, 16
8177; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[6:7], 48
8178; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
8179; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[16:19]
8180; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s6
8181; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
8182; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[12:13], 48
8183; GCN-HSA-NEXT:    v_bfe_i32 v16, v13, 0, 16
8184; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
8185; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v15
8186; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s5
8187; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[16:19]
8188; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s4
8189; GCN-HSA-NEXT:    v_bfe_i32 v16, v3, 0, 16
8190; GCN-HSA-NEXT:    v_ashr_i64 v[18:19], v[14:15], 48
8191; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
8192; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
8193; GCN-HSA-NEXT:    v_bfe_i32 v8, v8, 0, 16
8194; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v24, 16, v14
8195; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
8196; GCN-HSA-NEXT:    v_bfe_i32 v10, v1, 0, 16
8197; GCN-HSA-NEXT:    v_bfe_i32 v16, v14, 0, 16
8198; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s1
8199; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
8200; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v12
8201; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
8202; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
8203; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s0
8204; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
8205; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v22, 16, v0
8206; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
8207; GCN-HSA-NEXT:    v_bfe_i32 v14, v18, 0, 16
8208; GCN-HSA-NEXT:    v_bfe_i32 v18, v24, 0, 16
8209; GCN-HSA-NEXT:    v_bfe_i32 v20, v0, 0, 16
8210; GCN-HSA-NEXT:    v_bfe_i32 v24, v2, 0, 16
8211; GCN-HSA-NEXT:    v_bfe_i32 v26, v26, 0, 16
8212; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
8213; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
8214; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
8215; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
8216; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
8217; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
8218; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v23, 16, v6
8219; GCN-HSA-NEXT:    v_bfe_i32 v22, v22, 0, 16
8220; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
8221; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
8222; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
8223; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
8224; GCN-HSA-NEXT:    v_bfe_i32 v9, v23, 0, 16
8225; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
8226; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
8227; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
8228; GCN-HSA-NEXT:    v_bfe_i32 v7, v6, 0, 16
8229; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
8230; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
8231; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
8232; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
8233; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
8234; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
8235; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
8236; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
8237; GCN-HSA-NEXT:    v_bfe_i32 v3, v4, 0, 16
8238; GCN-HSA-NEXT:    v_bfe_i32 v5, v5, 0, 16
8239; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
8240; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
8241; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
8242; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
8243; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
8244; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
8245; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
8246; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
8247; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
8248; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
8249; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
8250; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
8251; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
8252; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
8253; GCN-HSA-NEXT:    v_bfe_i32 v12, v12, 0, 16
8254; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
8255; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
8256; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
8257; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
8258; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
8259; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
8260; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
8261; GCN-HSA-NEXT:    s_endpgm
8262;
8263; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64:
8264; GCN-NOHSA-VI:       ; %bb.0:
8265; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
8266; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
8267; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
8268; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s2
8269; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, s3
8270; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
8271; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
8272; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
8273; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[5:8], off, s[8:11], 0
8274; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[13:16], off, s[8:11], 0 offset:48
8275; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[9:12], off, s[8:11], 0 offset:32
8276; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[1:4], off, s[8:11], 0 offset:16
8277; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
8278; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
8279; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
8280; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v6, 0, 16
8281; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(2)
8282; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v15, 0, 16
8283; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
8284; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v15, 0, 16
8285; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
8286; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v6, v16
8287; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
8288; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
8289; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
8290; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:224
8291; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
8292; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v6, 0, 16
8293; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v16, 0, 16
8294; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
8295; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
8296; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:240
8297; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
8298; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v13, 0, 16
8299; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v14, 0, 16
8300; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v15, 0, 16
8301; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
8302; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
8303; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v6, 0, 16
8304; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:208
8305; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
8306; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(4)
8307; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v11
8308; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
8309; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v13, 0, 16
8310; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v11, 0, 16
8311; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:192
8312; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
8313; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, v12
8314; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
8315; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
8316; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:160
8317; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v19, 0, 16
8318; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v12, 0, 16
8319; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v18, v8
8320; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
8321; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
8322; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
8323; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
8324; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:176
8325; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v5, 0, 16
8326; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v8, 0, 16
8327; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v9, 0, 16
8328; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
8329; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
8330; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v10
8331; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:128
8332; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v9, v9, 0, 16
8333; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v7, 0, 16
8334; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
8335; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v7, v10, 0, 16
8336; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
8337; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
8338; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:144
8339; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
8340; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(8)
8341; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v7, v3, 0, 16
8342; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
8343; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v9, v3, 0, 16
8344; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
8345; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
8346; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:96
8347; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
8348; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v5, 0, 16
8349; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, v4
8350; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
8351; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
8352; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v27, v5, 0, 16
8353; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v29, v4, 0, 16
8354; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v2, 0, 16
8355; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v17, 0, 16
8356; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v19, v18, 0, 16
8357; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v21, v16, 0, 16
8358; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v12, 0, 16
8359; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v23, v1, 0, 16
8360; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v3, 0, 16
8361; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v25, v7, 0, 16
8362; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v28, 31, v27
8363; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v30, 31, v29
8364; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
8365; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
8366; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
8367; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
8368; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
8369; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
8370; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
8371; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
8372; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
8373; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
8374; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
8375; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
8376; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112
8377; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:64
8378; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80
8379; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32
8380; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48
8381; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0
8382; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
8383; GCN-NOHSA-VI-NEXT:    s_endpgm
8384;
8385; EG-LABEL: global_sextload_v32i16_to_v32i64:
8386; EG:       ; %bb.0:
8387; EG-NEXT:    ALU 0, @30, KC0[CB0:0-32], KC1[]
8388; EG-NEXT:    TEX 0 @22
8389; EG-NEXT:    ALU 56, @31, KC0[CB0:0-32], KC1[]
8390; EG-NEXT:    TEX 2 @24
8391; EG-NEXT:    ALU 74, @88, KC0[CB0:0-32], KC1[]
8392; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T38.X, 0
8393; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T36.X, 0
8394; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T34.X, 0
8395; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T33.X, 0
8396; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T32.X, 0
8397; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T31.X, 0
8398; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T30.X, 0
8399; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T29.X, 0
8400; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T28.X, 0
8401; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T27.X, 0
8402; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T26.X, 0
8403; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T25.X, 0
8404; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T24.X, 0
8405; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 0
8406; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T22.X, 0
8407; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T21.X, 1
8408; EG-NEXT:    CF_END
8409; EG-NEXT:    Fetch clause starting at 22:
8410; EG-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 0, #1
8411; EG-NEXT:    Fetch clause starting at 24:
8412; EG-NEXT:     VTX_READ_128 T38.XYZW, T19.X, 48, #1
8413; EG-NEXT:     VTX_READ_128 T39.XYZW, T19.X, 32, #1
8414; EG-NEXT:     VTX_READ_128 T40.XYZW, T19.X, 16, #1
8415; EG-NEXT:    ALU clause starting at 30:
8416; EG-NEXT:     MOV * T19.X, KC0[2].Z,
8417; EG-NEXT:    ALU clause starting at 31:
8418; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
8419; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
8420; EG-NEXT:     LSHR T21.X, PV.W, literal.x,
8421; EG-NEXT:     LSHR * T22.X, KC0[2].Y, literal.x,
8422; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
8423; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
8424; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
8425; EG-NEXT:     LSHR T23.X, PV.W, literal.x,
8426; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8427; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
8428; EG-NEXT:     LSHR T24.X, PV.W, literal.x,
8429; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8430; EG-NEXT:    2(2.802597e-45), 80(1.121039e-43)
8431; EG-NEXT:     LSHR T25.X, PV.W, literal.x,
8432; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8433; EG-NEXT:    2(2.802597e-45), 64(8.968310e-44)
8434; EG-NEXT:     LSHR T26.X, PV.W, literal.x,
8435; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8436; EG-NEXT:    2(2.802597e-45), 112(1.569454e-43)
8437; EG-NEXT:     LSHR T27.X, PV.W, literal.x,
8438; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8439; EG-NEXT:    2(2.802597e-45), 96(1.345247e-43)
8440; EG-NEXT:     LSHR T28.X, PV.W, literal.x,
8441; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8442; EG-NEXT:    2(2.802597e-45), 144(2.017870e-43)
8443; EG-NEXT:     LSHR T29.X, PV.W, literal.x,
8444; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8445; EG-NEXT:    2(2.802597e-45), 128(1.793662e-43)
8446; EG-NEXT:     LSHR T30.X, PV.W, literal.x,
8447; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8448; EG-NEXT:    2(2.802597e-45), 176(2.466285e-43)
8449; EG-NEXT:     LSHR T31.X, PV.W, literal.x,
8450; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8451; EG-NEXT:    2(2.802597e-45), 160(2.242078e-43)
8452; EG-NEXT:     LSHR T32.X, PV.W, literal.x,
8453; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8454; EG-NEXT:    2(2.802597e-45), 208(2.914701e-43)
8455; EG-NEXT:     LSHR T33.X, PV.W, literal.x,
8456; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8457; EG-NEXT:    2(2.802597e-45), 192(2.690493e-43)
8458; EG-NEXT:     LSHR T34.X, PV.W, literal.x,
8459; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.y,
8460; EG-NEXT:     ASHR * T35.W, T20.Y, literal.z,
8461; EG-NEXT:    2(2.802597e-45), 240(3.363116e-43)
8462; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
8463; EG-NEXT:     LSHR T36.X, PV.W, literal.x,
8464; EG-NEXT:     ASHR T35.Z, T20.Y, literal.y,
8465; EG-NEXT:     ASHR * T37.W, T20.X, literal.z,
8466; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
8467; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
8468; EG-NEXT:     BFE_INT T35.X, T20.Y, 0.0, literal.x,
8469; EG-NEXT:     ASHR * T37.Z, T20.X, literal.x,
8470; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
8471; EG-NEXT:     BFE_INT T37.X, T20.X, 0.0, literal.x,
8472; EG-NEXT:     ASHR T35.Y, PV.X, literal.y,
8473; EG-NEXT:     ASHR * T19.W, T20.W, literal.y,
8474; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8475; EG-NEXT:    ALU clause starting at 88:
8476; EG-NEXT:     ASHR T19.Z, T20.W, literal.x,
8477; EG-NEXT:     ASHR * T41.W, T20.Z, literal.y,
8478; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8479; EG-NEXT:     BFE_INT T19.X, T20.W, 0.0, literal.x,
8480; EG-NEXT:     ASHR T37.Y, T37.X, literal.y,
8481; EG-NEXT:     ASHR T41.Z, T20.Z, literal.x,
8482; EG-NEXT:     ASHR * T20.W, T40.Y, literal.y,
8483; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8484; EG-NEXT:     BFE_INT T41.X, T20.Z, 0.0, literal.x,
8485; EG-NEXT:     ASHR T19.Y, PV.X, literal.y,
8486; EG-NEXT:     ASHR T20.Z, T40.Y, literal.x,
8487; EG-NEXT:     ASHR * T42.W, T40.X, literal.y,
8488; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8489; EG-NEXT:     BFE_INT T20.X, T40.Y, 0.0, literal.x,
8490; EG-NEXT:     ASHR T41.Y, PV.X, literal.y,
8491; EG-NEXT:     ASHR T42.Z, T40.X, literal.x,
8492; EG-NEXT:     ASHR * T43.W, T40.W, literal.y,
8493; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8494; EG-NEXT:     BFE_INT T42.X, T40.X, 0.0, literal.x,
8495; EG-NEXT:     ASHR T20.Y, PV.X, literal.y,
8496; EG-NEXT:     ASHR T43.Z, T40.W, literal.x,
8497; EG-NEXT:     ASHR * T44.W, T40.Z, literal.y,
8498; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8499; EG-NEXT:     BFE_INT T43.X, T40.W, 0.0, literal.x,
8500; EG-NEXT:     ASHR T42.Y, PV.X, literal.y,
8501; EG-NEXT:     ASHR T44.Z, T40.Z, literal.x,
8502; EG-NEXT:     ASHR * T40.W, T39.Y, literal.y,
8503; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8504; EG-NEXT:     BFE_INT T44.X, T40.Z, 0.0, literal.x,
8505; EG-NEXT:     ASHR T43.Y, PV.X, literal.y,
8506; EG-NEXT:     ASHR T40.Z, T39.Y, literal.x,
8507; EG-NEXT:     ASHR * T45.W, T39.X, literal.y,
8508; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8509; EG-NEXT:     BFE_INT T40.X, T39.Y, 0.0, literal.x,
8510; EG-NEXT:     ASHR T44.Y, PV.X, literal.y,
8511; EG-NEXT:     ASHR T45.Z, T39.X, literal.x,
8512; EG-NEXT:     ASHR * T46.W, T39.W, literal.y,
8513; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8514; EG-NEXT:     BFE_INT T45.X, T39.X, 0.0, literal.x,
8515; EG-NEXT:     ASHR T40.Y, PV.X, literal.y,
8516; EG-NEXT:     ASHR T46.Z, T39.W, literal.x,
8517; EG-NEXT:     ASHR * T47.W, T39.Z, literal.y,
8518; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8519; EG-NEXT:     BFE_INT T46.X, T39.W, 0.0, literal.x,
8520; EG-NEXT:     ASHR T45.Y, PV.X, literal.y,
8521; EG-NEXT:     ASHR T47.Z, T39.Z, literal.x,
8522; EG-NEXT:     ASHR * T39.W, T38.Y, literal.y,
8523; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8524; EG-NEXT:     BFE_INT T47.X, T39.Z, 0.0, literal.x,
8525; EG-NEXT:     ASHR T46.Y, PV.X, literal.y,
8526; EG-NEXT:     ASHR T39.Z, T38.Y, literal.x,
8527; EG-NEXT:     ASHR * T48.W, T38.X, literal.y,
8528; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8529; EG-NEXT:     BFE_INT T39.X, T38.Y, 0.0, literal.x,
8530; EG-NEXT:     ASHR T47.Y, PV.X, literal.y,
8531; EG-NEXT:     ASHR T48.Z, T38.X, literal.x,
8532; EG-NEXT:     ASHR * T49.W, T38.W, literal.y,
8533; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8534; EG-NEXT:     BFE_INT T48.X, T38.X, 0.0, literal.x,
8535; EG-NEXT:     ASHR T39.Y, PV.X, literal.y,
8536; EG-NEXT:     ASHR T49.Z, T38.W, literal.x,
8537; EG-NEXT:     ASHR * T50.W, T38.Z, literal.y,
8538; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8539; EG-NEXT:     BFE_INT T49.X, T38.W, 0.0, literal.x,
8540; EG-NEXT:     ASHR T48.Y, PV.X, literal.y,
8541; EG-NEXT:     ASHR * T50.Z, T38.Z, literal.x,
8542; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8543; EG-NEXT:     BFE_INT T50.X, T38.Z, 0.0, literal.x,
8544; EG-NEXT:     ASHR T49.Y, PV.X, literal.y,
8545; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
8546; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8547; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
8548; EG-NEXT:     LSHR T38.X, PV.W, literal.x,
8549; EG-NEXT:     ASHR * T50.Y, PV.X, literal.y,
8550; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
8551;
8552; CM-LABEL: global_sextload_v32i16_to_v32i64:
8553; CM:       ; %bb.0:
8554; CM-NEXT:    ALU 0, @30, KC0[CB0:0-32], KC1[]
8555; CM-NEXT:    TEX 0 @22
8556; CM-NEXT:    ALU 55, @31, KC0[CB0:0-32], KC1[]
8557; CM-NEXT:    TEX 2 @24
8558; CM-NEXT:    ALU 73, @87, KC0[CB0:0-32], KC1[]
8559; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T38, T50.X
8560; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T49, T36.X
8561; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T48, T34.X
8562; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T47, T33.X
8563; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T39, T32.X
8564; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T46, T31.X
8565; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T45, T30.X
8566; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T44, T29.X
8567; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T40, T28.X
8568; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T43, T27.X
8569; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T42, T26.X
8570; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T41, T25.X
8571; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T20, T24.X
8572; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T19, T23.X
8573; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T37, T22.X
8574; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T35, T21.X
8575; CM-NEXT:    CF_END
8576; CM-NEXT:    Fetch clause starting at 22:
8577; CM-NEXT:     VTX_READ_128 T20.XYZW, T19.X, 48, #1
8578; CM-NEXT:    Fetch clause starting at 24:
8579; CM-NEXT:     VTX_READ_128 T38.XYZW, T19.X, 0, #1
8580; CM-NEXT:     VTX_READ_128 T39.XYZW, T19.X, 16, #1
8581; CM-NEXT:     VTX_READ_128 T40.XYZW, T19.X, 32, #1
8582; CM-NEXT:    ALU clause starting at 30:
8583; CM-NEXT:     MOV * T19.X, KC0[2].Z,
8584; CM-NEXT:    ALU clause starting at 31:
8585; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
8586; CM-NEXT:    224(3.138909e-43), 0(0.000000e+00)
8587; CM-NEXT:     LSHR T21.X, PV.W, literal.x,
8588; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8589; CM-NEXT:    2(2.802597e-45), 240(3.363116e-43)
8590; CM-NEXT:     LSHR T22.X, PV.W, literal.x,
8591; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8592; CM-NEXT:    2(2.802597e-45), 192(2.690493e-43)
8593; CM-NEXT:     LSHR T23.X, PV.W, literal.x,
8594; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8595; CM-NEXT:    2(2.802597e-45), 208(2.914701e-43)
8596; CM-NEXT:     LSHR T24.X, PV.W, literal.x,
8597; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8598; CM-NEXT:    2(2.802597e-45), 160(2.242078e-43)
8599; CM-NEXT:     LSHR T25.X, PV.W, literal.x,
8600; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8601; CM-NEXT:    2(2.802597e-45), 176(2.466285e-43)
8602; CM-NEXT:     LSHR T26.X, PV.W, literal.x,
8603; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8604; CM-NEXT:    2(2.802597e-45), 128(1.793662e-43)
8605; CM-NEXT:     LSHR T27.X, PV.W, literal.x,
8606; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8607; CM-NEXT:    2(2.802597e-45), 144(2.017870e-43)
8608; CM-NEXT:     LSHR T28.X, PV.W, literal.x,
8609; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8610; CM-NEXT:    2(2.802597e-45), 96(1.345247e-43)
8611; CM-NEXT:     LSHR T29.X, PV.W, literal.x,
8612; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8613; CM-NEXT:    2(2.802597e-45), 112(1.569454e-43)
8614; CM-NEXT:     LSHR T30.X, PV.W, literal.x,
8615; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8616; CM-NEXT:    2(2.802597e-45), 64(8.968310e-44)
8617; CM-NEXT:     LSHR T31.X, PV.W, literal.x,
8618; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8619; CM-NEXT:    2(2.802597e-45), 80(1.121039e-43)
8620; CM-NEXT:     LSHR T32.X, PV.W, literal.x,
8621; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8622; CM-NEXT:    2(2.802597e-45), 32(4.484155e-44)
8623; CM-NEXT:     LSHR T33.X, PV.W, literal.x,
8624; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
8625; CM-NEXT:    2(2.802597e-45), 48(6.726233e-44)
8626; CM-NEXT:     LSHR T34.X, PV.W, literal.x,
8627; CM-NEXT:     ASHR * T35.W, T20.Z, literal.y,
8628; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
8629; CM-NEXT:     LSHR T36.X, KC0[2].Y, literal.x,
8630; CM-NEXT:     ASHR T35.Z, T20.Z, literal.y,
8631; CM-NEXT:     ASHR * T37.W, T20.W, literal.z,
8632; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
8633; CM-NEXT:    31(4.344025e-44), 0(0.000000e+00)
8634; CM-NEXT:     BFE_INT T35.X, T20.Z, 0.0, literal.x,
8635; CM-NEXT:     ASHR * T37.Z, T20.W, literal.x,
8636; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
8637; CM-NEXT:     BFE_INT T37.X, T20.W, 0.0, literal.x,
8638; CM-NEXT:     ASHR T35.Y, PV.X, literal.y,
8639; CM-NEXT:     ASHR * T19.W, T20.X, literal.y,
8640; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8641; CM-NEXT:    ALU clause starting at 87:
8642; CM-NEXT:     ASHR T19.Z, T20.X, literal.x,
8643; CM-NEXT:     ASHR * T20.W, T20.Y, literal.y,
8644; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8645; CM-NEXT:     BFE_INT T19.X, T20.X, 0.0, literal.x,
8646; CM-NEXT:     ASHR T37.Y, T37.X, literal.y, BS:VEC_120/SCL_212
8647; CM-NEXT:     ASHR T20.Z, T20.Y, literal.x,
8648; CM-NEXT:     ASHR * T41.W, T40.Z, literal.y,
8649; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8650; CM-NEXT:     BFE_INT T20.X, T20.Y, 0.0, literal.x,
8651; CM-NEXT:     ASHR T19.Y, PV.X, literal.y,
8652; CM-NEXT:     ASHR T41.Z, T40.Z, literal.x,
8653; CM-NEXT:     ASHR * T42.W, T40.W, literal.y,
8654; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8655; CM-NEXT:     BFE_INT T41.X, T40.Z, 0.0, literal.x,
8656; CM-NEXT:     ASHR T20.Y, PV.X, literal.y,
8657; CM-NEXT:     ASHR T42.Z, T40.W, literal.x,
8658; CM-NEXT:     ASHR * T43.W, T40.X, literal.y,
8659; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8660; CM-NEXT:     BFE_INT T42.X, T40.W, 0.0, literal.x,
8661; CM-NEXT:     ASHR T41.Y, PV.X, literal.y,
8662; CM-NEXT:     ASHR T43.Z, T40.X, literal.x,
8663; CM-NEXT:     ASHR * T40.W, T40.Y, literal.y,
8664; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8665; CM-NEXT:     BFE_INT T43.X, T40.X, 0.0, literal.x,
8666; CM-NEXT:     ASHR T42.Y, PV.X, literal.y,
8667; CM-NEXT:     ASHR T40.Z, T40.Y, literal.x,
8668; CM-NEXT:     ASHR * T44.W, T39.Z, literal.y,
8669; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8670; CM-NEXT:     BFE_INT T40.X, T40.Y, 0.0, literal.x,
8671; CM-NEXT:     ASHR T43.Y, PV.X, literal.y,
8672; CM-NEXT:     ASHR T44.Z, T39.Z, literal.x,
8673; CM-NEXT:     ASHR * T45.W, T39.W, literal.y,
8674; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8675; CM-NEXT:     BFE_INT T44.X, T39.Z, 0.0, literal.x,
8676; CM-NEXT:     ASHR T40.Y, PV.X, literal.y,
8677; CM-NEXT:     ASHR T45.Z, T39.W, literal.x,
8678; CM-NEXT:     ASHR * T46.W, T39.X, literal.y,
8679; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8680; CM-NEXT:     BFE_INT T45.X, T39.W, 0.0, literal.x,
8681; CM-NEXT:     ASHR T44.Y, PV.X, literal.y,
8682; CM-NEXT:     ASHR T46.Z, T39.X, literal.x,
8683; CM-NEXT:     ASHR * T39.W, T39.Y, literal.y,
8684; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8685; CM-NEXT:     BFE_INT T46.X, T39.X, 0.0, literal.x,
8686; CM-NEXT:     ASHR T45.Y, PV.X, literal.y,
8687; CM-NEXT:     ASHR T39.Z, T39.Y, literal.x,
8688; CM-NEXT:     ASHR * T47.W, T38.Z, literal.y,
8689; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8690; CM-NEXT:     BFE_INT T39.X, T39.Y, 0.0, literal.x,
8691; CM-NEXT:     ASHR T46.Y, PV.X, literal.y,
8692; CM-NEXT:     ASHR T47.Z, T38.Z, literal.x,
8693; CM-NEXT:     ASHR * T48.W, T38.W, literal.y,
8694; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8695; CM-NEXT:     BFE_INT T47.X, T38.Z, 0.0, literal.x,
8696; CM-NEXT:     ASHR T39.Y, PV.X, literal.y,
8697; CM-NEXT:     ASHR T48.Z, T38.W, literal.x,
8698; CM-NEXT:     ASHR * T49.W, T38.X, literal.y,
8699; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8700; CM-NEXT:     BFE_INT T48.X, T38.W, 0.0, literal.x,
8701; CM-NEXT:     ASHR T47.Y, PV.X, literal.y,
8702; CM-NEXT:     ASHR T49.Z, T38.X, literal.x,
8703; CM-NEXT:     ASHR * T38.W, T38.Y, literal.y,
8704; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8705; CM-NEXT:     BFE_INT T49.X, T38.X, 0.0, literal.x,
8706; CM-NEXT:     ASHR T48.Y, PV.X, literal.y,
8707; CM-NEXT:     ASHR * T38.Z, T38.Y, literal.x,
8708; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8709; CM-NEXT:     BFE_INT T38.X, T38.Y, 0.0, literal.x,
8710; CM-NEXT:     ASHR T49.Y, PV.X, literal.y,
8711; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
8712; CM-NEXT:    16(2.242078e-44), 31(4.344025e-44)
8713; CM-NEXT:     LSHR T50.X, PV.W, literal.x,
8714; CM-NEXT:     ASHR * T38.Y, PV.X, literal.y,
8715; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
8716  %load = load <32 x i16>, ptr addrspace(1) %in
8717  %ext = sext <32 x i16> %load to <32 x i64>
8718  store <32 x i64> %ext, ptr addrspace(1) %out
8719  ret void
8720}
8721
8722; define amdgpu_kernel void @global_zextload_v64i16_to_v64i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
8723;   %load = load <64 x i16>, ptr addrspace(1) %in
8724;   %ext = zext <64 x i16> %load to <64 x i64>
8725;   store <64 x i64> %ext, ptr addrspace(1) %out
8726;   ret void
8727; }
8728
8729; define amdgpu_kernel void @global_sextload_v64i16_to_v64i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
8730;   %load = load <64 x i16>, ptr addrspace(1) %in
8731;   %ext = sext <64 x i16> %load to <64 x i64>
8732;   store <64 x i64> %ext, ptr addrspace(1) %out
8733;   ret void
8734; }
8735
8736attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" }
8737