xref: /llvm-project/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX900 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX11 %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX11 %s
8
9define <2 x half> @chain_hi_to_lo_private() {
10; GFX900-LABEL: chain_hi_to_lo_private:
11; GFX900:       ; %bb.0: ; %bb
12; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
14; GFX900-NEXT:    s_nop 0
15; GFX900-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], 0
16; GFX900-NEXT:    s_waitcnt vmcnt(0)
17; GFX900-NEXT:    s_setpc_b64 s[30:31]
18;
19; FLATSCR-LABEL: chain_hi_to_lo_private:
20; FLATSCR:       ; %bb.0: ; %bb
21; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22; FLATSCR-NEXT:    s_mov_b32 s0, 2
23; FLATSCR-NEXT:    scratch_load_ushort v0, off, s0
24; FLATSCR-NEXT:    s_mov_b32 s0, 0
25; FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s0
26; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
27; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
28;
29; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private:
30; GFX10_DEFAULT:       ; %bb.0: ; %bb
31; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32; GFX10_DEFAULT-NEXT:    s_clause 0x1
33; GFX10_DEFAULT-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
34; GFX10_DEFAULT-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], 0
35; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
36; GFX10_DEFAULT-NEXT:    s_setpc_b64 s[30:31]
37;
38; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private:
39; FLATSCR_GFX10:       ; %bb.0: ; %bb
40; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41; FLATSCR_GFX10-NEXT:    s_mov_b32 s0, 2
42; FLATSCR_GFX10-NEXT:    scratch_load_ushort v0, off, s0
43; FLATSCR_GFX10-NEXT:    s_waitcnt_depctr 0xffe3
44; FLATSCR_GFX10-NEXT:    s_mov_b32 s0, 0
45; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v0, off, s0
46; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
47; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
48;
49; GFX11-LABEL: chain_hi_to_lo_private:
50; GFX11:       ; %bb.0: ; %bb
51; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52; GFX11-NEXT:    s_mov_b32 s0, 2
53; GFX11-NEXT:    scratch_load_u16 v0, off, s0
54; GFX11-NEXT:    s_mov_b32 s0, 0
55; GFX11-NEXT:    scratch_load_d16_hi_b16 v0, off, s0
56; GFX11-NEXT:    s_waitcnt vmcnt(0)
57; GFX11-NEXT:    s_setpc_b64 s[30:31]
58bb:
59  %gep_lo = getelementptr inbounds half, ptr addrspace(5) null, i64 1
60  %load_lo = load half, ptr addrspace(5) %gep_lo
61  %load_hi = load half, ptr addrspace(5) null
62
63  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
64  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
65
66  ret <2 x half> %result
67}
68
69define <2 x half> @chain_hi_to_lo_private_different_bases(ptr addrspace(5) %base_lo, ptr addrspace(5) %base_hi) {
70; GFX900-LABEL: chain_hi_to_lo_private_different_bases:
71; GFX900:       ; %bb.0: ; %bb
72; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73; GFX900-NEXT:    buffer_load_ushort v0, v0, s[0:3], 0 offen
74; GFX900-NEXT:    s_nop 0
75; GFX900-NEXT:    buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen
76; GFX900-NEXT:    s_waitcnt vmcnt(0)
77; GFX900-NEXT:    s_setpc_b64 s[30:31]
78;
79; FLATSCR-LABEL: chain_hi_to_lo_private_different_bases:
80; FLATSCR:       ; %bb.0: ; %bb
81; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82; FLATSCR-NEXT:    scratch_load_ushort v0, v0, off
83; FLATSCR-NEXT:    s_nop 0
84; FLATSCR-NEXT:    scratch_load_short_d16_hi v0, v1, off
85; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
86; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
87;
88; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_different_bases:
89; GFX10_DEFAULT:       ; %bb.0: ; %bb
90; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
91; GFX10_DEFAULT-NEXT:    s_clause 0x1
92; GFX10_DEFAULT-NEXT:    buffer_load_ushort v0, v0, s[0:3], 0 offen
93; GFX10_DEFAULT-NEXT:    buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen
94; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
95; GFX10_DEFAULT-NEXT:    s_setpc_b64 s[30:31]
96;
97; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_different_bases:
98; FLATSCR_GFX10:       ; %bb.0: ; %bb
99; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
100; FLATSCR_GFX10-NEXT:    scratch_load_ushort v0, v0, off
101; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v0, v1, off
102; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
103; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
104;
105; GFX11-LABEL: chain_hi_to_lo_private_different_bases:
106; GFX11:       ; %bb.0: ; %bb
107; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108; GFX11-NEXT:    scratch_load_u16 v0, v0, off
109; GFX11-NEXT:    scratch_load_d16_hi_b16 v0, v1, off
110; GFX11-NEXT:    s_waitcnt vmcnt(0)
111; GFX11-NEXT:    s_setpc_b64 s[30:31]
112bb:
113  %load_lo = load half, ptr addrspace(5) %base_lo
114  %load_hi = load half, ptr addrspace(5) %base_hi
115
116  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
117  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
118
119  ret <2 x half> %result
120}
121
122define <2 x half> @chain_hi_to_lo_arithmatic(ptr addrspace(5) %base, half %in) {
123; GFX900-LABEL: chain_hi_to_lo_arithmatic:
124; GFX900:       ; %bb.0: ; %bb
125; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126; GFX900-NEXT:    v_add_f16_e32 v1, 1.0, v1
127; GFX900-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
128; GFX900-NEXT:    s_waitcnt vmcnt(0)
129; GFX900-NEXT:    v_mov_b32_e32 v0, v1
130; GFX900-NEXT:    s_setpc_b64 s[30:31]
131;
132; FLATSCR-LABEL: chain_hi_to_lo_arithmatic:
133; FLATSCR:       ; %bb.0: ; %bb
134; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
135; FLATSCR-NEXT:    v_add_f16_e32 v1, 1.0, v1
136; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, v0, off
137; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
138; FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
139; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
140;
141; GFX10_DEFAULT-LABEL: chain_hi_to_lo_arithmatic:
142; GFX10_DEFAULT:       ; %bb.0: ; %bb
143; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
144; GFX10_DEFAULT-NEXT:    v_add_f16_e32 v1, 1.0, v1
145; GFX10_DEFAULT-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
146; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
147; GFX10_DEFAULT-NEXT:    v_mov_b32_e32 v0, v1
148; GFX10_DEFAULT-NEXT:    s_setpc_b64 s[30:31]
149;
150; FLATSCR_GFX10-LABEL: chain_hi_to_lo_arithmatic:
151; FLATSCR_GFX10:       ; %bb.0: ; %bb
152; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153; FLATSCR_GFX10-NEXT:    v_add_f16_e32 v1, 1.0, v1
154; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v1, v0, off
155; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
156; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v0, v1
157; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
158;
159; GFX11-LABEL: chain_hi_to_lo_arithmatic:
160; GFX11:       ; %bb.0: ; %bb
161; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
162; GFX11-NEXT:    v_add_f16_e32 v1, 1.0, v1
163; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, v0, off
164; GFX11-NEXT:    s_waitcnt vmcnt(0)
165; GFX11-NEXT:    v_mov_b32_e32 v0, v1
166; GFX11-NEXT:    s_setpc_b64 s[30:31]
167bb:
168  %arith_lo = fadd half %in, 1.0
169  %load_hi = load half, ptr addrspace(5) %base
170
171  %temp = insertelement <2 x half> undef, half %arith_lo, i32 0
172  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
173
174  ret <2 x half> %result
175}
176
177define <2 x half> @chain_hi_to_lo_group() {
178; GCN-LABEL: chain_hi_to_lo_group:
179; GCN:       ; %bb.0: ; %bb
180; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
181; GCN-NEXT:    v_mov_b32_e32 v1, 0
182; GCN-NEXT:    ds_read_u16 v0, v1 offset:2
183; GCN-NEXT:    s_waitcnt lgkmcnt(0)
184; GCN-NEXT:    ds_read_u16_d16_hi v0, v1
185; GCN-NEXT:    s_waitcnt lgkmcnt(0)
186; GCN-NEXT:    s_setpc_b64 s[30:31]
187;
188; GFX10-LABEL: chain_hi_to_lo_group:
189; GFX10:       ; %bb.0: ; %bb
190; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
191; GFX10-NEXT:    v_mov_b32_e32 v1, 0
192; GFX10-NEXT:    ds_read_u16 v0, v1 offset:2
193; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
194; GFX10-NEXT:    ds_read_u16_d16_hi v0, v1
195; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX10-NEXT:    s_setpc_b64 s[30:31]
197;
198; GFX11-LABEL: chain_hi_to_lo_group:
199; GFX11:       ; %bb.0: ; %bb
200; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
201; GFX11-NEXT:    v_mov_b32_e32 v1, 0
202; GFX11-NEXT:    ds_load_u16 v0, v1 offset:2
203; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
204; GFX11-NEXT:    ds_load_u16_d16_hi v0, v1
205; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
206; GFX11-NEXT:    s_setpc_b64 s[30:31]
207bb:
208  %gep_lo = getelementptr inbounds half, ptr addrspace(3) null, i64 1
209  %load_lo = load half, ptr addrspace(3) %gep_lo
210  %load_hi = load half, ptr addrspace(3) null
211
212  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
213  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
214
215  ret <2 x half> %result
216}
217
218define <2 x half> @chain_hi_to_lo_group_different_bases(ptr addrspace(3) %base_lo, ptr addrspace(3) %base_hi) {
219; GCN-LABEL: chain_hi_to_lo_group_different_bases:
220; GCN:       ; %bb.0: ; %bb
221; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
222; GCN-NEXT:    ds_read_u16 v0, v0
223; GCN-NEXT:    s_waitcnt lgkmcnt(0)
224; GCN-NEXT:    ds_read_u16_d16_hi v0, v1
225; GCN-NEXT:    s_waitcnt lgkmcnt(0)
226; GCN-NEXT:    s_setpc_b64 s[30:31]
227;
228; GFX10-LABEL: chain_hi_to_lo_group_different_bases:
229; GFX10:       ; %bb.0: ; %bb
230; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
231; GFX10-NEXT:    ds_read_u16 v0, v0
232; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
233; GFX10-NEXT:    ds_read_u16_d16_hi v0, v1
234; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
235; GFX10-NEXT:    s_setpc_b64 s[30:31]
236;
237; GFX11-LABEL: chain_hi_to_lo_group_different_bases:
238; GFX11:       ; %bb.0: ; %bb
239; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240; GFX11-NEXT:    ds_load_u16 v0, v0
241; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
242; GFX11-NEXT:    ds_load_u16_d16_hi v0, v1
243; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
244; GFX11-NEXT:    s_setpc_b64 s[30:31]
245bb:
246  %load_lo = load half, ptr addrspace(3) %base_lo
247  %load_hi = load half, ptr addrspace(3) %base_hi
248
249  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
250  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
251
252  ret <2 x half> %result
253}
254
255define <2 x half> @chain_hi_to_lo_global() {
256; GCN-LABEL: chain_hi_to_lo_global:
257; GCN:       ; %bb.0: ; %bb
258; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259; GCN-NEXT:    v_mov_b32_e32 v0, 2
260; GCN-NEXT:    v_mov_b32_e32 v1, 0
261; GCN-NEXT:    global_load_ushort v0, v[0:1], off
262; GCN-NEXT:    v_mov_b32_e32 v1, 0
263; GCN-NEXT:    v_mov_b32_e32 v2, 0
264; GCN-NEXT:    global_load_short_d16_hi v0, v[1:2], off
265; GCN-NEXT:    s_waitcnt vmcnt(0)
266; GCN-NEXT:    s_setpc_b64 s[30:31]
267;
268; GFX10-LABEL: chain_hi_to_lo_global:
269; GFX10:       ; %bb.0: ; %bb
270; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
271; GFX10-NEXT:    v_mov_b32_e32 v0, 2
272; GFX10-NEXT:    v_mov_b32_e32 v1, 0
273; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
274; GFX10-NEXT:    v_mov_b32_e32 v1, 0
275; GFX10-NEXT:    v_mov_b32_e32 v2, 0
276; GFX10-NEXT:    global_load_short_d16_hi v0, v[1:2], off
277; GFX10-NEXT:    s_waitcnt vmcnt(0)
278; GFX10-NEXT:    s_setpc_b64 s[30:31]
279;
280; GFX11-LABEL: chain_hi_to_lo_global:
281; GFX11:       ; %bb.0: ; %bb
282; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
283; GFX11-NEXT:    v_mov_b32_e32 v0, 2
284; GFX11-NEXT:    v_mov_b32_e32 v1, 0
285; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
286; GFX11-NEXT:    v_mov_b32_e32 v1, 0
287; GFX11-NEXT:    v_mov_b32_e32 v2, 0
288; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[1:2], off
289; GFX11-NEXT:    s_waitcnt vmcnt(0)
290; GFX11-NEXT:    s_setpc_b64 s[30:31]
291bb:
292  %gep_lo = getelementptr inbounds half, ptr addrspace(1) null, i64 1
293  %load_lo = load half, ptr addrspace(1) %gep_lo
294  %load_hi = load half, ptr addrspace(1) null
295
296  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
297  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
298
299  ret <2 x half> %result
300}
301
302define <2 x half> @chain_hi_to_lo_global_different_bases(ptr addrspace(1) %base_lo, ptr addrspace(1) %base_hi) {
303; GCN-LABEL: chain_hi_to_lo_global_different_bases:
304; GCN:       ; %bb.0: ; %bb
305; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
306; GCN-NEXT:    global_load_ushort v0, v[0:1], off
307; GCN-NEXT:    s_nop 0
308; GCN-NEXT:    global_load_short_d16_hi v0, v[2:3], off
309; GCN-NEXT:    s_waitcnt vmcnt(0)
310; GCN-NEXT:    s_setpc_b64 s[30:31]
311;
312; GFX10-LABEL: chain_hi_to_lo_global_different_bases:
313; GFX10:       ; %bb.0: ; %bb
314; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
315; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
316; GFX10-NEXT:    global_load_short_d16_hi v0, v[2:3], off
317; GFX10-NEXT:    s_waitcnt vmcnt(0)
318; GFX10-NEXT:    s_setpc_b64 s[30:31]
319;
320; GFX11-LABEL: chain_hi_to_lo_global_different_bases:
321; GFX11:       ; %bb.0: ; %bb
322; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
323; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
324; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[2:3], off
325; GFX11-NEXT:    s_waitcnt vmcnt(0)
326; GFX11-NEXT:    s_setpc_b64 s[30:31]
327bb:
328  %load_lo = load half, ptr addrspace(1) %base_lo
329  %load_hi = load half, ptr addrspace(1) %base_hi
330
331  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
332  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
333
334  ret <2 x half> %result
335}
336
337define <2 x half> @chain_hi_to_lo_flat() {
338; GCN-LABEL: chain_hi_to_lo_flat:
339; GCN:       ; %bb.0: ; %bb
340; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
341; GCN-NEXT:    v_mov_b32_e32 v0, 2
342; GCN-NEXT:    v_mov_b32_e32 v1, 0
343; GCN-NEXT:    flat_load_ushort v0, v[0:1]
344; GCN-NEXT:    v_mov_b32_e32 v1, 0
345; GCN-NEXT:    v_mov_b32_e32 v2, 0
346; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
347; GCN-NEXT:    flat_load_short_d16_hi v0, v[1:2]
348; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
349; GCN-NEXT:    s_setpc_b64 s[30:31]
350;
351; GFX10-LABEL: chain_hi_to_lo_flat:
352; GFX10:       ; %bb.0: ; %bb
353; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
354; GFX10-NEXT:    v_mov_b32_e32 v0, 2
355; GFX10-NEXT:    v_mov_b32_e32 v1, 0
356; GFX10-NEXT:    flat_load_ushort v0, v[0:1]
357; GFX10-NEXT:    v_mov_b32_e32 v1, 0
358; GFX10-NEXT:    v_mov_b32_e32 v2, 0
359; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
360; GFX10-NEXT:    flat_load_short_d16_hi v0, v[1:2]
361; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
362; GFX10-NEXT:    s_setpc_b64 s[30:31]
363;
364; GFX11-LABEL: chain_hi_to_lo_flat:
365; GFX11:       ; %bb.0: ; %bb
366; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
367; GFX11-NEXT:    v_mov_b32_e32 v0, 2
368; GFX11-NEXT:    v_mov_b32_e32 v1, 0
369; GFX11-NEXT:    flat_load_u16 v0, v[0:1]
370; GFX11-NEXT:    v_mov_b32_e32 v1, 0
371; GFX11-NEXT:    v_mov_b32_e32 v2, 0
372; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
373; GFX11-NEXT:    flat_load_d16_hi_b16 v0, v[1:2]
374; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
375; GFX11-NEXT:    s_setpc_b64 s[30:31]
376bb:
377  %gep_lo = getelementptr inbounds half, ptr null, i64 1
378  %load_lo = load half, ptr %gep_lo
379  %load_hi = load half, ptr null
380
381  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
382  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
383
384  ret <2 x half> %result
385}
386
387define <2 x half> @chain_hi_to_lo_flat_different_bases(ptr %base_lo, ptr %base_hi) {
388; GCN-LABEL: chain_hi_to_lo_flat_different_bases:
389; GCN:       ; %bb.0: ; %bb
390; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
391; GCN-NEXT:    flat_load_ushort v0, v[0:1]
392; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
393; GCN-NEXT:    flat_load_short_d16_hi v0, v[2:3]
394; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
395; GCN-NEXT:    s_setpc_b64 s[30:31]
396;
397; GFX10-LABEL: chain_hi_to_lo_flat_different_bases:
398; GFX10:       ; %bb.0: ; %bb
399; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
400; GFX10-NEXT:    flat_load_ushort v0, v[0:1]
401; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
402; GFX10-NEXT:    flat_load_short_d16_hi v0, v[2:3]
403; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
404; GFX10-NEXT:    s_setpc_b64 s[30:31]
405;
406; GFX11-LABEL: chain_hi_to_lo_flat_different_bases:
407; GFX11:       ; %bb.0: ; %bb
408; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
409; GFX11-NEXT:    flat_load_u16 v0, v[0:1]
410; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
411; GFX11-NEXT:    flat_load_d16_hi_b16 v0, v[2:3]
412; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
413; GFX11-NEXT:    s_setpc_b64 s[30:31]
414bb:
415  %load_lo = load half, ptr %base_lo
416  %load_hi = load half, ptr %base_hi
417
418  %temp = insertelement <2 x half> undef, half %load_lo, i32 0
419  %result = insertelement <2 x half> %temp, half %load_hi, i32 1
420
421  ret <2 x half> %result
422}
423
424; Make sure we don't lose any of the private stores.
425define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) #0 {
426; GFX900-LABEL: vload2_private:
427; GFX900:       ; %bb.0: ; %entry
428; GFX900-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
429; GFX900-NEXT:    v_mov_b32_e32 v2, 0
430; GFX900-NEXT:    s_add_u32 s0, s0, s17
431; GFX900-NEXT:    s_addc_u32 s1, s1, 0
432; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
433; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5]
434; GFX900-NEXT:    s_waitcnt vmcnt(0)
435; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0
436; GFX900-NEXT:    s_waitcnt vmcnt(0)
437; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5] offset:2
438; GFX900-NEXT:    s_waitcnt vmcnt(0)
439; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:2
440; GFX900-NEXT:    s_waitcnt vmcnt(0)
441; GFX900-NEXT:    global_load_ushort v0, v2, s[4:5] offset:4
442; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
443; GFX900-NEXT:    s_waitcnt vmcnt(0)
444; GFX900-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
445; GFX900-NEXT:    s_waitcnt vmcnt(0)
446; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
447; GFX900-NEXT:    buffer_load_ushort v3, off, s[0:3], 0
448; GFX900-NEXT:    s_waitcnt vmcnt(1)
449; GFX900-NEXT:    v_mov_b32_e32 v1, v0
450; GFX900-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4
451; GFX900-NEXT:    s_waitcnt vmcnt(1)
452; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
453; GFX900-NEXT:    s_waitcnt vmcnt(0)
454; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
455; GFX900-NEXT:    s_endpgm
456;
457; FLATSCR-LABEL: vload2_private:
458; FLATSCR:       ; %bb.0: ; %entry
459; FLATSCR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
460; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
461; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
462; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
463; FLATSCR-NEXT:    s_mov_b32 s4, 0
464; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
465; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1]
466; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
467; FLATSCR-NEXT:    scratch_store_short off, v0, s4
468; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
469; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1] offset:2
470; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
471; FLATSCR-NEXT:    scratch_store_short off, v0, s4 offset:2
472; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
473; FLATSCR-NEXT:    global_load_ushort v0, v2, s[0:1] offset:4
474; FLATSCR-NEXT:    s_mov_b32 s0, 0
475; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
476; FLATSCR-NEXT:    scratch_store_short off, v0, s0 offset:4
477; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
478; FLATSCR-NEXT:    scratch_load_ushort v0, off, s0 offset:2
479; FLATSCR-NEXT:    scratch_load_ushort v3, off, s0
480; FLATSCR-NEXT:    s_waitcnt vmcnt(1)
481; FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
482; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, off, s0 offset:4
483; FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
484; FLATSCR-NEXT:    s_waitcnt vmcnt(1)
485; FLATSCR-NEXT:    v_perm_b32 v0, v0, v3, s0
486; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
487; FLATSCR-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
488; FLATSCR-NEXT:    s_endpgm
489;
490; GFX10_DEFAULT-LABEL: vload2_private:
491; GFX10_DEFAULT:       ; %bb.0: ; %entry
492; GFX10_DEFAULT-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
493; GFX10_DEFAULT-NEXT:    v_mov_b32_e32 v2, 0
494; GFX10_DEFAULT-NEXT:    s_add_u32 s0, s0, s17
495; GFX10_DEFAULT-NEXT:    s_addc_u32 s1, s1, 0
496; GFX10_DEFAULT-NEXT:    s_waitcnt lgkmcnt(0)
497; GFX10_DEFAULT-NEXT:    global_load_ushort v0, v2, s[4:5]
498; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
499; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0
500; GFX10_DEFAULT-NEXT:    s_waitcnt_vscnt null, 0x0
501; GFX10_DEFAULT-NEXT:    global_load_ushort v0, v2, s[4:5] offset:2
502; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
503; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:2
504; GFX10_DEFAULT-NEXT:    s_waitcnt_vscnt null, 0x0
505; GFX10_DEFAULT-NEXT:    global_load_ushort v0, v2, s[4:5] offset:4
506; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
507; GFX10_DEFAULT-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
508; GFX10_DEFAULT-NEXT:    s_waitcnt_vscnt null, 0x0
509; GFX10_DEFAULT-NEXT:    s_clause 0x1
510; GFX10_DEFAULT-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
511; GFX10_DEFAULT-NEXT:    buffer_load_ushort v3, off, s[0:3], 0
512; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(1)
513; GFX10_DEFAULT-NEXT:    v_mov_b32_e32 v1, v0
514; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
515; GFX10_DEFAULT-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
516; GFX10_DEFAULT-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4
517; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
518; GFX10_DEFAULT-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
519; GFX10_DEFAULT-NEXT:    s_endpgm
520;
521; FLATSCR_GFX10-LABEL: vload2_private:
522; FLATSCR_GFX10:       ; %bb.0: ; %entry
523; FLATSCR_GFX10-NEXT:    s_add_u32 s8, s8, s13
524; FLATSCR_GFX10-NEXT:    s_addc_u32 s9, s9, 0
525; FLATSCR_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
526; FLATSCR_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
527; FLATSCR_GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
528; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v2, 0
529; FLATSCR_GFX10-NEXT:    s_mov_b32 s4, 0
530; FLATSCR_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
531; FLATSCR_GFX10-NEXT:    global_load_ushort v0, v2, s[0:1]
532; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
533; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s4
534; FLATSCR_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
535; FLATSCR_GFX10-NEXT:    global_load_ushort v0, v2, s[0:1] offset:2
536; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
537; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s4 offset:2
538; FLATSCR_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
539; FLATSCR_GFX10-NEXT:    global_load_ushort v0, v2, s[0:1] offset:4
540; FLATSCR_GFX10-NEXT:    s_waitcnt_depctr 0xffe3
541; FLATSCR_GFX10-NEXT:    s_mov_b32 s0, 0
542; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
543; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s0 offset:4
544; FLATSCR_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
545; FLATSCR_GFX10-NEXT:    s_clause 0x1
546; FLATSCR_GFX10-NEXT:    scratch_load_ushort v0, off, s0 offset:2
547; FLATSCR_GFX10-NEXT:    scratch_load_ushort v3, off, s0
548; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(1)
549; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v1, v0
550; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
551; FLATSCR_GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
552; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v1, off, s0 offset:4
553; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
554; FLATSCR_GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
555; FLATSCR_GFX10-NEXT:    s_endpgm
556;
557; GFX11-LABEL: vload2_private:
558; GFX11:       ; %bb.0: ; %entry
559; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
560; GFX11-NEXT:    v_mov_b32_e32 v2, 0
561; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
562; GFX11-NEXT:    global_load_u16 v0, v2, s[0:1]
563; GFX11-NEXT:    s_waitcnt vmcnt(0)
564; GFX11-NEXT:    scratch_store_b16 off, v0, off dlc
565; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
566; GFX11-NEXT:    global_load_u16 v0, v2, s[0:1] offset:2
567; GFX11-NEXT:    s_waitcnt vmcnt(0)
568; GFX11-NEXT:    scratch_store_b16 off, v0, off offset:2 dlc
569; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
570; GFX11-NEXT:    global_load_u16 v0, v2, s[0:1] offset:4
571; GFX11-NEXT:    s_waitcnt vmcnt(0)
572; GFX11-NEXT:    scratch_store_b16 off, v0, off offset:4 dlc
573; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
574; GFX11-NEXT:    s_clause 0x1
575; GFX11-NEXT:    scratch_load_u16 v0, off, off offset:2
576; GFX11-NEXT:    scratch_load_u16 v3, off, off
577; GFX11-NEXT:    s_waitcnt vmcnt(1)
578; GFX11-NEXT:    v_mov_b32_e32 v1, v0
579; GFX11-NEXT:    s_waitcnt vmcnt(0)
580; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
581; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, off, off offset:4
582; GFX11-NEXT:    s_waitcnt vmcnt(0)
583; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
584; GFX11-NEXT:    s_endpgm
585entry:
586  %loc = alloca [3 x i16], align 2, addrspace(5)
587  %tmp = load i16, ptr addrspace(1) %in, align 2
588  store volatile i16 %tmp, ptr addrspace(5) %loc
589  %arrayidx.1 = getelementptr inbounds i16, ptr addrspace(1) %in, i64 1
590  %tmp1 = load i16, ptr addrspace(1) %arrayidx.1, align 2
591  %loc.2.sroa_idx3 = getelementptr inbounds [3 x i16], ptr addrspace(5) %loc, i32 0, i32 1
592  store volatile i16 %tmp1, ptr addrspace(5) %loc.2.sroa_idx3
593  %arrayidx.2 = getelementptr inbounds i16, ptr addrspace(1) %in, i64 2
594  %tmp2 = load i16, ptr addrspace(1) %arrayidx.2, align 2
595  %loc.4.sroa_idx = getelementptr inbounds [3 x i16], ptr addrspace(5) %loc, i32 0, i32 2
596  store volatile i16 %tmp2, ptr addrspace(5) %loc.4.sroa_idx
597  %loc.0. = load <2 x i16>, ptr addrspace(5) %loc, align 2
598  store <2 x i16> %loc.0., ptr addrspace(1) %out, align 4
599  %loc.2.sroa_idx = getelementptr inbounds [3 x i16], ptr addrspace(5) %loc, i32 0, i32 1
600  %loc.2. = load <2 x i16>, ptr addrspace(5) %loc.2.sroa_idx, align 2
601  %arrayidx6 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 1
602  store <2 x i16> %loc.2., ptr addrspace(1) %arrayidx6, align 4
603  ret void
604}
605
606; There is another instruction between the misordered instruction and
607; the value dependent load, so a simple operand check is insufficient.
608define <2 x i16> @chain_hi_to_lo_group_other_dep(ptr addrspace(3) %ptr) {
609; GCN-LABEL: chain_hi_to_lo_group_other_dep:
610; GCN:       ; %bb.0: ; %bb
611; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
612; GCN-NEXT:    ds_read_u16_d16_hi v1, v0
613; GCN-NEXT:    s_waitcnt lgkmcnt(0)
614; GCN-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
615; GCN-NEXT:    ds_read_u16_d16 v1, v0 offset:2
616; GCN-NEXT:    s_waitcnt lgkmcnt(0)
617; GCN-NEXT:    v_mov_b32_e32 v0, v1
618; GCN-NEXT:    s_setpc_b64 s[30:31]
619;
620; GFX10-LABEL: chain_hi_to_lo_group_other_dep:
621; GFX10:       ; %bb.0: ; %bb
622; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
623; GFX10-NEXT:    ds_read_u16_d16_hi v1, v0
624; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
625; GFX10-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
626; GFX10-NEXT:    ds_read_u16_d16 v1, v0 offset:2
627; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
628; GFX10-NEXT:    v_mov_b32_e32 v0, v1
629; GFX10-NEXT:    s_setpc_b64 s[30:31]
630;
631; GFX11-LABEL: chain_hi_to_lo_group_other_dep:
632; GFX11:       ; %bb.0: ; %bb
633; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
634; GFX11-NEXT:    ds_load_u16_d16_hi v1, v0
635; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
636; GFX11-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
637; GFX11-NEXT:    ds_load_u16_d16 v1, v0 offset:2
638; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
639; GFX11-NEXT:    v_mov_b32_e32 v0, v1
640; GFX11-NEXT:    s_setpc_b64 s[30:31]
641bb:
642  %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1
643  %load_lo = load i16, ptr addrspace(3) %gep_lo
644  %load_hi = load i16, ptr addrspace(3) %ptr
645  %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
646  %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
647  %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
648  ret <2 x i16> %result
649}
650
651; The volatile operations aren't put on the same chain
652define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %ptr) {
653; GFX900-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
654; GFX900:       ; %bb.0: ; %bb
655; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
656; GFX900-NEXT:    ds_read_u16 v1, v0 offset:2
657; GFX900-NEXT:    ds_read_u16_d16_hi v0, v0
658; GFX900-NEXT:    s_mov_b32 s4, 0xffff
659; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
660; GFX900-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
661; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v0
662; GFX900-NEXT:    s_setpc_b64 s[30:31]
663;
664; FLATSCR-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
665; FLATSCR:       ; %bb.0: ; %bb
666; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
667; FLATSCR-NEXT:    ds_read_u16 v1, v0 offset:2
668; FLATSCR-NEXT:    ds_read_u16_d16_hi v0, v0
669; FLATSCR-NEXT:    s_mov_b32 s0, 0xffff
670; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
671; FLATSCR-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
672; FLATSCR-NEXT:    v_bfi_b32 v0, s0, v1, v0
673; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
674;
675; GFX10-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
676; GFX10:       ; %bb.0: ; %bb
677; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
678; GFX10-NEXT:    ds_read_u16 v1, v0 offset:2
679; GFX10-NEXT:    ds_read_u16_d16_hi v0, v0
680; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
681; GFX10-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
682; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
683; GFX10-NEXT:    s_setpc_b64 s[30:31]
684;
685; GFX11-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
686; GFX11:       ; %bb.0: ; %bb
687; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
688; GFX11-NEXT:    ds_load_u16 v1, v0 offset:2
689; GFX11-NEXT:    ds_load_u16_d16_hi v0, v0
690; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
691; GFX11-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
692; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
693; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
694; GFX11-NEXT:    s_setpc_b64 s[30:31]
695bb:
696  %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1
697  %load_lo = load volatile i16, ptr addrspace(3) %gep_lo
698  %load_hi = load volatile i16, ptr addrspace(3) %ptr
699  %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
700  %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
701  %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
702  ret <2 x i16> %result
703}
704
705define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
706; GFX900-LABEL: chain_hi_to_lo_private_other_dep:
707; GFX900:       ; %bb.0: ; %bb
708; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
709; GFX900-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
710; GFX900-NEXT:    s_waitcnt vmcnt(0)
711; GFX900-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
712; GFX900-NEXT:    buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2
713; GFX900-NEXT:    s_waitcnt vmcnt(0)
714; GFX900-NEXT:    v_mov_b32_e32 v0, v1
715; GFX900-NEXT:    s_setpc_b64 s[30:31]
716;
717; FLATSCR-LABEL: chain_hi_to_lo_private_other_dep:
718; FLATSCR:       ; %bb.0: ; %bb
719; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
720; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, v0, off
721; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
722; FLATSCR-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
723; FLATSCR-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
724; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
725; FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
726; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
727;
728; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep:
729; GFX10_DEFAULT:       ; %bb.0: ; %bb
730; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
731; GFX10_DEFAULT-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
732; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
733; GFX10_DEFAULT-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
734; GFX10_DEFAULT-NEXT:    buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2
735; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
736; GFX10_DEFAULT-NEXT:    v_mov_b32_e32 v0, v1
737; GFX10_DEFAULT-NEXT:    s_setpc_b64 s[30:31]
738;
739; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_other_dep:
740; FLATSCR_GFX10:       ; %bb.0: ; %bb
741; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
742; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v1, v0, off
743; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
744; FLATSCR_GFX10-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
745; FLATSCR_GFX10-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
746; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
747; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v0, v1
748; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
749;
750; GFX11-LABEL: chain_hi_to_lo_private_other_dep:
751; GFX11:       ; %bb.0: ; %bb
752; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
753; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, v0, off
754; GFX11-NEXT:    s_waitcnt vmcnt(0)
755; GFX11-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
756; GFX11-NEXT:    scratch_load_d16_b16 v1, v0, off offset:2
757; GFX11-NEXT:    s_waitcnt vmcnt(0)
758; GFX11-NEXT:    v_mov_b32_e32 v0, v1
759; GFX11-NEXT:    s_setpc_b64 s[30:31]
760bb:
761  %gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1
762  %load_lo = load i16, ptr addrspace(5) %gep_lo
763  %load_hi = load i16, ptr addrspace(5) %ptr
764  %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
765  %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
766  %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
767  ret <2 x i16> %result
768}
769
770define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
771; GFX900-LABEL: chain_hi_to_lo_global_other_dep:
772; GFX900:       ; %bb.0: ; %bb
773; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
774; GFX900-NEXT:    global_load_ushort v2, v[0:1], off offset:2 glc
775; GFX900-NEXT:    s_waitcnt vmcnt(0)
776; GFX900-NEXT:    global_load_short_d16_hi v0, v[0:1], off glc
777; GFX900-NEXT:    s_waitcnt vmcnt(0)
778; GFX900-NEXT:    s_mov_b32 s4, 0xffff
779; GFX900-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
780; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v0
781; GFX900-NEXT:    s_setpc_b64 s[30:31]
782;
783; FLATSCR-LABEL: chain_hi_to_lo_global_other_dep:
784; FLATSCR:       ; %bb.0: ; %bb
785; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
786; FLATSCR-NEXT:    global_load_ushort v2, v[0:1], off offset:2 glc
787; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
788; FLATSCR-NEXT:    global_load_short_d16_hi v0, v[0:1], off glc
789; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
790; FLATSCR-NEXT:    s_mov_b32 s0, 0xffff
791; FLATSCR-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
792; FLATSCR-NEXT:    v_bfi_b32 v0, s0, v2, v0
793; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
794;
795; GFX10-LABEL: chain_hi_to_lo_global_other_dep:
796; GFX10:       ; %bb.0: ; %bb
797; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
798; GFX10-NEXT:    global_load_ushort v2, v[0:1], off offset:2 glc dlc
799; GFX10-NEXT:    s_waitcnt vmcnt(0)
800; GFX10-NEXT:    global_load_short_d16_hi v0, v[0:1], off glc dlc
801; GFX10-NEXT:    s_waitcnt vmcnt(0)
802; GFX10-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
803; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
804; GFX10-NEXT:    s_setpc_b64 s[30:31]
805;
806; GFX11-LABEL: chain_hi_to_lo_global_other_dep:
807; GFX11:       ; %bb.0: ; %bb
808; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
809; GFX11-NEXT:    global_load_u16 v2, v[0:1], off offset:2 glc dlc
810; GFX11-NEXT:    s_waitcnt vmcnt(0)
811; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[0:1], off glc dlc
812; GFX11-NEXT:    s_waitcnt vmcnt(0)
813; GFX11-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
814; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
815; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
816; GFX11-NEXT:    s_setpc_b64 s[30:31]
817bb:
818  %gep_lo = getelementptr inbounds i16, ptr addrspace(1) %ptr, i64 1
819  %load_lo = load volatile i16, ptr addrspace(1) %gep_lo
820  %load_hi = load volatile i16, ptr addrspace(1) %ptr
821  %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
822  %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
823  %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
824  ret <2 x i16> %result
825}
826
827define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
828; GFX900-LABEL: chain_hi_to_lo_flat_other_dep:
829; GFX900:       ; %bb.0: ; %bb
830; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
831; GFX900-NEXT:    flat_load_ushort v2, v[0:1] offset:2 glc
832; GFX900-NEXT:    s_waitcnt vmcnt(0)
833; GFX900-NEXT:    flat_load_short_d16_hi v0, v[0:1] glc
834; GFX900-NEXT:    s_waitcnt vmcnt(0)
835; GFX900-NEXT:    s_mov_b32 s4, 0xffff
836; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
837; GFX900-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
838; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v0
839; GFX900-NEXT:    s_setpc_b64 s[30:31]
840;
841; FLATSCR-LABEL: chain_hi_to_lo_flat_other_dep:
842; FLATSCR:       ; %bb.0: ; %bb
843; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
844; FLATSCR-NEXT:    flat_load_ushort v2, v[0:1] offset:2 glc
845; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
846; FLATSCR-NEXT:    flat_load_short_d16_hi v0, v[0:1] glc
847; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
848; FLATSCR-NEXT:    s_mov_b32 s0, 0xffff
849; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
850; FLATSCR-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
851; FLATSCR-NEXT:    v_bfi_b32 v0, s0, v2, v0
852; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
853;
854; GFX10-LABEL: chain_hi_to_lo_flat_other_dep:
855; GFX10:       ; %bb.0: ; %bb
856; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
857; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 2
858; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
859; GFX10-NEXT:    flat_load_ushort v2, v[2:3] glc dlc
860; GFX10-NEXT:    s_waitcnt vmcnt(0)
861; GFX10-NEXT:    flat_load_short_d16_hi v0, v[0:1] glc dlc
862; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
863; GFX10-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
864; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
865; GFX10-NEXT:    s_setpc_b64 s[30:31]
866;
867; GFX11-LABEL: chain_hi_to_lo_flat_other_dep:
868; GFX11:       ; %bb.0: ; %bb
869; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
870; GFX11-NEXT:    flat_load_u16 v2, v[0:1] offset:2 glc dlc
871; GFX11-NEXT:    s_waitcnt vmcnt(0)
872; GFX11-NEXT:    flat_load_d16_hi_b16 v0, v[0:1] glc dlc
873; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
874; GFX11-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
875; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
876; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
877; GFX11-NEXT:    s_setpc_b64 s[30:31]
878bb:
879  %gep_lo = getelementptr inbounds i16, ptr addrspace(0) %ptr, i64 1
880  %load_lo = load volatile i16, ptr addrspace(0) %gep_lo
881  %load_hi = load volatile i16, ptr addrspace(0) %ptr
882  %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
883  %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
884  %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
885  ret <2 x i16> %result
886}
887
888define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, ptr addrspace(3) %may.alias) {
889; GFX900-LABEL: chain_hi_to_lo_group_may_alias_store:
890; GFX900:       ; %bb.0: ; %bb
891; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
892; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7b
893; GFX900-NEXT:    ds_read_u16 v2, v0
894; GFX900-NEXT:    ds_write_b16 v1, v3
895; GFX900-NEXT:    ds_read_u16 v0, v0 offset:2
896; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
897; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
898; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
899; GFX900-NEXT:    s_setpc_b64 s[30:31]
900;
901; FLATSCR-LABEL: chain_hi_to_lo_group_may_alias_store:
902; FLATSCR:       ; %bb.0: ; %bb
903; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
904; FLATSCR-NEXT:    v_mov_b32_e32 v3, 0x7b
905; FLATSCR-NEXT:    ds_read_u16 v2, v0
906; FLATSCR-NEXT:    ds_write_b16 v1, v3
907; FLATSCR-NEXT:    ds_read_u16 v0, v0 offset:2
908; FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
909; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
910; FLATSCR-NEXT:    v_perm_b32 v0, v2, v0, s0
911; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
912;
913; GFX10-LABEL: chain_hi_to_lo_group_may_alias_store:
914; GFX10:       ; %bb.0: ; %bb
915; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
916; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7b
917; GFX10-NEXT:    ds_read_u16 v3, v0
918; GFX10-NEXT:    ds_write_b16 v1, v2
919; GFX10-NEXT:    ds_read_u16 v0, v0 offset:2
920; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
921; GFX10-NEXT:    v_perm_b32 v0, v3, v0, 0x5040100
922; GFX10-NEXT:    s_setpc_b64 s[30:31]
923;
924; GFX11-LABEL: chain_hi_to_lo_group_may_alias_store:
925; GFX11:       ; %bb.0: ; %bb
926; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
927; GFX11-NEXT:    v_mov_b32_e32 v2, 0x7b
928; GFX11-NEXT:    ds_load_u16 v3, v0
929; GFX11-NEXT:    ds_store_b16 v1, v2
930; GFX11-NEXT:    ds_load_u16 v0, v0 offset:2
931; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
932; GFX11-NEXT:    v_perm_b32 v0, v3, v0, 0x5040100
933; GFX11-NEXT:    s_setpc_b64 s[30:31]
934bb:
935  %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1
936  %load_hi = load i16, ptr addrspace(3) %ptr
937  store i16 123, ptr addrspace(3) %may.alias
938  %load_lo = load i16, ptr addrspace(3) %gep_lo
939
940  %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
941  %result = insertelement <2 x i16> %to.hi, i16 %load_lo, i32 0
942  ret <2 x i16> %result
943}
944