xref: /llvm-project/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll (revision 5da7179cb3ff80203f58ddea71562816b2ae4ff6)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
5
6define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
7; SI-LABEL: vec_8xi16_extract_4xi16:
8; SI:       ; %bb.0:
9; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10; SI-NEXT:    s_cbranch_scc0 .LBB0_2
11; SI-NEXT:  ; %bb.1: ; %F
12; SI-NEXT:    s_mov_b32 s6, 0
13; SI-NEXT:    s_mov_b32 s7, 0xf000
14; SI-NEXT:    s_mov_b32 s4, s6
15; SI-NEXT:    s_mov_b32 s5, s6
16; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
17; SI-NEXT:    s_waitcnt vmcnt(0)
18; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
19; SI-NEXT:    s_waitcnt vmcnt(0)
20; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
21; SI-NEXT:    s_waitcnt vmcnt(0)
22; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
23; SI-NEXT:    s_waitcnt vmcnt(0)
24; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
25; SI-NEXT:    s_waitcnt vmcnt(0)
26; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
27; SI-NEXT:    s_waitcnt vmcnt(0)
28; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
29; SI-NEXT:    s_waitcnt vmcnt(0)
30; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
31; SI-NEXT:    s_waitcnt vmcnt(0)
32; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
33; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
34; SI-NEXT:    v_or_b32_e32 v3, v6, v2
35; SI-NEXT:    v_or_b32_e32 v2, v4, v5
36; SI-NEXT:    s_mov_b64 vcc, exec
37; SI-NEXT:    s_cbranch_execz .LBB0_3
38; SI-NEXT:    s_branch .LBB0_4
39; SI-NEXT:  .LBB0_2:
40; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
41; SI-NEXT:    s_mov_b64 vcc, 0
42; SI-NEXT:  .LBB0_3: ; %T
43; SI-NEXT:    s_mov_b32 s6, 0
44; SI-NEXT:    s_mov_b32 s7, 0xf000
45; SI-NEXT:    s_mov_b32 s4, s6
46; SI-NEXT:    s_mov_b32 s5, s6
47; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
48; SI-NEXT:    s_waitcnt vmcnt(0)
49; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
50; SI-NEXT:    s_waitcnt vmcnt(0)
51; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
52; SI-NEXT:    s_waitcnt vmcnt(0)
53; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
54; SI-NEXT:    s_waitcnt vmcnt(0)
55; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
56; SI-NEXT:    s_waitcnt vmcnt(0)
57; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
58; SI-NEXT:    s_waitcnt vmcnt(0)
59; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
60; SI-NEXT:    s_waitcnt vmcnt(0)
61; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
62; SI-NEXT:    s_waitcnt vmcnt(0)
63; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
64; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
65; SI-NEXT:    v_or_b32_e32 v3, v4, v0
66; SI-NEXT:    v_or_b32_e32 v2, v2, v1
67; SI-NEXT:  .LBB0_4: ; %exit
68; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v2
69; SI-NEXT:    v_bfe_i32 v1, v2, 0, 16
70; SI-NEXT:    v_bfe_i32 v2, v3, 0, 16
71; SI-NEXT:    v_mov_b32_e32 v3, 0xffff0000
72; SI-NEXT:    v_bfrev_b32_e32 v4, 1
73; SI-NEXT:    v_mov_b32_e32 v5, 0xffff
74; SI-NEXT:    v_mov_b32_e32 v6, 0x8000
75; SI-NEXT:    v_mov_b32_e32 v7, 0xffff8000
76; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
77; SI-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
78; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
79; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
80; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
81; SI-NEXT:    v_cndmask_b32_e32 v1, -1, v7, vcc
82; SI-NEXT:    v_or_b32_e32 v0, v0, v4
83; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
84; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v1
85; SI-NEXT:    v_or_b32_e32 v2, v3, v2
86; SI-NEXT:    v_alignbit_b32 v1, v2, v4, 16
87; SI-NEXT:    s_setpc_b64 s[30:31]
88;
89; GFX9-LABEL: vec_8xi16_extract_4xi16:
90; GFX9:       ; %bb.0:
91; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92; GFX9-NEXT:    s_cbranch_scc0 .LBB0_2
93; GFX9-NEXT:  ; %bb.1: ; %F
94; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
95; GFX9-NEXT:    s_waitcnt vmcnt(0)
96; GFX9-NEXT:    s_cbranch_execz .LBB0_3
97; GFX9-NEXT:    s_branch .LBB0_4
98; GFX9-NEXT:  .LBB0_2:
99; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
100; GFX9-NEXT:  .LBB0_3: ; %T
101; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
102; GFX9-NEXT:    s_waitcnt vmcnt(0)
103; GFX9-NEXT:  .LBB0_4: ; %exit
104; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
105; GFX9-NEXT:    s_movk_i32 s4, 0x8000
106; GFX9-NEXT:    v_or_b32_e32 v1, 0xffff8000, v0
107; GFX9-NEXT:    v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
108; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
109; GFX9-NEXT:    v_or_b32_e32 v2, 0xffff8000, v0
110; GFX9-NEXT:    v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
111; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
112; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
113; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
114; GFX9-NEXT:    s_setpc_b64 s[30:31]
115;
116; GFX11-LABEL: vec_8xi16_extract_4xi16:
117; GFX11:       ; %bb.0:
118; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119; GFX11-NEXT:    s_cbranch_scc0 .LBB0_2
120; GFX11-NEXT:  ; %bb.1: ; %F
121; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
122; GFX11-NEXT:    s_waitcnt vmcnt(0)
123; GFX11-NEXT:    s_cbranch_execz .LBB0_3
124; GFX11-NEXT:    s_branch .LBB0_4
125; GFX11-NEXT:  .LBB0_2:
126; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
127; GFX11-NEXT:  .LBB0_3: ; %T
128; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
129; GFX11-NEXT:    s_waitcnt vmcnt(0)
130; GFX11-NEXT:  .LBB0_4: ; %exit
131; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
132; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
133; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
134; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
135; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
136; GFX11-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
137; GFX11-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
138; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
139; GFX11-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
140; GFX11-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
141; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
142; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
143; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
144; GFX11-NEXT:    s_setpc_b64 s[30:31]
145  br i1 undef, label %T, label %F
146
147T:
148  %t = load volatile <8 x i16>, ptr addrspace(1) %p0
149  br label %exit
150
151F:
152  %f = load volatile <8 x i16>, ptr addrspace(1) %p1
153  br label %exit
154
155exit:
156  %m = phi <8 x i16> [ %t, %T ], [ %f, %F ]
157  %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
158  %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
159  %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
160  ret <4 x i16> %r2
161}
162
163define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
164; SI-LABEL: vec_8xi16_extract_4xi16_2:
165; SI:       ; %bb.0:
166; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
167; SI-NEXT:    s_cbranch_scc0 .LBB1_2
168; SI-NEXT:  ; %bb.1: ; %F
169; SI-NEXT:    s_mov_b32 s6, 0
170; SI-NEXT:    s_mov_b32 s7, 0xf000
171; SI-NEXT:    s_mov_b32 s4, s6
172; SI-NEXT:    s_mov_b32 s5, s6
173; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
174; SI-NEXT:    s_waitcnt vmcnt(0)
175; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
176; SI-NEXT:    s_waitcnt vmcnt(0)
177; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc
178; SI-NEXT:    s_waitcnt vmcnt(0)
179; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
180; SI-NEXT:    s_waitcnt vmcnt(0)
181; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
182; SI-NEXT:    s_waitcnt vmcnt(0)
183; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
184; SI-NEXT:    s_waitcnt vmcnt(0)
185; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
186; SI-NEXT:    s_waitcnt vmcnt(0)
187; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
188; SI-NEXT:    s_waitcnt vmcnt(0)
189; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
190; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
191; SI-NEXT:    v_or_b32_e32 v5, v6, v2
192; SI-NEXT:    v_or_b32_e32 v4, v4, v3
193; SI-NEXT:    s_mov_b64 vcc, exec
194; SI-NEXT:    s_cbranch_execz .LBB1_3
195; SI-NEXT:    s_branch .LBB1_4
196; SI-NEXT:  .LBB1_2:
197; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
198; SI-NEXT:    s_mov_b64 vcc, 0
199; SI-NEXT:  .LBB1_3: ; %T
200; SI-NEXT:    s_mov_b32 s6, 0
201; SI-NEXT:    s_mov_b32 s7, 0xf000
202; SI-NEXT:    s_mov_b32 s4, s6
203; SI-NEXT:    s_mov_b32 s5, s6
204; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
205; SI-NEXT:    s_waitcnt vmcnt(0)
206; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
207; SI-NEXT:    s_waitcnt vmcnt(0)
208; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
209; SI-NEXT:    s_waitcnt vmcnt(0)
210; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
211; SI-NEXT:    s_waitcnt vmcnt(0)
212; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
213; SI-NEXT:    s_waitcnt vmcnt(0)
214; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc
215; SI-NEXT:    s_waitcnt vmcnt(0)
216; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
217; SI-NEXT:    s_waitcnt vmcnt(0)
218; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
219; SI-NEXT:    s_waitcnt vmcnt(0)
220; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
221; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
222; SI-NEXT:    v_or_b32_e32 v5, v4, v0
223; SI-NEXT:    v_or_b32_e32 v4, v2, v1
224; SI-NEXT:  .LBB1_4: ; %exit
225; SI-NEXT:    v_ashrrev_i32_e32 v2, 16, v4
226; SI-NEXT:    v_ashr_i64 v[0:1], v[4:5], 48
227; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
228; SI-NEXT:    v_bfe_i32 v3, v5, 0, 16
229; SI-NEXT:    v_mov_b32_e32 v4, 0xffff0000
230; SI-NEXT:    v_bfrev_b32_e32 v5, 1
231; SI-NEXT:    v_mov_b32_e32 v6, 0xffff
232; SI-NEXT:    v_mov_b32_e32 v7, 0x8000
233; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
234; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v5, vcc
235; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
236; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
237; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v3
238; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
239; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
240; SI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
241; SI-NEXT:    v_or_b32_e32 v0, v1, v8
242; SI-NEXT:    v_or_b32_e32 v2, v2, v3
243; SI-NEXT:    v_alignbit_b32 v1, v2, v8, 16
244; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
245; SI-NEXT:    s_setpc_b64 s[30:31]
246;
247; GFX9-LABEL: vec_8xi16_extract_4xi16_2:
248; GFX9:       ; %bb.0:
249; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
250; GFX9-NEXT:    s_cbranch_scc0 .LBB1_2
251; GFX9-NEXT:  ; %bb.1: ; %F
252; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
253; GFX9-NEXT:    s_waitcnt vmcnt(0)
254; GFX9-NEXT:    s_cbranch_execz .LBB1_3
255; GFX9-NEXT:    s_branch .LBB1_4
256; GFX9-NEXT:  .LBB1_2:
257; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
258; GFX9-NEXT:  .LBB1_3: ; %T
259; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
260; GFX9-NEXT:    s_waitcnt vmcnt(0)
261; GFX9-NEXT:  .LBB1_4: ; %exit
262; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1]
263; GFX9-NEXT:    s_movk_i32 s4, 0x8000
264; GFX9-NEXT:    v_or_b32_e32 v1, 0xffff8000, v0
265; GFX9-NEXT:    v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
266; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
267; GFX9-NEXT:    v_or_b32_e32 v3, 0xffff8000, v0
268; GFX9-NEXT:    v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
269; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
270; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
271; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
272; GFX9-NEXT:    s_setpc_b64 s[30:31]
273;
274; GFX11-LABEL: vec_8xi16_extract_4xi16_2:
275; GFX11:       ; %bb.0:
276; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
277; GFX11-NEXT:    s_cbranch_scc0 .LBB1_2
278; GFX11-NEXT:  ; %bb.1: ; %F
279; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
280; GFX11-NEXT:    s_waitcnt vmcnt(0)
281; GFX11-NEXT:    s_cbranch_execz .LBB1_3
282; GFX11-NEXT:    s_branch .LBB1_4
283; GFX11-NEXT:  .LBB1_2:
284; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
285; GFX11-NEXT:  .LBB1_3: ; %T
286; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
287; GFX11-NEXT:    s_waitcnt vmcnt(0)
288; GFX11-NEXT:  .LBB1_4: ; %exit
289; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
290; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
291; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
292; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
293; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
294; GFX11-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
295; GFX11-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
296; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
297; GFX11-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
298; GFX11-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
299; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
300; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
301; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
302; GFX11-NEXT:    s_setpc_b64 s[30:31]
303  br i1 undef, label %T, label %F
304
305T:
306  %t = load volatile <8 x i16>, ptr addrspace(1) %p0
307  br label %exit
308
309F:
310  %f = load volatile <8 x i16>, ptr addrspace(1) %p1
311  br label %exit
312
313exit:
314  %m = phi <8 x i16> [ %t, %T ], [ %f, %F ]
315  %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
316  %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
317  %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
318  ret <4 x i16> %r2
319}
320
321define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
322; SI-LABEL: vec_8xf16_extract_4xf16:
323; SI:       ; %bb.0:
324; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325; SI-NEXT:    s_cbranch_scc0 .LBB2_2
326; SI-NEXT:  ; %bb.1: ; %F
327; SI-NEXT:    s_mov_b32 s6, 0
328; SI-NEXT:    s_mov_b32 s7, 0xf000
329; SI-NEXT:    s_mov_b32 s4, s6
330; SI-NEXT:    s_mov_b32 s5, s6
331; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
332; SI-NEXT:    s_waitcnt vmcnt(0)
333; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
334; SI-NEXT:    s_waitcnt vmcnt(0)
335; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
336; SI-NEXT:    s_waitcnt vmcnt(0)
337; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
338; SI-NEXT:    s_waitcnt vmcnt(0)
339; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
340; SI-NEXT:    s_waitcnt vmcnt(0)
341; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
342; SI-NEXT:    s_waitcnt vmcnt(0)
343; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
344; SI-NEXT:    s_waitcnt vmcnt(0)
345; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
346; SI-NEXT:    s_waitcnt vmcnt(0)
347; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
348; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
349; SI-NEXT:    v_cvt_f32_f16_e32 v3, v5
350; SI-NEXT:    v_or_b32_e32 v2, v6, v2
351; SI-NEXT:    v_or_b32_e32 v4, v4, v7
352; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
353; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
354; SI-NEXT:    s_mov_b64 vcc, exec
355; SI-NEXT:    s_cbranch_execz .LBB2_3
356; SI-NEXT:    s_branch .LBB2_4
357; SI-NEXT:  .LBB2_2:
358; SI-NEXT:    ; implicit-def: $vgpr4
359; SI-NEXT:    ; implicit-def: $vgpr3
360; SI-NEXT:    ; implicit-def: $vgpr2
361; SI-NEXT:    s_mov_b64 vcc, 0
362; SI-NEXT:  .LBB2_3: ; %T
363; SI-NEXT:    s_mov_b32 s6, 0
364; SI-NEXT:    s_mov_b32 s7, 0xf000
365; SI-NEXT:    s_mov_b32 s4, s6
366; SI-NEXT:    s_mov_b32 s5, s6
367; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
368; SI-NEXT:    s_waitcnt vmcnt(0)
369; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
370; SI-NEXT:    s_waitcnt vmcnt(0)
371; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
372; SI-NEXT:    s_waitcnt vmcnt(0)
373; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
374; SI-NEXT:    s_waitcnt vmcnt(0)
375; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
376; SI-NEXT:    s_waitcnt vmcnt(0)
377; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
378; SI-NEXT:    s_waitcnt vmcnt(0)
379; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
380; SI-NEXT:    s_waitcnt vmcnt(0)
381; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
382; SI-NEXT:    s_waitcnt vmcnt(0)
383; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
384; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
385; SI-NEXT:    v_or_b32_e32 v0, v4, v0
386; SI-NEXT:    v_or_b32_e32 v1, v2, v1
387; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
388; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
389; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
390; SI-NEXT:  .LBB2_4: ; %exit
391; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
392; SI-NEXT:    v_cvt_f16_f32_e32 v1, v3
393; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
394; SI-NEXT:    v_mov_b32_e32 v3, 0x3fa00000
395; SI-NEXT:    v_mov_b32_e32 v4, 0x3f200000
396; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
397; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
398; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
399; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v0
400; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
401; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v1
402; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
403; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v2
404; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
405; SI-NEXT:    v_mov_b32_e32 v3, v2
406; SI-NEXT:    s_setpc_b64 s[30:31]
407;
408; GFX9-LABEL: vec_8xf16_extract_4xf16:
409; GFX9:       ; %bb.0:
410; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
411; GFX9-NEXT:    s_cbranch_scc0 .LBB2_2
412; GFX9-NEXT:  ; %bb.1: ; %F
413; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
414; GFX9-NEXT:    s_waitcnt vmcnt(0)
415; GFX9-NEXT:    s_cbranch_execz .LBB2_3
416; GFX9-NEXT:    s_branch .LBB2_4
417; GFX9-NEXT:  .LBB2_2:
418; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
419; GFX9-NEXT:  .LBB2_3: ; %T
420; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
421; GFX9-NEXT:    s_waitcnt vmcnt(0)
422; GFX9-NEXT:  .LBB2_4: ; %exit
423; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3900
424; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3d00
425; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v2
426; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3800
427; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
428; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v2, v5 src0_sel:WORD_1 src1_sel:DWORD
429; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
430; GFX9-NEXT:    v_cmp_nge_f16_e32 vcc, 0.5, v3
431; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc
432; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v3
433; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
434; GFX9-NEXT:    v_pack_b32_f16 v1, v0, v5
435; GFX9-NEXT:    v_pack_b32_f16 v0, v4, v2
436; GFX9-NEXT:    s_setpc_b64 s[30:31]
437;
438; GFX11-LABEL: vec_8xf16_extract_4xf16:
439; GFX11:       ; %bb.0:
440; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441; GFX11-NEXT:    s_cbranch_scc0 .LBB2_2
442; GFX11-NEXT:  ; %bb.1: ; %F
443; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
444; GFX11-NEXT:    s_waitcnt vmcnt(0)
445; GFX11-NEXT:    s_cbranch_execz .LBB2_3
446; GFX11-NEXT:    s_branch .LBB2_4
447; GFX11-NEXT:  .LBB2_2:
448; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
449; GFX11-NEXT:  .LBB2_3: ; %T
450; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
451; GFX11-NEXT:    s_waitcnt vmcnt(0)
452; GFX11-NEXT:  .LBB2_4: ; %exit
453; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
454; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
455; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
456; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
457; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo
458; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v1
459; GFX11-NEXT:    v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0
460; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, 0.5, v3
461; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
462; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo
463; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
464; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo
465; GFX11-NEXT:    v_pack_b32_f16 v0, v2, v1
466; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v4
467; GFX11-NEXT:    s_setpc_b64 s[30:31]
468  br i1 undef, label %T, label %F
469
470T:
471  %t = load volatile <8 x half>, ptr addrspace(1) %p0
472  br label %exit
473
474F:
475  %f = load volatile <8 x half>, ptr addrspace(1) %p1
476  br label %exit
477
478exit:
479  %m = phi <8 x half> [ %t, %T ], [ %f, %F ]
480  %v2 = shufflevector <8 x half> %m, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
481  %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>
482  %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00>
483  ret <4 x half> %r2
484}
485
486define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
487;
488; SI-LABEL: vec_16xi16_extract_4xi16:
489; SI:       ; %bb.0:
490; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
491; SI-NEXT:    s_cbranch_scc0 .LBB3_2
492; SI-NEXT:  ; %bb.1: ; %F
493; SI-NEXT:    s_mov_b32 s6, 0
494; SI-NEXT:    s_mov_b32 s7, 0xf000
495; SI-NEXT:    s_mov_b32 s4, s6
496; SI-NEXT:    s_mov_b32 s5, s6
497; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
498; SI-NEXT:    s_waitcnt vmcnt(0)
499; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
500; SI-NEXT:    s_waitcnt vmcnt(0)
501; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
502; SI-NEXT:    s_waitcnt vmcnt(0)
503; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
504; SI-NEXT:    s_waitcnt vmcnt(0)
505; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
506; SI-NEXT:    s_waitcnt vmcnt(0)
507; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
508; SI-NEXT:    s_waitcnt vmcnt(0)
509; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
510; SI-NEXT:    s_waitcnt vmcnt(0)
511; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc
512; SI-NEXT:    s_waitcnt vmcnt(0)
513; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
514; SI-NEXT:    s_waitcnt vmcnt(0)
515; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc
516; SI-NEXT:    s_waitcnt vmcnt(0)
517; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc
518; SI-NEXT:    s_waitcnt vmcnt(0)
519; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc
520; SI-NEXT:    s_waitcnt vmcnt(0)
521; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc
522; SI-NEXT:    s_waitcnt vmcnt(0)
523; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc
524; SI-NEXT:    s_waitcnt vmcnt(0)
525; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc
526; SI-NEXT:    s_waitcnt vmcnt(0)
527; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
528; SI-NEXT:    s_waitcnt vmcnt(0)
529; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
530; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
531; SI-NEXT:    v_or_b32_e32 v3, v6, v2
532; SI-NEXT:    v_or_b32_e32 v2, v4, v5
533; SI-NEXT:    s_mov_b64 vcc, exec
534; SI-NEXT:    s_cbranch_execz .LBB3_3
535; SI-NEXT:    s_branch .LBB3_4
536; SI-NEXT:  .LBB3_2:
537; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
538; SI-NEXT:    s_mov_b64 vcc, 0
539; SI-NEXT:  .LBB3_3: ; %T
540; SI-NEXT:    s_mov_b32 s6, 0
541; SI-NEXT:    s_mov_b32 s7, 0xf000
542; SI-NEXT:    s_mov_b32 s4, s6
543; SI-NEXT:    s_mov_b32 s5, s6
544; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
545; SI-NEXT:    s_waitcnt vmcnt(0)
546; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
547; SI-NEXT:    s_waitcnt vmcnt(0)
548; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
549; SI-NEXT:    s_waitcnt vmcnt(0)
550; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
551; SI-NEXT:    s_waitcnt vmcnt(0)
552; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
553; SI-NEXT:    s_waitcnt vmcnt(0)
554; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
555; SI-NEXT:    s_waitcnt vmcnt(0)
556; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
557; SI-NEXT:    s_waitcnt vmcnt(0)
558; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc
559; SI-NEXT:    s_waitcnt vmcnt(0)
560; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc
561; SI-NEXT:    s_waitcnt vmcnt(0)
562; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc
563; SI-NEXT:    s_waitcnt vmcnt(0)
564; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc
565; SI-NEXT:    s_waitcnt vmcnt(0)
566; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc
567; SI-NEXT:    s_waitcnt vmcnt(0)
568; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc
569; SI-NEXT:    s_waitcnt vmcnt(0)
570; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc
571; SI-NEXT:    s_waitcnt vmcnt(0)
572; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc
573; SI-NEXT:    s_waitcnt vmcnt(0)
574; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
575; SI-NEXT:    s_waitcnt vmcnt(0)
576; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
577; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
578; SI-NEXT:    v_or_b32_e32 v3, v4, v0
579; SI-NEXT:    v_or_b32_e32 v2, v2, v1
580; SI-NEXT:  .LBB3_4: ; %exit
581; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v2
582; SI-NEXT:    v_bfe_i32 v1, v2, 0, 16
583; SI-NEXT:    v_bfe_i32 v2, v3, 0, 16
584; SI-NEXT:    v_mov_b32_e32 v3, 0xffff0000
585; SI-NEXT:    v_bfrev_b32_e32 v4, 1
586; SI-NEXT:    v_mov_b32_e32 v5, 0xffff
587; SI-NEXT:    v_mov_b32_e32 v6, 0x8000
588; SI-NEXT:    v_mov_b32_e32 v7, 0xffff8000
589; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
590; SI-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
591; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
592; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
593; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
594; SI-NEXT:    v_cndmask_b32_e32 v1, -1, v7, vcc
595; SI-NEXT:    v_or_b32_e32 v0, v0, v4
596; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
597; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v1
598; SI-NEXT:    v_or_b32_e32 v2, v3, v2
599; SI-NEXT:    v_alignbit_b32 v1, v2, v4, 16
600; SI-NEXT:    s_setpc_b64 s[30:31]
601;
602; GFX9-LABEL: vec_16xi16_extract_4xi16:
603; GFX9:       ; %bb.0:
604; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
605; GFX9-NEXT:    s_cbranch_scc0 .LBB3_2
606; GFX9-NEXT:  ; %bb.1: ; %F
607; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
608; GFX9-NEXT:    s_waitcnt vmcnt(0)
609; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
610; GFX9-NEXT:    s_waitcnt vmcnt(0)
611; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
612; GFX9-NEXT:    s_cbranch_execz .LBB3_3
613; GFX9-NEXT:    s_branch .LBB3_4
614; GFX9-NEXT:  .LBB3_2:
615; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
616; GFX9-NEXT:  .LBB3_3: ; %T
617; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
618; GFX9-NEXT:    s_waitcnt vmcnt(0)
619; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
620; GFX9-NEXT:    s_waitcnt vmcnt(0)
621; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
622; GFX9-NEXT:  .LBB3_4: ; %exit
623; GFX9-NEXT:    s_waitcnt vmcnt(0)
624; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0]
625; GFX9-NEXT:    s_movk_i32 s4, 0x8000
626; GFX9-NEXT:    v_or_b32_e32 v1, 0xffff8000, v0
627; GFX9-NEXT:    v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
628; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
629; GFX9-NEXT:    v_or_b32_e32 v3, 0xffff8000, v0
630; GFX9-NEXT:    v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
631; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
632; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
633; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
634; GFX9-NEXT:    s_setpc_b64 s[30:31]
635;
636; GFX11-LABEL: vec_16xi16_extract_4xi16:
637; GFX11:       ; %bb.0:
638; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
639; GFX11-NEXT:    s_cbranch_scc0 .LBB3_2
640; GFX11-NEXT:  ; %bb.1: ; %F
641; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
642; GFX11-NEXT:    s_waitcnt vmcnt(0)
643; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
644; GFX11-NEXT:    s_waitcnt vmcnt(0)
645; GFX11-NEXT:    s_cbranch_execz .LBB3_3
646; GFX11-NEXT:    s_branch .LBB3_4
647; GFX11-NEXT:  .LBB3_2:
648; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
649; GFX11-NEXT:  .LBB3_3: ; %T
650; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
651; GFX11-NEXT:    s_waitcnt vmcnt(0)
652; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
653; GFX11-NEXT:    s_waitcnt vmcnt(0)
654; GFX11-NEXT:  .LBB3_4: ; %exit
655; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
656; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
657; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
658; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
659; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
660; GFX11-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
661; GFX11-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
662; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
663; GFX11-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
664; GFX11-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
665; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
666; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
667; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
668; GFX11-NEXT:    s_setpc_b64 s[30:31]
669  br i1 undef, label %T, label %F
670
671T:
672  %t = load volatile <16 x i16>, ptr addrspace(1) %p0
673  br label %exit
674
675F:
676  %f = load volatile <16 x i16>, ptr addrspace(1) %p1
677  br label %exit
678
679exit:
680  %m = phi <16 x i16> [ %t, %T ], [ %f, %F ]
681  %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
682  %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
683  %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
684  ret <4 x i16> %r2
685}
686
687define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
688;
689; SI-LABEL: vec_16xi16_extract_4xi16_2:
690; SI:       ; %bb.0:
691; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
692; SI-NEXT:    s_cbranch_scc0 .LBB4_2
693; SI-NEXT:  ; %bb.1: ; %F
694; SI-NEXT:    s_mov_b32 s6, 0
695; SI-NEXT:    s_mov_b32 s7, 0xf000
696; SI-NEXT:    s_mov_b32 s4, s6
697; SI-NEXT:    s_mov_b32 s5, s6
698; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
699; SI-NEXT:    s_waitcnt vmcnt(0)
700; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
701; SI-NEXT:    s_waitcnt vmcnt(0)
702; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc
703; SI-NEXT:    s_waitcnt vmcnt(0)
704; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
705; SI-NEXT:    s_waitcnt vmcnt(0)
706; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
707; SI-NEXT:    s_waitcnt vmcnt(0)
708; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
709; SI-NEXT:    s_waitcnt vmcnt(0)
710; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
711; SI-NEXT:    s_waitcnt vmcnt(0)
712; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc
713; SI-NEXT:    s_waitcnt vmcnt(0)
714; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
715; SI-NEXT:    s_waitcnt vmcnt(0)
716; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc
717; SI-NEXT:    s_waitcnt vmcnt(0)
718; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc
719; SI-NEXT:    s_waitcnt vmcnt(0)
720; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc
721; SI-NEXT:    s_waitcnt vmcnt(0)
722; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc
723; SI-NEXT:    s_waitcnt vmcnt(0)
724; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc
725; SI-NEXT:    s_waitcnt vmcnt(0)
726; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc
727; SI-NEXT:    s_waitcnt vmcnt(0)
728; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
729; SI-NEXT:    s_waitcnt vmcnt(0)
730; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
731; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
732; SI-NEXT:    v_or_b32_e32 v5, v6, v2
733; SI-NEXT:    v_or_b32_e32 v4, v4, v3
734; SI-NEXT:    s_mov_b64 vcc, exec
735; SI-NEXT:    s_cbranch_execz .LBB4_3
736; SI-NEXT:    s_branch .LBB4_4
737; SI-NEXT:  .LBB4_2:
738; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
739; SI-NEXT:    s_mov_b64 vcc, 0
740; SI-NEXT:  .LBB4_3: ; %T
741; SI-NEXT:    s_mov_b32 s6, 0
742; SI-NEXT:    s_mov_b32 s7, 0xf000
743; SI-NEXT:    s_mov_b32 s4, s6
744; SI-NEXT:    s_mov_b32 s5, s6
745; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
746; SI-NEXT:    s_waitcnt vmcnt(0)
747; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
748; SI-NEXT:    s_waitcnt vmcnt(0)
749; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
750; SI-NEXT:    s_waitcnt vmcnt(0)
751; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
752; SI-NEXT:    s_waitcnt vmcnt(0)
753; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
754; SI-NEXT:    s_waitcnt vmcnt(0)
755; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc
756; SI-NEXT:    s_waitcnt vmcnt(0)
757; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
758; SI-NEXT:    s_waitcnt vmcnt(0)
759; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc
760; SI-NEXT:    s_waitcnt vmcnt(0)
761; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc
762; SI-NEXT:    s_waitcnt vmcnt(0)
763; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc
764; SI-NEXT:    s_waitcnt vmcnt(0)
765; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc
766; SI-NEXT:    s_waitcnt vmcnt(0)
767; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc
768; SI-NEXT:    s_waitcnt vmcnt(0)
769; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc
770; SI-NEXT:    s_waitcnt vmcnt(0)
771; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc
772; SI-NEXT:    s_waitcnt vmcnt(0)
773; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc
774; SI-NEXT:    s_waitcnt vmcnt(0)
775; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
776; SI-NEXT:    s_waitcnt vmcnt(0)
777; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
778; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
779; SI-NEXT:    v_or_b32_e32 v5, v4, v0
780; SI-NEXT:    v_or_b32_e32 v4, v2, v1
781; SI-NEXT:  .LBB4_4: ; %exit
782; SI-NEXT:    v_ashrrev_i32_e32 v2, 16, v4
783; SI-NEXT:    v_ashr_i64 v[0:1], v[4:5], 48
784; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
785; SI-NEXT:    v_bfe_i32 v3, v5, 0, 16
786; SI-NEXT:    v_mov_b32_e32 v4, 0xffff0000
787; SI-NEXT:    v_bfrev_b32_e32 v5, 1
788; SI-NEXT:    v_mov_b32_e32 v6, 0xffff
789; SI-NEXT:    v_mov_b32_e32 v7, 0x8000
790; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
791; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v5, vcc
792; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
793; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
794; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v3
795; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
796; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
797; SI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
798; SI-NEXT:    v_or_b32_e32 v0, v1, v8
799; SI-NEXT:    v_or_b32_e32 v2, v2, v3
800; SI-NEXT:    v_alignbit_b32 v1, v2, v8, 16
801; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
802; SI-NEXT:    s_setpc_b64 s[30:31]
803;
804; GFX9-LABEL: vec_16xi16_extract_4xi16_2:
805; GFX9:       ; %bb.0:
806; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
807; GFX9-NEXT:    s_cbranch_scc0 .LBB4_2
808; GFX9-NEXT:  ; %bb.1: ; %F
809; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
810; GFX9-NEXT:    s_waitcnt vmcnt(0)
811; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
812; GFX9-NEXT:    s_waitcnt vmcnt(0)
813; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
814; GFX9-NEXT:    s_cbranch_execz .LBB4_3
815; GFX9-NEXT:    s_branch .LBB4_4
816; GFX9-NEXT:  .LBB4_2:
817; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
818; GFX9-NEXT:  .LBB4_3: ; %T
819; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
820; GFX9-NEXT:    s_waitcnt vmcnt(0)
821; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
822; GFX9-NEXT:    s_waitcnt vmcnt(0)
823; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
824; GFX9-NEXT:  .LBB4_4: ; %exit
825; GFX9-NEXT:    s_waitcnt vmcnt(0)
826; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1]
827; GFX9-NEXT:    s_movk_i32 s4, 0x8000
828; GFX9-NEXT:    v_or_b32_e32 v1, 0xffff8000, v0
829; GFX9-NEXT:    v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
830; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v6 op_sel_hi:[0,1]
831; GFX9-NEXT:    v_or_b32_e32 v3, 0xffff8000, v0
832; GFX9-NEXT:    v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
833; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
834; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
835; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
836; GFX9-NEXT:    s_setpc_b64 s[30:31]
837;
838; GFX11-LABEL: vec_16xi16_extract_4xi16_2:
839; GFX11:       ; %bb.0:
840; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
841; GFX11-NEXT:    s_cbranch_scc0 .LBB4_2
842; GFX11-NEXT:  ; %bb.1: ; %F
843; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
844; GFX11-NEXT:    s_waitcnt vmcnt(0)
845; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
846; GFX11-NEXT:    s_waitcnt vmcnt(0)
847; GFX11-NEXT:    s_cbranch_execz .LBB4_3
848; GFX11-NEXT:    s_branch .LBB4_4
849; GFX11-NEXT:  .LBB4_2:
850; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
851; GFX11-NEXT:  .LBB4_3: ; %T
852; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
853; GFX11-NEXT:    s_waitcnt vmcnt(0)
854; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
855; GFX11-NEXT:    s_waitcnt vmcnt(0)
856; GFX11-NEXT:  .LBB4_4: ; %exit
857; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
858; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
859; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
860; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
861; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
862; GFX11-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
863; GFX11-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
864; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
865; GFX11-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
866; GFX11-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
867; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
868; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
869; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
870; GFX11-NEXT:    s_setpc_b64 s[30:31]
871  br i1 undef, label %T, label %F
872
873T:
874  %t = load volatile <16 x i16>, ptr addrspace(1) %p0
875  br label %exit
876
877F:
878  %f = load volatile <16 x i16>, ptr addrspace(1) %p1
879  br label %exit
880
881exit:
882  %m = phi <16 x i16> [ %t, %T ], [ %f, %F ]
883  %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
884  %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
885  %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
886  ret <4 x i16> %r2
887}
888
889define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
890;
891; SI-LABEL: vec_16xf16_extract_4xf16:
892; SI:       ; %bb.0:
893; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
894; SI-NEXT:    s_cbranch_scc0 .LBB5_2
895; SI-NEXT:  ; %bb.1: ; %F
896; SI-NEXT:    s_mov_b32 s6, 0
897; SI-NEXT:    s_mov_b32 s7, 0xf000
898; SI-NEXT:    s_mov_b32 s4, s6
899; SI-NEXT:    s_mov_b32 s5, s6
900; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
901; SI-NEXT:    s_waitcnt vmcnt(0)
902; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
903; SI-NEXT:    s_waitcnt vmcnt(0)
904; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
905; SI-NEXT:    s_waitcnt vmcnt(0)
906; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
907; SI-NEXT:    s_waitcnt vmcnt(0)
908; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
909; SI-NEXT:    s_waitcnt vmcnt(0)
910; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
911; SI-NEXT:    s_waitcnt vmcnt(0)
912; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
913; SI-NEXT:    s_waitcnt vmcnt(0)
914; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc
915; SI-NEXT:    s_waitcnt vmcnt(0)
916; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
917; SI-NEXT:    s_waitcnt vmcnt(0)
918; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc
919; SI-NEXT:    s_waitcnt vmcnt(0)
920; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc
921; SI-NEXT:    s_waitcnt vmcnt(0)
922; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc
923; SI-NEXT:    s_waitcnt vmcnt(0)
924; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc
925; SI-NEXT:    s_waitcnt vmcnt(0)
926; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc
927; SI-NEXT:    s_waitcnt vmcnt(0)
928; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc
929; SI-NEXT:    s_waitcnt vmcnt(0)
930; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
931; SI-NEXT:    s_waitcnt vmcnt(0)
932; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
933; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
934; SI-NEXT:    v_cvt_f32_f16_e32 v3, v5
935; SI-NEXT:    v_or_b32_e32 v2, v6, v2
936; SI-NEXT:    v_or_b32_e32 v4, v4, v7
937; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
938; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
939; SI-NEXT:    s_mov_b64 vcc, exec
940; SI-NEXT:    s_cbranch_execz .LBB5_3
941; SI-NEXT:    s_branch .LBB5_4
942; SI-NEXT:  .LBB5_2:
943; SI-NEXT:    ; implicit-def: $vgpr4
944; SI-NEXT:    ; implicit-def: $vgpr3
945; SI-NEXT:    ; implicit-def: $vgpr2
946; SI-NEXT:    s_mov_b64 vcc, 0
947; SI-NEXT:  .LBB5_3: ; %T
948; SI-NEXT:    s_mov_b32 s6, 0
949; SI-NEXT:    s_mov_b32 s7, 0xf000
950; SI-NEXT:    s_mov_b32 s4, s6
951; SI-NEXT:    s_mov_b32 s5, s6
952; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
953; SI-NEXT:    s_waitcnt vmcnt(0)
954; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
955; SI-NEXT:    s_waitcnt vmcnt(0)
956; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
957; SI-NEXT:    s_waitcnt vmcnt(0)
958; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
959; SI-NEXT:    s_waitcnt vmcnt(0)
960; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
961; SI-NEXT:    s_waitcnt vmcnt(0)
962; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
963; SI-NEXT:    s_waitcnt vmcnt(0)
964; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
965; SI-NEXT:    s_waitcnt vmcnt(0)
966; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc
967; SI-NEXT:    s_waitcnt vmcnt(0)
968; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc
969; SI-NEXT:    s_waitcnt vmcnt(0)
970; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc
971; SI-NEXT:    s_waitcnt vmcnt(0)
972; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc
973; SI-NEXT:    s_waitcnt vmcnt(0)
974; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc
975; SI-NEXT:    s_waitcnt vmcnt(0)
976; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc
977; SI-NEXT:    s_waitcnt vmcnt(0)
978; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc
979; SI-NEXT:    s_waitcnt vmcnt(0)
980; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc
981; SI-NEXT:    s_waitcnt vmcnt(0)
982; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
983; SI-NEXT:    s_waitcnt vmcnt(0)
984; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
985; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
986; SI-NEXT:    v_or_b32_e32 v0, v4, v0
987; SI-NEXT:    v_or_b32_e32 v1, v2, v1
988; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
989; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
990; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
991; SI-NEXT:  .LBB5_4: ; %exit
992; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
993; SI-NEXT:    v_cvt_f16_f32_e32 v1, v3
994; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
995; SI-NEXT:    v_mov_b32_e32 v3, 0x3fa00000
996; SI-NEXT:    v_mov_b32_e32 v4, 0x3f200000
997; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
998; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
999; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1000; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v0
1001; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
1002; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v1
1003; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
1004; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v2
1005; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
1006; SI-NEXT:    v_mov_b32_e32 v3, v2
1007; SI-NEXT:    s_setpc_b64 s[30:31]
1008;
1009; GFX9-LABEL: vec_16xf16_extract_4xf16:
1010; GFX9:       ; %bb.0:
1011; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1012; GFX9-NEXT:    s_cbranch_scc0 .LBB5_2
1013; GFX9-NEXT:  ; %bb.1: ; %F
1014; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
1015; GFX9-NEXT:    s_waitcnt vmcnt(0)
1016; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
1017; GFX9-NEXT:    s_waitcnt vmcnt(0)
1018; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
1019; GFX9-NEXT:    s_cbranch_execz .LBB5_3
1020; GFX9-NEXT:    s_branch .LBB5_4
1021; GFX9-NEXT:  .LBB5_2:
1022; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
1023; GFX9-NEXT:  .LBB5_3: ; %T
1024; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
1025; GFX9-NEXT:    s_waitcnt vmcnt(0)
1026; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
1027; GFX9-NEXT:    s_waitcnt vmcnt(0)
1028; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
1029; GFX9-NEXT:  .LBB5_4: ; %exit
1030; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3900
1031; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3d00
1032; GFX9-NEXT:    s_waitcnt vmcnt(0)
1033; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v4
1034; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3800
1035; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
1036; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v4, v3 src0_sel:WORD_1 src1_sel:DWORD
1037; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
1038; GFX9-NEXT:    v_cmp_nge_f16_e32 vcc, 0.5, v5
1039; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
1040; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v5
1041; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
1042; GFX9-NEXT:    v_pack_b32_f16 v1, v0, v4
1043; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v3
1044; GFX9-NEXT:    s_setpc_b64 s[30:31]
1045;
1046; GFX11-LABEL: vec_16xf16_extract_4xf16:
1047; GFX11:       ; %bb.0:
1048; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1049; GFX11-NEXT:    s_cbranch_scc0 .LBB5_2
1050; GFX11-NEXT:  ; %bb.1: ; %F
1051; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
1052; GFX11-NEXT:    s_waitcnt vmcnt(0)
1053; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
1054; GFX11-NEXT:    s_waitcnt vmcnt(0)
1055; GFX11-NEXT:    s_cbranch_execz .LBB5_3
1056; GFX11-NEXT:    s_branch .LBB5_4
1057; GFX11-NEXT:  .LBB5_2:
1058; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
1059; GFX11-NEXT:  .LBB5_3: ; %T
1060; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
1061; GFX11-NEXT:    s_waitcnt vmcnt(0)
1062; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
1063; GFX11-NEXT:    s_waitcnt vmcnt(0)
1064; GFX11-NEXT:  .LBB5_4: ; %exit
1065; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
1066; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1067; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
1068; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1069; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo
1070; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v1
1071; GFX11-NEXT:    v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0
1072; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, 0.5, v3
1073; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1074; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo
1075; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
1076; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo
1077; GFX11-NEXT:    v_pack_b32_f16 v0, v2, v1
1078; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v4
1079; GFX11-NEXT:    s_setpc_b64 s[30:31]
1080  br i1 undef, label %T, label %F
1081
1082T:
1083  %t = load volatile <16 x half>, ptr addrspace(1) %p0
1084  br label %exit
1085
1086F:
1087  %f = load volatile <16 x half>, ptr addrspace(1) %p1
1088  br label %exit
1089
1090exit:
1091  %m = phi <16 x half> [ %t, %T ], [ %f, %F ]
1092  %v2 = shufflevector <16 x half> %m, <16 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1093  %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>
1094  %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00>
1095  ret <4 x half> %r2
1096}
1097
1098define <8 x i16> @large_vector(ptr addrspace(3) %p, i32 %idxp) {
1099; SI-LABEL: large_vector:
1100; SI:       ; %bb.0:
1101; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1102; SI-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
1103; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1104; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
1105; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
1106; SI-NEXT:    v_add_i32_e32 v5, vcc, 12, v0
1107; SI-NEXT:    s_mov_b32 m0, -1
1108; SI-NEXT:    ds_read_b32 v0, v0
1109; SI-NEXT:    ds_read_b32 v2, v1
1110; SI-NEXT:    ds_read_b32 v4, v3
1111; SI-NEXT:    ds_read_b32 v6, v5
1112; SI-NEXT:    s_waitcnt lgkmcnt(3)
1113; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1114; SI-NEXT:    s_waitcnt lgkmcnt(2)
1115; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1116; SI-NEXT:    s_waitcnt lgkmcnt(1)
1117; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
1118; SI-NEXT:    s_waitcnt lgkmcnt(0)
1119; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
1120; SI-NEXT:    s_setpc_b64 s[30:31]
1121;
1122; GFX9-LABEL: large_vector:
1123; GFX9:       ; %bb.0:
1124; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1125; GFX9-NEXT:    v_lshl_add_u32 v2, v1, 5, v0
1126; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
1127; GFX9-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
1128; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1129; GFX9-NEXT:    s_setpc_b64 s[30:31]
1130;
1131; GFX11-LABEL: large_vector:
1132; GFX11:       ; %bb.0:
1133; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1134; GFX11-NEXT:    v_lshl_add_u32 v2, v1, 5, v0
1135; GFX11-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
1136; GFX11-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3
1137; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1138; GFX11-NEXT:    s_setpc_b64 s[30:31]
1139  %idx = shl i32 %idxp, 4
1140
1141  %i.0 = or disjoint i32 %idx, 0
1142  %p.0 = getelementptr half, ptr addrspace(3) %p, i32 %i.0
1143  %x.0 = load i16, ptr addrspace(3) %p.0, align 4
1144  %v0p = insertelement <8 x i16> poison, i16 %x.0, i32 0
1145  %i.1 = or disjoint i32 %idx, 1
1146  %p.1 = getelementptr half, ptr addrspace(3) %p, i32 %i.1
1147  %x.1 = load i16, ptr addrspace(3) %p.1, align 2
1148  %v0 = insertelement <8 x i16> %v0p, i16 %x.1, i32 1
1149
1150  %i.2 = or disjoint i32 %idx, 2
1151  %p.2 = getelementptr half, ptr addrspace(3) %p, i32 %i.2
1152  %x.2 = load i16, ptr addrspace(3) %p.2, align 4
1153  %v1p = insertelement <8 x i16> poison, i16 %x.2, i32 0
1154  %i.3 = or disjoint i32 %idx, 3
1155  %p.3 = getelementptr half, ptr addrspace(3) %p, i32 %i.3
1156  %x.3 = load i16, ptr addrspace(3) %p.3, align 2
1157  %v1 = insertelement <8 x i16> %v1p, i16 %x.3, i32 1
1158
1159  %i.4 = or disjoint i32 %idx, 4
1160  %p.4 = getelementptr half, ptr addrspace(3) %p, i32 %i.4
1161  %x.4 = load i16, ptr addrspace(3) %p.4, align 4
1162  %v2p = insertelement <8 x i16> poison, i16 %x.4, i32 0
1163  %i.5 = or disjoint i32 %idx, 5
1164  %p.5 = getelementptr half, ptr addrspace(3) %p, i32 %i.5
1165  %x.5 = load i16, ptr addrspace(3) %p.5, align 2
1166  %v2 = insertelement <8 x i16> %v2p, i16 %x.5, i32 1
1167
1168  %i.6 = or disjoint i32 %idx, 6
1169  %p.6 = getelementptr half, ptr addrspace(3) %p, i32 %i.6
1170  %x.6 = load i16, ptr addrspace(3) %p.6, align 4
1171  %v3p = insertelement <8 x i16> poison, i16 %x.6, i32 0
1172  %i.7 = or disjoint i32 %idx, 7
1173  %p.7 = getelementptr half, ptr addrspace(3) %p, i32 %i.7
1174  %x.7 = load i16, ptr addrspace(3) %p.7, align 2
1175  %v3 = insertelement <8 x i16> %v3p, i16 %x.7, i32 1
1176
1177  %z.1 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
1178  %z.2 = shufflevector <8 x i16> %z.1, <8 x i16> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
1179  %z.3 = shufflevector <8 x i16> %z.2, <8 x i16> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
1180  ret <8 x i16> %z.3
1181}
1182
1183define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
1184; SI-LABEL: vec_16xi16_extract_8xi16_0:
1185; SI:       ; %bb.0:
1186; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1187; SI-NEXT:    buffer_load_ubyte v4, off, s[0:3], s32
1188; SI-NEXT:    s_waitcnt vmcnt(0)
1189; SI-NEXT:    v_and_b32_e32 v4, 1, v4
1190; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
1191; SI-NEXT:    s_and_b64 s[34:35], vcc, exec
1192; SI-NEXT:    s_mov_b32 s38, 0
1193; SI-NEXT:    s_cbranch_scc0 .LBB7_2
1194; SI-NEXT:  ; %bb.1: ; %F
1195; SI-NEXT:    s_mov_b32 s39, 0xf000
1196; SI-NEXT:    s_mov_b32 s36, s38
1197; SI-NEXT:    s_mov_b32 s37, s38
1198; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 glc
1199; SI-NEXT:    s_waitcnt vmcnt(0)
1200; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:2 glc
1201; SI-NEXT:    s_waitcnt vmcnt(0)
1202; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc
1203; SI-NEXT:    s_waitcnt vmcnt(0)
1204; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:6 glc
1205; SI-NEXT:    s_waitcnt vmcnt(0)
1206; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc
1207; SI-NEXT:    s_waitcnt vmcnt(0)
1208; SI-NEXT:    buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc
1209; SI-NEXT:    s_waitcnt vmcnt(0)
1210; SI-NEXT:    buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc
1211; SI-NEXT:    s_waitcnt vmcnt(0)
1212; SI-NEXT:    buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:14 glc
1213; SI-NEXT:    s_waitcnt vmcnt(0)
1214; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc
1215; SI-NEXT:    s_waitcnt vmcnt(0)
1216; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:18 glc
1217; SI-NEXT:    s_waitcnt vmcnt(0)
1218; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:20 glc
1219; SI-NEXT:    s_waitcnt vmcnt(0)
1220; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:22 glc
1221; SI-NEXT:    s_waitcnt vmcnt(0)
1222; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:24 glc
1223; SI-NEXT:    s_waitcnt vmcnt(0)
1224; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:26 glc
1225; SI-NEXT:    s_waitcnt vmcnt(0)
1226; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:28 glc
1227; SI-NEXT:    s_waitcnt vmcnt(0)
1228; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc
1229; SI-NEXT:    s_waitcnt vmcnt(0)
1230; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v11
1231; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
1232; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
1233; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
1234; SI-NEXT:    v_or_b32_e32 v5, v10, v2
1235; SI-NEXT:    v_or_b32_e32 v4, v8, v3
1236; SI-NEXT:    v_or_b32_e32 v3, v7, v9
1237; SI-NEXT:    v_or_b32_e32 v2, v6, v11
1238; SI-NEXT:    s_mov_b64 vcc, exec
1239; SI-NEXT:    s_cbranch_execz .LBB7_3
1240; SI-NEXT:    s_branch .LBB7_4
1241; SI-NEXT:  .LBB7_2:
1242; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
1243; SI-NEXT:    s_mov_b64 vcc, 0
1244; SI-NEXT:  .LBB7_3: ; %T
1245; SI-NEXT:    s_mov_b32 s39, 0xf000
1246; SI-NEXT:    s_mov_b32 s36, s38
1247; SI-NEXT:    s_mov_b32 s37, s38
1248; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 glc
1249; SI-NEXT:    s_waitcnt vmcnt(0)
1250; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:2 glc
1251; SI-NEXT:    s_waitcnt vmcnt(0)
1252; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc
1253; SI-NEXT:    s_waitcnt vmcnt(0)
1254; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc
1255; SI-NEXT:    s_waitcnt vmcnt(0)
1256; SI-NEXT:    buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:8 glc
1257; SI-NEXT:    s_waitcnt vmcnt(0)
1258; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:10 glc
1259; SI-NEXT:    s_waitcnt vmcnt(0)
1260; SI-NEXT:    buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:12 glc
1261; SI-NEXT:    s_waitcnt vmcnt(0)
1262; SI-NEXT:    buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:14 glc
1263; SI-NEXT:    s_waitcnt vmcnt(0)
1264; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc
1265; SI-NEXT:    s_waitcnt vmcnt(0)
1266; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:18 glc
1267; SI-NEXT:    s_waitcnt vmcnt(0)
1268; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:20 glc
1269; SI-NEXT:    s_waitcnt vmcnt(0)
1270; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:22 glc
1271; SI-NEXT:    s_waitcnt vmcnt(0)
1272; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:24 glc
1273; SI-NEXT:    s_waitcnt vmcnt(0)
1274; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:26 glc
1275; SI-NEXT:    s_waitcnt vmcnt(0)
1276; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:28 glc
1277; SI-NEXT:    s_waitcnt vmcnt(0)
1278; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc
1279; SI-NEXT:    s_waitcnt vmcnt(0)
1280; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v9
1281; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
1282; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
1283; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
1284; SI-NEXT:    v_or_b32_e32 v5, v8, v0
1285; SI-NEXT:    v_or_b32_e32 v4, v7, v1
1286; SI-NEXT:    v_or_b32_e32 v3, v6, v9
1287; SI-NEXT:    v_or_b32_e32 v2, v2, v10
1288; SI-NEXT:  .LBB7_4: ; %exit
1289; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
1290; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
1291; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
1292; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
1293; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
1294; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
1295; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
1296; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1297; SI-NEXT:    s_movk_i32 s34, 0x3800
1298; SI-NEXT:    v_mov_b32_e32 v8, 0x3d000000
1299; SI-NEXT:    v_mov_b32_e32 v9, 0x39000000
1300; SI-NEXT:    v_mov_b32_e32 v10, 0x3d00
1301; SI-NEXT:    v_mov_b32_e32 v11, 0x3900
1302; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v0
1303; SI-NEXT:    v_cndmask_b32_e32 v12, v8, v9, vcc
1304; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v1
1305; SI-NEXT:    v_cndmask_b32_e32 v0, v10, v11, vcc
1306; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v2
1307; SI-NEXT:    v_cndmask_b32_e32 v13, v8, v9, vcc
1308; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v4
1309; SI-NEXT:    v_cndmask_b32_e32 v1, v10, v11, vcc
1310; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v6
1311; SI-NEXT:    v_cndmask_b32_e32 v14, v8, v9, vcc
1312; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v5
1313; SI-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc
1314; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v7
1315; SI-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
1316; SI-NEXT:    v_cmp_lt_u32_e32 vcc, s34, v3
1317; SI-NEXT:    v_cndmask_b32_e32 v3, v10, v11, vcc
1318; SI-NEXT:    v_or_b32_e32 v0, v0, v12
1319; SI-NEXT:    v_or_b32_e32 v4, v1, v13
1320; SI-NEXT:    v_or_b32_e32 v6, v2, v14
1321; SI-NEXT:    v_or_b32_e32 v2, v3, v5
1322; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
1323; SI-NEXT:    v_alignbit_b32 v1, v2, v12, 16
1324; SI-NEXT:    v_alignbit_b32 v5, v6, v13, 16
1325; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v14
1326; SI-NEXT:    s_setpc_b64 s[30:31]
1327;
1328; GFX9-LABEL: vec_16xi16_extract_8xi16_0:
1329; GFX9:       ; %bb.0:
1330; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1331; GFX9-NEXT:    buffer_load_ubyte v4, off, s[0:3], s32
1332; GFX9-NEXT:    s_waitcnt vmcnt(0)
1333; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
1334; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
1335; GFX9-NEXT:    s_and_b64 s[34:35], vcc, exec
1336; GFX9-NEXT:    s_cbranch_scc0 .LBB7_2
1337; GFX9-NEXT:  ; %bb.1: ; %F
1338; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
1339; GFX9-NEXT:    s_waitcnt vmcnt(0)
1340; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
1341; GFX9-NEXT:    s_waitcnt vmcnt(0)
1342; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
1343; GFX9-NEXT:    s_cbranch_execz .LBB7_3
1344; GFX9-NEXT:    s_branch .LBB7_4
1345; GFX9-NEXT:  .LBB7_2:
1346; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
1347; GFX9-NEXT:  .LBB7_3: ; %T
1348; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
1349; GFX9-NEXT:    s_waitcnt vmcnt(0)
1350; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
1351; GFX9-NEXT:    s_waitcnt vmcnt(0)
1352; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
1353; GFX9-NEXT:  .LBB7_4: ; %exit
1354; GFX9-NEXT:    s_movk_i32 s35, 0x3801
1355; GFX9-NEXT:    s_movk_i32 s34, 0x3800
1356; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3900
1357; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3d00
1358; GFX9-NEXT:    s_waitcnt vmcnt(0)
1359; GFX9-NEXT:    v_cmp_gt_u16_e32 vcc, s35, v7
1360; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
1361; GFX9-NEXT:    v_cmp_gt_u16_sdwa vcc, v7, s34 src0_sel:WORD_1 src1_sel:DWORD
1362; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v0, vcc
1363; GFX9-NEXT:    v_cmp_gt_u16_e32 vcc, s35, v6
1364; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
1365; GFX9-NEXT:    v_cmp_lt_u16_sdwa vcc, v6, s35 src0_sel:WORD_1 src1_sel:DWORD
1366; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
1367; GFX9-NEXT:    v_cmp_gt_u16_e32 vcc, s35, v5
1368; GFX9-NEXT:    v_cndmask_b32_e32 v8, v0, v1, vcc
1369; GFX9-NEXT:    v_cmp_lt_u16_sdwa vcc, v5, s35 src0_sel:WORD_1 src1_sel:DWORD
1370; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
1371; GFX9-NEXT:    v_cmp_gt_u16_e32 vcc, s35, v4
1372; GFX9-NEXT:    v_cndmask_b32_e32 v9, v0, v1, vcc
1373; GFX9-NEXT:    v_cmp_lt_u16_sdwa vcc, v4, s35 src0_sel:WORD_1 src1_sel:DWORD
1374; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
1375; GFX9-NEXT:    s_mov_b32 s34, 0x5040100
1376; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s34
1377; GFX9-NEXT:    v_perm_b32 v1, v5, v8, s34
1378; GFX9-NEXT:    v_perm_b32 v2, v6, v2, s34
1379; GFX9-NEXT:    v_perm_b32 v3, v7, v3, s34
1380; GFX9-NEXT:    s_setpc_b64 s[30:31]
1381;
1382; GFX11-LABEL: vec_16xi16_extract_8xi16_0:
1383; GFX11:       ; %bb.0:
1384; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1385; GFX11-NEXT:    scratch_load_u8 v4, off, s32
1386; GFX11-NEXT:    s_mov_b32 s0, 0
1387; GFX11-NEXT:    s_waitcnt vmcnt(0)
1388; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
1389; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1390; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
1391; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
1392; GFX11-NEXT:    s_cbranch_scc0 .LBB7_2
1393; GFX11-NEXT:  ; %bb.1: ; %F
1394; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
1395; GFX11-NEXT:    s_waitcnt vmcnt(0)
1396; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
1397; GFX11-NEXT:    s_waitcnt vmcnt(0)
1398; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
1399; GFX11-NEXT:    s_cbranch_vccz .LBB7_3
1400; GFX11-NEXT:    s_branch .LBB7_4
1401; GFX11-NEXT:  .LBB7_2:
1402; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
1403; GFX11-NEXT:  .LBB7_3: ; %T
1404; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
1405; GFX11-NEXT:    s_waitcnt vmcnt(0)
1406; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
1407; GFX11-NEXT:    s_waitcnt vmcnt(0)
1408; GFX11-NEXT:  .LBB7_4: ; %exit
1409; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5
1410; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
1411; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
1412; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
1413; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
1414; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
1415; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
1416; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo
1417; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v4
1418; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo
1419; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v7
1420; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo
1421; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v3
1422; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo
1423; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v2
1424; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo
1425; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v0
1426; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo
1427; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v6
1428; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
1429; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
1430; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo
1431; GFX11-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x3800, v8
1432; GFX11-NEXT:    v_perm_b32 v2, v7, v4, 0x5040100
1433; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
1434; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo
1435; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1436; GFX11-NEXT:    v_perm_b32 v3, v6, v5, 0x5040100
1437; GFX11-NEXT:    s_setpc_b64 s[30:31]
1438  br i1 %cond, label %T, label %F
1439
1440T:
1441  %t = load volatile <16 x i16>, ptr addrspace(1) %p0
1442  br label %exit
1443
1444F:
1445  %f = load volatile <16 x i16>, ptr addrspace(1) %p1
1446  br label %exit
1447
1448exit:
1449  %m = phi <16 x i16> [ %t, %T ], [ %f, %F ]
1450  %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1451  %b2 = icmp ugt <8 x i16> %v2, <i16 u0x3800, i16 u0x3800, i16 u0x3800, i16 u0x3800, i16 u0x3800, i16 u0x3800, i16 u0x3800, i16 u0x3800>
1452  %r2 = select <8 x i1> %b2, <8 x i16> <i16 u0x3900, i16 u0x3900, i16 u0x3900, i16 u0x3900, i16 u0x3900, i16 u0x3900, i16 u0x3900, i16 u0x3900>, <8 x i16> <i16 u0x3D00, i16 u0x3D00, i16 u0x3D00, i16 u0x3D00, i16 u0x3D00, i16 u0x3D00, i16 u0x3D00, i16 u0x3D00>
1453  ret <8 x i16> %r2
1454}
1455
1456define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
1457; SI-LABEL: vec_16xf16_extract_8xf16_0:
1458; SI:       ; %bb.0:
1459; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1460; SI-NEXT:    buffer_load_ubyte v4, off, s[0:3], s32
1461; SI-NEXT:    s_waitcnt vmcnt(0)
1462; SI-NEXT:    v_and_b32_e32 v4, 1, v4
1463; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
1464; SI-NEXT:    s_and_b64 s[34:35], vcc, exec
1465; SI-NEXT:    s_mov_b32 s38, 0
1466; SI-NEXT:    s_cbranch_scc0 .LBB8_2
1467; SI-NEXT:  ; %bb.1: ; %F
1468; SI-NEXT:    s_mov_b32 s39, 0xf000
1469; SI-NEXT:    s_mov_b32 s36, s38
1470; SI-NEXT:    s_mov_b32 s37, s38
1471; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 glc
1472; SI-NEXT:    s_waitcnt vmcnt(0)
1473; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:2 glc
1474; SI-NEXT:    s_waitcnt vmcnt(0)
1475; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc
1476; SI-NEXT:    s_waitcnt vmcnt(0)
1477; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc
1478; SI-NEXT:    s_waitcnt vmcnt(0)
1479; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc
1480; SI-NEXT:    s_waitcnt vmcnt(0)
1481; SI-NEXT:    buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc
1482; SI-NEXT:    s_waitcnt vmcnt(0)
1483; SI-NEXT:    buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc
1484; SI-NEXT:    s_waitcnt vmcnt(0)
1485; SI-NEXT:    buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:14 glc
1486; SI-NEXT:    s_waitcnt vmcnt(0)
1487; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc
1488; SI-NEXT:    s_waitcnt vmcnt(0)
1489; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:18 glc
1490; SI-NEXT:    s_waitcnt vmcnt(0)
1491; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:20 glc
1492; SI-NEXT:    s_waitcnt vmcnt(0)
1493; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:22 glc
1494; SI-NEXT:    s_waitcnt vmcnt(0)
1495; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:24 glc
1496; SI-NEXT:    s_waitcnt vmcnt(0)
1497; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:26 glc
1498; SI-NEXT:    s_waitcnt vmcnt(0)
1499; SI-NEXT:    buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:28 glc
1500; SI-NEXT:    s_waitcnt vmcnt(0)
1501; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc
1502; SI-NEXT:    s_waitcnt vmcnt(0)
1503; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
1504; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
1505; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
1506; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
1507; SI-NEXT:    v_cvt_f32_f16_e32 v2, v11
1508; SI-NEXT:    v_cvt_f32_f16_e32 v3, v9
1509; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
1510; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
1511; SI-NEXT:    v_or_b32_e32 v9, v10, v12
1512; SI-NEXT:    v_or_b32_e32 v8, v8, v13
1513; SI-NEXT:    v_or_b32_e32 v10, v7, v14
1514; SI-NEXT:    v_or_b32_e32 v11, v5, v15
1515; SI-NEXT:    v_cvt_f32_f16_e32 v5, v9
1516; SI-NEXT:    v_cvt_f32_f16_e32 v7, v8
1517; SI-NEXT:    v_cvt_f32_f16_e32 v8, v10
1518; SI-NEXT:    v_cvt_f32_f16_e32 v9, v11
1519; SI-NEXT:    s_mov_b64 vcc, exec
1520; SI-NEXT:    s_cbranch_execz .LBB8_3
1521; SI-NEXT:    s_branch .LBB8_4
1522; SI-NEXT:  .LBB8_2:
1523; SI-NEXT:    ; implicit-def: $vgpr9
1524; SI-NEXT:    ; implicit-def: $vgpr6
1525; SI-NEXT:    ; implicit-def: $vgpr8
1526; SI-NEXT:    ; implicit-def: $vgpr4
1527; SI-NEXT:    ; implicit-def: $vgpr7
1528; SI-NEXT:    ; implicit-def: $vgpr3
1529; SI-NEXT:    ; implicit-def: $vgpr5
1530; SI-NEXT:    ; implicit-def: $vgpr2
1531; SI-NEXT:    s_mov_b64 vcc, 0
1532; SI-NEXT:  .LBB8_3: ; %T
1533; SI-NEXT:    s_mov_b32 s39, 0xf000
1534; SI-NEXT:    s_mov_b32 s36, s38
1535; SI-NEXT:    s_mov_b32 s37, s38
1536; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 glc
1537; SI-NEXT:    s_waitcnt vmcnt(0)
1538; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:2 glc
1539; SI-NEXT:    s_waitcnt vmcnt(0)
1540; SI-NEXT:    buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:4 glc
1541; SI-NEXT:    s_waitcnt vmcnt(0)
1542; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc
1543; SI-NEXT:    s_waitcnt vmcnt(0)
1544; SI-NEXT:    buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc
1545; SI-NEXT:    s_waitcnt vmcnt(0)
1546; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:10 glc
1547; SI-NEXT:    s_waitcnt vmcnt(0)
1548; SI-NEXT:    buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:12 glc
1549; SI-NEXT:    s_waitcnt vmcnt(0)
1550; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:14 glc
1551; SI-NEXT:    s_waitcnt vmcnt(0)
1552; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc
1553; SI-NEXT:    s_waitcnt vmcnt(0)
1554; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:18 glc
1555; SI-NEXT:    s_waitcnt vmcnt(0)
1556; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:20 glc
1557; SI-NEXT:    s_waitcnt vmcnt(0)
1558; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:22 glc
1559; SI-NEXT:    s_waitcnt vmcnt(0)
1560; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:24 glc
1561; SI-NEXT:    s_waitcnt vmcnt(0)
1562; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:26 glc
1563; SI-NEXT:    s_waitcnt vmcnt(0)
1564; SI-NEXT:    buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:28 glc
1565; SI-NEXT:    s_waitcnt vmcnt(0)
1566; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc
1567; SI-NEXT:    s_waitcnt vmcnt(0)
1568; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
1569; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
1570; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v4
1571; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
1572; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1573; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1574; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
1575; SI-NEXT:    v_or_b32_e32 v0, v9, v0
1576; SI-NEXT:    v_or_b32_e32 v1, v8, v1
1577; SI-NEXT:    v_or_b32_e32 v8, v7, v10
1578; SI-NEXT:    v_or_b32_e32 v9, v5, v11
1579; SI-NEXT:    v_cvt_f32_f16_e32 v5, v0
1580; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1581; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
1582; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
1583; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
1584; SI-NEXT:  .LBB8_4: ; %exit
1585; SI-NEXT:    v_cvt_f16_f32_e32 v0, v9
1586; SI-NEXT:    v_cvt_f16_f32_e32 v1, v6
1587; SI-NEXT:    v_cvt_f16_f32_e32 v6, v8
1588; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1589; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1590; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
1591; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1592; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1593; SI-NEXT:    v_mov_b32_e32 v8, 0x3fa00000
1594; SI-NEXT:    v_mov_b32_e32 v9, 0x3f200000
1595; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1596; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1597; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
1598; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
1599; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
1600; SI-NEXT:    v_cvt_f32_f16_e32 v10, v3
1601; SI-NEXT:    v_cvt_f32_f16_e32 v11, v5
1602; SI-NEXT:    v_cvt_f32_f16_e32 v12, v2
1603; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v0
1604; SI-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc
1605; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v1
1606; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
1607; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v6
1608; SI-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
1609; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v4
1610; SI-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
1611; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v7
1612; SI-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc
1613; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v10
1614; SI-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
1615; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v11
1616; SI-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc
1617; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v12
1618; SI-NEXT:    v_cndmask_b32_e32 v7, v8, v9, vcc
1619; SI-NEXT:    s_setpc_b64 s[30:31]
1620;
1621; GFX9-LABEL: vec_16xf16_extract_8xf16_0:
1622; GFX9:       ; %bb.0:
1623; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1624; GFX9-NEXT:    buffer_load_ubyte v4, off, s[0:3], s32
1625; GFX9-NEXT:    s_waitcnt vmcnt(0)
1626; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
1627; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
1628; GFX9-NEXT:    s_and_b64 s[34:35], vcc, exec
1629; GFX9-NEXT:    s_cbranch_scc0 .LBB8_2
1630; GFX9-NEXT:  ; %bb.1: ; %F
1631; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
1632; GFX9-NEXT:    s_waitcnt vmcnt(0)
1633; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
1634; GFX9-NEXT:    s_waitcnt vmcnt(0)
1635; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
1636; GFX9-NEXT:    s_cbranch_execz .LBB8_3
1637; GFX9-NEXT:    s_branch .LBB8_4
1638; GFX9-NEXT:  .LBB8_2:
1639; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
1640; GFX9-NEXT:  .LBB8_3: ; %T
1641; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
1642; GFX9-NEXT:    s_waitcnt vmcnt(0)
1643; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
1644; GFX9-NEXT:    s_waitcnt vmcnt(0)
1645; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
1646; GFX9-NEXT:  .LBB8_4: ; %exit
1647; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3800
1648; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3900
1649; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3d00
1650; GFX9-NEXT:    s_waitcnt vmcnt(0)
1651; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v7
1652; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v2, vcc
1653; GFX9-NEXT:    v_cmp_nle_f16_sdwa vcc, v7, v0 src0_sel:WORD_1 src1_sel:DWORD
1654; GFX9-NEXT:    v_cndmask_b32_e32 v7, v2, v1, vcc
1655; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v6
1656; GFX9-NEXT:    v_cndmask_b32_e32 v8, v1, v2, vcc
1657; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v6, v0 src0_sel:WORD_1 src1_sel:DWORD
1658; GFX9-NEXT:    v_cndmask_b32_e32 v6, v1, v2, vcc
1659; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v5
1660; GFX9-NEXT:    v_cndmask_b32_e32 v9, v1, v2, vcc
1661; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v5, v0 src0_sel:WORD_1 src1_sel:DWORD
1662; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v2, vcc
1663; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v4
1664; GFX9-NEXT:    v_cndmask_b32_e32 v10, v1, v2, vcc
1665; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
1666; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
1667; GFX9-NEXT:    v_pack_b32_f16 v0, v10, v0
1668; GFX9-NEXT:    v_pack_b32_f16 v1, v9, v5
1669; GFX9-NEXT:    v_pack_b32_f16 v2, v8, v6
1670; GFX9-NEXT:    v_pack_b32_f16 v3, v3, v7
1671; GFX9-NEXT:    s_setpc_b64 s[30:31]
1672;
1673; GFX11-LABEL: vec_16xf16_extract_8xf16_0:
1674; GFX11:       ; %bb.0:
1675; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1676; GFX11-NEXT:    scratch_load_u8 v4, off, s32
1677; GFX11-NEXT:    s_mov_b32 s0, 0
1678; GFX11-NEXT:    s_waitcnt vmcnt(0)
1679; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
1680; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1681; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
1682; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
1683; GFX11-NEXT:    s_cbranch_scc0 .LBB8_2
1684; GFX11-NEXT:  ; %bb.1: ; %F
1685; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
1686; GFX11-NEXT:    s_waitcnt vmcnt(0)
1687; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
1688; GFX11-NEXT:    s_waitcnt vmcnt(0)
1689; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
1690; GFX11-NEXT:    s_cbranch_vccz .LBB8_3
1691; GFX11-NEXT:    s_branch .LBB8_4
1692; GFX11-NEXT:  .LBB8_2:
1693; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
1694; GFX11-NEXT:  .LBB8_3: ; %T
1695; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
1696; GFX11-NEXT:    s_waitcnt vmcnt(0)
1697; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
1698; GFX11-NEXT:    s_waitcnt vmcnt(0)
1699; GFX11-NEXT:  .LBB8_4: ; %exit
1700; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v5
1701; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
1702; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
1703; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
1704; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
1705; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
1706; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
1707; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo
1708; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v4
1709; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo
1710; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v7
1711; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo
1712; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
1713; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo
1714; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
1715; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo
1716; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v0
1717; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo
1718; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v6
1719; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
1720; GFX11-NEXT:    v_pack_b32_f16 v0, v2, v0
1721; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo
1722; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, 0.5, v8
1723; GFX11-NEXT:    v_pack_b32_f16 v2, v4, v7
1724; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v1
1725; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo
1726; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1727; GFX11-NEXT:    v_pack_b32_f16 v3, v5, v6
1728; GFX11-NEXT:    s_setpc_b64 s[30:31]
1729  br i1 %cond, label %T, label %F
1730
1731T:
1732  %t = load volatile <16 x half>, ptr addrspace(1) %p0
1733  br label %exit
1734
1735F:
1736  %f = load volatile <16 x half>, ptr addrspace(1) %p1
1737  br label %exit
1738
1739exit:
1740  %m = phi <16 x half> [ %t, %T ], [ %f, %F ]
1741  %v2 = shufflevector <16 x half> %m, <16 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1742  %b2 = fcmp ugt <8 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>
1743  %r2 = select <8 x i1> %b2, <8 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <8 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00>
1744  ret <8 x half> %r2
1745}
1746