xref: /llvm-project/llvm/test/CodeGen/AMDGPU/extract-subvector.ll (revision 5da7179cb3ff80203f58ddea71562816b2ae4ff6)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
3
4define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
5; GCN-LABEL: extract_2xi16:
6; GCN:       ; %bb.0:
7; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
9; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
10; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
11; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
12; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
13; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
14; GCN-NEXT:    s_cbranch_execz .LBB0_2
15; GCN-NEXT:  ; %bb.1: ; %F
16; GCN-NEXT:    s_mov_b32 s10, 0
17; GCN-NEXT:    s_mov_b32 s11, 0xf000
18; GCN-NEXT:    s_mov_b32 s8, s10
19; GCN-NEXT:    s_mov_b32 s9, s10
20; GCN-NEXT:    buffer_load_ushort v0, v[2:3], s[8:11], 0 addr64 glc
21; GCN-NEXT:    s_waitcnt vmcnt(0)
22; GCN-NEXT:    buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:2 glc
23; GCN-NEXT:    s_waitcnt vmcnt(0)
24; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:4 glc
25; GCN-NEXT:    s_waitcnt vmcnt(0)
26; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:6 glc
27; GCN-NEXT:    s_waitcnt vmcnt(0)
28; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:8 glc
29; GCN-NEXT:    s_waitcnt vmcnt(0)
30; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:10 glc
31; GCN-NEXT:    s_waitcnt vmcnt(0)
32; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:12 glc
33; GCN-NEXT:    s_waitcnt vmcnt(0)
34; GCN-NEXT:    buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64 offset:14 glc
35; GCN-NEXT:    s_waitcnt vmcnt(0)
36; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
37; GCN-NEXT:    v_or_b32_e32 v4, v0, v1
38; GCN-NEXT:    ; implicit-def: $vgpr0
39; GCN-NEXT:  .LBB0_2: ; %Flow
40; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
41; GCN-NEXT:    s_cbranch_execz .LBB0_4
42; GCN-NEXT:  ; %bb.3: ; %T
43; GCN-NEXT:    s_mov_b32 s10, 0
44; GCN-NEXT:    s_mov_b32 s11, 0xf000
45; GCN-NEXT:    s_mov_b32 s8, s10
46; GCN-NEXT:    s_mov_b32 s9, s10
47; GCN-NEXT:    buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
48; GCN-NEXT:    s_waitcnt vmcnt(0)
49; GCN-NEXT:    buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:2 glc
50; GCN-NEXT:    s_waitcnt vmcnt(0)
51; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:4 glc
52; GCN-NEXT:    s_waitcnt vmcnt(0)
53; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:6 glc
54; GCN-NEXT:    s_waitcnt vmcnt(0)
55; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
56; GCN-NEXT:    s_waitcnt vmcnt(0)
57; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:10 glc
58; GCN-NEXT:    s_waitcnt vmcnt(0)
59; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:12 glc
60; GCN-NEXT:    s_waitcnt vmcnt(0)
61; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 offset:14 glc
62; GCN-NEXT:    s_waitcnt vmcnt(0)
63; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
64; GCN-NEXT:    v_or_b32_e32 v4, v2, v0
65; GCN-NEXT:  .LBB0_4: ; %exit
66; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
67; GCN-NEXT:    v_ashrrev_i32_e32 v0, 16, v4
68; GCN-NEXT:    v_bfe_i32 v1, v4, 0, 16
69; GCN-NEXT:    v_mov_b32_e32 v2, 0xffff
70; GCN-NEXT:    v_mov_b32_e32 v3, 0x8000
71; GCN-NEXT:    v_mov_b32_e32 v4, 0xffff8000
72; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
73; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
74; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
75; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
76; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
77; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
78; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v2
79; GCN-NEXT:    s_setpc_b64 s[30:31]
80  br i1 %c0, label %T, label %F
81
82T:
83  %t = load volatile <8 x i16>, ptr addrspace(1) %p0
84  br label %exit
85
86F:
87  %f = load volatile <8 x i16>, ptr addrspace(1) %p1
88  br label %exit
89
90exit:
91  %m = phi <8 x i16> [ %t, %T ], [ %f, %F ]
92  %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
93  %b2 = icmp sgt <2 x i16> %v2, <i16 -1, i16 -1>
94  %r2 = select <2 x i1> %b2, <2 x i16> <i16 -32768, i16 -32768>, <2 x i16> <i16 -1, i16 -1>
95  ret <2 x i16> %r2
96}
97
98define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
99; GCN-LABEL: extract_2xi64:
100; GCN:       ; %bb.0:
101; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
103; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
104; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
105; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
106; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
107; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
108; GCN-NEXT:    s_cbranch_execz .LBB1_2
109; GCN-NEXT:  ; %bb.1: ; %F
110; GCN-NEXT:    s_mov_b32 s10, 0
111; GCN-NEXT:    s_mov_b32 s11, 0xf000
112; GCN-NEXT:    s_mov_b32 s8, s10
113; GCN-NEXT:    s_mov_b32 s9, s10
114; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
115; GCN-NEXT:    s_waitcnt vmcnt(0)
116; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
117; GCN-NEXT:    s_waitcnt vmcnt(0)
118; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc
119; GCN-NEXT:    s_waitcnt vmcnt(0)
120; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
121; GCN-NEXT:    s_waitcnt vmcnt(0)
122; GCN-NEXT:    ; implicit-def: $vgpr0
123; GCN-NEXT:  .LBB1_2: ; %Flow
124; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
125; GCN-NEXT:    s_cbranch_execz .LBB1_4
126; GCN-NEXT:  ; %bb.3: ; %T
127; GCN-NEXT:    s_mov_b32 s10, 0
128; GCN-NEXT:    s_mov_b32 s11, 0xf000
129; GCN-NEXT:    s_mov_b32 s8, s10
130; GCN-NEXT:    s_mov_b32 s9, s10
131; GCN-NEXT:    s_waitcnt vmcnt(0)
132; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
133; GCN-NEXT:    s_waitcnt vmcnt(0)
134; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
135; GCN-NEXT:    s_waitcnt vmcnt(0)
136; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc
137; GCN-NEXT:    s_waitcnt vmcnt(0)
138; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
139; GCN-NEXT:    s_waitcnt vmcnt(0)
140; GCN-NEXT:  .LBB1_4: ; %exit
141; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
142; GCN-NEXT:    s_waitcnt vmcnt(0)
143; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff8000
144; GCN-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[4:5]
145; GCN-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc
146; GCN-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[6:7]
147; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v1, vcc
148; GCN-NEXT:    v_mov_b32_e32 v1, -1
149; GCN-NEXT:    v_mov_b32_e32 v3, -1
150; GCN-NEXT:    s_setpc_b64 s[30:31]
151  br i1 %c0, label %T, label %F
152
153T:
154  %t = load volatile <8 x i64>, ptr addrspace(1) %p0
155  br label %exit
156
157F:
158  %f = load volatile <8 x i64>, ptr addrspace(1) %p1
159  br label %exit
160
161exit:
162  %m = phi <8 x i64> [ %t, %T ], [ %f, %F ]
163  %v2 = shufflevector <8 x i64> %m, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
164  %b2 = icmp sgt <2 x i64> %v2, <i64 -1, i64 -1>
165  %r2 = select <2 x i1> %b2, <2 x i64> <i64 -32768, i64 -32768>, <2 x i64> <i64 -1, i64 -1>
166  ret <2 x i64> %r2
167}
168
169define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
170; GCN-LABEL: extract_4xi64:
171; GCN:       ; %bb.0:
172; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
174; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
175; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
176; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
177; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
178; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
179; GCN-NEXT:    s_cbranch_execz .LBB2_2
180; GCN-NEXT:  ; %bb.1: ; %F
181; GCN-NEXT:    s_mov_b32 s10, 0
182; GCN-NEXT:    s_mov_b32 s11, 0xf000
183; GCN-NEXT:    s_mov_b32 s8, s10
184; GCN-NEXT:    s_mov_b32 s9, s10
185; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
186; GCN-NEXT:    s_waitcnt vmcnt(0)
187; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
188; GCN-NEXT:    s_waitcnt vmcnt(0)
189; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
190; GCN-NEXT:    s_waitcnt vmcnt(0)
191; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
192; GCN-NEXT:    s_waitcnt vmcnt(0)
193; GCN-NEXT:    ; implicit-def: $vgpr0
194; GCN-NEXT:  .LBB2_2: ; %Flow
195; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
196; GCN-NEXT:    s_cbranch_execz .LBB2_4
197; GCN-NEXT:  ; %bb.3: ; %T
198; GCN-NEXT:    s_mov_b32 s10, 0
199; GCN-NEXT:    s_mov_b32 s11, 0xf000
200; GCN-NEXT:    s_mov_b32 s8, s10
201; GCN-NEXT:    s_mov_b32 s9, s10
202; GCN-NEXT:    s_waitcnt vmcnt(0)
203; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
204; GCN-NEXT:    s_waitcnt vmcnt(0)
205; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
206; GCN-NEXT:    s_waitcnt vmcnt(0)
207; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
208; GCN-NEXT:    s_waitcnt vmcnt(0)
209; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
210; GCN-NEXT:    s_waitcnt vmcnt(0)
211; GCN-NEXT:  .LBB2_4: ; %exit
212; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
213; GCN-NEXT:    s_waitcnt vmcnt(0)
214; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff8000
215; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[4:5]
216; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
217; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
218; GCN-NEXT:    v_cndmask_b32_e64 v2, v1, -1, vcc
219; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[8:9]
220; GCN-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
221; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
222; GCN-NEXT:    v_cndmask_b32_e64 v6, v1, -1, vcc
223; GCN-NEXT:    v_mov_b32_e32 v1, -1
224; GCN-NEXT:    v_mov_b32_e32 v3, -1
225; GCN-NEXT:    v_mov_b32_e32 v5, -1
226; GCN-NEXT:    v_mov_b32_e32 v7, -1
227; GCN-NEXT:    s_setpc_b64 s[30:31]
228  br i1 %c0, label %T, label %F
229
230T:
231  %t = load volatile <8 x i64>, ptr addrspace(1) %p0
232  br label %exit
233
234F:
235  %f = load volatile <8 x i64>, ptr addrspace(1) %p1
236  br label %exit
237
238exit:
239  %m = phi <8 x i64> [ %t, %T ], [ %f, %F ]
240  %v2 = shufflevector <8 x i64> %m, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
241  %b2 = icmp sgt <4 x i64> %v2, <i64 -1, i64 -1, i64 -1, i64 -1>
242  %r2 = select <4 x i1> %b2, <4 x i64> <i64 -32768, i64 -32768, i64 -32768, i64 -32768>, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>
243  ret <4 x i64> %r2
244}
245
246define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
247; GCN-LABEL: extract_8xi64:
248; GCN:       ; %bb.0:
249; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
250; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
251; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
252; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
253; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
254; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
255; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
256; GCN-NEXT:    s_cbranch_execz .LBB3_2
257; GCN-NEXT:  ; %bb.1: ; %F
258; GCN-NEXT:    s_mov_b32 s10, 0
259; GCN-NEXT:    s_mov_b32 s11, 0xf000
260; GCN-NEXT:    s_mov_b32 s8, s10
261; GCN-NEXT:    s_mov_b32 s9, s10
262; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:112 glc
263; GCN-NEXT:    s_waitcnt vmcnt(0)
264; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:96 glc
265; GCN-NEXT:    s_waitcnt vmcnt(0)
266; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:80 glc
267; GCN-NEXT:    s_waitcnt vmcnt(0)
268; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc
269; GCN-NEXT:    s_waitcnt vmcnt(0)
270; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
271; GCN-NEXT:    s_waitcnt vmcnt(0)
272; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
273; GCN-NEXT:    s_waitcnt vmcnt(0)
274; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
275; GCN-NEXT:    s_waitcnt vmcnt(0)
276; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc
277; GCN-NEXT:    s_waitcnt vmcnt(0)
278; GCN-NEXT:    ; implicit-def: $vgpr0
279; GCN-NEXT:  .LBB3_2: ; %Flow
280; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
281; GCN-NEXT:    s_cbranch_execz .LBB3_4
282; GCN-NEXT:  ; %bb.3: ; %T
283; GCN-NEXT:    s_mov_b32 s10, 0
284; GCN-NEXT:    s_mov_b32 s11, 0xf000
285; GCN-NEXT:    s_mov_b32 s8, s10
286; GCN-NEXT:    s_mov_b32 s9, s10
287; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:112 glc
288; GCN-NEXT:    s_waitcnt vmcnt(0)
289; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:96 glc
290; GCN-NEXT:    s_waitcnt vmcnt(0)
291; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:80 glc
292; GCN-NEXT:    s_waitcnt vmcnt(0)
293; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc
294; GCN-NEXT:    s_waitcnt vmcnt(0)
295; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
296; GCN-NEXT:    s_waitcnt vmcnt(0)
297; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
298; GCN-NEXT:    s_waitcnt vmcnt(0)
299; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
300; GCN-NEXT:    s_waitcnt vmcnt(0)
301; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc
302; GCN-NEXT:    s_waitcnt vmcnt(0)
303; GCN-NEXT:  .LBB3_4: ; %exit
304; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
305; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff8000
306; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
307; GCN-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[8:9]
308; GCN-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
309; GCN-NEXT:    v_cmp_gt_i64_e64 s[8:9], 0, v[12:13]
310; GCN-NEXT:    v_cmp_gt_i64_e64 s[10:11], 0, v[14:15]
311; GCN-NEXT:    s_waitcnt vmcnt(0)
312; GCN-NEXT:    v_cmp_gt_i64_e64 s[12:13], 0, v[16:17]
313; GCN-NEXT:    v_cmp_gt_i64_e64 s[14:15], 0, v[18:19]
314; GCN-NEXT:    v_cmp_gt_i64_e64 s[16:17], 0, v[4:5]
315; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s[16:17]
316; GCN-NEXT:    v_cndmask_b32_e64 v2, v1, -1, vcc
317; GCN-NEXT:    v_cndmask_b32_e64 v4, v1, -1, s[4:5]
318; GCN-NEXT:    v_cndmask_b32_e64 v6, v1, -1, s[6:7]
319; GCN-NEXT:    v_cndmask_b32_e64 v8, v1, -1, s[8:9]
320; GCN-NEXT:    v_cndmask_b32_e64 v10, v1, -1, s[10:11]
321; GCN-NEXT:    v_cndmask_b32_e64 v12, v1, -1, s[12:13]
322; GCN-NEXT:    v_cndmask_b32_e64 v14, v1, -1, s[14:15]
323; GCN-NEXT:    v_mov_b32_e32 v1, -1
324; GCN-NEXT:    v_mov_b32_e32 v3, -1
325; GCN-NEXT:    v_mov_b32_e32 v5, -1
326; GCN-NEXT:    v_mov_b32_e32 v7, -1
327; GCN-NEXT:    v_mov_b32_e32 v9, -1
328; GCN-NEXT:    v_mov_b32_e32 v11, -1
329; GCN-NEXT:    v_mov_b32_e32 v13, -1
330; GCN-NEXT:    v_mov_b32_e32 v15, -1
331; GCN-NEXT:    s_setpc_b64 s[30:31]
332  br i1 %c0, label %T, label %F
333
334T:
335  %t = load volatile <16 x i64>, ptr addrspace(1) %p0
336  br label %exit
337
338F:
339  %f = load volatile <16 x i64>, ptr addrspace(1) %p1
340  br label %exit
341
342exit:
343  %m = phi <16 x i64> [ %t, %T ], [ %f, %F ]
344  %v2 = shufflevector <16 x i64> %m, <16 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
345  %b2 = icmp sgt <8 x i64> %v2, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
346  %r2 = select <8 x i1> %b2, <8 x i64> <i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768>, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
347  ret <8 x i64> %r2
348}
349
350define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
351; GCN-LABEL: extract_2xf64:
352; GCN:       ; %bb.0:
353; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
354; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
355; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
356; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
357; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
358; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
359; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
360; GCN-NEXT:    s_cbranch_execz .LBB4_2
361; GCN-NEXT:  ; %bb.1: ; %F
362; GCN-NEXT:    s_mov_b32 s10, 0
363; GCN-NEXT:    s_mov_b32 s11, 0xf000
364; GCN-NEXT:    s_mov_b32 s8, s10
365; GCN-NEXT:    s_mov_b32 s9, s10
366; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
367; GCN-NEXT:    s_waitcnt vmcnt(0)
368; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
369; GCN-NEXT:    s_waitcnt vmcnt(0)
370; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc
371; GCN-NEXT:    s_waitcnt vmcnt(0)
372; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
373; GCN-NEXT:    s_waitcnt vmcnt(0)
374; GCN-NEXT:    ; implicit-def: $vgpr0
375; GCN-NEXT:  .LBB4_2: ; %Flow
376; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
377; GCN-NEXT:    s_cbranch_execz .LBB4_4
378; GCN-NEXT:  ; %bb.3: ; %T
379; GCN-NEXT:    s_mov_b32 s10, 0
380; GCN-NEXT:    s_mov_b32 s11, 0xf000
381; GCN-NEXT:    s_mov_b32 s8, s10
382; GCN-NEXT:    s_mov_b32 s9, s10
383; GCN-NEXT:    s_waitcnt vmcnt(0)
384; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
385; GCN-NEXT:    s_waitcnt vmcnt(0)
386; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
387; GCN-NEXT:    s_waitcnt vmcnt(0)
388; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc
389; GCN-NEXT:    s_waitcnt vmcnt(0)
390; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
391; GCN-NEXT:    s_waitcnt vmcnt(0)
392; GCN-NEXT:  .LBB4_4: ; %exit
393; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
394; GCN-NEXT:    s_waitcnt vmcnt(0)
395; GCN-NEXT:    v_mov_b32_e32 v0, 0xbff00000
396; GCN-NEXT:    v_cmp_lt_f64_e32 vcc, -1.0, v[4:5]
397; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, -2.0, vcc
398; GCN-NEXT:    v_cmp_lt_f64_e32 vcc, -1.0, v[6:7]
399; GCN-NEXT:    v_cndmask_b32_e64 v3, v0, -2.0, vcc
400; GCN-NEXT:    v_mov_b32_e32 v0, 0
401; GCN-NEXT:    v_mov_b32_e32 v2, 0
402; GCN-NEXT:    s_setpc_b64 s[30:31]
403  br i1 %c0, label %T, label %F
404
405T:
406  %t = load volatile <8 x double>, ptr addrspace(1) %p0
407  br label %exit
408
409F:
410  %f = load volatile <8 x double>, ptr addrspace(1) %p1
411  br label %exit
412
413exit:
414  %m = phi <8 x double> [ %t, %T ], [ %f, %F ]
415  %v2 = shufflevector <8 x double> %m, <8 x double> undef, <2 x i32> <i32 0, i32 1>
416  %b2 = fcmp ogt <2 x double> %v2, <double -1.0, double -1.0>
417  %r2 = select <2 x i1> %b2, <2 x double> <double -2.0, double -2.0>, <2 x double> <double -1.0, double -1.0>
418  ret <2 x double> %r2
419}
420
421define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
422; GCN-LABEL: extract_4xf64:
423; GCN:       ; %bb.0:
424; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
425; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
426; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
427; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
428; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
429; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
430; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
431; GCN-NEXT:    s_cbranch_execz .LBB5_2
432; GCN-NEXT:  ; %bb.1: ; %F
433; GCN-NEXT:    s_mov_b32 s10, 0
434; GCN-NEXT:    s_mov_b32 s11, 0xf000
435; GCN-NEXT:    s_mov_b32 s8, s10
436; GCN-NEXT:    s_mov_b32 s9, s10
437; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
438; GCN-NEXT:    s_waitcnt vmcnt(0)
439; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
440; GCN-NEXT:    s_waitcnt vmcnt(0)
441; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
442; GCN-NEXT:    s_waitcnt vmcnt(0)
443; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
444; GCN-NEXT:    s_waitcnt vmcnt(0)
445; GCN-NEXT:    ; implicit-def: $vgpr0
446; GCN-NEXT:  .LBB5_2: ; %Flow
447; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
448; GCN-NEXT:    s_cbranch_execz .LBB5_4
449; GCN-NEXT:  ; %bb.3: ; %T
450; GCN-NEXT:    s_mov_b32 s10, 0
451; GCN-NEXT:    s_mov_b32 s11, 0xf000
452; GCN-NEXT:    s_mov_b32 s8, s10
453; GCN-NEXT:    s_mov_b32 s9, s10
454; GCN-NEXT:    s_waitcnt vmcnt(0)
455; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
456; GCN-NEXT:    s_waitcnt vmcnt(0)
457; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
458; GCN-NEXT:    s_waitcnt vmcnt(0)
459; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
460; GCN-NEXT:    s_waitcnt vmcnt(0)
461; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
462; GCN-NEXT:    s_waitcnt vmcnt(0)
463; GCN-NEXT:  .LBB5_4: ; %exit
464; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
465; GCN-NEXT:    s_waitcnt vmcnt(0)
466; GCN-NEXT:    v_mov_b32_e32 v0, 0xbff00000
467; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[4:5]
468; GCN-NEXT:    v_cndmask_b32_e32 v1, -2.0, v0, vcc
469; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7]
470; GCN-NEXT:    v_cndmask_b32_e32 v3, -2.0, v0, vcc
471; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[8:9]
472; GCN-NEXT:    v_cndmask_b32_e32 v5, -2.0, v0, vcc
473; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[10:11]
474; GCN-NEXT:    v_cndmask_b32_e32 v7, -2.0, v0, vcc
475; GCN-NEXT:    v_mov_b32_e32 v0, 0
476; GCN-NEXT:    v_mov_b32_e32 v2, 0
477; GCN-NEXT:    v_mov_b32_e32 v4, 0
478; GCN-NEXT:    v_mov_b32_e32 v6, 0
479; GCN-NEXT:    s_setpc_b64 s[30:31]
480  br i1 %c0, label %T, label %F
481
482T:
483  %t = load volatile <8 x double>, ptr addrspace(1) %p0
484  br label %exit
485
486F:
487  %f = load volatile <8 x double>, ptr addrspace(1) %p1
488  br label %exit
489
490exit:
491  %m = phi <8 x double> [ %t, %T ], [ %f, %F ]
492  %v2 = shufflevector <8 x double> %m, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
493  %b2 = fcmp ogt <4 x double> %v2, <double -1.0, double -1.0, double -1.0, double -1.0>
494  %r2 = select <4 x i1> %b2, <4 x double> <double -2.0, double -2.0, double -2.0, double -2.0>, <4 x double> <double -1.0, double -1.0, double -1.0, double -1.0>
495  ret <4 x double> %r2
496}
497
498define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
499; GCN-LABEL: extract_8xf64:
500; GCN:       ; %bb.0:
501; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
502; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
503; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
504; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
505; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
506; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
507; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
508; GCN-NEXT:    s_cbranch_execz .LBB6_2
509; GCN-NEXT:  ; %bb.1: ; %F
510; GCN-NEXT:    s_mov_b32 s10, 0
511; GCN-NEXT:    s_mov_b32 s11, 0xf000
512; GCN-NEXT:    s_mov_b32 s8, s10
513; GCN-NEXT:    s_mov_b32 s9, s10
514; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:112 glc
515; GCN-NEXT:    s_waitcnt vmcnt(0)
516; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:96 glc
517; GCN-NEXT:    s_waitcnt vmcnt(0)
518; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:80 glc
519; GCN-NEXT:    s_waitcnt vmcnt(0)
520; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc
521; GCN-NEXT:    s_waitcnt vmcnt(0)
522; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
523; GCN-NEXT:    s_waitcnt vmcnt(0)
524; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
525; GCN-NEXT:    s_waitcnt vmcnt(0)
526; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
527; GCN-NEXT:    s_waitcnt vmcnt(0)
528; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc
529; GCN-NEXT:    s_waitcnt vmcnt(0)
530; GCN-NEXT:    ; implicit-def: $vgpr0
531; GCN-NEXT:  .LBB6_2: ; %Flow
532; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
533; GCN-NEXT:    s_cbranch_execz .LBB6_4
534; GCN-NEXT:  ; %bb.3: ; %T
535; GCN-NEXT:    s_mov_b32 s10, 0
536; GCN-NEXT:    s_mov_b32 s11, 0xf000
537; GCN-NEXT:    s_mov_b32 s8, s10
538; GCN-NEXT:    s_mov_b32 s9, s10
539; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:112 glc
540; GCN-NEXT:    s_waitcnt vmcnt(0)
541; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:96 glc
542; GCN-NEXT:    s_waitcnt vmcnt(0)
543; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:80 glc
544; GCN-NEXT:    s_waitcnt vmcnt(0)
545; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc
546; GCN-NEXT:    s_waitcnt vmcnt(0)
547; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
548; GCN-NEXT:    s_waitcnt vmcnt(0)
549; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
550; GCN-NEXT:    s_waitcnt vmcnt(0)
551; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
552; GCN-NEXT:    s_waitcnt vmcnt(0)
553; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc
554; GCN-NEXT:    s_waitcnt vmcnt(0)
555; GCN-NEXT:  .LBB6_4: ; %exit
556; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
557; GCN-NEXT:    v_mov_b32_e32 v0, 0xbff00000
558; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7]
559; GCN-NEXT:    v_cmp_nlt_f64_e64 s[4:5], -1.0, v[8:9]
560; GCN-NEXT:    v_cmp_nlt_f64_e64 s[6:7], -1.0, v[10:11]
561; GCN-NEXT:    v_cmp_nlt_f64_e64 s[8:9], -1.0, v[12:13]
562; GCN-NEXT:    v_cmp_nlt_f64_e64 s[10:11], -1.0, v[14:15]
563; GCN-NEXT:    s_waitcnt vmcnt(0)
564; GCN-NEXT:    v_cmp_nlt_f64_e64 s[12:13], -1.0, v[16:17]
565; GCN-NEXT:    v_cmp_nlt_f64_e64 s[14:15], -1.0, v[18:19]
566; GCN-NEXT:    v_cmp_nlt_f64_e64 s[16:17], -1.0, v[4:5]
567; GCN-NEXT:    v_cndmask_b32_e64 v1, -2.0, v0, s[16:17]
568; GCN-NEXT:    v_cndmask_b32_e32 v3, -2.0, v0, vcc
569; GCN-NEXT:    v_cndmask_b32_e64 v5, -2.0, v0, s[4:5]
570; GCN-NEXT:    v_cndmask_b32_e64 v7, -2.0, v0, s[6:7]
571; GCN-NEXT:    v_cndmask_b32_e64 v9, -2.0, v0, s[8:9]
572; GCN-NEXT:    v_cndmask_b32_e64 v11, -2.0, v0, s[10:11]
573; GCN-NEXT:    v_cndmask_b32_e64 v13, -2.0, v0, s[12:13]
574; GCN-NEXT:    v_cndmask_b32_e64 v15, -2.0, v0, s[14:15]
575; GCN-NEXT:    v_mov_b32_e32 v0, 0
576; GCN-NEXT:    v_mov_b32_e32 v2, 0
577; GCN-NEXT:    v_mov_b32_e32 v4, 0
578; GCN-NEXT:    v_mov_b32_e32 v6, 0
579; GCN-NEXT:    v_mov_b32_e32 v8, 0
580; GCN-NEXT:    v_mov_b32_e32 v10, 0
581; GCN-NEXT:    v_mov_b32_e32 v12, 0
582; GCN-NEXT:    v_mov_b32_e32 v14, 0
583; GCN-NEXT:    s_setpc_b64 s[30:31]
584  br i1 %c0, label %T, label %F
585
586T:
587  %t = load volatile <16 x double>, ptr addrspace(1) %p0
588  br label %exit
589
590F:
591  %f = load volatile <16 x double>, ptr addrspace(1) %p1
592  br label %exit
593
594exit:
595  %m = phi <16 x double> [ %t, %T ], [ %f, %F ]
596  %v2 = shufflevector <16 x double> %m, <16 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
597  %b2 = fcmp ogt <8 x double> %v2, <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
598  %r2 = select <8 x i1> %b2, <8 x double> <double -2.0, double -2.0, double -2.0, double -2.0, double -2.0, double -2.0, double -2.0, double -2.0>, <8 x double> <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
599  ret <8 x double> %r2
600}
601