xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll (revision fd3eaf76ba3392a4406247d996e757ef49f7a8b2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
5; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
6; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7
8define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
9; GFX9-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_and_b32 s0, s4, 3
12; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
13; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], s0 offset:0x0
14; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX9-NEXT:    ; return to shader part epilog
16;
17; GFX8-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
18; GFX8:       ; %bb.0:
19; GFX8-NEXT:    s_and_b32 s0, s4, 3
20; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
21; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], s0
22; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
23; GFX8-NEXT:    ; return to shader part epilog
24;
25; GFX7-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
26; GFX7:       ; %bb.0:
27; GFX7-NEXT:    s_and_b32 s0, s4, 3
28; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
29; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], s0
30; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
31; GFX7-NEXT:    ; return to shader part epilog
32;
33; GFX10-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
34; GFX10:       ; %bb.0:
35; GFX10-NEXT:    s_and_b32 s0, s4, 3
36; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
37; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], s0 offset:0x0
38; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX10-NEXT:    ; return to shader part epilog
40;
41; GFX11-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
42; GFX11:       ; %bb.0:
43; GFX11-NEXT:    s_and_b32 s0, s4, 3
44; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
45; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
46; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], s0 offset:0x0
47; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
48; GFX11-NEXT:    ; return to shader part epilog
49  %vector = load <4 x i128>, ptr addrspace(4) %ptr
50  %element = extractelement <4 x i128> %vector, i32 %idx
51  ret i128 %element
52}
53
54define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
55; GFX9-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
56; GFX9:       ; %bb.0:
57; GFX9-NEXT:    s_and_b32 s0, s2, 3
58; GFX9-NEXT:    s_mov_b32 s1, 0
59; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
60; GFX9-NEXT:    v_mov_b32_e32 v3, s1
61; GFX9-NEXT:    v_mov_b32_e32 v2, s0
62; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
63; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
64; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
65; GFX9-NEXT:    s_waitcnt vmcnt(0)
66; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
67; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
68; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
69; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
70; GFX9-NEXT:    ; return to shader part epilog
71;
72; GFX8-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
73; GFX8:       ; %bb.0:
74; GFX8-NEXT:    s_and_b32 s0, s2, 3
75; GFX8-NEXT:    s_mov_b32 s1, 0
76; GFX8-NEXT:    s_lshl_b32 s0, s0, 4
77; GFX8-NEXT:    v_mov_b32_e32 v3, s1
78; GFX8-NEXT:    v_mov_b32_e32 v2, s0
79; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
80; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
81; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
82; GFX8-NEXT:    s_waitcnt vmcnt(0)
83; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
84; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
85; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
86; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
87; GFX8-NEXT:    ; return to shader part epilog
88;
89; GFX7-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
90; GFX7:       ; %bb.0:
91; GFX7-NEXT:    s_and_b32 s0, s2, 3
92; GFX7-NEXT:    s_mov_b32 s1, 0
93; GFX7-NEXT:    s_lshl_b32 s0, s0, 4
94; GFX7-NEXT:    s_mov_b32 s3, 0xf000
95; GFX7-NEXT:    s_mov_b32 s2, s1
96; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
97; GFX7-NEXT:    s_waitcnt vmcnt(0)
98; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
99; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
100; GFX7-NEXT:    v_readfirstlane_b32 s2, v2
101; GFX7-NEXT:    v_readfirstlane_b32 s3, v3
102; GFX7-NEXT:    ; return to shader part epilog
103;
104; GFX10-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
105; GFX10:       ; %bb.0:
106; GFX10-NEXT:    s_and_b32 s0, s2, 3
107; GFX10-NEXT:    s_mov_b32 s1, 0
108; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
109; GFX10-NEXT:    v_mov_b32_e32 v3, s1
110; GFX10-NEXT:    v_mov_b32_e32 v2, s0
111; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
112; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
113; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
114; GFX10-NEXT:    s_waitcnt vmcnt(0)
115; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
116; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
117; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
118; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
119; GFX10-NEXT:    ; return to shader part epilog
120;
121; GFX11-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
122; GFX11:       ; %bb.0:
123; GFX11-NEXT:    s_and_b32 s0, s2, 3
124; GFX11-NEXT:    s_mov_b32 s1, 0
125; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
126; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
127; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
128; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
129; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
130; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
131; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
132; GFX11-NEXT:    s_waitcnt vmcnt(0)
133; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
134; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
135; GFX11-NEXT:    v_readfirstlane_b32 s2, v2
136; GFX11-NEXT:    v_readfirstlane_b32 s3, v3
137; GFX11-NEXT:    ; return to shader part epilog
138  %vector = load <4 x i128>, ptr addrspace(1) %ptr
139  %element = extractelement <4 x i128> %vector, i32 %idx
140  ret i128 %element
141}
142
143define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
144; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
145; GFX9:       ; %bb.0:
146; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
148; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
149; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
150; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
151; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
152; GFX9-NEXT:    s_waitcnt vmcnt(0)
153; GFX9-NEXT:    s_setpc_b64 s[30:31]
154;
155; GFX8-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
156; GFX8:       ; %bb.0:
157; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
158; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
159; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
160; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
161; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
162; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
163; GFX8-NEXT:    s_waitcnt vmcnt(0)
164; GFX8-NEXT:    s_setpc_b64 s[30:31]
165;
166; GFX7-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
167; GFX7:       ; %bb.0:
168; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
169; GFX7-NEXT:    v_and_b32_e32 v2, 3, v2
170; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
171; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
172; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
173; GFX7-NEXT:    s_mov_b32 s6, 0
174; GFX7-NEXT:    s_mov_b32 s7, 0xf000
175; GFX7-NEXT:    s_mov_b64 s[4:5], 0
176; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
177; GFX7-NEXT:    s_waitcnt vmcnt(0)
178; GFX7-NEXT:    s_setpc_b64 s[30:31]
179;
180; GFX10-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
181; GFX10:       ; %bb.0:
182; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183; GFX10-NEXT:    v_and_b32_e32 v2, 3, v2
184; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
185; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
186; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
187; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
188; GFX10-NEXT:    s_waitcnt vmcnt(0)
189; GFX10-NEXT:    s_setpc_b64 s[30:31]
190;
191; GFX11-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
192; GFX11:       ; %bb.0:
193; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
194; GFX11-NEXT:    v_and_b32_e32 v2, 3, v2
195; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
196; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
197; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
198; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
199; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
200; GFX11-NEXT:    s_waitcnt vmcnt(0)
201; GFX11-NEXT:    s_setpc_b64 s[30:31]
202  %vector = load <4 x i128>, ptr addrspace(1) %ptr
203  %element = extractelement <4 x i128> %vector, i32 %idx
204  ret i128 %element
205}
206
207define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
208; GFX9-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
209; GFX9:       ; %bb.0:
210; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
211; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
212; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
213; GFX9-NEXT:    s_waitcnt vmcnt(0)
214; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
215; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
216; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
217; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
218; GFX9-NEXT:    ; return to shader part epilog
219;
220; GFX8-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
221; GFX8:       ; %bb.0:
222; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
223; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
224; GFX8-NEXT:    v_mov_b32_e32 v0, s2
225; GFX8-NEXT:    v_mov_b32_e32 v1, s3
226; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
227; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
228; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
229; GFX8-NEXT:    s_waitcnt vmcnt(0)
230; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
231; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
232; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
233; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
234; GFX8-NEXT:    ; return to shader part epilog
235;
236; GFX7-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
237; GFX7:       ; %bb.0:
238; GFX7-NEXT:    v_and_b32_e32 v0, 3, v0
239; GFX7-NEXT:    s_mov_b32 s0, s2
240; GFX7-NEXT:    s_mov_b32 s1, s3
241; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
242; GFX7-NEXT:    v_mov_b32_e32 v1, 0
243; GFX7-NEXT:    s_mov_b32 s2, 0
244; GFX7-NEXT:    s_mov_b32 s3, 0xf000
245; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
246; GFX7-NEXT:    s_waitcnt vmcnt(0)
247; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
248; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
249; GFX7-NEXT:    v_readfirstlane_b32 s2, v2
250; GFX7-NEXT:    v_readfirstlane_b32 s3, v3
251; GFX7-NEXT:    ; return to shader part epilog
252;
253; GFX10-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
254; GFX10:       ; %bb.0:
255; GFX10-NEXT:    v_and_b32_e32 v0, 3, v0
256; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
257; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
258; GFX10-NEXT:    s_waitcnt vmcnt(0)
259; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
260; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
261; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
262; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
263; GFX10-NEXT:    ; return to shader part epilog
264;
265; GFX11-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
266; GFX11:       ; %bb.0:
267; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
268; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
269; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
270; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
271; GFX11-NEXT:    s_waitcnt vmcnt(0)
272; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
273; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
274; GFX11-NEXT:    v_readfirstlane_b32 s2, v2
275; GFX11-NEXT:    v_readfirstlane_b32 s3, v3
276; GFX11-NEXT:    ; return to shader part epilog
277  %vector = load <4 x i128>, ptr addrspace(4) %ptr
278  %element = extractelement <4 x i128> %vector, i32 %idx
279  ret i128 %element
280}
281
282define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %ptr) {
283; GCN-LABEL: extractelement_sgpr_v4i128_idx0:
284; GCN:       ; %bb.0:
285; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
286; GCN-NEXT:    s_waitcnt lgkmcnt(0)
287; GCN-NEXT:    ; return to shader part epilog
288;
289; GFX10-LABEL: extractelement_sgpr_v4i128_idx0:
290; GFX10:       ; %bb.0:
291; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
292; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
293; GFX10-NEXT:    ; return to shader part epilog
294;
295; GFX11-LABEL: extractelement_sgpr_v4i128_idx0:
296; GFX11:       ; %bb.0:
297; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x0
298; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
299; GFX11-NEXT:    ; return to shader part epilog
300  %vector = load <4 x i128>, ptr addrspace(4) %ptr
301  %element = extractelement <4 x i128> %vector, i32 0
302  ret i128 %element
303}
304
305define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %ptr) {
306; GFX9-LABEL: extractelement_sgpr_v4i128_idx1:
307; GFX9:       ; %bb.0:
308; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x10
309; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
310; GFX9-NEXT:    ; return to shader part epilog
311;
312; GFX8-LABEL: extractelement_sgpr_v4i128_idx1:
313; GFX8:       ; %bb.0:
314; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x10
315; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
316; GFX8-NEXT:    ; return to shader part epilog
317;
318; GFX7-LABEL: extractelement_sgpr_v4i128_idx1:
319; GFX7:       ; %bb.0:
320; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x4
321; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
322; GFX7-NEXT:    ; return to shader part epilog
323;
324; GFX10-LABEL: extractelement_sgpr_v4i128_idx1:
325; GFX10:       ; %bb.0:
326; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x10
327; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
328; GFX10-NEXT:    ; return to shader part epilog
329;
330; GFX11-LABEL: extractelement_sgpr_v4i128_idx1:
331; GFX11:       ; %bb.0:
332; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x10
333; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
334; GFX11-NEXT:    ; return to shader part epilog
335  %vector = load <4 x i128>, ptr addrspace(4) %ptr
336  %element = extractelement <4 x i128> %vector, i32 1
337  ret i128 %element
338}
339
340define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %ptr) {
341; GFX9-LABEL: extractelement_sgpr_v4i128_idx2:
342; GFX9:       ; %bb.0:
343; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x20
344; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
345; GFX9-NEXT:    ; return to shader part epilog
346;
347; GFX8-LABEL: extractelement_sgpr_v4i128_idx2:
348; GFX8:       ; %bb.0:
349; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x20
350; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
351; GFX8-NEXT:    ; return to shader part epilog
352;
353; GFX7-LABEL: extractelement_sgpr_v4i128_idx2:
354; GFX7:       ; %bb.0:
355; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x8
356; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
357; GFX7-NEXT:    ; return to shader part epilog
358;
359; GFX10-LABEL: extractelement_sgpr_v4i128_idx2:
360; GFX10:       ; %bb.0:
361; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x20
362; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
363; GFX10-NEXT:    ; return to shader part epilog
364;
365; GFX11-LABEL: extractelement_sgpr_v4i128_idx2:
366; GFX11:       ; %bb.0:
367; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x20
368; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
369; GFX11-NEXT:    ; return to shader part epilog
370  %vector = load <4 x i128>, ptr addrspace(4) %ptr
371  %element = extractelement <4 x i128> %vector, i32 2
372  ret i128 %element
373}
374
375define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx3(ptr addrspace(4) inreg %ptr) {
376; GFX9-LABEL: extractelement_sgpr_v4i128_idx3:
377; GFX9:       ; %bb.0:
378; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x30
379; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
380; GFX9-NEXT:    ; return to shader part epilog
381;
382; GFX8-LABEL: extractelement_sgpr_v4i128_idx3:
383; GFX8:       ; %bb.0:
384; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x30
385; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX8-NEXT:    ; return to shader part epilog
387;
388; GFX7-LABEL: extractelement_sgpr_v4i128_idx3:
389; GFX7:       ; %bb.0:
390; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0xc
391; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX7-NEXT:    ; return to shader part epilog
393;
394; GFX10-LABEL: extractelement_sgpr_v4i128_idx3:
395; GFX10:       ; %bb.0:
396; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x30
397; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
398; GFX10-NEXT:    ; return to shader part epilog
399;
400; GFX11-LABEL: extractelement_sgpr_v4i128_idx3:
401; GFX11:       ; %bb.0:
402; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x30
403; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
404; GFX11-NEXT:    ; return to shader part epilog
405  %vector = load <4 x i128>, ptr addrspace(4) %ptr
406  %element = extractelement <4 x i128> %vector, i32 3
407  ret i128 %element
408}
409
410define i128 @extractelement_vgpr_v4i128_idx0(ptr addrspace(1) %ptr) {
411; GFX9-LABEL: extractelement_vgpr_v4i128_idx0:
412; GFX9:       ; %bb.0:
413; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
414; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
415; GFX9-NEXT:    s_waitcnt vmcnt(0)
416; GFX9-NEXT:    s_setpc_b64 s[30:31]
417;
418; GFX8-LABEL: extractelement_vgpr_v4i128_idx0:
419; GFX8:       ; %bb.0:
420; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
421; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
422; GFX8-NEXT:    s_waitcnt vmcnt(0)
423; GFX8-NEXT:    s_setpc_b64 s[30:31]
424;
425; GFX7-LABEL: extractelement_vgpr_v4i128_idx0:
426; GFX7:       ; %bb.0:
427; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
428; GFX7-NEXT:    s_mov_b32 s6, 0
429; GFX7-NEXT:    s_mov_b32 s7, 0xf000
430; GFX7-NEXT:    s_mov_b64 s[4:5], 0
431; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
432; GFX7-NEXT:    s_waitcnt vmcnt(0)
433; GFX7-NEXT:    s_setpc_b64 s[30:31]
434;
435; GFX10-LABEL: extractelement_vgpr_v4i128_idx0:
436; GFX10:       ; %bb.0:
437; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
438; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
439; GFX10-NEXT:    s_waitcnt vmcnt(0)
440; GFX10-NEXT:    s_setpc_b64 s[30:31]
441;
442; GFX11-LABEL: extractelement_vgpr_v4i128_idx0:
443; GFX11:       ; %bb.0:
444; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
446; GFX11-NEXT:    s_waitcnt vmcnt(0)
447; GFX11-NEXT:    s_setpc_b64 s[30:31]
448  %vector = load <4 x i128>, ptr addrspace(1) %ptr
449  %element = extractelement <4 x i128> %vector, i32 0
450  ret i128 %element
451}
452
453define i128 @extractelement_vgpr_v4i128_idx1(ptr addrspace(1) %ptr) {
454; GFX9-LABEL: extractelement_vgpr_v4i128_idx1:
455; GFX9:       ; %bb.0:
456; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
457; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:16
458; GFX9-NEXT:    s_waitcnt vmcnt(0)
459; GFX9-NEXT:    s_setpc_b64 s[30:31]
460;
461; GFX8-LABEL: extractelement_vgpr_v4i128_idx1:
462; GFX8:       ; %bb.0:
463; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
464; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
465; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
466; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
467; GFX8-NEXT:    s_waitcnt vmcnt(0)
468; GFX8-NEXT:    s_setpc_b64 s[30:31]
469;
470; GFX7-LABEL: extractelement_vgpr_v4i128_idx1:
471; GFX7:       ; %bb.0:
472; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
473; GFX7-NEXT:    s_mov_b32 s6, 0
474; GFX7-NEXT:    s_mov_b32 s7, 0xf000
475; GFX7-NEXT:    s_mov_b64 s[4:5], 0
476; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 offset:16
477; GFX7-NEXT:    s_waitcnt vmcnt(0)
478; GFX7-NEXT:    s_setpc_b64 s[30:31]
479;
480; GFX10-LABEL: extractelement_vgpr_v4i128_idx1:
481; GFX10:       ; %bb.0:
482; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
483; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:16
484; GFX10-NEXT:    s_waitcnt vmcnt(0)
485; GFX10-NEXT:    s_setpc_b64 s[30:31]
486;
487; GFX11-LABEL: extractelement_vgpr_v4i128_idx1:
488; GFX11:       ; %bb.0:
489; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
490; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:16
491; GFX11-NEXT:    s_waitcnt vmcnt(0)
492; GFX11-NEXT:    s_setpc_b64 s[30:31]
493  %vector = load <4 x i128>, ptr addrspace(1) %ptr
494  %element = extractelement <4 x i128> %vector, i32 1
495  ret i128 %element
496}
497
498define i128 @extractelement_vgpr_v4i128_idx2(ptr addrspace(1) %ptr) {
499; GFX9-LABEL: extractelement_vgpr_v4i128_idx2:
500; GFX9:       ; %bb.0:
501; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
502; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:32
503; GFX9-NEXT:    s_waitcnt vmcnt(0)
504; GFX9-NEXT:    s_setpc_b64 s[30:31]
505;
506; GFX8-LABEL: extractelement_vgpr_v4i128_idx2:
507; GFX8:       ; %bb.0:
508; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
509; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
510; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
511; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
512; GFX8-NEXT:    s_waitcnt vmcnt(0)
513; GFX8-NEXT:    s_setpc_b64 s[30:31]
514;
515; GFX7-LABEL: extractelement_vgpr_v4i128_idx2:
516; GFX7:       ; %bb.0:
517; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
518; GFX7-NEXT:    s_mov_b32 s6, 0
519; GFX7-NEXT:    s_mov_b32 s7, 0xf000
520; GFX7-NEXT:    s_mov_b64 s[4:5], 0
521; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 offset:32
522; GFX7-NEXT:    s_waitcnt vmcnt(0)
523; GFX7-NEXT:    s_setpc_b64 s[30:31]
524;
525; GFX10-LABEL: extractelement_vgpr_v4i128_idx2:
526; GFX10:       ; %bb.0:
527; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
528; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:32
529; GFX10-NEXT:    s_waitcnt vmcnt(0)
530; GFX10-NEXT:    s_setpc_b64 s[30:31]
531;
532; GFX11-LABEL: extractelement_vgpr_v4i128_idx2:
533; GFX11:       ; %bb.0:
534; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
535; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:32
536; GFX11-NEXT:    s_waitcnt vmcnt(0)
537; GFX11-NEXT:    s_setpc_b64 s[30:31]
538  %vector = load <4 x i128>, ptr addrspace(1) %ptr
539  %element = extractelement <4 x i128> %vector, i32 2
540  ret i128 %element
541}
542
543define i128 @extractelement_vgpr_v4i128_idx3(ptr addrspace(1) %ptr) {
544; GFX9-LABEL: extractelement_vgpr_v4i128_idx3:
545; GFX9:       ; %bb.0:
546; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
547; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:48
548; GFX9-NEXT:    s_waitcnt vmcnt(0)
549; GFX9-NEXT:    s_setpc_b64 s[30:31]
550;
551; GFX8-LABEL: extractelement_vgpr_v4i128_idx3:
552; GFX8:       ; %bb.0:
553; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
554; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
555; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
556; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
557; GFX8-NEXT:    s_waitcnt vmcnt(0)
558; GFX8-NEXT:    s_setpc_b64 s[30:31]
559;
560; GFX7-LABEL: extractelement_vgpr_v4i128_idx3:
561; GFX7:       ; %bb.0:
562; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
563; GFX7-NEXT:    s_mov_b32 s6, 0
564; GFX7-NEXT:    s_mov_b32 s7, 0xf000
565; GFX7-NEXT:    s_mov_b64 s[4:5], 0
566; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 offset:48
567; GFX7-NEXT:    s_waitcnt vmcnt(0)
568; GFX7-NEXT:    s_setpc_b64 s[30:31]
569;
570; GFX10-LABEL: extractelement_vgpr_v4i128_idx3:
571; GFX10:       ; %bb.0:
572; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
573; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:48
574; GFX10-NEXT:    s_waitcnt vmcnt(0)
575; GFX10-NEXT:    s_setpc_b64 s[30:31]
576;
577; GFX11-LABEL: extractelement_vgpr_v4i128_idx3:
578; GFX11:       ; %bb.0:
579; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
580; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:48
581; GFX11-NEXT:    s_waitcnt vmcnt(0)
582; GFX11-NEXT:    s_setpc_b64 s[30:31]
583  %vector = load <4 x i128>, ptr addrspace(1) %ptr
584  %element = extractelement <4 x i128> %vector, i32 3
585  ret i128 %element
586}
587