xref: /llvm-project/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll (revision bd679865c05b29450428ad460e59e2dcd07fe974)
1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
4
5define <2 x i8> @shuffle_v2i8_rebroadcast(ptr addrspace(1) %arg0) {
6; GFX9-LABEL: shuffle_v2i8_rebroadcast:
7; GFX9:       ; %bb.0: ; %entry
8; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9; GFX9-NEXT:  global_load_ushort v0, v[0:1], off
10; GFX9-NEXT:  s_waitcnt vmcnt(0)
11; GFX9-NEXT:  v_lshrrev_b16_e32 v0, 8, v0
12; GFX9-NEXT:  v_mov_b32_e32 v1, v0
13; GFX9-NEXT:  s_setpc_b64 s[30:31]
14;
15; GFX10-LABEL: shuffle_v2i8_rebroadcast:
16; GFX10:       ; %bb.0: ; %entry
17; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18; GFX10-NEXT:  global_load_ushort v0, v[0:1], off
19; GFX10-NEXT:  s_waitcnt vmcnt(0)
20; GFX10-NEXT:  v_lshrrev_b16 v0, 8, v0
21; GFX10-NEXT:  v_mov_b32_e32 v1, v0
22; GFX10-NEXT:  s_setpc_b64 s[30:31]
23;
24; GFX11-LABEL: shuffle_v2i8_rebroadcast:
25; GFX11:       ; %bb.0: ; %entry
26; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; GFX11-NEXT:  global_load_u16 v0, v[0:1], off
28; GFX11-NEXT:  s_waitcnt vmcnt(0)
29; GFX11-NEXT:  v_lshrrev_b16 v0, 8, v0
30; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
31; GFX11-NEXT:  v_mov_b32_e32 v1, v0
32; GFX11-NEXT:  s_setpc_b64 s[30:31]
33entry:
34  %val0 = load <2 x i8>, ptr addrspace(1) %arg0
35  %val1 = shufflevector <2 x i8> %val0, <2 x i8> poison, <2 x i32> <i32 1, i32 1>
36  ret <2 x i8> %val1
37}
38
39define <4 x i8> @shuffle_v4i8_rebroadcast(ptr addrspace(1) %arg0) {
40; GFX9-LABEL: shuffle_v4i8_rebroadcast:
41; GFX9:       ; %bb.0: ; %entry
42; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
43; GFX9-NEXT:  global_load_dword v0, v[0:1], off
44; GFX9-NEXT:  s_waitcnt vmcnt(0)
45; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
46; GFX9-NEXT:  v_mov_b32_e32 v1, v0
47; GFX9-NEXT:  v_mov_b32_e32 v2, v0
48; GFX9-NEXT:  v_mov_b32_e32 v3, v0
49; GFX9-NEXT:  s_setpc_b64 s[30:31]
50;
51; GFX10-LABEL: shuffle_v4i8_rebroadcast:
52; GFX10:       ; %bb.0: ; %entry
53; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
54; GFX10-NEXT:  global_load_dword v0, v[0:1], off
55; GFX10-NEXT:  s_waitcnt vmcnt(0)
56; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
57; GFX10-NEXT:  v_mov_b32_e32 v1, v0
58; GFX10-NEXT:  v_mov_b32_e32 v2, v0
59; GFX10-NEXT:  v_mov_b32_e32 v3, v0
60; GFX10-NEXT:  s_setpc_b64 s[30:31]
61;
62; GFX11-LABEL: shuffle_v4i8_rebroadcast:
63; GFX11:       ; %bb.0: ; %entry
64; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
65; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
66; GFX11-NEXT:  s_waitcnt vmcnt(0)
67; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
68; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
69; GFX11-NEXT:  v_mov_b32_e32 v1, v0
70; GFX11-NEXT:  v_mov_b32_e32 v2, v0
71; GFX11-NEXT:  v_mov_b32_e32 v3, v0
72; GFX11-NEXT:  s_setpc_b64 s[30:31]
73entry:
74  %val0 = load <4 x i8>, ptr addrspace(1) %arg0
75  %val1 = shufflevector <4 x i8> %val0, <4 x i8> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
76  ret <4 x i8> %val1
77}
78
79define <8 x i8> @shuffle_v8i8_rebroadcast(ptr addrspace(1) %arg0) {
80; GFX9-LABEL: shuffle_v8i8_rebroadcast:
81; GFX9:       ; %bb.0: ; %entry
82; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; GFX9-NEXT:  global_load_dword v0, v[0:1], off
84; GFX9-NEXT:  s_waitcnt vmcnt(0)
85; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
86; GFX9-NEXT:  v_mov_b32_e32 v1, v0
87; GFX9-NEXT:  v_mov_b32_e32 v2, v0
88; GFX9-NEXT:  v_mov_b32_e32 v3, v0
89; GFX9-NEXT:  v_mov_b32_e32 v4, v0
90; GFX9-NEXT:  v_mov_b32_e32 v5, v0
91; GFX9-NEXT:  v_mov_b32_e32 v6, v0
92; GFX9-NEXT:  v_mov_b32_e32 v7, v0
93; GFX9-NEXT:  s_setpc_b64 s[30:31]
94;
95; GFX10-LABEL: shuffle_v8i8_rebroadcast:
96; GFX10:       ; %bb.0: ; %entry
97; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98; GFX10-NEXT:  global_load_dword v0, v[0:1], off
99; GFX10-NEXT:  s_waitcnt vmcnt(0)
100; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
101; GFX10-NEXT:  v_mov_b32_e32 v1, v0
102; GFX10-NEXT:  v_mov_b32_e32 v2, v0
103; GFX10-NEXT:  v_mov_b32_e32 v3, v0
104; GFX10-NEXT:  v_mov_b32_e32 v4, v0
105; GFX10-NEXT:  v_mov_b32_e32 v5, v0
106; GFX10-NEXT:  v_mov_b32_e32 v6, v0
107; GFX10-NEXT:  v_mov_b32_e32 v7, v0
108; GFX10-NEXT:  s_setpc_b64 s[30:31]
109;
110; GFX11-LABEL: shuffle_v8i8_rebroadcast:
111; GFX11:       ; %bb.0: ; %entry
112; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
114; GFX11-NEXT:  s_waitcnt vmcnt(0)
115; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
116; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
117; GFX11-NEXT:  v_mov_b32_e32 v1, v0
118; GFX11-NEXT:  v_mov_b32_e32 v2, v0
119; GFX11-NEXT:  v_mov_b32_e32 v3, v0
120; GFX11-NEXT:  v_mov_b32_e32 v4, v0
121; GFX11-NEXT:  v_mov_b32_e32 v5, v0
122; GFX11-NEXT:  v_mov_b32_e32 v6, v0
123; GFX11-NEXT:  v_mov_b32_e32 v7, v0
124; GFX11-NEXT:  s_setpc_b64 s[30:31]
125entry:
126  %val0 = load <8 x i8>, ptr addrspace(1) %arg0
127  %val1 = shufflevector <8 x i8> %val0, <8 x i8> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
128  ret <8 x i8> %val1
129}
130
131define <16 x i8> @shuffle_v16i8_rebroadcast(ptr addrspace(1) %arg0) {
132; GFX9-LABEL: shuffle_v16i8_rebroadcast:
133; GFX9:       ; %bb.0: ; %entry
134; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
135; GFX9-NEXT:  global_load_dword v0, v[0:1], off
136; GFX9-NEXT:  s_waitcnt vmcnt(0)
137; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
138; GFX9-NEXT:  v_mov_b32_e32 v1, v0
139; GFX9-NEXT:  v_mov_b32_e32 v2, v0
140; GFX9-NEXT:  v_mov_b32_e32 v3, v0
141; GFX9-NEXT:  v_mov_b32_e32 v4, v0
142; GFX9-NEXT:  v_mov_b32_e32 v5, v0
143; GFX9-NEXT:  v_mov_b32_e32 v6, v0
144; GFX9-NEXT:  v_mov_b32_e32 v7, v0
145; GFX9-NEXT:  v_mov_b32_e32 v8, v0
146; GFX9-NEXT:  v_mov_b32_e32 v9, v0
147; GFX9-NEXT:  v_mov_b32_e32 v10, v0
148; GFX9-NEXT:  v_mov_b32_e32 v11, v0
149; GFX9-NEXT:  v_mov_b32_e32 v12, v0
150; GFX9-NEXT:  v_mov_b32_e32 v13, v0
151; GFX9-NEXT:  v_mov_b32_e32 v14, v0
152; GFX9-NEXT:  v_mov_b32_e32 v15, v0
153; GFX9-NEXT:  s_setpc_b64 s[30:31]
154;
155; GFX10-LABEL: shuffle_v16i8_rebroadcast:
156; GFX10:       ; %bb.0: ; %entry
157; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
158; GFX10-NEXT:  global_load_dword v0, v[0:1], off
159; GFX10-NEXT:  s_waitcnt vmcnt(0)
160; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
161; GFX10-NEXT:  v_mov_b32_e32 v1, v0
162; GFX10-NEXT:  v_mov_b32_e32 v2, v0
163; GFX10-NEXT:  v_mov_b32_e32 v3, v0
164; GFX10-NEXT:  v_mov_b32_e32 v4, v0
165; GFX10-NEXT:  v_mov_b32_e32 v5, v0
166; GFX10-NEXT:  v_mov_b32_e32 v6, v0
167; GFX10-NEXT:  v_mov_b32_e32 v7, v0
168; GFX10-NEXT:  v_mov_b32_e32 v8, v0
169; GFX10-NEXT:  v_mov_b32_e32 v9, v0
170; GFX10-NEXT:  v_mov_b32_e32 v10, v0
171; GFX10-NEXT:  v_mov_b32_e32 v11, v0
172; GFX10-NEXT:  v_mov_b32_e32 v12, v0
173; GFX10-NEXT:  v_mov_b32_e32 v13, v0
174; GFX10-NEXT:  v_mov_b32_e32 v14, v0
175; GFX10-NEXT:  v_mov_b32_e32 v15, v0
176; GFX10-NEXT:  s_setpc_b64 s[30:31]
177;
178; GFX11-LABEL: shuffle_v16i8_rebroadcast:
179; GFX11:       ; %bb.0: ; %entry
180; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
181; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
182; GFX11-NEXT:  s_waitcnt vmcnt(0)
183; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
184; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
185; GFX11-NEXT:  v_mov_b32_e32 v1, v0
186; GFX11-NEXT:  v_mov_b32_e32 v2, v0
187; GFX11-NEXT:  v_mov_b32_e32 v3, v0
188; GFX11-NEXT:  v_mov_b32_e32 v4, v0
189; GFX11-NEXT:  v_mov_b32_e32 v5, v0
190; GFX11-NEXT:  v_mov_b32_e32 v6, v0
191; GFX11-NEXT:  v_mov_b32_e32 v7, v0
192; GFX11-NEXT:  v_mov_b32_e32 v8, v0
193; GFX11-NEXT:  v_mov_b32_e32 v9, v0
194; GFX11-NEXT:  v_mov_b32_e32 v10, v0
195; GFX11-NEXT:  v_mov_b32_e32 v11, v0
196; GFX11-NEXT:  v_mov_b32_e32 v12, v0
197; GFX11-NEXT:  v_mov_b32_e32 v13, v0
198; GFX11-NEXT:  v_mov_b32_e32 v14, v0
199; GFX11-NEXT:  v_mov_b32_e32 v15, v0
200; GFX11-NEXT:  s_setpc_b64 s[30:31]
201entry:
202  %val0 = load <16 x i8>, ptr addrspace(1) %arg0
203  %val1 = shufflevector <16 x i8> %val0, <16 x i8> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
204  ret <16 x i8> %val1
205}
206
207define <32 x i8> @shuffle_v32i8_rebroadcast(ptr addrspace(1) %arg0) {
208; GFX9-LABEL: shuffle_v32i8_rebroadcast:
209; GFX9:       ; %bb.0: ; %entry
210; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
211; GFX9-NEXT:  global_load_dword v0, v[0:1], off
212; GFX9-NEXT:  s_waitcnt vmcnt(0)
213; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
214; GFX9-NEXT:  v_mov_b32_e32 v1, v0
215; GFX9-NEXT:  v_mov_b32_e32 v2, v0
216; GFX9-NEXT:  v_mov_b32_e32 v3, v0
217; GFX9-NEXT:  v_mov_b32_e32 v4, v0
218; GFX9-NEXT:  v_mov_b32_e32 v5, v0
219; GFX9-NEXT:  v_mov_b32_e32 v6, v0
220; GFX9-NEXT:  v_mov_b32_e32 v7, v0
221; GFX9-NEXT:  v_mov_b32_e32 v8, v0
222; GFX9-NEXT:  v_mov_b32_e32 v9, v0
223; GFX9-NEXT:  v_mov_b32_e32 v10, v0
224; GFX9-NEXT:  v_mov_b32_e32 v11, v0
225; GFX9-NEXT:  v_mov_b32_e32 v12, v0
226; GFX9-NEXT:  v_mov_b32_e32 v13, v0
227; GFX9-NEXT:  v_mov_b32_e32 v14, v0
228; GFX9-NEXT:  v_mov_b32_e32 v15, v0
229; GFX9-NEXT:  v_mov_b32_e32 v16, v0
230; GFX9-NEXT:  v_mov_b32_e32 v17, v0
231; GFX9-NEXT:  v_mov_b32_e32 v18, v0
232; GFX9-NEXT:  v_mov_b32_e32 v19, v0
233; GFX9-NEXT:  v_mov_b32_e32 v20, v0
234; GFX9-NEXT:  v_mov_b32_e32 v21, v0
235; GFX9-NEXT:  v_mov_b32_e32 v22, v0
236; GFX9-NEXT:  v_mov_b32_e32 v23, v0
237; GFX9-NEXT:  v_mov_b32_e32 v24, v0
238; GFX9-NEXT:  v_mov_b32_e32 v25, v0
239; GFX9-NEXT:  v_mov_b32_e32 v26, v0
240; GFX9-NEXT:  v_mov_b32_e32 v27, v0
241; GFX9-NEXT:  v_mov_b32_e32 v28, v0
242; GFX9-NEXT:  v_mov_b32_e32 v29, v0
243; GFX9-NEXT:  v_mov_b32_e32 v30, v0
244; GFX9-NEXT:  v_mov_b32_e32 v31, v0
245; GFX9-NEXT:  s_setpc_b64 s[30:31]
246;
247; GFX10-LABEL: shuffle_v32i8_rebroadcast:
248; GFX10:       ; %bb.0: ; %entry
249; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
250; GFX10-NEXT:  global_load_dword v0, v[0:1], off
251; GFX10-NEXT:  s_waitcnt vmcnt(0)
252; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
253; GFX10-NEXT:  v_mov_b32_e32 v1, v0
254; GFX10-NEXT:  v_mov_b32_e32 v2, v0
255; GFX10-NEXT:  v_mov_b32_e32 v3, v0
256; GFX10-NEXT:  v_mov_b32_e32 v4, v0
257; GFX10-NEXT:  v_mov_b32_e32 v5, v0
258; GFX10-NEXT:  v_mov_b32_e32 v6, v0
259; GFX10-NEXT:  v_mov_b32_e32 v7, v0
260; GFX10-NEXT:  v_mov_b32_e32 v8, v0
261; GFX10-NEXT:  v_mov_b32_e32 v9, v0
262; GFX10-NEXT:  v_mov_b32_e32 v10, v0
263; GFX10-NEXT:  v_mov_b32_e32 v11, v0
264; GFX10-NEXT:  v_mov_b32_e32 v12, v0
265; GFX10-NEXT:  v_mov_b32_e32 v13, v0
266; GFX10-NEXT:  v_mov_b32_e32 v14, v0
267; GFX10-NEXT:  v_mov_b32_e32 v15, v0
268; GFX10-NEXT:  v_mov_b32_e32 v16, v0
269; GFX10-NEXT:  v_mov_b32_e32 v17, v0
270; GFX10-NEXT:  v_mov_b32_e32 v18, v0
271; GFX10-NEXT:  v_mov_b32_e32 v19, v0
272; GFX10-NEXT:  v_mov_b32_e32 v20, v0
273; GFX10-NEXT:  v_mov_b32_e32 v21, v0
274; GFX10-NEXT:  v_mov_b32_e32 v22, v0
275; GFX10-NEXT:  v_mov_b32_e32 v23, v0
276; GFX10-NEXT:  v_mov_b32_e32 v24, v0
277; GFX10-NEXT:  v_mov_b32_e32 v25, v0
278; GFX10-NEXT:  v_mov_b32_e32 v26, v0
279; GFX10-NEXT:  v_mov_b32_e32 v27, v0
280; GFX10-NEXT:  v_mov_b32_e32 v28, v0
281; GFX10-NEXT:  v_mov_b32_e32 v29, v0
282; GFX10-NEXT:  v_mov_b32_e32 v30, v0
283; GFX10-NEXT:  v_mov_b32_e32 v31, v0
284; GFX10-NEXT:  s_setpc_b64 s[30:31]
285;
286; GFX11-LABEL: shuffle_v32i8_rebroadcast:
287; GFX11:       ; %bb.0: ; %entry
288; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
289; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
290; GFX11-NEXT:  s_waitcnt vmcnt(0)
291; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
292; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
293; GFX11-NEXT:  v_mov_b32_e32 v1, v0
294; GFX11-NEXT:  v_mov_b32_e32 v2, v0
295; GFX11-NEXT:  v_mov_b32_e32 v3, v0
296; GFX11-NEXT:  v_mov_b32_e32 v4, v0
297; GFX11-NEXT:  v_mov_b32_e32 v5, v0
298; GFX11-NEXT:  v_mov_b32_e32 v6, v0
299; GFX11-NEXT:  v_mov_b32_e32 v7, v0
300; GFX11-NEXT:  v_mov_b32_e32 v8, v0
301; GFX11-NEXT:  v_mov_b32_e32 v9, v0
302; GFX11-NEXT:  v_mov_b32_e32 v10, v0
303; GFX11-NEXT:  v_mov_b32_e32 v11, v0
304; GFX11-NEXT:  v_mov_b32_e32 v12, v0
305; GFX11-NEXT:  v_mov_b32_e32 v13, v0
306; GFX11-NEXT:  v_mov_b32_e32 v14, v0
307; GFX11-NEXT:  v_mov_b32_e32 v15, v0
308; GFX11-NEXT:  v_mov_b32_e32 v16, v0
309; GFX11-NEXT:  v_mov_b32_e32 v17, v0
310; GFX11-NEXT:  v_mov_b32_e32 v18, v0
311; GFX11-NEXT:  v_mov_b32_e32 v19, v0
312; GFX11-NEXT:  v_mov_b32_e32 v20, v0
313; GFX11-NEXT:  v_mov_b32_e32 v21, v0
314; GFX11-NEXT:  v_mov_b32_e32 v22, v0
315; GFX11-NEXT:  v_mov_b32_e32 v23, v0
316; GFX11-NEXT:  v_mov_b32_e32 v24, v0
317; GFX11-NEXT:  v_mov_b32_e32 v25, v0
318; GFX11-NEXT:  v_mov_b32_e32 v26, v0
319; GFX11-NEXT:  v_mov_b32_e32 v27, v0
320; GFX11-NEXT:  v_mov_b32_e32 v28, v0
321; GFX11-NEXT:  v_mov_b32_e32 v29, v0
322; GFX11-NEXT:  v_mov_b32_e32 v30, v0
323; GFX11-NEXT:  v_mov_b32_e32 v31, v0
324; GFX11-NEXT:  s_setpc_b64 s[30:31]
325entry:
326  %val0 = load <32 x i8>, ptr addrspace(1) %arg0
327  %val1 = shufflevector <32 x i8> %val0, <32 x i8> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
328  ret <32 x i8> %val1
329}
330
331define <2 x i16> @shuffle_v2i16_rebroadcast(ptr addrspace(1) %arg0) {
332; GFX9-LABEL: shuffle_v2i16_rebroadcast:
333; GFX9:       ; %bb.0: ; %entry
334; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
335; GFX9-NEXT:  global_load_dword v0, v[0:1], off
336; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
337; GFX9-NEXT:  s_waitcnt vmcnt(0)
338; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
339; GFX9-NEXT:  s_setpc_b64 s[30:31]
340;
341; GFX10-LABEL: shuffle_v2i16_rebroadcast:
342; GFX10:       ; %bb.0: ; %entry
343; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344; GFX10-NEXT:  global_load_dword v0, v[0:1], off
345; GFX10-NEXT:  s_waitcnt vmcnt(0)
346; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
347; GFX10-NEXT:  s_setpc_b64 s[30:31]
348;
349; GFX11-LABEL: shuffle_v2i16_rebroadcast:
350; GFX11:       ; %bb.0: ; %entry
351; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
352; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
353; GFX11-NEXT:  s_waitcnt vmcnt(0)
354; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
355; GFX11-NEXT:  s_setpc_b64 s[30:31]
356entry:
357  %val0 = load <2 x i16>, ptr addrspace(1) %arg0
358  %val1 = shufflevector <2 x i16> %val0, <2 x i16> poison, <2 x i32> <i32 1, i32 1>
359  ret <2 x i16> %val1
360}
361
362define <4 x i16> @shuffle_v4i16_rebroadcast(ptr addrspace(1) %arg0) {
363; GFX9-LABEL: shuffle_v4i16_rebroadcast:
364; GFX9:       ; %bb.0: ; %entry
365; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
366; GFX9-NEXT:  global_load_dword v0, v[0:1], off
367; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
368; GFX9-NEXT:  s_waitcnt vmcnt(0)
369; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
370; GFX9-NEXT:  v_mov_b32_e32 v1, v0
371; GFX9-NEXT:  s_setpc_b64 s[30:31]
372;
373; GFX10-LABEL: shuffle_v4i16_rebroadcast:
374; GFX10:       ; %bb.0: ; %entry
375; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
376; GFX10-NEXT:  global_load_dword v0, v[0:1], off
377; GFX10-NEXT:  s_waitcnt vmcnt(0)
378; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
379; GFX10-NEXT:  v_mov_b32_e32 v1, v0
380; GFX10-NEXT:  s_setpc_b64 s[30:31]
381;
382; GFX11-LABEL: shuffle_v4i16_rebroadcast:
383; GFX11:       ; %bb.0: ; %entry
384; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
385; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
386; GFX11-NEXT:  s_waitcnt vmcnt(0)
387; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
388; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
389; GFX11-NEXT:  v_mov_b32_e32 v1, v0
390; GFX11-NEXT:  s_setpc_b64 s[30:31]
391entry:
392  %val0 = load <4 x i16>, ptr addrspace(1) %arg0
393  %val1 = shufflevector <4 x i16> %val0, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
394  ret <4 x i16> %val1
395}
396
397define <8 x i16> @shuffle_v8i16_rebroadcast(ptr addrspace(1) %arg0) {
398; GFX9-LABEL: shuffle_v8i16_rebroadcast:
399; GFX9:       ; %bb.0: ; %entry
400; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
401; GFX9-NEXT:  global_load_dword v0, v[0:1], off
402; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
403; GFX9-NEXT:  s_waitcnt vmcnt(0)
404; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
405; GFX9-NEXT:  v_mov_b32_e32 v1, v0
406; GFX9-NEXT:  v_mov_b32_e32 v2, v0
407; GFX9-NEXT:  v_mov_b32_e32 v3, v0
408; GFX9-NEXT:  s_setpc_b64 s[30:31]
409;
410; GFX10-LABEL: shuffle_v8i16_rebroadcast:
411; GFX10:       ; %bb.0: ; %entry
412; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
413; GFX10-NEXT:  global_load_dword v0, v[0:1], off
414; GFX10-NEXT:  s_waitcnt vmcnt(0)
415; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
416; GFX10-NEXT:  v_mov_b32_e32 v1, v0
417; GFX10-NEXT:  v_mov_b32_e32 v2, v0
418; GFX10-NEXT:  v_mov_b32_e32 v3, v0
419; GFX10-NEXT:  s_setpc_b64 s[30:31]
420;
421; GFX11-LABEL: shuffle_v8i16_rebroadcast:
422; GFX11:       ; %bb.0: ; %entry
423; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
424; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
425; GFX11-NEXT:  s_waitcnt vmcnt(0)
426; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
427; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
428; GFX11-NEXT:  v_mov_b32_e32 v1, v0
429; GFX11-NEXT:  v_mov_b32_e32 v2, v0
430; GFX11-NEXT:  v_mov_b32_e32 v3, v0
431; GFX11-NEXT:  s_setpc_b64 s[30:31]
432entry:
433  %val0 = load <8 x i16>, ptr addrspace(1) %arg0
434  %val1 = shufflevector <8 x i16> %val0, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
435  ret <8 x i16> %val1
436}
437
438define <16 x i16> @shuffle_v16i16_rebroadcast(ptr addrspace(1) %arg0) {
439; GFX9-LABEL: shuffle_v16i16_rebroadcast:
440; GFX9:       ; %bb.0: ; %entry
441; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
442; GFX9-NEXT:  global_load_dword v0, v[0:1], off
443; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
444; GFX9-NEXT:  s_waitcnt vmcnt(0)
445; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
446; GFX9-NEXT:  v_mov_b32_e32 v1, v0
447; GFX9-NEXT:  v_mov_b32_e32 v2, v0
448; GFX9-NEXT:  v_mov_b32_e32 v3, v0
449; GFX9-NEXT:  v_mov_b32_e32 v4, v0
450; GFX9-NEXT:  v_mov_b32_e32 v5, v0
451; GFX9-NEXT:  v_mov_b32_e32 v6, v0
452; GFX9-NEXT:  v_mov_b32_e32 v7, v0
453; GFX9-NEXT:  s_setpc_b64 s[30:31]
454;
455; GFX10-LABEL: shuffle_v16i16_rebroadcast:
456; GFX10:       ; %bb.0: ; %entry
457; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458; GFX10-NEXT:  global_load_dword v0, v[0:1], off
459; GFX10-NEXT:  s_waitcnt vmcnt(0)
460; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
461; GFX10-NEXT:  v_mov_b32_e32 v1, v0
462; GFX10-NEXT:  v_mov_b32_e32 v2, v0
463; GFX10-NEXT:  v_mov_b32_e32 v3, v0
464; GFX10-NEXT:  v_mov_b32_e32 v4, v0
465; GFX10-NEXT:  v_mov_b32_e32 v5, v0
466; GFX10-NEXT:  v_mov_b32_e32 v6, v0
467; GFX10-NEXT:  v_mov_b32_e32 v7, v0
468; GFX10-NEXT:  s_setpc_b64 s[30:31]
469;
470; GFX11-LABEL: shuffle_v16i16_rebroadcast:
471; GFX11:       ; %bb.0: ; %entry
472; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
473; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
474; GFX11-NEXT:  s_waitcnt vmcnt(0)
475; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
476; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
477; GFX11-NEXT:  v_mov_b32_e32 v1, v0
478; GFX11-NEXT:  v_mov_b32_e32 v2, v0
479; GFX11-NEXT:  v_mov_b32_e32 v3, v0
480; GFX11-NEXT:  v_mov_b32_e32 v4, v0
481; GFX11-NEXT:  v_mov_b32_e32 v5, v0
482; GFX11-NEXT:  v_mov_b32_e32 v6, v0
483; GFX11-NEXT:  v_mov_b32_e32 v7, v0
484; GFX11-NEXT:  s_setpc_b64 s[30:31]
485entry:
486  %val0 = load <16 x i16>, ptr addrspace(1) %arg0
487  %val1 = shufflevector <16 x i16> %val0, <16 x i16> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
488  ret <16 x i16> %val1
489}
490
491define <32 x i16> @shuffle_v32i16_rebroadcast(ptr addrspace(1) %arg0) {
492; GFX9-LABEL: shuffle_v32i16_rebroadcast:
493; GFX9:       ; %bb.0: ; %entry
494; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
495; GFX9-NEXT:  global_load_dword v0, v[0:1], off
496; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
497; GFX9-NEXT:  s_waitcnt vmcnt(0)
498; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
499; GFX9-NEXT:  v_mov_b32_e32 v1, v0
500; GFX9-NEXT:  v_mov_b32_e32 v2, v0
501; GFX9-NEXT:  v_mov_b32_e32 v3, v0
502; GFX9-NEXT:  v_mov_b32_e32 v4, v0
503; GFX9-NEXT:  v_mov_b32_e32 v5, v0
504; GFX9-NEXT:  v_mov_b32_e32 v6, v0
505; GFX9-NEXT:  v_mov_b32_e32 v7, v0
506; GFX9-NEXT:  v_mov_b32_e32 v8, v0
507; GFX9-NEXT:  v_mov_b32_e32 v9, v0
508; GFX9-NEXT:  v_mov_b32_e32 v10, v0
509; GFX9-NEXT:  v_mov_b32_e32 v11, v0
510; GFX9-NEXT:  v_mov_b32_e32 v12, v0
511; GFX9-NEXT:  v_mov_b32_e32 v13, v0
512; GFX9-NEXT:  v_mov_b32_e32 v14, v0
513; GFX9-NEXT:  v_mov_b32_e32 v15, v0
514; GFX9-NEXT:  s_setpc_b64 s[30:31]
515;
516; GFX10-LABEL: shuffle_v32i16_rebroadcast:
517; GFX10:       ; %bb.0: ; %entry
518; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
519; GFX10-NEXT:  global_load_dword v0, v[0:1], off
520; GFX10-NEXT:  s_waitcnt vmcnt(0)
521; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
522; GFX10-NEXT:  v_mov_b32_e32 v1, v0
523; GFX10-NEXT:  v_mov_b32_e32 v2, v0
524; GFX10-NEXT:  v_mov_b32_e32 v3, v0
525; GFX10-NEXT:  v_mov_b32_e32 v4, v0
526; GFX10-NEXT:  v_mov_b32_e32 v5, v0
527; GFX10-NEXT:  v_mov_b32_e32 v6, v0
528; GFX10-NEXT:  v_mov_b32_e32 v7, v0
529; GFX10-NEXT:  v_mov_b32_e32 v8, v0
530; GFX10-NEXT:  v_mov_b32_e32 v9, v0
531; GFX10-NEXT:  v_mov_b32_e32 v10, v0
532; GFX10-NEXT:  v_mov_b32_e32 v11, v0
533; GFX10-NEXT:  v_mov_b32_e32 v12, v0
534; GFX10-NEXT:  v_mov_b32_e32 v13, v0
535; GFX10-NEXT:  v_mov_b32_e32 v14, v0
536; GFX10-NEXT:  v_mov_b32_e32 v15, v0
537; GFX10-NEXT:  s_setpc_b64 s[30:31]
538;
539; GFX11-LABEL: shuffle_v32i16_rebroadcast:
540; GFX11:       ; %bb.0: ; %entry
541; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
542; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
543; GFX11-NEXT:  s_waitcnt vmcnt(0)
544; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
545; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
546; GFX11-NEXT:  v_mov_b32_e32 v1, v0
547; GFX11-NEXT:  v_mov_b32_e32 v2, v0
548; GFX11-NEXT:  v_mov_b32_e32 v3, v0
549; GFX11-NEXT:  v_mov_b32_e32 v4, v0
550; GFX11-NEXT:  v_mov_b32_e32 v5, v0
551; GFX11-NEXT:  v_mov_b32_e32 v6, v0
552; GFX11-NEXT:  v_mov_b32_e32 v7, v0
553; GFX11-NEXT:  v_mov_b32_e32 v8, v0
554; GFX11-NEXT:  v_mov_b32_e32 v9, v0
555; GFX11-NEXT:  v_mov_b32_e32 v10, v0
556; GFX11-NEXT:  v_mov_b32_e32 v11, v0
557; GFX11-NEXT:  v_mov_b32_e32 v12, v0
558; GFX11-NEXT:  v_mov_b32_e32 v13, v0
559; GFX11-NEXT:  v_mov_b32_e32 v14, v0
560; GFX11-NEXT:  v_mov_b32_e32 v15, v0
561; GFX11-NEXT:  s_setpc_b64 s[30:31]
562entry:
563  %val0 = load <32 x i16>, ptr addrspace(1) %arg0
564  %val1 = shufflevector <32 x i16> %val0, <32 x i16> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
565  ret <32 x i16> %val1
566}
567
568define <2 x i32> @shuffle_v2i32_rebroadcast(ptr addrspace(1) %arg0) {
569; GFX9-LABEL: shuffle_v2i32_rebroadcast:
570; GFX9:       ; %bb.0: ; %entry
571; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
572; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
573; GFX9-NEXT:  s_waitcnt vmcnt(0)
574; GFX9-NEXT:  v_mov_b32_e32 v1, v0
575; GFX9-NEXT:  s_setpc_b64 s[30:31]
576;
577; GFX10-LABEL: shuffle_v2i32_rebroadcast:
578; GFX10:       ; %bb.0: ; %entry
579; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
580; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
581; GFX10-NEXT:  s_waitcnt vmcnt(0)
582; GFX10-NEXT:  v_mov_b32_e32 v1, v0
583; GFX10-NEXT:  s_setpc_b64 s[30:31]
584;
585; GFX11-LABEL: shuffle_v2i32_rebroadcast:
586; GFX11:       ; %bb.0: ; %entry
587; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
588; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
589; GFX11-NEXT:  s_waitcnt vmcnt(0)
590; GFX11-NEXT:  v_mov_b32_e32 v1, v0
591; GFX11-NEXT:  s_setpc_b64 s[30:31]
592entry:
593  %val0 = load <2 x i32>, ptr addrspace(1) %arg0
594  %val1 = shufflevector <2 x i32> %val0, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
595  ret <2 x i32> %val1
596}
597
598define <4 x i32> @shuffle_v4i32_rebroadcast(ptr addrspace(1) %arg0) {
599; GFX9-LABEL: shuffle_v4i32_rebroadcast:
600; GFX9:       ; %bb.0: ; %entry
601; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
602; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
603; GFX9-NEXT:  s_waitcnt vmcnt(0)
604; GFX9-NEXT:  v_mov_b32_e32 v1, v0
605; GFX9-NEXT:  v_mov_b32_e32 v2, v0
606; GFX9-NEXT:  v_mov_b32_e32 v3, v0
607; GFX9-NEXT:  s_setpc_b64 s[30:31]
608;
609; GFX10-LABEL: shuffle_v4i32_rebroadcast:
610; GFX10:       ; %bb.0: ; %entry
611; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
612; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
613; GFX10-NEXT:  s_waitcnt vmcnt(0)
614; GFX10-NEXT:  v_mov_b32_e32 v1, v0
615; GFX10-NEXT:  v_mov_b32_e32 v2, v0
616; GFX10-NEXT:  v_mov_b32_e32 v3, v0
617; GFX10-NEXT:  s_setpc_b64 s[30:31]
618;
619; GFX11-LABEL: shuffle_v4i32_rebroadcast:
620; GFX11:       ; %bb.0: ; %entry
621; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
622; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
623; GFX11-NEXT:  s_waitcnt vmcnt(0)
624; GFX11-NEXT:  v_mov_b32_e32 v1, v0
625; GFX11-NEXT:  v_mov_b32_e32 v2, v0
626; GFX11-NEXT:  v_mov_b32_e32 v3, v0
627; GFX11-NEXT:  s_setpc_b64 s[30:31]
628entry:
629  %val0 = load <4 x i32>, ptr addrspace(1) %arg0
630  %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
631  ret <4 x i32> %val1
632}
633
634define <8 x i32> @shuffle_v8i32_rebroadcast(ptr addrspace(1) %arg0) {
635; GFX9-LABEL: shuffle_v8i32_rebroadcast:
636; GFX9:       ; %bb.0: ; %entry
637; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
638; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
639; GFX9-NEXT:  s_waitcnt vmcnt(0)
640; GFX9-NEXT:  v_mov_b32_e32 v1, v0
641; GFX9-NEXT:  v_mov_b32_e32 v2, v0
642; GFX9-NEXT:  v_mov_b32_e32 v3, v0
643; GFX9-NEXT:  v_mov_b32_e32 v4, v0
644; GFX9-NEXT:  v_mov_b32_e32 v5, v0
645; GFX9-NEXT:  v_mov_b32_e32 v6, v0
646; GFX9-NEXT:  v_mov_b32_e32 v7, v0
647; GFX9-NEXT:  s_setpc_b64 s[30:31]
648;
649; GFX10-LABEL: shuffle_v8i32_rebroadcast:
650; GFX10:       ; %bb.0: ; %entry
651; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
653; GFX10-NEXT:  s_waitcnt vmcnt(0)
654; GFX10-NEXT:  v_mov_b32_e32 v1, v0
655; GFX10-NEXT:  v_mov_b32_e32 v2, v0
656; GFX10-NEXT:  v_mov_b32_e32 v3, v0
657; GFX10-NEXT:  v_mov_b32_e32 v4, v0
658; GFX10-NEXT:  v_mov_b32_e32 v5, v0
659; GFX10-NEXT:  v_mov_b32_e32 v6, v0
660; GFX10-NEXT:  v_mov_b32_e32 v7, v0
661; GFX10-NEXT:  s_setpc_b64 s[30:31]
662;
663; GFX11-LABEL: shuffle_v8i32_rebroadcast:
664; GFX11:       ; %bb.0: ; %entry
665; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
666; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
667; GFX11-NEXT:  s_waitcnt vmcnt(0)
668; GFX11-NEXT:  v_mov_b32_e32 v1, v0
669; GFX11-NEXT:  v_mov_b32_e32 v2, v0
670; GFX11-NEXT:  v_mov_b32_e32 v3, v0
671; GFX11-NEXT:  v_mov_b32_e32 v4, v0
672; GFX11-NEXT:  v_mov_b32_e32 v5, v0
673; GFX11-NEXT:  v_mov_b32_e32 v6, v0
674; GFX11-NEXT:  v_mov_b32_e32 v7, v0
675; GFX11-NEXT:  s_setpc_b64 s[30:31]
676entry:
677  %val0 = load <8 x i32>, ptr addrspace(1) %arg0
678  %val1 = shufflevector <8 x i32> %val0, <8 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
679  ret <8 x i32> %val1
680}
681
682define <16 x i32> @shuffle_v16i32_rebroadcast(ptr addrspace(1) %arg0) {
683; GFX9-LABEL: shuffle_v16i32_rebroadcast:
684; GFX9:       ; %bb.0: ; %entry
685; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
686; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
687; GFX9-NEXT:  s_waitcnt vmcnt(0)
688; GFX9-NEXT:  v_mov_b32_e32 v1, v0
689; GFX9-NEXT:  v_mov_b32_e32 v2, v0
690; GFX9-NEXT:  v_mov_b32_e32 v3, v0
691; GFX9-NEXT:  v_mov_b32_e32 v4, v0
692; GFX9-NEXT:  v_mov_b32_e32 v5, v0
693; GFX9-NEXT:  v_mov_b32_e32 v6, v0
694; GFX9-NEXT:  v_mov_b32_e32 v7, v0
695; GFX9-NEXT:  v_mov_b32_e32 v8, v0
696; GFX9-NEXT:  v_mov_b32_e32 v9, v0
697; GFX9-NEXT:  v_mov_b32_e32 v10, v0
698; GFX9-NEXT:  v_mov_b32_e32 v11, v0
699; GFX9-NEXT:  v_mov_b32_e32 v12, v0
700; GFX9-NEXT:  v_mov_b32_e32 v13, v0
701; GFX9-NEXT:  v_mov_b32_e32 v14, v0
702; GFX9-NEXT:  v_mov_b32_e32 v15, v0
703; GFX9-NEXT:  s_setpc_b64 s[30:31]
704;
705; GFX10-LABEL: shuffle_v16i32_rebroadcast:
706; GFX10:       ; %bb.0: ; %entry
707; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
708; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
709; GFX10-NEXT:  s_waitcnt vmcnt(0)
710; GFX10-NEXT:  v_mov_b32_e32 v1, v0
711; GFX10-NEXT:  v_mov_b32_e32 v2, v0
712; GFX10-NEXT:  v_mov_b32_e32 v3, v0
713; GFX10-NEXT:  v_mov_b32_e32 v4, v0
714; GFX10-NEXT:  v_mov_b32_e32 v5, v0
715; GFX10-NEXT:  v_mov_b32_e32 v6, v0
716; GFX10-NEXT:  v_mov_b32_e32 v7, v0
717; GFX10-NEXT:  v_mov_b32_e32 v8, v0
718; GFX10-NEXT:  v_mov_b32_e32 v9, v0
719; GFX10-NEXT:  v_mov_b32_e32 v10, v0
720; GFX10-NEXT:  v_mov_b32_e32 v11, v0
721; GFX10-NEXT:  v_mov_b32_e32 v12, v0
722; GFX10-NEXT:  v_mov_b32_e32 v13, v0
723; GFX10-NEXT:  v_mov_b32_e32 v14, v0
724; GFX10-NEXT:  v_mov_b32_e32 v15, v0
725; GFX10-NEXT:  s_setpc_b64 s[30:31]
726;
727; GFX11-LABEL: shuffle_v16i32_rebroadcast:
728; GFX11:       ; %bb.0: ; %entry
729; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
730; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
731; GFX11-NEXT:  s_waitcnt vmcnt(0)
732; GFX11-NEXT:  v_mov_b32_e32 v1, v0
733; GFX11-NEXT:  v_mov_b32_e32 v2, v0
734; GFX11-NEXT:  v_mov_b32_e32 v3, v0
735; GFX11-NEXT:  v_mov_b32_e32 v4, v0
736; GFX11-NEXT:  v_mov_b32_e32 v5, v0
737; GFX11-NEXT:  v_mov_b32_e32 v6, v0
738; GFX11-NEXT:  v_mov_b32_e32 v7, v0
739; GFX11-NEXT:  v_mov_b32_e32 v8, v0
740; GFX11-NEXT:  v_mov_b32_e32 v9, v0
741; GFX11-NEXT:  v_mov_b32_e32 v10, v0
742; GFX11-NEXT:  v_mov_b32_e32 v11, v0
743; GFX11-NEXT:  v_mov_b32_e32 v12, v0
744; GFX11-NEXT:  v_mov_b32_e32 v13, v0
745; GFX11-NEXT:  v_mov_b32_e32 v14, v0
746; GFX11-NEXT:  v_mov_b32_e32 v15, v0
747; GFX11-NEXT:  s_setpc_b64 s[30:31]
748entry:
749  %val0 = load <16 x i32>, ptr addrspace(1) %arg0
750  %val1 = shufflevector <16 x i32> %val0, <16 x i32> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
751  ret <16 x i32> %val1
752}
753
754define <32 x i32> @shuffle_v32i32_rebroadcast(ptr addrspace(1) %arg0) {
755; GFX9-LABEL: shuffle_v32i32_rebroadcast:
756; GFX9:       ; %bb.0: ; %entry
757; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
758; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
759; GFX9-NEXT:  s_waitcnt vmcnt(0)
760; GFX9-NEXT:  v_mov_b32_e32 v1, v0
761; GFX9-NEXT:  v_mov_b32_e32 v2, v0
762; GFX9-NEXT:  v_mov_b32_e32 v3, v0
763; GFX9-NEXT:  v_mov_b32_e32 v4, v0
764; GFX9-NEXT:  v_mov_b32_e32 v5, v0
765; GFX9-NEXT:  v_mov_b32_e32 v6, v0
766; GFX9-NEXT:  v_mov_b32_e32 v7, v0
767; GFX9-NEXT:  v_mov_b32_e32 v8, v0
768; GFX9-NEXT:  v_mov_b32_e32 v9, v0
769; GFX9-NEXT:  v_mov_b32_e32 v10, v0
770; GFX9-NEXT:  v_mov_b32_e32 v11, v0
771; GFX9-NEXT:  v_mov_b32_e32 v12, v0
772; GFX9-NEXT:  v_mov_b32_e32 v13, v0
773; GFX9-NEXT:  v_mov_b32_e32 v14, v0
774; GFX9-NEXT:  v_mov_b32_e32 v15, v0
775; GFX9-NEXT:  v_mov_b32_e32 v16, v0
776; GFX9-NEXT:  v_mov_b32_e32 v17, v0
777; GFX9-NEXT:  v_mov_b32_e32 v18, v0
778; GFX9-NEXT:  v_mov_b32_e32 v19, v0
779; GFX9-NEXT:  v_mov_b32_e32 v20, v0
780; GFX9-NEXT:  v_mov_b32_e32 v21, v0
781; GFX9-NEXT:  v_mov_b32_e32 v22, v0
782; GFX9-NEXT:  v_mov_b32_e32 v23, v0
783; GFX9-NEXT:  v_mov_b32_e32 v24, v0
784; GFX9-NEXT:  v_mov_b32_e32 v25, v0
785; GFX9-NEXT:  v_mov_b32_e32 v26, v0
786; GFX9-NEXT:  v_mov_b32_e32 v27, v0
787; GFX9-NEXT:  v_mov_b32_e32 v28, v0
788; GFX9-NEXT:  v_mov_b32_e32 v29, v0
789; GFX9-NEXT:  v_mov_b32_e32 v30, v0
790; GFX9-NEXT:  v_mov_b32_e32 v31, v0
791; GFX9-NEXT:  s_setpc_b64 s[30:31]
792;
793; GFX10-LABEL: shuffle_v32i32_rebroadcast:
794; GFX10:       ; %bb.0: ; %entry
795; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
796; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
797; GFX10-NEXT:  s_waitcnt vmcnt(0)
798; GFX10-NEXT:  v_mov_b32_e32 v1, v0
799; GFX10-NEXT:  v_mov_b32_e32 v2, v0
800; GFX10-NEXT:  v_mov_b32_e32 v3, v0
801; GFX10-NEXT:  v_mov_b32_e32 v4, v0
802; GFX10-NEXT:  v_mov_b32_e32 v5, v0
803; GFX10-NEXT:  v_mov_b32_e32 v6, v0
804; GFX10-NEXT:  v_mov_b32_e32 v7, v0
805; GFX10-NEXT:  v_mov_b32_e32 v8, v0
806; GFX10-NEXT:  v_mov_b32_e32 v9, v0
807; GFX10-NEXT:  v_mov_b32_e32 v10, v0
808; GFX10-NEXT:  v_mov_b32_e32 v11, v0
809; GFX10-NEXT:  v_mov_b32_e32 v12, v0
810; GFX10-NEXT:  v_mov_b32_e32 v13, v0
811; GFX10-NEXT:  v_mov_b32_e32 v14, v0
812; GFX10-NEXT:  v_mov_b32_e32 v15, v0
813; GFX10-NEXT:  v_mov_b32_e32 v16, v0
814; GFX10-NEXT:  v_mov_b32_e32 v17, v0
815; GFX10-NEXT:  v_mov_b32_e32 v18, v0
816; GFX10-NEXT:  v_mov_b32_e32 v19, v0
817; GFX10-NEXT:  v_mov_b32_e32 v20, v0
818; GFX10-NEXT:  v_mov_b32_e32 v21, v0
819; GFX10-NEXT:  v_mov_b32_e32 v22, v0
820; GFX10-NEXT:  v_mov_b32_e32 v23, v0
821; GFX10-NEXT:  v_mov_b32_e32 v24, v0
822; GFX10-NEXT:  v_mov_b32_e32 v25, v0
823; GFX10-NEXT:  v_mov_b32_e32 v26, v0
824; GFX10-NEXT:  v_mov_b32_e32 v27, v0
825; GFX10-NEXT:  v_mov_b32_e32 v28, v0
826; GFX10-NEXT:  v_mov_b32_e32 v29, v0
827; GFX10-NEXT:  v_mov_b32_e32 v30, v0
828; GFX10-NEXT:  v_mov_b32_e32 v31, v0
829; GFX10-NEXT:  s_setpc_b64 s[30:31]
830;
831; GFX11-LABEL: shuffle_v32i32_rebroadcast:
832; GFX11:       ; %bb.0: ; %entry
833; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
834; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
835; GFX11-NEXT:  s_waitcnt vmcnt(0)
836; GFX11-NEXT:  v_mov_b32_e32 v1, v0
837; GFX11-NEXT:  v_mov_b32_e32 v2, v0
838; GFX11-NEXT:  v_mov_b32_e32 v3, v0
839; GFX11-NEXT:  v_mov_b32_e32 v4, v0
840; GFX11-NEXT:  v_mov_b32_e32 v5, v0
841; GFX11-NEXT:  v_mov_b32_e32 v6, v0
842; GFX11-NEXT:  v_mov_b32_e32 v7, v0
843; GFX11-NEXT:  v_mov_b32_e32 v8, v0
844; GFX11-NEXT:  v_mov_b32_e32 v9, v0
845; GFX11-NEXT:  v_mov_b32_e32 v10, v0
846; GFX11-NEXT:  v_mov_b32_e32 v11, v0
847; GFX11-NEXT:  v_mov_b32_e32 v12, v0
848; GFX11-NEXT:  v_mov_b32_e32 v13, v0
849; GFX11-NEXT:  v_mov_b32_e32 v14, v0
850; GFX11-NEXT:  v_mov_b32_e32 v15, v0
851; GFX11-NEXT:  v_mov_b32_e32 v16, v0
852; GFX11-NEXT:  v_mov_b32_e32 v17, v0
853; GFX11-NEXT:  v_mov_b32_e32 v18, v0
854; GFX11-NEXT:  v_mov_b32_e32 v19, v0
855; GFX11-NEXT:  v_mov_b32_e32 v20, v0
856; GFX11-NEXT:  v_mov_b32_e32 v21, v0
857; GFX11-NEXT:  v_mov_b32_e32 v22, v0
858; GFX11-NEXT:  v_mov_b32_e32 v23, v0
859; GFX11-NEXT:  v_mov_b32_e32 v24, v0
860; GFX11-NEXT:  v_mov_b32_e32 v25, v0
861; GFX11-NEXT:  v_mov_b32_e32 v26, v0
862; GFX11-NEXT:  v_mov_b32_e32 v27, v0
863; GFX11-NEXT:  v_mov_b32_e32 v28, v0
864; GFX11-NEXT:  v_mov_b32_e32 v29, v0
865; GFX11-NEXT:  v_mov_b32_e32 v30, v0
866; GFX11-NEXT:  v_mov_b32_e32 v31, v0
867; GFX11-NEXT:  s_setpc_b64 s[30:31]
868entry:
869  %val0 = load <32 x i32>, ptr addrspace(1) %arg0
870  %val1 = shufflevector <32 x i32> %val0, <32 x i32> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
871  ret <32 x i32> %val1
872}
873
874define <2 x bfloat> @shuffle_v2bf16_rebroadcast(ptr addrspace(1) %arg0) {
875; GFX9-LABEL: shuffle_v2bf16_rebroadcast:
876; GFX9:       ; %bb.0: ; %entry
877; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
878; GFX9-NEXT:  global_load_dword v0, v[0:1], off
879; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
880; GFX9-NEXT:  s_waitcnt vmcnt(0)
881; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
882; GFX9-NEXT:  s_setpc_b64 s[30:31]
883;
884; GFX10-LABEL: shuffle_v2bf16_rebroadcast:
885; GFX10:       ; %bb.0: ; %entry
886; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
887; GFX10-NEXT:  global_load_dword v0, v[0:1], off
888; GFX10-NEXT:  s_waitcnt vmcnt(0)
889; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
890; GFX10-NEXT:  s_setpc_b64 s[30:31]
891;
892; GFX11-LABEL: shuffle_v2bf16_rebroadcast:
893; GFX11:       ; %bb.0: ; %entry
894; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
895; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
896; GFX11-NEXT:  s_waitcnt vmcnt(0)
897; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
898; GFX11-NEXT:  s_setpc_b64 s[30:31]
899entry:
900  %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0
901  %val1 = shufflevector <2 x bfloat> %val0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
902  ret <2 x bfloat> %val1
903}
904
905define <3 x bfloat> @shuffle_v3bf16_rebroadcast(ptr addrspace(1) %arg0) {
906; GFX9-LABEL: shuffle_v3bf16_rebroadcast:
907; GFX9:       ; %bb.0: ; %entry
908; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
909; GFX9-NEXT:  global_load_dword v1, v[0:1], off
910; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
911; GFX9-NEXT:  s_waitcnt vmcnt(0)
912; GFX9-NEXT:  v_perm_b32 v0, v1, v1, s4
913; GFX9-NEXT:  v_alignbit_b32 v1, s4, v1, 16
914; GFX9-NEXT:  s_setpc_b64 s[30:31]
915;
916; GFX10-LABEL: shuffle_v3bf16_rebroadcast:
917; GFX10:       ; %bb.0: ; %entry
918; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
919; GFX10-NEXT:  global_load_dword v1, v[0:1], off
920; GFX10-NEXT:  s_waitcnt vmcnt(0)
921; GFX10-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
922; GFX10-NEXT:  v_alignbit_b32 v1, s4, v1, 16
923; GFX10-NEXT:  s_setpc_b64 s[30:31]
924;
925; GFX11-LABEL: shuffle_v3bf16_rebroadcast:
926; GFX11:       ; %bb.0: ; %entry
927; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
928; GFX11-NEXT:  global_load_b32 v1, v[0:1], off
929; GFX11-NEXT:  s_waitcnt vmcnt(0)
930; GFX11-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
931; GFX11-NEXT:  v_alignbit_b32 v1, s0, v1, 16
932; GFX11-NEXT:  s_setpc_b64 s[30:31]
933entry:
934  %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0
935  %val1 = shufflevector <3 x bfloat> %val0, <3 x bfloat> poison, <3 x i32> <i32 1, i32 1, i32 1>
936  ret <3 x bfloat> %val1
937}
938
939define <4 x bfloat> @shuffle_v4bf16_rebroadcast(ptr addrspace(1) %arg0) {
940; GFX9-LABEL: shuffle_v4bf16_rebroadcast:
941; GFX9:       ; %bb.0: ; %entry
942; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
943; GFX9-NEXT:  global_load_dword v0, v[0:1], off
944; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
945; GFX9-NEXT:  s_waitcnt vmcnt(0)
946; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
947; GFX9-NEXT:  v_mov_b32_e32 v1, v0
948; GFX9-NEXT:  s_setpc_b64 s[30:31]
949;
950; GFX10-LABEL: shuffle_v4bf16_rebroadcast:
951; GFX10:       ; %bb.0: ; %entry
952; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
953; GFX10-NEXT:  global_load_dword v0, v[0:1], off
954; GFX10-NEXT:  s_waitcnt vmcnt(0)
955; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
956; GFX10-NEXT:  v_mov_b32_e32 v1, v0
957; GFX10-NEXT:  s_setpc_b64 s[30:31]
958;
959; GFX11-LABEL: shuffle_v4bf16_rebroadcast:
960; GFX11:       ; %bb.0: ; %entry
961; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
962; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
963; GFX11-NEXT:  s_waitcnt vmcnt(0)
964; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
965; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
966; GFX11-NEXT:  v_mov_b32_e32 v1, v0
967; GFX11-NEXT:  s_setpc_b64 s[30:31]
968entry:
969  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
970  %val1 = shufflevector <4 x bfloat> %val0, <4 x bfloat> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
971  ret <4 x bfloat> %val1
972}
973
974define <6 x bfloat> @shuffle_v6bf16_rebroadcast(ptr addrspace(1) %arg0) {
975; GFX9-LABEL: shuffle_v6bf16_rebroadcast:
976; GFX9:       ; %bb.0: ; %entry
977; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
978; GFX9-NEXT:  global_load_dword v0, v[0:1], off
979; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
980; GFX9-NEXT:  s_waitcnt vmcnt(0)
981; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
982; GFX9-NEXT:  v_mov_b32_e32 v1, v0
983; GFX9-NEXT:  v_mov_b32_e32 v2, v0
984; GFX9-NEXT:  s_setpc_b64 s[30:31]
985;
986; GFX10-LABEL: shuffle_v6bf16_rebroadcast:
987; GFX10:       ; %bb.0: ; %entry
988; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
989; GFX10-NEXT:  global_load_dword v0, v[0:1], off
990; GFX10-NEXT:  s_waitcnt vmcnt(0)
991; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
992; GFX10-NEXT:  v_mov_b32_e32 v1, v0
993; GFX10-NEXT:  v_mov_b32_e32 v2, v0
994; GFX10-NEXT:  s_setpc_b64 s[30:31]
995;
996; GFX11-LABEL: shuffle_v6bf16_rebroadcast:
997; GFX11:       ; %bb.0: ; %entry
998; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
999; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
1000; GFX11-NEXT:  s_waitcnt vmcnt(0)
1001; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1002; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
1003; GFX11-NEXT:  v_mov_b32_e32 v1, v0
1004; GFX11-NEXT:  v_mov_b32_e32 v2, v0
1005; GFX11-NEXT:  s_setpc_b64 s[30:31]
1006entry:
1007  %val0 = load <6 x bfloat>, ptr addrspace(1) %arg0
1008  %val1 = shufflevector <6 x bfloat> %val0, <6 x bfloat> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1009  ret <6 x bfloat> %val1
1010}
1011
1012define <8 x bfloat> @shuffle_v8bf16_rebroadcast(ptr addrspace(1) %arg0) {
1013; GFX9-LABEL: shuffle_v8bf16_rebroadcast:
1014; GFX9:       ; %bb.0: ; %entry
1015; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1016; GFX9-NEXT:  global_load_dword v0, v[0:1], off
1017; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
1018; GFX9-NEXT:  s_waitcnt vmcnt(0)
1019; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
1020; GFX9-NEXT:  v_mov_b32_e32 v1, v0
1021; GFX9-NEXT:  v_mov_b32_e32 v2, v0
1022; GFX9-NEXT:  v_mov_b32_e32 v3, v0
1023; GFX9-NEXT:  s_setpc_b64 s[30:31]
1024;
1025; GFX10-LABEL: shuffle_v8bf16_rebroadcast:
1026; GFX10:       ; %bb.0: ; %entry
1027; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1028; GFX10-NEXT:  global_load_dword v0, v[0:1], off
1029; GFX10-NEXT:  s_waitcnt vmcnt(0)
1030; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1031; GFX10-NEXT:  v_mov_b32_e32 v1, v0
1032; GFX10-NEXT:  v_mov_b32_e32 v2, v0
1033; GFX10-NEXT:  v_mov_b32_e32 v3, v0
1034; GFX10-NEXT:  s_setpc_b64 s[30:31]
1035;
1036; GFX11-LABEL: shuffle_v8bf16_rebroadcast:
1037; GFX11:       ; %bb.0: ; %entry
1038; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1039; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
1040; GFX11-NEXT:  s_waitcnt vmcnt(0)
1041; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1042; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
1043; GFX11-NEXT:  v_mov_b32_e32 v1, v0
1044; GFX11-NEXT:  v_mov_b32_e32 v2, v0
1045; GFX11-NEXT:  v_mov_b32_e32 v3, v0
1046; GFX11-NEXT:  s_setpc_b64 s[30:31]
1047entry:
1048  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
1049  %val1 = shufflevector <8 x bfloat> %val0, <8 x bfloat> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1050  ret <8 x bfloat> %val1
1051}
1052
1053define <16 x bfloat> @shuffle_v16bf16_rebroadcast(ptr addrspace(1) %arg0) {
1054; GFX9-LABEL: shuffle_v16bf16_rebroadcast:
1055; GFX9:       ; %bb.0: ; %entry
1056; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1057; GFX9-NEXT:  global_load_dword v0, v[0:1], off
1058; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
1059; GFX9-NEXT:  s_waitcnt vmcnt(0)
1060; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
1061; GFX9-NEXT:  v_mov_b32_e32 v1, v0
1062; GFX9-NEXT:  v_mov_b32_e32 v2, v0
1063; GFX9-NEXT:  v_mov_b32_e32 v3, v0
1064; GFX9-NEXT:  v_mov_b32_e32 v4, v0
1065; GFX9-NEXT:  v_mov_b32_e32 v5, v0
1066; GFX9-NEXT:  v_mov_b32_e32 v6, v0
1067; GFX9-NEXT:  v_mov_b32_e32 v7, v0
1068; GFX9-NEXT:  s_setpc_b64 s[30:31]
1069;
1070; GFX10-LABEL: shuffle_v16bf16_rebroadcast:
1071; GFX10:       ; %bb.0: ; %entry
1072; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1073; GFX10-NEXT:  global_load_dword v0, v[0:1], off
1074; GFX10-NEXT:  s_waitcnt vmcnt(0)
1075; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1076; GFX10-NEXT:  v_mov_b32_e32 v1, v0
1077; GFX10-NEXT:  v_mov_b32_e32 v2, v0
1078; GFX10-NEXT:  v_mov_b32_e32 v3, v0
1079; GFX10-NEXT:  v_mov_b32_e32 v4, v0
1080; GFX10-NEXT:  v_mov_b32_e32 v5, v0
1081; GFX10-NEXT:  v_mov_b32_e32 v6, v0
1082; GFX10-NEXT:  v_mov_b32_e32 v7, v0
1083; GFX10-NEXT:  s_setpc_b64 s[30:31]
1084;
1085; GFX11-LABEL: shuffle_v16bf16_rebroadcast:
1086; GFX11:       ; %bb.0: ; %entry
1087; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1088; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
1089; GFX11-NEXT:  s_waitcnt vmcnt(0)
1090; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1091; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
1092; GFX11-NEXT:  v_mov_b32_e32 v1, v0
1093; GFX11-NEXT:  v_mov_b32_e32 v2, v0
1094; GFX11-NEXT:  v_mov_b32_e32 v3, v0
1095; GFX11-NEXT:  v_mov_b32_e32 v4, v0
1096; GFX11-NEXT:  v_mov_b32_e32 v5, v0
1097; GFX11-NEXT:  v_mov_b32_e32 v6, v0
1098; GFX11-NEXT:  v_mov_b32_e32 v7, v0
1099; GFX11-NEXT:  s_setpc_b64 s[30:31]
1100entry:
1101  %val0 = load <16 x bfloat>, ptr addrspace(1) %arg0
1102  %val1 = shufflevector <16 x bfloat> %val0, <16 x bfloat> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1103  ret <16 x bfloat> %val1
1104}
1105
1106define <32 x bfloat> @shuffle_v32bf16_rebroadcast(ptr addrspace(1) %arg0) {
1107; GFX9-LABEL: shuffle_v32bf16_rebroadcast:
1108; GFX9:       ; %bb.0: ; %entry
1109; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1110; GFX9-NEXT:  global_load_dword v0, v[0:1], off
1111; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
1112; GFX9-NEXT:  s_waitcnt vmcnt(0)
1113; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
1114; GFX9-NEXT:  v_mov_b32_e32 v1, v0
1115; GFX9-NEXT:  v_mov_b32_e32 v2, v0
1116; GFX9-NEXT:  v_mov_b32_e32 v3, v0
1117; GFX9-NEXT:  v_mov_b32_e32 v4, v0
1118; GFX9-NEXT:  v_mov_b32_e32 v5, v0
1119; GFX9-NEXT:  v_mov_b32_e32 v6, v0
1120; GFX9-NEXT:  v_mov_b32_e32 v7, v0
1121; GFX9-NEXT:  v_mov_b32_e32 v8, v0
1122; GFX9-NEXT:  v_mov_b32_e32 v9, v0
1123; GFX9-NEXT:  v_mov_b32_e32 v10, v0
1124; GFX9-NEXT:  v_mov_b32_e32 v11, v0
1125; GFX9-NEXT:  v_mov_b32_e32 v12, v0
1126; GFX9-NEXT:  v_mov_b32_e32 v13, v0
1127; GFX9-NEXT:  v_mov_b32_e32 v14, v0
1128; GFX9-NEXT:  v_mov_b32_e32 v15, v0
1129; GFX9-NEXT:  s_setpc_b64 s[30:31]
1130;
1131; GFX10-LABEL: shuffle_v32bf16_rebroadcast:
1132; GFX10:       ; %bb.0: ; %entry
1133; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1134; GFX10-NEXT:  global_load_dword v0, v[0:1], off
1135; GFX10-NEXT:  s_waitcnt vmcnt(0)
1136; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1137; GFX10-NEXT:  v_mov_b32_e32 v1, v0
1138; GFX10-NEXT:  v_mov_b32_e32 v2, v0
1139; GFX10-NEXT:  v_mov_b32_e32 v3, v0
1140; GFX10-NEXT:  v_mov_b32_e32 v4, v0
1141; GFX10-NEXT:  v_mov_b32_e32 v5, v0
1142; GFX10-NEXT:  v_mov_b32_e32 v6, v0
1143; GFX10-NEXT:  v_mov_b32_e32 v7, v0
1144; GFX10-NEXT:  v_mov_b32_e32 v8, v0
1145; GFX10-NEXT:  v_mov_b32_e32 v9, v0
1146; GFX10-NEXT:  v_mov_b32_e32 v10, v0
1147; GFX10-NEXT:  v_mov_b32_e32 v11, v0
1148; GFX10-NEXT:  v_mov_b32_e32 v12, v0
1149; GFX10-NEXT:  v_mov_b32_e32 v13, v0
1150; GFX10-NEXT:  v_mov_b32_e32 v14, v0
1151; GFX10-NEXT:  v_mov_b32_e32 v15, v0
1152; GFX10-NEXT:  s_setpc_b64 s[30:31]
1153;
1154; GFX11-LABEL: shuffle_v32bf16_rebroadcast:
1155; GFX11:       ; %bb.0: ; %entry
1156; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1157; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
1158; GFX11-NEXT:  s_waitcnt vmcnt(0)
1159; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1160; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
1161; GFX11-NEXT:  v_mov_b32_e32 v1, v0
1162; GFX11-NEXT:  v_mov_b32_e32 v2, v0
1163; GFX11-NEXT:  v_mov_b32_e32 v3, v0
1164; GFX11-NEXT:  v_mov_b32_e32 v4, v0
1165; GFX11-NEXT:  v_mov_b32_e32 v5, v0
1166; GFX11-NEXT:  v_mov_b32_e32 v6, v0
1167; GFX11-NEXT:  v_mov_b32_e32 v7, v0
1168; GFX11-NEXT:  v_mov_b32_e32 v8, v0
1169; GFX11-NEXT:  v_mov_b32_e32 v9, v0
1170; GFX11-NEXT:  v_mov_b32_e32 v10, v0
1171; GFX11-NEXT:  v_mov_b32_e32 v11, v0
1172; GFX11-NEXT:  v_mov_b32_e32 v12, v0
1173; GFX11-NEXT:  v_mov_b32_e32 v13, v0
1174; GFX11-NEXT:  v_mov_b32_e32 v14, v0
1175; GFX11-NEXT:  v_mov_b32_e32 v15, v0
1176; GFX11-NEXT:  s_setpc_b64 s[30:31]
1177entry:
1178  %val0 = load <32 x bfloat>, ptr addrspace(1) %arg0
1179  %val1 = shufflevector <32 x bfloat> %val0, <32 x bfloat> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1180  ret <32 x bfloat> %val1
1181}
1182
1183define <2 x half> @shuffle_v2f16_rebroadcast(ptr addrspace(1) %arg0) {
1184; GFX9-LABEL: shuffle_v2f16_rebroadcast:
1185; GFX9:       ; %bb.0: ; %entry
1186; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1187; GFX9-NEXT:  global_load_dword v0, v[0:1], off
1188; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
1189; GFX9-NEXT:  s_waitcnt vmcnt(0)
1190; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
1191; GFX9-NEXT:  s_setpc_b64 s[30:31]
1192;
1193; GFX10-LABEL: shuffle_v2f16_rebroadcast:
1194; GFX10:       ; %bb.0: ; %entry
1195; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1196; GFX10-NEXT:  global_load_dword v0, v[0:1], off
1197; GFX10-NEXT:  s_waitcnt vmcnt(0)
1198; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1199; GFX10-NEXT:  s_setpc_b64 s[30:31]
1200;
1201; GFX11-LABEL: shuffle_v2f16_rebroadcast:
1202; GFX11:       ; %bb.0: ; %entry
1203; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1204; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
1205; GFX11-NEXT:  s_waitcnt vmcnt(0)
1206; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1207; GFX11-NEXT:  s_setpc_b64 s[30:31]
1208entry:
1209  %val0 = load <2 x half>, ptr addrspace(1) %arg0
1210  %val1 = shufflevector <2 x half> %val0, <2 x half> poison, <2 x i32> <i32 1, i32 1>
1211  ret <2 x half> %val1
1212}
1213
1214define <3 x half> @shuffle_v3f16_rebroadcast(ptr addrspace(1) %arg0) {
1215; GFX9-LABEL: shuffle_v3f16_rebroadcast:
1216; GFX9:       ; %bb.0: ; %entry
1217; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1218; GFX9-NEXT:  global_load_dword v1, v[0:1], off
1219; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
1220; GFX9-NEXT:  s_waitcnt vmcnt(0)
1221; GFX9-NEXT:  v_perm_b32 v0, v1, v1, s4
1222; GFX9-NEXT:  v_alignbit_b32 v1, s4, v1, 16
1223; GFX9-NEXT:  s_setpc_b64 s[30:31]
1224;
1225; GFX10-LABEL: shuffle_v3f16_rebroadcast:
1226; GFX10:       ; %bb.0: ; %entry
1227; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1228; GFX10-NEXT:  global_load_dword v1, v[0:1], off
1229; GFX10-NEXT:  s_waitcnt vmcnt(0)
1230; GFX10-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
1231; GFX10-NEXT:  v_alignbit_b32 v1, s4, v1, 16
1232; GFX10-NEXT:  s_setpc_b64 s[30:31]
1233;
1234; GFX11-LABEL: shuffle_v3f16_rebroadcast:
1235; GFX11:       ; %bb.0: ; %entry
1236; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1237; GFX11-NEXT:  global_load_b32 v1, v[0:1], off
1238; GFX11-NEXT:  s_waitcnt vmcnt(0)
1239; GFX11-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
1240; GFX11-NEXT:  v_alignbit_b32 v1, s0, v1, 16
1241; GFX11-NEXT:  s_setpc_b64 s[30:31]
1242entry:
1243  %val0 = load <3 x half>, ptr addrspace(1) %arg0
1244  %val1 = shufflevector <3 x half> %val0, <3 x half> poison, <3 x i32> <i32 1, i32 1, i32 1>
1245  ret <3 x half> %val1
1246}
1247
1248define <4 x half> @shuffle_v4f16_rebroadcast(ptr addrspace(1) %arg0) {
1249; GFX9-LABEL: shuffle_v4f16_rebroadcast:
1250; GFX9:       ; %bb.0: ; %entry
1251; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1252; GFX9-NEXT:  global_load_dword v0, v[0:1], off
1253; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
1254; GFX9-NEXT:  s_waitcnt vmcnt(0)
1255; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
1256; GFX9-NEXT:  v_mov_b32_e32 v1, v0
1257; GFX9-NEXT:  s_setpc_b64 s[30:31]
1258;
1259; GFX10-LABEL: shuffle_v4f16_rebroadcast:
1260; GFX10:       ; %bb.0: ; %entry
1261; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1262; GFX10-NEXT:  global_load_dword v0, v[0:1], off
1263; GFX10-NEXT:  s_waitcnt vmcnt(0)
1264; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1265; GFX10-NEXT:  v_mov_b32_e32 v1, v0
1266; GFX10-NEXT:  s_setpc_b64 s[30:31]
1267;
1268; GFX11-LABEL: shuffle_v4f16_rebroadcast:
1269; GFX11:       ; %bb.0: ; %entry
1270; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1271; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
1272; GFX11-NEXT:  s_waitcnt vmcnt(0)
1273; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1274; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
1275; GFX11-NEXT:  v_mov_b32_e32 v1, v0
1276; GFX11-NEXT:  s_setpc_b64 s[30:31]
1277entry:
1278  %val0 = load <4 x half>, ptr addrspace(1) %arg0
1279  %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1280  ret <4 x half> %val1
1281}
1282
1283define <6 x half> @shuffle_v6f16_rebroadcast(ptr addrspace(1) %arg0) {
1284; GFX9-LABEL: shuffle_v6f16_rebroadcast:
1285; GFX9:       ; %bb.0: ; %entry
1286; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1287; GFX9-NEXT:  global_load_dword v0, v[0:1], off
1288; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
1289; GFX9-NEXT:  s_waitcnt vmcnt(0)
1290; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
1291; GFX9-NEXT:  v_mov_b32_e32 v1, v0
1292; GFX9-NEXT:  v_mov_b32_e32 v2, v0
1293; GFX9-NEXT:  s_setpc_b64 s[30:31]
1294;
1295; GFX10-LABEL: shuffle_v6f16_rebroadcast:
1296; GFX10:       ; %bb.0: ; %entry
1297; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1298; GFX10-NEXT:  global_load_dword v0, v[0:1], off
1299; GFX10-NEXT:  s_waitcnt vmcnt(0)
1300; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1301; GFX10-NEXT:  v_mov_b32_e32 v1, v0
1302; GFX10-NEXT:  v_mov_b32_e32 v2, v0
1303; GFX10-NEXT:  s_setpc_b64 s[30:31]
1304;
1305; GFX11-LABEL: shuffle_v6f16_rebroadcast:
1306; GFX11:       ; %bb.0: ; %entry
1307; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1308; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
1309; GFX11-NEXT:  s_waitcnt vmcnt(0)
1310; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1311; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
1312; GFX11-NEXT:  v_mov_b32_e32 v1, v0
1313; GFX11-NEXT:  v_mov_b32_e32 v2, v0
1314; GFX11-NEXT:  s_setpc_b64 s[30:31]
1315entry:
1316  %val0 = load <6 x half>, ptr addrspace(1) %arg0
1317  %val1 = shufflevector <6 x half> %val0, <6 x half> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1318  ret <6 x half> %val1
1319}
1320
1321define <8 x half> @shuffle_v8f16_rebroadcast(ptr addrspace(1) %arg0) {
1322; GFX9-LABEL: shuffle_v8f16_rebroadcast:
1323; GFX9:       ; %bb.0: ; %entry
1324; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1325; GFX9-NEXT:  global_load_dword v0, v[0:1], off
1326; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
1327; GFX9-NEXT:  s_waitcnt vmcnt(0)
1328; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
1329; GFX9-NEXT:  v_mov_b32_e32 v1, v0
1330; GFX9-NEXT:  v_mov_b32_e32 v2, v0
1331; GFX9-NEXT:  v_mov_b32_e32 v3, v0
1332; GFX9-NEXT:  s_setpc_b64 s[30:31]
1333;
1334; GFX10-LABEL: shuffle_v8f16_rebroadcast:
1335; GFX10:       ; %bb.0: ; %entry
1336; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1337; GFX10-NEXT:  global_load_dword v0, v[0:1], off
1338; GFX10-NEXT:  s_waitcnt vmcnt(0)
1339; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1340; GFX10-NEXT:  v_mov_b32_e32 v1, v0
1341; GFX10-NEXT:  v_mov_b32_e32 v2, v0
1342; GFX10-NEXT:  v_mov_b32_e32 v3, v0
1343; GFX10-NEXT:  s_setpc_b64 s[30:31]
1344;
1345; GFX11-LABEL: shuffle_v8f16_rebroadcast:
1346; GFX11:       ; %bb.0: ; %entry
1347; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1348; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
1349; GFX11-NEXT:  s_waitcnt vmcnt(0)
1350; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1351; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
1352; GFX11-NEXT:  v_mov_b32_e32 v1, v0
1353; GFX11-NEXT:  v_mov_b32_e32 v2, v0
1354; GFX11-NEXT:  v_mov_b32_e32 v3, v0
1355; GFX11-NEXT:  s_setpc_b64 s[30:31]
1356entry:
1357  %val0 = load <8 x half>, ptr addrspace(1) %arg0
1358  %val1 = shufflevector <8 x half> %val0, <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1359  ret <8 x half> %val1
1360}
1361
1362define <16 x half> @shuffle_v16f16_rebroadcast(ptr addrspace(1) %arg0) {
1363; GFX9-LABEL: shuffle_v16f16_rebroadcast:
1364; GFX9:       ; %bb.0: ; %entry
1365; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1366; GFX9-NEXT:  global_load_dword v0, v[0:1], off
1367; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
1368; GFX9-NEXT:  s_waitcnt vmcnt(0)
1369; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
1370; GFX9-NEXT:  v_mov_b32_e32 v1, v0
1371; GFX9-NEXT:  v_mov_b32_e32 v2, v0
1372; GFX9-NEXT:  v_mov_b32_e32 v3, v0
1373; GFX9-NEXT:  v_mov_b32_e32 v4, v0
1374; GFX9-NEXT:  v_mov_b32_e32 v5, v0
1375; GFX9-NEXT:  v_mov_b32_e32 v6, v0
1376; GFX9-NEXT:  v_mov_b32_e32 v7, v0
1377; GFX9-NEXT:  s_setpc_b64 s[30:31]
1378;
1379; GFX10-LABEL: shuffle_v16f16_rebroadcast:
1380; GFX10:       ; %bb.0: ; %entry
1381; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1382; GFX10-NEXT:  global_load_dword v0, v[0:1], off
1383; GFX10-NEXT:  s_waitcnt vmcnt(0)
1384; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1385; GFX10-NEXT:  v_mov_b32_e32 v1, v0
1386; GFX10-NEXT:  v_mov_b32_e32 v2, v0
1387; GFX10-NEXT:  v_mov_b32_e32 v3, v0
1388; GFX10-NEXT:  v_mov_b32_e32 v4, v0
1389; GFX10-NEXT:  v_mov_b32_e32 v5, v0
1390; GFX10-NEXT:  v_mov_b32_e32 v6, v0
1391; GFX10-NEXT:  v_mov_b32_e32 v7, v0
1392; GFX10-NEXT:  s_setpc_b64 s[30:31]
1393;
1394; GFX11-LABEL: shuffle_v16f16_rebroadcast:
1395; GFX11:       ; %bb.0: ; %entry
1396; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1397; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
1398; GFX11-NEXT:  s_waitcnt vmcnt(0)
1399; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1400; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
1401; GFX11-NEXT:  v_mov_b32_e32 v1, v0
1402; GFX11-NEXT:  v_mov_b32_e32 v2, v0
1403; GFX11-NEXT:  v_mov_b32_e32 v3, v0
1404; GFX11-NEXT:  v_mov_b32_e32 v4, v0
1405; GFX11-NEXT:  v_mov_b32_e32 v5, v0
1406; GFX11-NEXT:  v_mov_b32_e32 v6, v0
1407; GFX11-NEXT:  v_mov_b32_e32 v7, v0
1408; GFX11-NEXT:  s_setpc_b64 s[30:31]
1409entry:
1410  %val0 = load <16 x half>, ptr addrspace(1) %arg0
1411  %val1 = shufflevector <16 x half> %val0, <16 x half> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1412  ret <16 x half> %val1
1413}
1414
1415define <32 x half> @shuffle_v32f16_rebroadcast(ptr addrspace(1) %arg0) {
1416; GFX9-LABEL: shuffle_v32f16_rebroadcast:
1417; GFX9:       ; %bb.0: ; %entry
1418; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1419; GFX9-NEXT:  global_load_dword v0, v[0:1], off
1420; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
1421; GFX9-NEXT:  s_waitcnt vmcnt(0)
1422; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
1423; GFX9-NEXT:  v_mov_b32_e32 v1, v0
1424; GFX9-NEXT:  v_mov_b32_e32 v2, v0
1425; GFX9-NEXT:  v_mov_b32_e32 v3, v0
1426; GFX9-NEXT:  v_mov_b32_e32 v4, v0
1427; GFX9-NEXT:  v_mov_b32_e32 v5, v0
1428; GFX9-NEXT:  v_mov_b32_e32 v6, v0
1429; GFX9-NEXT:  v_mov_b32_e32 v7, v0
1430; GFX9-NEXT:  v_mov_b32_e32 v8, v0
1431; GFX9-NEXT:  v_mov_b32_e32 v9, v0
1432; GFX9-NEXT:  v_mov_b32_e32 v10, v0
1433; GFX9-NEXT:  v_mov_b32_e32 v11, v0
1434; GFX9-NEXT:  v_mov_b32_e32 v12, v0
1435; GFX9-NEXT:  v_mov_b32_e32 v13, v0
1436; GFX9-NEXT:  v_mov_b32_e32 v14, v0
1437; GFX9-NEXT:  v_mov_b32_e32 v15, v0
1438; GFX9-NEXT:  s_setpc_b64 s[30:31]
1439;
1440; GFX10-LABEL: shuffle_v32f16_rebroadcast:
1441; GFX10:       ; %bb.0: ; %entry
1442; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1443; GFX10-NEXT:  global_load_dword v0, v[0:1], off
1444; GFX10-NEXT:  s_waitcnt vmcnt(0)
1445; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1446; GFX10-NEXT:  v_mov_b32_e32 v1, v0
1447; GFX10-NEXT:  v_mov_b32_e32 v2, v0
1448; GFX10-NEXT:  v_mov_b32_e32 v3, v0
1449; GFX10-NEXT:  v_mov_b32_e32 v4, v0
1450; GFX10-NEXT:  v_mov_b32_e32 v5, v0
1451; GFX10-NEXT:  v_mov_b32_e32 v6, v0
1452; GFX10-NEXT:  v_mov_b32_e32 v7, v0
1453; GFX10-NEXT:  v_mov_b32_e32 v8, v0
1454; GFX10-NEXT:  v_mov_b32_e32 v9, v0
1455; GFX10-NEXT:  v_mov_b32_e32 v10, v0
1456; GFX10-NEXT:  v_mov_b32_e32 v11, v0
1457; GFX10-NEXT:  v_mov_b32_e32 v12, v0
1458; GFX10-NEXT:  v_mov_b32_e32 v13, v0
1459; GFX10-NEXT:  v_mov_b32_e32 v14, v0
1460; GFX10-NEXT:  v_mov_b32_e32 v15, v0
1461; GFX10-NEXT:  s_setpc_b64 s[30:31]
1462;
1463; GFX11-LABEL: shuffle_v32f16_rebroadcast:
1464; GFX11:       ; %bb.0: ; %entry
1465; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1466; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
1467; GFX11-NEXT:  s_waitcnt vmcnt(0)
1468; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
1469; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
1470; GFX11-NEXT:  v_mov_b32_e32 v1, v0
1471; GFX11-NEXT:  v_mov_b32_e32 v2, v0
1472; GFX11-NEXT:  v_mov_b32_e32 v3, v0
1473; GFX11-NEXT:  v_mov_b32_e32 v4, v0
1474; GFX11-NEXT:  v_mov_b32_e32 v5, v0
1475; GFX11-NEXT:  v_mov_b32_e32 v6, v0
1476; GFX11-NEXT:  v_mov_b32_e32 v7, v0
1477; GFX11-NEXT:  v_mov_b32_e32 v8, v0
1478; GFX11-NEXT:  v_mov_b32_e32 v9, v0
1479; GFX11-NEXT:  v_mov_b32_e32 v10, v0
1480; GFX11-NEXT:  v_mov_b32_e32 v11, v0
1481; GFX11-NEXT:  v_mov_b32_e32 v12, v0
1482; GFX11-NEXT:  v_mov_b32_e32 v13, v0
1483; GFX11-NEXT:  v_mov_b32_e32 v14, v0
1484; GFX11-NEXT:  v_mov_b32_e32 v15, v0
1485; GFX11-NEXT:  s_setpc_b64 s[30:31]
1486entry:
1487  %val0 = load <32 x half>, ptr addrspace(1) %arg0
1488  %val1 = shufflevector <32 x half> %val0, <32 x half> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1489  ret <32 x half> %val1
1490}
1491
1492define <2 x float> @shuffle_v2f32_rebroadcast(ptr addrspace(1) %arg0) {
1493; GFX9-LABEL: shuffle_v2f32_rebroadcast:
1494; GFX9:       ; %bb.0: ; %entry
1495; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1496; GFX9-NEXT:  global_load_dwordx2 v[0:1], v[0:1], off
1497; GFX9-NEXT:  s_waitcnt vmcnt(0)
1498; GFX9-NEXT:  v_mov_b32_e32 v0, v1
1499; GFX9-NEXT:  s_setpc_b64 s[30:31]
1500;
1501; GFX10-LABEL: shuffle_v2f32_rebroadcast:
1502; GFX10:       ; %bb.0: ; %entry
1503; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1504; GFX10-NEXT:  global_load_dwordx2 v[0:1], v[0:1], off
1505; GFX10-NEXT:  s_waitcnt vmcnt(0)
1506; GFX10-NEXT:  v_mov_b32_e32 v0, v1
1507; GFX10-NEXT:  s_setpc_b64 s[30:31]
1508;
1509; GFX11-LABEL: shuffle_v2f32_rebroadcast:
1510; GFX11:       ; %bb.0: ; %entry
1511; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1512; GFX11-NEXT:  global_load_b64 v[0:1], v[0:1], off
1513; GFX11-NEXT:  s_waitcnt vmcnt(0)
1514; GFX11-NEXT:  v_mov_b32_e32 v0, v1
1515; GFX11-NEXT:  s_setpc_b64 s[30:31]
1516entry:
1517  %val0 = load <2 x float>, ptr addrspace(1) %arg0
1518  %val1 = shufflevector <2 x float> %val0, <2 x float> poison, <2 x i32> <i32 1, i32 1>
1519  ret <2 x float> %val1
1520}
1521
1522define <3 x float> @shuffle_v3f32_rebroadcast(ptr addrspace(1) %arg0) {
1523; GFX9-LABEL: shuffle_v3f32_rebroadcast:
1524; GFX9:       ; %bb.0: ; %entry
1525; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1526; GFX9-NEXT:  global_load_dwordx3 v[0:2], v[0:1], off
1527; GFX9-NEXT:  s_waitcnt vmcnt(0)
1528; GFX9-NEXT:  v_mov_b32_e32 v0, v1
1529; GFX9-NEXT:  v_mov_b32_e32 v2, v1
1530; GFX9-NEXT:  s_setpc_b64 s[30:31]
1531;
1532; GFX10-LABEL: shuffle_v3f32_rebroadcast:
1533; GFX10:       ; %bb.0: ; %entry
1534; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1535; GFX10-NEXT:  global_load_dwordx3 v[0:2], v[0:1], off
1536; GFX10-NEXT:  s_waitcnt vmcnt(0)
1537; GFX10-NEXT:  v_mov_b32_e32 v0, v1
1538; GFX10-NEXT:  v_mov_b32_e32 v2, v1
1539; GFX10-NEXT:  s_setpc_b64 s[30:31]
1540;
1541; GFX11-LABEL: shuffle_v3f32_rebroadcast:
1542; GFX11:       ; %bb.0: ; %entry
1543; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1544; GFX11-NEXT:  global_load_b96 v[0:2], v[0:1], off
1545; GFX11-NEXT:  s_waitcnt vmcnt(0)
1546; GFX11-NEXT:  v_mov_b32_e32 v0, v1
1547; GFX11-NEXT:  v_mov_b32_e32 v2, v1
1548; GFX11-NEXT:  s_setpc_b64 s[30:31]
1549entry:
1550  %val0 = load <3 x float>, ptr addrspace(1) %arg0
1551  %val1 = shufflevector <3 x float> %val0, <3 x float> poison, <3 x i32> <i32 1, i32 1, i32 1>
1552  ret <3 x float> %val1
1553}
1554
1555define <4 x float> @shuffle_v4f32_rebroadcast(ptr addrspace(1) %arg0) {
1556; GFX9-LABEL: shuffle_v4f32_rebroadcast:
1557; GFX9:       ; %bb.0: ; %entry
1558; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1559; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
1560; GFX9-NEXT:  s_waitcnt vmcnt(0)
1561; GFX9-NEXT:  v_mov_b32_e32 v0, v1
1562; GFX9-NEXT:  v_mov_b32_e32 v2, v1
1563; GFX9-NEXT:  v_mov_b32_e32 v3, v1
1564; GFX9-NEXT:  s_setpc_b64 s[30:31]
1565;
1566; GFX10-LABEL: shuffle_v4f32_rebroadcast:
1567; GFX10:       ; %bb.0: ; %entry
1568; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1569; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
1570; GFX10-NEXT:  s_waitcnt vmcnt(0)
1571; GFX10-NEXT:  v_mov_b32_e32 v0, v1
1572; GFX10-NEXT:  v_mov_b32_e32 v2, v1
1573; GFX10-NEXT:  v_mov_b32_e32 v3, v1
1574; GFX10-NEXT:  s_setpc_b64 s[30:31]
1575;
1576; GFX11-LABEL: shuffle_v4f32_rebroadcast:
1577; GFX11:       ; %bb.0: ; %entry
1578; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1579; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
1580; GFX11-NEXT:  s_waitcnt vmcnt(0)
1581; GFX11-NEXT:  v_mov_b32_e32 v0, v1
1582; GFX11-NEXT:  v_mov_b32_e32 v2, v1
1583; GFX11-NEXT:  v_mov_b32_e32 v3, v1
1584; GFX11-NEXT:  s_setpc_b64 s[30:31]
1585entry:
1586  %val0 = load <4 x float>, ptr addrspace(1) %arg0
1587  %val1 = shufflevector <4 x float> %val0, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1588  ret <4 x float> %val1
1589}
1590
1591define <6 x float> @shuffle_v6f32_rebroadcast(ptr addrspace(1) %arg0) {
1592; GFX9-LABEL: shuffle_v6f32_rebroadcast:
1593; GFX9:       ; %bb.0: ; %entry
1594; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1595; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
1596; GFX9-NEXT:  s_waitcnt vmcnt(0)
1597; GFX9-NEXT:  v_mov_b32_e32 v0, v1
1598; GFX9-NEXT:  v_mov_b32_e32 v2, v1
1599; GFX9-NEXT:  v_mov_b32_e32 v3, v1
1600; GFX9-NEXT:  v_mov_b32_e32 v4, v1
1601; GFX9-NEXT:  v_mov_b32_e32 v5, v1
1602; GFX9-NEXT:  s_setpc_b64 s[30:31]
1603;
1604; GFX10-LABEL: shuffle_v6f32_rebroadcast:
1605; GFX10:       ; %bb.0: ; %entry
1606; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1607; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
1608; GFX10-NEXT:  s_waitcnt vmcnt(0)
1609; GFX10-NEXT:  v_mov_b32_e32 v0, v1
1610; GFX10-NEXT:  v_mov_b32_e32 v2, v1
1611; GFX10-NEXT:  v_mov_b32_e32 v3, v1
1612; GFX10-NEXT:  v_mov_b32_e32 v4, v1
1613; GFX10-NEXT:  v_mov_b32_e32 v5, v1
1614; GFX10-NEXT:  s_setpc_b64 s[30:31]
1615;
1616; GFX11-LABEL: shuffle_v6f32_rebroadcast:
1617; GFX11:       ; %bb.0: ; %entry
1618; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1619; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
1620; GFX11-NEXT:  s_waitcnt vmcnt(0)
1621; GFX11-NEXT:  v_mov_b32_e32 v0, v1
1622; GFX11-NEXT:  v_mov_b32_e32 v2, v1
1623; GFX11-NEXT:  v_mov_b32_e32 v3, v1
1624; GFX11-NEXT:  v_mov_b32_e32 v4, v1
1625; GFX11-NEXT:  v_mov_b32_e32 v5, v1
1626; GFX11-NEXT:  s_setpc_b64 s[30:31]
1627entry:
1628  %val0 = load <6 x float>, ptr addrspace(1) %arg0
1629  %val1 = shufflevector <6 x float> %val0, <6 x float> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1630  ret <6 x float> %val1
1631}
1632
1633define <8 x float> @shuffle_v8f32_rebroadcast(ptr addrspace(1) %arg0) {
1634; GFX9-LABEL: shuffle_v8f32_rebroadcast:
1635; GFX9:       ; %bb.0: ; %entry
1636; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1637; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
1638; GFX9-NEXT:  s_waitcnt vmcnt(0)
1639; GFX9-NEXT:  v_mov_b32_e32 v0, v1
1640; GFX9-NEXT:  v_mov_b32_e32 v2, v1
1641; GFX9-NEXT:  v_mov_b32_e32 v3, v1
1642; GFX9-NEXT:  v_mov_b32_e32 v4, v1
1643; GFX9-NEXT:  v_mov_b32_e32 v5, v1
1644; GFX9-NEXT:  v_mov_b32_e32 v6, v1
1645; GFX9-NEXT:  v_mov_b32_e32 v7, v1
1646; GFX9-NEXT:  s_setpc_b64 s[30:31]
1647;
1648; GFX10-LABEL: shuffle_v8f32_rebroadcast:
1649; GFX10:       ; %bb.0: ; %entry
1650; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1651; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
1652; GFX10-NEXT:  s_waitcnt vmcnt(0)
1653; GFX10-NEXT:  v_mov_b32_e32 v0, v1
1654; GFX10-NEXT:  v_mov_b32_e32 v2, v1
1655; GFX10-NEXT:  v_mov_b32_e32 v3, v1
1656; GFX10-NEXT:  v_mov_b32_e32 v4, v1
1657; GFX10-NEXT:  v_mov_b32_e32 v5, v1
1658; GFX10-NEXT:  v_mov_b32_e32 v6, v1
1659; GFX10-NEXT:  v_mov_b32_e32 v7, v1
1660; GFX10-NEXT:  s_setpc_b64 s[30:31]
1661;
1662; GFX11-LABEL: shuffle_v8f32_rebroadcast:
1663; GFX11:       ; %bb.0: ; %entry
1664; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1665; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
1666; GFX11-NEXT:  s_waitcnt vmcnt(0)
1667; GFX11-NEXT:  v_mov_b32_e32 v0, v1
1668; GFX11-NEXT:  v_mov_b32_e32 v2, v1
1669; GFX11-NEXT:  v_mov_b32_e32 v3, v1
1670; GFX11-NEXT:  v_mov_b32_e32 v4, v1
1671; GFX11-NEXT:  v_mov_b32_e32 v5, v1
1672; GFX11-NEXT:  v_mov_b32_e32 v6, v1
1673; GFX11-NEXT:  v_mov_b32_e32 v7, v1
1674; GFX11-NEXT:  s_setpc_b64 s[30:31]
1675entry:
1676  %val0 = load <8 x float>, ptr addrspace(1) %arg0
1677  %val1 = shufflevector <8 x float> %val0, <8 x float> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1678  ret <8 x float> %val1
1679}
1680
1681define <16 x float> @shuffle_v16f32_rebroadcast(ptr addrspace(1) %arg0) {
1682; GFX9-LABEL: shuffle_v16f32_rebroadcast:
1683; GFX9:       ; %bb.0: ; %entry
1684; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1685; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
1686; GFX9-NEXT:  s_waitcnt vmcnt(0)
1687; GFX9-NEXT:  v_mov_b32_e32 v0, v1
1688; GFX9-NEXT:  v_mov_b32_e32 v2, v1
1689; GFX9-NEXT:  v_mov_b32_e32 v3, v1
1690; GFX9-NEXT:  v_mov_b32_e32 v4, v1
1691; GFX9-NEXT:  v_mov_b32_e32 v5, v1
1692; GFX9-NEXT:  v_mov_b32_e32 v6, v1
1693; GFX9-NEXT:  v_mov_b32_e32 v7, v1
1694; GFX9-NEXT:  v_mov_b32_e32 v8, v1
1695; GFX9-NEXT:  v_mov_b32_e32 v9, v1
1696; GFX9-NEXT:  v_mov_b32_e32 v10, v1
1697; GFX9-NEXT:  v_mov_b32_e32 v11, v1
1698; GFX9-NEXT:  v_mov_b32_e32 v12, v1
1699; GFX9-NEXT:  v_mov_b32_e32 v13, v1
1700; GFX9-NEXT:  v_mov_b32_e32 v14, v1
1701; GFX9-NEXT:  v_mov_b32_e32 v15, v1
1702; GFX9-NEXT:  s_setpc_b64 s[30:31]
1703;
1704; GFX10-LABEL: shuffle_v16f32_rebroadcast:
1705; GFX10:       ; %bb.0: ; %entry
1706; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1707; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
1708; GFX10-NEXT:  s_waitcnt vmcnt(0)
1709; GFX10-NEXT:  v_mov_b32_e32 v0, v1
1710; GFX10-NEXT:  v_mov_b32_e32 v2, v1
1711; GFX10-NEXT:  v_mov_b32_e32 v3, v1
1712; GFX10-NEXT:  v_mov_b32_e32 v4, v1
1713; GFX10-NEXT:  v_mov_b32_e32 v5, v1
1714; GFX10-NEXT:  v_mov_b32_e32 v6, v1
1715; GFX10-NEXT:  v_mov_b32_e32 v7, v1
1716; GFX10-NEXT:  v_mov_b32_e32 v8, v1
1717; GFX10-NEXT:  v_mov_b32_e32 v9, v1
1718; GFX10-NEXT:  v_mov_b32_e32 v10, v1
1719; GFX10-NEXT:  v_mov_b32_e32 v11, v1
1720; GFX10-NEXT:  v_mov_b32_e32 v12, v1
1721; GFX10-NEXT:  v_mov_b32_e32 v13, v1
1722; GFX10-NEXT:  v_mov_b32_e32 v14, v1
1723; GFX10-NEXT:  v_mov_b32_e32 v15, v1
1724; GFX10-NEXT:  s_setpc_b64 s[30:31]
1725;
1726; GFX11-LABEL: shuffle_v16f32_rebroadcast:
1727; GFX11:       ; %bb.0: ; %entry
1728; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1729; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
1730; GFX11-NEXT:  s_waitcnt vmcnt(0)
1731; GFX11-NEXT:  v_mov_b32_e32 v0, v1
1732; GFX11-NEXT:  v_mov_b32_e32 v2, v1
1733; GFX11-NEXT:  v_mov_b32_e32 v3, v1
1734; GFX11-NEXT:  v_mov_b32_e32 v4, v1
1735; GFX11-NEXT:  v_mov_b32_e32 v5, v1
1736; GFX11-NEXT:  v_mov_b32_e32 v6, v1
1737; GFX11-NEXT:  v_mov_b32_e32 v7, v1
1738; GFX11-NEXT:  v_mov_b32_e32 v8, v1
1739; GFX11-NEXT:  v_mov_b32_e32 v9, v1
1740; GFX11-NEXT:  v_mov_b32_e32 v10, v1
1741; GFX11-NEXT:  v_mov_b32_e32 v11, v1
1742; GFX11-NEXT:  v_mov_b32_e32 v12, v1
1743; GFX11-NEXT:  v_mov_b32_e32 v13, v1
1744; GFX11-NEXT:  v_mov_b32_e32 v14, v1
1745; GFX11-NEXT:  v_mov_b32_e32 v15, v1
1746; GFX11-NEXT:  s_setpc_b64 s[30:31]
1747entry:
1748  %val0 = load <16 x float>, ptr addrspace(1) %arg0
1749  %val1 = shufflevector <16 x float> %val0, <16 x float> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1750  ret <16 x float> %val1
1751}
1752
1753define <32 x float> @shuffle_v32f32_rebroadcast(ptr addrspace(1) %arg0) {
1754; GFX9-LABEL: shuffle_v32f32_rebroadcast:
1755; GFX9:       ; %bb.0: ; %entry
1756; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1757; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
1758; GFX9-NEXT:  s_waitcnt vmcnt(0)
1759; GFX9-NEXT:  v_mov_b32_e32 v0, v1
1760; GFX9-NEXT:  v_mov_b32_e32 v2, v1
1761; GFX9-NEXT:  v_mov_b32_e32 v3, v1
1762; GFX9-NEXT:  v_mov_b32_e32 v4, v1
1763; GFX9-NEXT:  v_mov_b32_e32 v5, v1
1764; GFX9-NEXT:  v_mov_b32_e32 v6, v1
1765; GFX9-NEXT:  v_mov_b32_e32 v7, v1
1766; GFX9-NEXT:  v_mov_b32_e32 v8, v1
1767; GFX9-NEXT:  v_mov_b32_e32 v9, v1
1768; GFX9-NEXT:  v_mov_b32_e32 v10, v1
1769; GFX9-NEXT:  v_mov_b32_e32 v11, v1
1770; GFX9-NEXT:  v_mov_b32_e32 v12, v1
1771; GFX9-NEXT:  v_mov_b32_e32 v13, v1
1772; GFX9-NEXT:  v_mov_b32_e32 v14, v1
1773; GFX9-NEXT:  v_mov_b32_e32 v15, v1
1774; GFX9-NEXT:  v_mov_b32_e32 v16, v1
1775; GFX9-NEXT:  v_mov_b32_e32 v17, v1
1776; GFX9-NEXT:  v_mov_b32_e32 v18, v1
1777; GFX9-NEXT:  v_mov_b32_e32 v19, v1
1778; GFX9-NEXT:  v_mov_b32_e32 v20, v1
1779; GFX9-NEXT:  v_mov_b32_e32 v21, v1
1780; GFX9-NEXT:  v_mov_b32_e32 v22, v1
1781; GFX9-NEXT:  v_mov_b32_e32 v23, v1
1782; GFX9-NEXT:  v_mov_b32_e32 v24, v1
1783; GFX9-NEXT:  v_mov_b32_e32 v25, v1
1784; GFX9-NEXT:  v_mov_b32_e32 v26, v1
1785; GFX9-NEXT:  v_mov_b32_e32 v27, v1
1786; GFX9-NEXT:  v_mov_b32_e32 v28, v1
1787; GFX9-NEXT:  v_mov_b32_e32 v29, v1
1788; GFX9-NEXT:  v_mov_b32_e32 v30, v1
1789; GFX9-NEXT:  v_mov_b32_e32 v31, v1
1790; GFX9-NEXT:  s_setpc_b64 s[30:31]
1791;
1792; GFX10-LABEL: shuffle_v32f32_rebroadcast:
1793; GFX10:       ; %bb.0: ; %entry
1794; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1795; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
1796; GFX10-NEXT:  s_waitcnt vmcnt(0)
1797; GFX10-NEXT:  v_mov_b32_e32 v0, v1
1798; GFX10-NEXT:  v_mov_b32_e32 v2, v1
1799; GFX10-NEXT:  v_mov_b32_e32 v3, v1
1800; GFX10-NEXT:  v_mov_b32_e32 v4, v1
1801; GFX10-NEXT:  v_mov_b32_e32 v5, v1
1802; GFX10-NEXT:  v_mov_b32_e32 v6, v1
1803; GFX10-NEXT:  v_mov_b32_e32 v7, v1
1804; GFX10-NEXT:  v_mov_b32_e32 v8, v1
1805; GFX10-NEXT:  v_mov_b32_e32 v9, v1
1806; GFX10-NEXT:  v_mov_b32_e32 v10, v1
1807; GFX10-NEXT:  v_mov_b32_e32 v11, v1
1808; GFX10-NEXT:  v_mov_b32_e32 v12, v1
1809; GFX10-NEXT:  v_mov_b32_e32 v13, v1
1810; GFX10-NEXT:  v_mov_b32_e32 v14, v1
1811; GFX10-NEXT:  v_mov_b32_e32 v15, v1
1812; GFX10-NEXT:  v_mov_b32_e32 v16, v1
1813; GFX10-NEXT:  v_mov_b32_e32 v17, v1
1814; GFX10-NEXT:  v_mov_b32_e32 v18, v1
1815; GFX10-NEXT:  v_mov_b32_e32 v19, v1
1816; GFX10-NEXT:  v_mov_b32_e32 v20, v1
1817; GFX10-NEXT:  v_mov_b32_e32 v21, v1
1818; GFX10-NEXT:  v_mov_b32_e32 v22, v1
1819; GFX10-NEXT:  v_mov_b32_e32 v23, v1
1820; GFX10-NEXT:  v_mov_b32_e32 v24, v1
1821; GFX10-NEXT:  v_mov_b32_e32 v25, v1
1822; GFX10-NEXT:  v_mov_b32_e32 v26, v1
1823; GFX10-NEXT:  v_mov_b32_e32 v27, v1
1824; GFX10-NEXT:  v_mov_b32_e32 v28, v1
1825; GFX10-NEXT:  v_mov_b32_e32 v29, v1
1826; GFX10-NEXT:  v_mov_b32_e32 v30, v1
1827; GFX10-NEXT:  v_mov_b32_e32 v31, v1
1828; GFX10-NEXT:  s_setpc_b64 s[30:31]
1829;
1830; GFX11-LABEL: shuffle_v32f32_rebroadcast:
1831; GFX11:       ; %bb.0: ; %entry
1832; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1833; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
1834; GFX11-NEXT:  s_waitcnt vmcnt(0)
1835; GFX11-NEXT:  v_mov_b32_e32 v0, v1
1836; GFX11-NEXT:  v_mov_b32_e32 v2, v1
1837; GFX11-NEXT:  v_mov_b32_e32 v3, v1
1838; GFX11-NEXT:  v_mov_b32_e32 v4, v1
1839; GFX11-NEXT:  v_mov_b32_e32 v5, v1
1840; GFX11-NEXT:  v_mov_b32_e32 v6, v1
1841; GFX11-NEXT:  v_mov_b32_e32 v7, v1
1842; GFX11-NEXT:  v_mov_b32_e32 v8, v1
1843; GFX11-NEXT:  v_mov_b32_e32 v9, v1
1844; GFX11-NEXT:  v_mov_b32_e32 v10, v1
1845; GFX11-NEXT:  v_mov_b32_e32 v11, v1
1846; GFX11-NEXT:  v_mov_b32_e32 v12, v1
1847; GFX11-NEXT:  v_mov_b32_e32 v13, v1
1848; GFX11-NEXT:  v_mov_b32_e32 v14, v1
1849; GFX11-NEXT:  v_mov_b32_e32 v15, v1
1850; GFX11-NEXT:  v_mov_b32_e32 v16, v1
1851; GFX11-NEXT:  v_mov_b32_e32 v17, v1
1852; GFX11-NEXT:  v_mov_b32_e32 v18, v1
1853; GFX11-NEXT:  v_mov_b32_e32 v19, v1
1854; GFX11-NEXT:  v_mov_b32_e32 v20, v1
1855; GFX11-NEXT:  v_mov_b32_e32 v21, v1
1856; GFX11-NEXT:  v_mov_b32_e32 v22, v1
1857; GFX11-NEXT:  v_mov_b32_e32 v23, v1
1858; GFX11-NEXT:  v_mov_b32_e32 v24, v1
1859; GFX11-NEXT:  v_mov_b32_e32 v25, v1
1860; GFX11-NEXT:  v_mov_b32_e32 v26, v1
1861; GFX11-NEXT:  v_mov_b32_e32 v27, v1
1862; GFX11-NEXT:  v_mov_b32_e32 v28, v1
1863; GFX11-NEXT:  v_mov_b32_e32 v29, v1
1864; GFX11-NEXT:  v_mov_b32_e32 v30, v1
1865; GFX11-NEXT:  v_mov_b32_e32 v31, v1
1866; GFX11-NEXT:  s_setpc_b64 s[30:31]
1867entry:
1868  %val0 = load <32 x float>, ptr addrspace(1) %arg0
1869  %val1 = shufflevector <32 x float> %val0, <32 x float> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1870  ret <32 x float> %val1
1871}
1872