xref: /llvm-project/llvm/test/CodeGen/AMDGPU/permute_i8.ll (revision bfd9bc274586b0261e16e22ac50d50586a0152e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
3; RUN: llc -mtriple=amdgcn -mcpu=gfx908  -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
4
5define hidden void @shuffle6766(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
6; GFX10-LABEL: shuffle6766:
7; GFX10:       ; %bb.0:
8; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9; GFX10-NEXT:    global_load_dword v0, v[2:3], off
10; GFX10-NEXT:    s_waitcnt vmcnt(0)
11; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x6060706
12; GFX10-NEXT:    global_store_dword v[4:5], v0, off
13; GFX10-NEXT:    s_setpc_b64 s[30:31]
14;
15; GFX9-LABEL: shuffle6766:
16; GFX9:       ; %bb.0:
17; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18; GFX9-NEXT:    global_load_dword v0, v[2:3], off
19; GFX9-NEXT:    s_mov_b32 s4, 0x6060706
20; GFX9-NEXT:    s_waitcnt vmcnt(0)
21; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
22; GFX9-NEXT:    global_store_dword v[4:5], v0, off
23; GFX9-NEXT:    s_waitcnt vmcnt(0)
24; GFX9-NEXT:    s_setpc_b64 s[30:31]
25  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
26  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
27  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 7, i32 6, i32 6>
28  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
29  ret void
30}
31
32define hidden void @shuffle3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
33; GFX10-LABEL: shuffle3744:
34; GFX10:       ; %bb.0:
35; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36; GFX10-NEXT:    global_load_dword v6, v[0:1], off
37; GFX10-NEXT:    global_load_dword v7, v[2:3], off
38; GFX10-NEXT:    s_waitcnt vmcnt(0)
39; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x307
40; GFX10-NEXT:    global_store_dword v[4:5], v0, off
41; GFX10-NEXT:    s_setpc_b64 s[30:31]
42;
43; GFX9-LABEL: shuffle3744:
44; GFX9:       ; %bb.0:
45; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46; GFX9-NEXT:    global_load_dword v6, v[0:1], off
47; GFX9-NEXT:    global_load_dword v7, v[2:3], off
48; GFX9-NEXT:    s_movk_i32 s4, 0x307
49; GFX9-NEXT:    s_waitcnt vmcnt(0)
50; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
51; GFX9-NEXT:    global_store_dword v[4:5], v0, off
52; GFX9-NEXT:    s_waitcnt vmcnt(0)
53; GFX9-NEXT:    s_setpc_b64 s[30:31]
54  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
55  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
56  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 7, i32 4, i32 4>
57  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
58  ret void
59}
60
61define hidden void @shuffle4445(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
62; GFX10-LABEL: shuffle4445:
63; GFX10:       ; %bb.0:
64; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
65; GFX10-NEXT:    global_load_dword v0, v[2:3], off
66; GFX10-NEXT:    s_waitcnt vmcnt(0)
67; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040404
68; GFX10-NEXT:    global_store_dword v[4:5], v0, off
69; GFX10-NEXT:    s_setpc_b64 s[30:31]
70;
71; GFX9-LABEL: shuffle4445:
72; GFX9:       ; %bb.0:
73; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74; GFX9-NEXT:    global_load_dword v0, v[2:3], off
75; GFX9-NEXT:    s_mov_b32 s4, 0x5040404
76; GFX9-NEXT:    s_waitcnt vmcnt(0)
77; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
78; GFX9-NEXT:    global_store_dword v[4:5], v0, off
79; GFX9-NEXT:    s_waitcnt vmcnt(0)
80; GFX9-NEXT:    s_setpc_b64 s[30:31]
81  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
82  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
83  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 4, i32 4, i32 5>
84  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
85  ret void
86}
87
88define hidden void @shuffle0101(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
89; GFX10-LABEL: shuffle0101:
90; GFX10:       ; %bb.0:
91; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92; GFX10-NEXT:    global_load_dword v0, v[0:1], off
93; GFX10-NEXT:    s_waitcnt vmcnt(0)
94; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040504
95; GFX10-NEXT:    global_store_dword v[4:5], v0, off
96; GFX10-NEXT:    s_setpc_b64 s[30:31]
97;
98; GFX9-LABEL: shuffle0101:
99; GFX9:       ; %bb.0:
100; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101; GFX9-NEXT:    global_load_dword v0, v[0:1], off
102; GFX9-NEXT:    s_mov_b32 s4, 0x5040504
103; GFX9-NEXT:    s_waitcnt vmcnt(0)
104; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
105; GFX9-NEXT:    global_store_dword v[4:5], v0, off
106; GFX9-NEXT:    s_waitcnt vmcnt(0)
107; GFX9-NEXT:    s_setpc_b64 s[30:31]
108  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
109  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
110  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
111  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
112  ret void
113}
114
115define hidden void @shuffle1004(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
116; GFX10-LABEL: shuffle1004:
117; GFX10:       ; %bb.0:
118; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119; GFX10-NEXT:    global_load_dword v6, v[0:1], off
120; GFX10-NEXT:    global_load_dword v7, v[2:3], off
121; GFX10-NEXT:    s_waitcnt vmcnt(0)
122; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x40405
123; GFX10-NEXT:    global_store_dword v[4:5], v0, off
124; GFX10-NEXT:    s_setpc_b64 s[30:31]
125;
126; GFX9-LABEL: shuffle1004:
127; GFX9:       ; %bb.0:
128; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129; GFX9-NEXT:    global_load_dword v6, v[0:1], off
130; GFX9-NEXT:    global_load_dword v7, v[2:3], off
131; GFX9-NEXT:    s_mov_b32 s4, 0x40405
132; GFX9-NEXT:    s_waitcnt vmcnt(0)
133; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
134; GFX9-NEXT:    global_store_dword v[4:5], v0, off
135; GFX9-NEXT:    s_waitcnt vmcnt(0)
136; GFX9-NEXT:    s_setpc_b64 s[30:31]
137  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
138  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
139  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 0, i32 0, i32 4>
140  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
141  ret void
142}
143
144
145
146define hidden void @shuffle7533(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) {
147; GFX10-LABEL: shuffle7533:
148; GFX10:       ; %bb.0:
149; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150; GFX10-NEXT:    flat_load_dword v6, v[0:1]
151; GFX10-NEXT:    flat_load_dword v7, v[2:3]
152; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
153; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3030507
154; GFX10-NEXT:    flat_store_dword v[4:5], v0
155; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
156; GFX10-NEXT:    s_setpc_b64 s[30:31]
157;
158; GFX9-LABEL: shuffle7533:
159; GFX9:       ; %bb.0:
160; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161; GFX9-NEXT:    flat_load_dword v6, v[0:1]
162; GFX9-NEXT:    flat_load_dword v7, v[2:3]
163; GFX9-NEXT:    s_mov_b32 s4, 0x3030507
164; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
165; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
166; GFX9-NEXT:    flat_store_dword v[4:5], v0
167; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
168; GFX9-NEXT:    s_setpc_b64 s[30:31]
169  %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4
170  %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4
171  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 5, i32 3, i32 3>
172  store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4
173  ret void
174}
175
176define hidden void @shuffle7767(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) {
177; GFX10-LABEL: shuffle7767:
178; GFX10:       ; %bb.0:
179; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180; GFX10-NEXT:    flat_load_dword v0, v[2:3]
181; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
182; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060707
183; GFX10-NEXT:    flat_store_dword v[4:5], v0
184; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
185; GFX10-NEXT:    s_setpc_b64 s[30:31]
186;
187; GFX9-LABEL: shuffle7767:
188; GFX9:       ; %bb.0:
189; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190; GFX9-NEXT:    flat_load_dword v0, v[2:3]
191; GFX9-NEXT:    s_mov_b32 s4, 0x7060707
192; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
193; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
194; GFX9-NEXT:    flat_store_dword v[4:5], v0
195; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
196; GFX9-NEXT:    s_setpc_b64 s[30:31]
197  %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4
198  %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4
199  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 7>
200  store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4
201  ret void
202}
203
204define hidden void @shuffle0554(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) {
205; GFX10-LABEL: shuffle0554:
206; GFX10:       ; %bb.0:
207; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
208; GFX10-NEXT:    ds_read_b32 v0, v0
209; GFX10-NEXT:    ds_read_b32 v1, v1
210; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
211; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x10104
212; GFX10-NEXT:    ds_write_b32 v2, v0
213; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
214; GFX10-NEXT:    s_setpc_b64 s[30:31]
215;
216; GFX9-LABEL: shuffle0554:
217; GFX9:       ; %bb.0:
218; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219; GFX9-NEXT:    ds_read_b32 v0, v0
220; GFX9-NEXT:    ds_read_b32 v1, v1
221; GFX9-NEXT:    s_mov_b32 s4, 0x10104
222; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
224; GFX9-NEXT:    ds_write_b32 v2, v0
225; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
226; GFX9-NEXT:    s_setpc_b64 s[30:31]
227  %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4
228  %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4
229  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 5, i32 5, i32 4>
230  store <4 x i8> %shuffle0_0, ptr addrspace(3) %out0, align 4
231  ret void
232}
233
234define hidden void @shuffle2127(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) {
235; GFX10-LABEL: shuffle2127:
236; GFX10:       ; %bb.0:
237; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
238; GFX10-NEXT:    ds_read_b32 v0, v0
239; GFX10-NEXT:    ds_read_b32 v1, v1
240; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
241; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3060506
242; GFX10-NEXT:    ds_write_b32 v2, v0
243; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
244; GFX10-NEXT:    s_setpc_b64 s[30:31]
245;
246; GFX9-LABEL: shuffle2127:
247; GFX9:       ; %bb.0:
248; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249; GFX9-NEXT:    ds_read_b32 v0, v0
250; GFX9-NEXT:    ds_read_b32 v1, v1
251; GFX9-NEXT:    s_mov_b32 s4, 0x3060506
252; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
253; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
254; GFX9-NEXT:    ds_write_b32 v2, v0
255; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX9-NEXT:    s_setpc_b64 s[30:31]
257  %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4
258  %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4
259  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 1, i32 2, i32 7>
260  store <4 x i8> %shuffle0_0, ptr addrspace(3) %out0, align 4
261  ret void
262}
263
264define hidden void @shuffle5047(ptr addrspace(5) %in0, ptr addrspace(5) %in1, ptr addrspace(5) %out0) {
265; GFX10-LABEL: shuffle5047:
266; GFX10:       ; %bb.0:
267; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
268; GFX10-NEXT:    s_clause 0x1
269; GFX10-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
270; GFX10-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
271; GFX10-NEXT:    s_waitcnt vmcnt(0)
272; GFX10-NEXT:    v_perm_b32 v0, v4, v3, 0x7040005
273; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
274; GFX10-NEXT:    s_setpc_b64 s[30:31]
275;
276; GFX9-LABEL: shuffle5047:
277; GFX9:       ; %bb.0:
278; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
279; GFX9-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
280; GFX9-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
281; GFX9-NEXT:    s_mov_b32 s4, 0x7040005
282; GFX9-NEXT:    s_waitcnt vmcnt(0)
283; GFX9-NEXT:    v_perm_b32 v0, v4, v3, s4
284; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
285; GFX9-NEXT:    s_waitcnt vmcnt(0)
286; GFX9-NEXT:    s_setpc_b64 s[30:31]
287  %vec0 = load <4 x i8>, ptr addrspace(5) %in0, align 4
288  %vec1 = load <4 x i8>, ptr addrspace(5) %in1, align 4
289  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 0, i32 4, i32 7>
290  store <4 x i8> %shuffle0_0, ptr addrspace(5) %out0, align 4
291  ret void
292}
293
294define hidden void @shuffle3546(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
295; GFX10-LABEL: shuffle3546:
296; GFX10:       ; %bb.0:
297; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
298; GFX10-NEXT:    global_load_dword v6, v[0:1], off
299; GFX10-NEXT:    global_load_dword v7, v[2:3], off
300; GFX10-NEXT:    s_waitcnt vmcnt(0)
301; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x2000107
302; GFX10-NEXT:    global_store_dword v[4:5], v0, off
303; GFX10-NEXT:    s_setpc_b64 s[30:31]
304;
305; GFX9-LABEL: shuffle3546:
306; GFX9:       ; %bb.0:
307; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
308; GFX9-NEXT:    global_load_dword v6, v[0:1], off
309; GFX9-NEXT:    global_load_dword v7, v[2:3], off
310; GFX9-NEXT:    s_mov_b32 s4, 0x2000107
311; GFX9-NEXT:    s_waitcnt vmcnt(0)
312; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
313; GFX9-NEXT:    global_store_dword v[4:5], v0, off
314; GFX9-NEXT:    s_waitcnt vmcnt(0)
315; GFX9-NEXT:    s_setpc_b64 s[30:31]
316  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
317  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
318  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 5, i32 4, i32 6>
319  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
320  ret void
321}
322
323
324define hidden void @shuffle7330ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
325; GFX10-LABEL: shuffle7330ud2:
326; GFX10:       ; %bb.0:
327; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
328; GFX10-NEXT:    global_load_dword v0, v[0:1], off
329; GFX10-NEXT:    s_waitcnt vmcnt(0)
330; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x4070706
331; GFX10-NEXT:    global_store_dword v[2:3], v0, off
332; GFX10-NEXT:    s_setpc_b64 s[30:31]
333;
334; GFX9-LABEL: shuffle7330ud2:
335; GFX9:       ; %bb.0:
336; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
337; GFX9-NEXT:    global_load_dword v0, v[0:1], off
338; GFX9-NEXT:    s_mov_b32 s4, 0x4070706
339; GFX9-NEXT:    s_waitcnt vmcnt(0)
340; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
341; GFX9-NEXT:    global_store_dword v[2:3], v0, off
342; GFX9-NEXT:    s_waitcnt vmcnt(0)
343; GFX9-NEXT:    s_setpc_b64 s[30:31]
344  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
345  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 7, i32 3, i32 3, i32 0>
346  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
347  ret void
348}
349
350define hidden void @shuffle5341ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
351; GFX10-LABEL: shuffle5341ud2:
352; GFX10:       ; %bb.0:
353; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
354; GFX10-NEXT:    global_load_dword v0, v[0:1], off
355; GFX10-NEXT:    s_waitcnt vmcnt(0)
356; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
357; GFX10-NEXT:    global_store_dword v[2:3], v0, off
358; GFX10-NEXT:    s_setpc_b64 s[30:31]
359;
360; GFX9-LABEL: shuffle5341ud2:
361; GFX9:       ; %bb.0:
362; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363; GFX9-NEXT:    global_load_dword v0, v[0:1], off
364; GFX9-NEXT:    s_waitcnt vmcnt(0)
365; GFX9-NEXT:    v_alignbit_b32 v0, v0, v0, 16
366; GFX9-NEXT:    global_store_dword v[2:3], v0, off
367; GFX9-NEXT:    s_waitcnt vmcnt(0)
368; GFX9-NEXT:    s_setpc_b64 s[30:31]
369  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
370  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 5, i32 3, i32 4, i32 1>
371  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
372  ret void
373}
374
375define hidden void @shuffle6106ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
376; GFX10-LABEL: shuffle6106ud2:
377; GFX10:       ; %bb.0:
378; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
379; GFX10-NEXT:    global_load_dword v0, v[0:1], off
380; GFX10-NEXT:    s_waitcnt vmcnt(0)
381; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040504
382; GFX10-NEXT:    global_store_dword v[2:3], v0, off
383; GFX10-NEXT:    s_setpc_b64 s[30:31]
384;
385; GFX9-LABEL: shuffle6106ud2:
386; GFX9:       ; %bb.0:
387; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
388; GFX9-NEXT:    global_load_dword v0, v[0:1], off
389; GFX9-NEXT:    s_mov_b32 s4, 0x5040504
390; GFX9-NEXT:    s_waitcnt vmcnt(0)
391; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
392; GFX9-NEXT:    global_store_dword v[2:3], v0, off
393; GFX9-NEXT:    s_waitcnt vmcnt(0)
394; GFX9-NEXT:    s_setpc_b64 s[30:31]
395  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
396  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 6, i32 1, i32 0, i32 6>
397  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
398  ret void
399}
400
401
402define hidden void @shuffle4327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
403; GFX10-LABEL: shuffle4327ud2:
404; GFX10:       ; %bb.0:
405; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
406; GFX10-NEXT:    global_load_dword v0, v[0:1], off
407; GFX10-NEXT:    s_waitcnt vmcnt(0)
408; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060706
409; GFX10-NEXT:    global_store_dword v[2:3], v0, off
410; GFX10-NEXT:    s_setpc_b64 s[30:31]
411;
412; GFX9-LABEL: shuffle4327ud2:
413; GFX9:       ; %bb.0:
414; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
415; GFX9-NEXT:    global_load_dword v0, v[0:1], off
416; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
417; GFX9-NEXT:    s_waitcnt vmcnt(0)
418; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
419; GFX9-NEXT:    global_store_dword v[2:3], v0, off
420; GFX9-NEXT:    s_waitcnt vmcnt(0)
421; GFX9-NEXT:    s_setpc_b64 s[30:31]
422  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
423  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 4, i32 3, i32 2, i32 7>
424  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
425  ret void
426}
427
428define hidden void @shuffle3263ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
429; GFX10-LABEL: shuffle3263ud2:
430; GFX10:       ; %bb.0:
431; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
432; GFX10-NEXT:    global_load_dword v0, v[0:1], off
433; GFX10-NEXT:    s_waitcnt vmcnt(0)
434; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060607
435; GFX10-NEXT:    global_store_dword v[2:3], v0, off
436; GFX10-NEXT:    s_setpc_b64 s[30:31]
437;
438; GFX9-LABEL: shuffle3263ud2:
439; GFX9:       ; %bb.0:
440; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441; GFX9-NEXT:    global_load_dword v0, v[0:1], off
442; GFX9-NEXT:    s_mov_b32 s4, 0x7060607
443; GFX9-NEXT:    s_waitcnt vmcnt(0)
444; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
445; GFX9-NEXT:    global_store_dword v[2:3], v0, off
446; GFX9-NEXT:    s_waitcnt vmcnt(0)
447; GFX9-NEXT:    s_setpc_b64 s[30:31]
448  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
449  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 6, i32 3>
450  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
451  ret void
452}
453
454define hidden void @shuffle2763ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
455; GFX10-LABEL: shuffle2763ud2:
456; GFX10:       ; %bb.0:
457; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458; GFX10-NEXT:    global_load_dword v0, v[0:1], off
459; GFX10-NEXT:    s_waitcnt vmcnt(0)
460; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060706
461; GFX10-NEXT:    global_store_dword v[2:3], v0, off
462; GFX10-NEXT:    s_setpc_b64 s[30:31]
463;
464; GFX9-LABEL: shuffle2763ud2:
465; GFX9:       ; %bb.0:
466; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467; GFX9-NEXT:    global_load_dword v0, v[0:1], off
468; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
469; GFX9-NEXT:    s_waitcnt vmcnt(0)
470; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
471; GFX9-NEXT:    global_store_dword v[2:3], v0, off
472; GFX9-NEXT:    s_waitcnt vmcnt(0)
473; GFX9-NEXT:    s_setpc_b64 s[30:31]
474  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
475  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 2, i32 7, i32 6, i32 3>
476  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
477  ret void
478}
479
480define hidden void @shuffle1327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
481; GFX10-LABEL: shuffle1327ud2:
482; GFX10:       ; %bb.0:
483; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
484; GFX10-NEXT:    global_load_dword v0, v[0:1], off
485; GFX10-NEXT:    s_waitcnt vmcnt(0)
486; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060705
487; GFX10-NEXT:    global_store_dword v[2:3], v0, off
488; GFX10-NEXT:    s_setpc_b64 s[30:31]
489;
490; GFX9-LABEL: shuffle1327ud2:
491; GFX9:       ; %bb.0:
492; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
493; GFX9-NEXT:    global_load_dword v0, v[0:1], off
494; GFX9-NEXT:    s_mov_b32 s4, 0x7060705
495; GFX9-NEXT:    s_waitcnt vmcnt(0)
496; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
497; GFX9-NEXT:    global_store_dword v[2:3], v0, off
498; GFX9-NEXT:    s_waitcnt vmcnt(0)
499; GFX9-NEXT:    s_setpc_b64 s[30:31]
500  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
501  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 7>
502  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
503  ret void
504}
505
506define hidden void @shuffle0605ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
507; GFX10-LABEL: shuffle0605ud2:
508; GFX10:       ; %bb.0:
509; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
510; GFX10-NEXT:    global_load_dword v0, v[0:1], off
511; GFX10-NEXT:    s_waitcnt vmcnt(0)
512; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040504
513; GFX10-NEXT:    global_store_dword v[2:3], v0, off
514; GFX10-NEXT:    s_setpc_b64 s[30:31]
515;
516; GFX9-LABEL: shuffle0605ud2:
517; GFX9:       ; %bb.0:
518; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
519; GFX9-NEXT:    global_load_dword v0, v[0:1], off
520; GFX9-NEXT:    s_mov_b32 s4, 0x5040504
521; GFX9-NEXT:    s_waitcnt vmcnt(0)
522; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
523; GFX9-NEXT:    global_store_dword v[2:3], v0, off
524; GFX9-NEXT:    s_waitcnt vmcnt(0)
525; GFX9-NEXT:    s_setpc_b64 s[30:31]
526  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
527  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 0, i32 6, i32 0, i32 5>
528  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4
529  ret void
530}
531
532define hidden void @insertUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
533; GFX10-LABEL: insertUsesOr:
534; GFX10:       ; %bb.0:
535; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
536; GFX10-NEXT:    global_load_dword v0, v[0:1], off
537; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v4
538; GFX10-NEXT:    s_waitcnt vmcnt(0)
539; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
540; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
541; GFX10-NEXT:    global_store_dword v[5:6], v0, off
542; GFX10-NEXT:    s_setpc_b64 s[30:31]
543;
544; GFX9-LABEL: insertUsesOr:
545; GFX9:       ; %bb.0:
546; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
547; GFX9-NEXT:    global_load_dword v0, v[0:1], off
548; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v4
549; GFX9-NEXT:    s_waitcnt vmcnt(0)
550; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
551; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
552; GFX9-NEXT:    global_store_dword v[5:6], v0, off
553; GFX9-NEXT:    s_waitcnt vmcnt(0)
554; GFX9-NEXT:    s_setpc_b64 s[30:31]
555  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
556  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
557  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
558  %vecins = insertelement <4 x i8> %shuffle0_0, i8 %elt, i32 1
559  store <4 x i8> %vecins, ptr addrspace(1) %out0
560  ret void
561}
562
563define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
564; GFX10-LABEL: addUsesOr:
565; GFX10:       ; %bb.0:
566; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
567; GFX10-NEXT:    global_load_dword v4, v[0:1], off
568; GFX10-NEXT:    global_load_dword v7, v[2:3], off
569; GFX10-NEXT:    s_waitcnt vmcnt(1)
570; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
571; GFX10-NEXT:    s_waitcnt vmcnt(0)
572; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
573; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
574; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
575; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
576; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v7
577; GFX10-NEXT:    v_add_nc_u16 v2, v2, v3
578; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
579; GFX10-NEXT:    v_add_nc_u16 v1, v4, v1
580; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
581; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
582; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
583; GFX10-NEXT:    global_store_dword v[5:6], v0, off
584; GFX10-NEXT:    s_setpc_b64 s[30:31]
585;
586; GFX9-LABEL: addUsesOr:
587; GFX9:       ; %bb.0:
588; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
589; GFX9-NEXT:    global_load_dword v4, v[0:1], off
590; GFX9-NEXT:    global_load_dword v7, v[2:3], off
591; GFX9-NEXT:    s_waitcnt vmcnt(0)
592; GFX9-NEXT:    v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
593; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
594; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
595; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
596; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
597; GFX9-NEXT:    global_store_dword v[5:6], v0, off
598; GFX9-NEXT:    s_waitcnt vmcnt(0)
599; GFX9-NEXT:    s_setpc_b64 s[30:31]
600  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
601  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
602  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 7, i32 0, i32 6, i32 3>
603  %added = add <4 x i8> %shuffle0_0, %vec1
604  store <4 x i8> %added, ptr addrspace(1) %out0
605  ret void
606}
607
608
609define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out1) #0 {
610; GFX10-LABEL: shuffle8i8:
611; GFX10:       ; %bb.0: ; %bb
612; GFX10-NEXT:    s_clause 0x1
613; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
614; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
615; GFX10-NEXT:    v_mov_b32_e32 v2, 0
616; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
617; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
618; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x0
619; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
620; GFX10-NEXT:    s_bfe_u32 s2, s5, 0x80008
621; GFX10-NEXT:    s_lshl_b32 s1, s9, 8
622; GFX10-NEXT:    s_bfe_u32 s9, s9, 0x100010
623; GFX10-NEXT:    s_bfe_u32 s0, s4, 0x80008
624; GFX10-NEXT:    s_lshl_b32 s3, s8, 8
625; GFX10-NEXT:    s_and_b32 s5, s8, 0xff00
626; GFX10-NEXT:    s_bfe_u32 s8, s4, 0x80010
627; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
628; GFX10-NEXT:    s_or_b32 s1, s2, s1
629; GFX10-NEXT:    s_lshl_b32 s2, s9, 8
630; GFX10-NEXT:    s_or_b32 s0, s0, s3
631; GFX10-NEXT:    s_or_b32 s3, s8, s5
632; GFX10-NEXT:    s_or_b32 s2, s4, s2
633; GFX10-NEXT:    s_and_b32 s0, s0, 0xffff
634; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
635; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff
636; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
637; GFX10-NEXT:    s_or_b32 s0, s0, s1
638; GFX10-NEXT:    s_or_b32 s1, s2, s3
639; GFX10-NEXT:    v_mov_b32_e32 v0, s0
640; GFX10-NEXT:    v_mov_b32_e32 v1, s1
641; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
642; GFX10-NEXT:    s_endpgm
643;
644; GFX9-LABEL: shuffle8i8:
645; GFX9:       ; %bb.0: ; %bb
646; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
647; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
648; GFX9-NEXT:    v_mov_b32_e32 v2, 0
649; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
650; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
651; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x0
652; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
653; GFX9-NEXT:    s_bfe_u32 s0, s4, 0x80008
654; GFX9-NEXT:    s_lshl_b32 s1, s9, 8
655; GFX9-NEXT:    s_bfe_u32 s2, s5, 0x80008
656; GFX9-NEXT:    s_lshl_b32 s3, s8, 8
657; GFX9-NEXT:    s_or_b32 s1, s2, s1
658; GFX9-NEXT:    s_or_b32 s0, s0, s3
659; GFX9-NEXT:    s_bfe_u32 s2, s4, 0x80010
660; GFX9-NEXT:    s_and_b32 s3, s4, 0xff
661; GFX9-NEXT:    s_bfe_u32 s4, s9, 0x100010
662; GFX9-NEXT:    s_and_b32 s5, s8, 0xff00
663; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
664; GFX9-NEXT:    s_or_b32 s2, s2, s5
665; GFX9-NEXT:    s_or_b32 s3, s3, s4
666; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
667; GFX9-NEXT:    s_lshl_b32 s2, s2, 16
668; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
669; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
670; GFX9-NEXT:    s_or_b32 s2, s3, s2
671; GFX9-NEXT:    s_or_b32 s0, s0, s1
672; GFX9-NEXT:    v_mov_b32_e32 v0, s0
673; GFX9-NEXT:    v_mov_b32_e32 v1, s2
674; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
675; GFX9-NEXT:    s_endpgm
676bb:
677  %vec0 = load <8 x i8>, ptr addrspace(1) %in0
678  %vec1 = load <8 x i8>, ptr addrspace(1) %in1
679  %shuffle0 = shufflevector <8 x i8> %vec0, <8 x i8> %vec1, <8 x i32> <i32 1, i32 8, i32 5, i32 12, i32 0, i32 14, i32 2, i32 9>
680  store <8 x i8> %shuffle0, ptr addrspace(1) %out1
681  ret void
682}
683
684declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
685declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
686
687; Not combined to perm due to non-vectorized use, non-divergent
688define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
689; GFX10-LABEL: add:
690; GFX10:       ; %bb.0:
691; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
692; GFX10-NEXT:    global_load_dword v4, v[0:1], off
693; GFX10-NEXT:    global_load_dword v7, v[2:3], off
694; GFX10-NEXT:    s_waitcnt vmcnt(1)
695; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
696; GFX10-NEXT:    s_waitcnt vmcnt(0)
697; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v7
698; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v7
699; GFX10-NEXT:    v_lshrrev_b16 v3, 8, v4
700; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
701; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
702; GFX10-NEXT:    v_add_nc_u16 v2, v7, v2
703; GFX10-NEXT:    v_add_nc_u16 v3, v3, v7
704; GFX10-NEXT:    v_add_nc_u16 v1, v1, v4
705; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
706; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v2
707; GFX10-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
708; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
709; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
710; GFX10-NEXT:    global_store_dword v[5:6], v0, off
711; GFX10-NEXT:    s_setpc_b64 s[30:31]
712;
713; GFX9-LABEL: add:
714; GFX9:       ; %bb.0:
715; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
716; GFX9-NEXT:    global_load_dword v4, v[0:1], off
717; GFX9-NEXT:    global_load_dword v7, v[2:3], off
718; GFX9-NEXT:    s_waitcnt vmcnt(0)
719; GFX9-NEXT:    v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
720; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
721; GFX9-NEXT:    v_add_u16_sdwa v2, v7, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
722; GFX9-NEXT:    v_add_u16_sdwa v3, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:WORD_1
723; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
724; GFX9-NEXT:    v_or_b32_sdwa v1, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
725; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
726; GFX9-NEXT:    global_store_dword v[5:6], v0, off
727; GFX9-NEXT:    s_waitcnt vmcnt(0)
728; GFX9-NEXT:    s_setpc_b64 s[30:31]
729  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
730  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
731  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
732  %vecins = add <4 x i8> %shuffle0_0, %vec1
733  store <4 x i8> %vecins, ptr addrspace(1) %out0
734  ret void
735}
736
737; Not combined to perm due to non-vectorized use
738define hidden void @add_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
739; GFX10-LABEL: add_div:
740; GFX10:       ; %bb.0:
741; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
742; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
743; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
744; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
745; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
746; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
747; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
748; GFX10-NEXT:    global_load_dword v4, v[0:1], off
749; GFX10-NEXT:    global_load_dword v7, v[2:3], off
750; GFX10-NEXT:    s_waitcnt vmcnt(1)
751; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
752; GFX10-NEXT:    s_waitcnt vmcnt(0)
753; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v7
754; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
755; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v4
756; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
757; GFX10-NEXT:    v_add_nc_u16 v1, v1, v7
758; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
759; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
760; GFX10-NEXT:    global_store_dword v[5:6], v0, off
761; GFX10-NEXT:    s_setpc_b64 s[30:31]
762;
763; GFX9-LABEL: add_div:
764; GFX9:       ; %bb.0:
765; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
766; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
767; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
768; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
769; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
770; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
771; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
772; GFX9-NEXT:    global_load_dword v4, v[0:1], off
773; GFX9-NEXT:    global_load_dword v7, v[2:3], off
774; GFX9-NEXT:    s_waitcnt vmcnt(0)
775; GFX9-NEXT:    v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
776; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
777; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
778; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
779; GFX9-NEXT:    global_store_dword v[5:6], v0, off
780; GFX9-NEXT:    s_waitcnt vmcnt(0)
781; GFX9-NEXT:    s_setpc_b64 s[30:31]
782  %tid = call i32 @llvm.amdgcn.workitem.id.x()
783  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
784  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
785  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
786  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
787  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
788  %vecins = add <4 x i8> %shuffle0_0, %vec1
789  store <4 x i8> %vecins, ptr addrspace(1) %out0
790  ret void
791}
792
793; Not combined to perm due to non-divergent use
794define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
795; GFX10-LABEL: add_store:
796; GFX10:       ; %bb.0:
797; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
798; GFX10-NEXT:    global_load_dword v4, v[0:1], off
799; GFX10-NEXT:    global_load_dword v9, v[2:3], off
800; GFX10-NEXT:    s_waitcnt vmcnt(1)
801; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
802; GFX10-NEXT:    s_waitcnt vmcnt(0)
803; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v9
804; GFX10-NEXT:    v_lshrrev_b16 v2, 8, v4
805; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
806; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffffff00
807; GFX10-NEXT:    v_add_nc_u16 v3, v2, v9
808; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
809; GFX10-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
810; GFX10-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
811; GFX10-NEXT:    v_or_b32_e32 v1, v2, v1
812; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
813; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
814; GFX10-NEXT:    global_store_dword v[5:6], v0, off
815; GFX10-NEXT:    global_store_dword v[7:8], v1, off
816; GFX10-NEXT:    s_setpc_b64 s[30:31]
817;
818; GFX9-LABEL: add_store:
819; GFX9:       ; %bb.0:
820; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
821; GFX9-NEXT:    global_load_dword v4, v[0:1], off
822; GFX9-NEXT:    global_load_dword v9, v[2:3], off
823; GFX9-NEXT:    s_movk_i32 s4, 0xff00
824; GFX9-NEXT:    s_waitcnt vmcnt(1)
825; GFX9-NEXT:    v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
826; GFX9-NEXT:    s_waitcnt vmcnt(0)
827; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
828; GFX9-NEXT:    v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
829; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
830; GFX9-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
831; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
832; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
833; GFX9-NEXT:    global_store_dword v[5:6], v1, off
834; GFX9-NEXT:    global_store_dword v[7:8], v0, off
835; GFX9-NEXT:    s_waitcnt vmcnt(0)
836; GFX9-NEXT:    s_setpc_b64 s[30:31]
837  %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
838  %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
839  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
840  %vecins = add <4 x i8> %shuffle0_0, %vec1
841  store <4 x i8> %vecins, ptr addrspace(1) %out0
842  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
843  ret void
844}
845
846; Not combined to perm due to 16 bit or
847define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
848; GFX10-LABEL: add_store_div_16:
849; GFX10:       ; %bb.0:
850; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
851; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
852; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
853; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
854; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
855; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
856; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
857; GFX10-NEXT:    global_load_dword v4, v[0:1], off
858; GFX10-NEXT:    global_load_dword v9, v[2:3], off
859; GFX10-NEXT:    s_waitcnt vmcnt(1)
860; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
861; GFX10-NEXT:    s_waitcnt vmcnt(0)
862; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v9
863; GFX10-NEXT:    v_lshrrev_b16 v2, 8, v4
864; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
865; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffffff00
866; GFX10-NEXT:    v_add_nc_u16 v3, v2, v9
867; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
868; GFX10-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
869; GFX10-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
870; GFX10-NEXT:    v_or_b32_e32 v1, v2, v1
871; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
872; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
873; GFX10-NEXT:    global_store_dword v[5:6], v0, off
874; GFX10-NEXT:    global_store_dword v[7:8], v1, off
875; GFX10-NEXT:    s_setpc_b64 s[30:31]
876;
877; GFX9-LABEL: add_store_div_16:
878; GFX9:       ; %bb.0:
879; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
880; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
881; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
882; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
883; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
884; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
885; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
886; GFX9-NEXT:    global_load_dword v4, v[0:1], off
887; GFX9-NEXT:    global_load_dword v9, v[2:3], off
888; GFX9-NEXT:    s_movk_i32 s4, 0xff00
889; GFX9-NEXT:    s_waitcnt vmcnt(1)
890; GFX9-NEXT:    v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
891; GFX9-NEXT:    s_waitcnt vmcnt(0)
892; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
893; GFX9-NEXT:    v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
894; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
895; GFX9-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
896; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
897; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
898; GFX9-NEXT:    global_store_dword v[5:6], v1, off
899; GFX9-NEXT:    global_store_dword v[7:8], v0, off
900; GFX9-NEXT:    s_waitcnt vmcnt(0)
901; GFX9-NEXT:    s_setpc_b64 s[30:31]
902  %tid = call i32 @llvm.amdgcn.workitem.id.x()
903  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
904  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
905  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
906  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
907  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
908  %vecins = add <4 x i8> %shuffle0_0, %vec1
909  store <4 x i8> %vecins, ptr addrspace(1) %out0
910  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
911  ret void
912}
913
914; Vectorized use, divergent, 32 bit or
915define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
916; GFX10-LABEL: add_store_div:
917; GFX10:       ; %bb.0:
918; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
919; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
920; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
921; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
922; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
923; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
924; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
925; GFX10-NEXT:    global_load_dword v4, v[0:1], off
926; GFX10-NEXT:    global_load_dword v9, v[2:3], off
927; GFX10-NEXT:    s_waitcnt vmcnt(1)
928; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
929; GFX10-NEXT:    s_waitcnt vmcnt(0)
930; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v9
931; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v9
932; GFX10-NEXT:    v_lshrrev_b16 v3, 8, v4
933; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
934; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
935; GFX10-NEXT:    v_add_nc_u16 v2, v9, v2
936; GFX10-NEXT:    v_add_nc_u16 v3, v3, v9
937; GFX10-NEXT:    v_add_nc_u16 v1, v1, v10
938; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
939; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v2
940; GFX10-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
941; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
942; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
943; GFX10-NEXT:    v_perm_b32 v1, v4, v9, 0x10705
944; GFX10-NEXT:    global_store_dword v[5:6], v0, off
945; GFX10-NEXT:    global_store_dword v[7:8], v1, off
946; GFX10-NEXT:    s_setpc_b64 s[30:31]
947;
948; GFX9-LABEL: add_store_div:
949; GFX9:       ; %bb.0:
950; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
951; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
952; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
953; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
954; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
955; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
956; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
957; GFX9-NEXT:    global_load_dword v4, v[0:1], off
958; GFX9-NEXT:    global_load_dword v9, v[2:3], off
959; GFX9-NEXT:    s_mov_b32 s4, 0x10705
960; GFX9-NEXT:    s_waitcnt vmcnt(0)
961; GFX9-NEXT:    v_perm_b32 v0, v4, v9, s4
962; GFX9-NEXT:    v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
963; GFX9-NEXT:    v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
964; GFX9-NEXT:    v_add_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
965; GFX9-NEXT:    v_add_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:WORD_1
966; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
967; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
968; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
969; GFX9-NEXT:    global_store_dword v[5:6], v1, off
970; GFX9-NEXT:    global_store_dword v[7:8], v0, off
971; GFX9-NEXT:    s_waitcnt vmcnt(0)
972; GFX9-NEXT:    s_setpc_b64 s[30:31]
973  %tid = call i32 @llvm.amdgcn.workitem.id.x()
974  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
975  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
976  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
977  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
978  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 5, i32 4>
979  %vecins = add <4 x i8> %shuffle0_0, %vec1
980  store <4 x i8> %vecins, ptr addrspace(1) %out0
981  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
982  ret void
983}
984
985define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
986; GFX10-LABEL: and_store_div:
987; GFX10:       ; %bb.0:
988; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
989; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
990; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
991; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
992; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
993; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
994; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
995; GFX10-NEXT:    global_load_dword v4, v[2:3], off
996; GFX10-NEXT:    global_load_dword v9, v[0:1], off
997; GFX10-NEXT:    v_mov_b32_e32 v0, 2
998; GFX10-NEXT:    v_mov_b32_e32 v1, 1
999; GFX10-NEXT:    s_waitcnt vmcnt(1)
1000; GFX10-NEXT:    v_and_b32_sdwa v2, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1001; GFX10-NEXT:    s_waitcnt vmcnt(0)
1002; GFX10-NEXT:    v_and_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1003; GFX10-NEXT:    v_and_b32_e32 v3, 0x100, v9
1004; GFX10-NEXT:    v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
1005; GFX10-NEXT:    v_or_b32_e32 v1, v1, v2
1006; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1007; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1008; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x5070006
1009; GFX10-NEXT:    global_store_dword v[5:6], v0, off
1010; GFX10-NEXT:    global_store_dword v[7:8], v1, off
1011; GFX10-NEXT:    s_setpc_b64 s[30:31]
1012;
1013; GFX9-LABEL: and_store_div:
1014; GFX9:       ; %bb.0:
1015; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1016; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1017; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1018; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
1019; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1020; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
1021; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1022; GFX9-NEXT:    global_load_dword v4, v[0:1], off
1023; GFX9-NEXT:    global_load_dword v9, v[2:3], off
1024; GFX9-NEXT:    s_mov_b32 s4, 0x5070006
1025; GFX9-NEXT:    v_mov_b32_e32 v0, 2
1026; GFX9-NEXT:    v_mov_b32_e32 v1, 1
1027; GFX9-NEXT:    s_waitcnt vmcnt(1)
1028; GFX9-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1029; GFX9-NEXT:    s_waitcnt vmcnt(0)
1030; GFX9-NEXT:    v_perm_b32 v2, v4, v9, s4
1031; GFX9-NEXT:    v_and_b32_sdwa v3, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1032; GFX9-NEXT:    v_and_b32_e32 v9, 0x100, v4
1033; GFX9-NEXT:    v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
1034; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
1035; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1036; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1037; GFX9-NEXT:    global_store_dword v[5:6], v0, off
1038; GFX9-NEXT:    global_store_dword v[7:8], v2, off
1039; GFX9-NEXT:    s_waitcnt vmcnt(0)
1040; GFX9-NEXT:    s_setpc_b64 s[30:31]
1041  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1042  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1043  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1044  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1045  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1046  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 4, i32 3, i32 1>
1047  %vecins = and <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
1048  store <4 x i8> %vecins, ptr addrspace(1) %out0
1049  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1050  ret void
1051}
1052
1053define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1054; GFX10-LABEL: ashr_store_div:
1055; GFX10:       ; %bb.0:
1056; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1057; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1058; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1059; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
1060; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1061; GFX10-NEXT:    global_load_dword v9, v[0:1], off
1062; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v4
1063; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1064; GFX10-NEXT:    v_mov_b32_e32 v2, 26
1065; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1066; GFX10-NEXT:    s_waitcnt vmcnt(1)
1067; GFX10-NEXT:    v_bfe_i32 v1, v9, 0, 8
1068; GFX10-NEXT:    v_ashrrev_i32_sdwa v2, v2, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1069; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 25, v9
1070; GFX10-NEXT:    v_lshlrev_b16 v1, 7, v1
1071; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1072; GFX10-NEXT:    s_waitcnt vmcnt(0)
1073; GFX10-NEXT:    v_ashrrev_i16 v4, 10, v0
1074; GFX10-NEXT:    v_perm_b32 v0, v9, v0, 0x4010707
1075; GFX10-NEXT:    v_and_b32_e32 v1, 0xffffff00, v1
1076; GFX10-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1077; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1078; GFX10-NEXT:    global_store_dword v[5:6], v1, off
1079; GFX10-NEXT:    global_store_dword v[7:8], v0, off
1080; GFX10-NEXT:    s_setpc_b64 s[30:31]
1081;
1082; GFX9-LABEL: ashr_store_div:
1083; GFX9:       ; %bb.0:
1084; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1085; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1086; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1087; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
1088; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1089; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
1090; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1091; GFX9-NEXT:    global_load_dword v4, v[0:1], off
1092; GFX9-NEXT:    global_load_dword v9, v[2:3], off
1093; GFX9-NEXT:    v_mov_b32_e32 v1, 7
1094; GFX9-NEXT:    s_mov_b32 s4, 0x4010707
1095; GFX9-NEXT:    v_mov_b32_e32 v0, 26
1096; GFX9-NEXT:    s_waitcnt vmcnt(1)
1097; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1098; GFX9-NEXT:    s_waitcnt vmcnt(0)
1099; GFX9-NEXT:    v_perm_b32 v2, v4, v9, s4
1100; GFX9-NEXT:    v_ashrrev_i32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1101; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 25, v4
1102; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 10, v9
1103; GFX9-NEXT:    v_and_b32_e32 v1, 0xffffff00, v1
1104; GFX9-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1105; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1106; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1107; GFX9-NEXT:    global_store_dword v[5:6], v0, off
1108; GFX9-NEXT:    global_store_dword v[7:8], v2, off
1109; GFX9-NEXT:    s_waitcnt vmcnt(0)
1110; GFX9-NEXT:    s_setpc_b64 s[30:31]
1111  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1112  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1113  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1114  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1115  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1116  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 3, i32 5, i32 0>
1117  %vecins = ashr <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
1118  store <4 x i8> %vecins, ptr addrspace(1) %out0
1119  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1120  ret void
1121}
1122
1123define hidden void @bc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1124; GFX10-LABEL: bc_store_div:
1125; GFX10:       ; %bb.0:
1126; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1127; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1128; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1129; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
1130; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1131; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
1132; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1133; GFX10-NEXT:    global_load_dword v4, v[0:1], off
1134; GFX10-NEXT:    global_load_dword v9, v[2:3], off
1135; GFX10-NEXT:    s_waitcnt vmcnt(0)
1136; GFX10-NEXT:    v_perm_b32 v0, v9, v4, 0x7060104
1137; GFX10-NEXT:    global_store_dword v[7:8], v0, off
1138; GFX10-NEXT:    global_store_dword v[5:6], v0, off
1139; GFX10-NEXT:    s_setpc_b64 s[30:31]
1140;
1141; GFX9-LABEL: bc_store_div:
1142; GFX9:       ; %bb.0:
1143; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1144; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1145; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1146; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
1147; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1148; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
1149; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1150; GFX9-NEXT:    global_load_dword v4, v[0:1], off
1151; GFX9-NEXT:    global_load_dword v9, v[2:3], off
1152; GFX9-NEXT:    s_mov_b32 s4, 0x7060104
1153; GFX9-NEXT:    s_waitcnt vmcnt(0)
1154; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4
1155; GFX9-NEXT:    global_store_dword v[7:8], v0, off
1156; GFX9-NEXT:    global_store_dword v[5:6], v0, off
1157; GFX9-NEXT:    s_waitcnt vmcnt(0)
1158; GFX9-NEXT:    s_setpc_b64 s[30:31]
1159  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1160  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1161  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1162  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1163  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1164  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1165  %insvec = bitcast <4 x i8> %shuffle0_0 to i32
1166  store i32 %insvec, ptr addrspace(1) %out1
1167  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
1168  ret void
1169}
1170
1171
1172define hidden void @eve_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
1173; GFX10-LABEL: eve_store_div:
1174; GFX10:       ; %bb.0:
1175; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1176; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1177; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1178; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
1179; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1180; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
1181; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1182; GFX10-NEXT:    global_load_dword v4, v[0:1], off
1183; GFX10-NEXT:    global_load_dword v5, v[2:3], off
1184; GFX10-NEXT:    s_waitcnt vmcnt(1)
1185; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
1186; GFX10-NEXT:    s_waitcnt vmcnt(0)
1187; GFX10-NEXT:    v_perm_b32 v1, v5, v4, 0x1020305
1188; GFX10-NEXT:    global_store_byte v[9:10], v0, off
1189; GFX10-NEXT:    global_store_dword v[7:8], v1, off
1190; GFX10-NEXT:    s_setpc_b64 s[30:31]
1191;
1192; GFX9-LABEL: eve_store_div:
1193; GFX9:       ; %bb.0:
1194; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1195; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1196; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1197; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
1198; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1199; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
1200; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1201; GFX9-NEXT:    global_load_dword v4, v[0:1], off
1202; GFX9-NEXT:    global_load_dword v5, v[2:3], off
1203; GFX9-NEXT:    s_mov_b32 s4, 0x1020305
1204; GFX9-NEXT:    s_waitcnt vmcnt(1)
1205; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 24, v4
1206; GFX9-NEXT:    s_waitcnt vmcnt(0)
1207; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
1208; GFX9-NEXT:    global_store_byte v[9:10], v1, off
1209; GFX9-NEXT:    global_store_dword v[7:8], v0, off
1210; GFX9-NEXT:    s_waitcnt vmcnt(0)
1211; GFX9-NEXT:    s_setpc_b64 s[30:31]
1212  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1213  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1214  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1215  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1216  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1217  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 3, i32 2, i32 1>
1218  %tmp = extractelement <4 x i8> %shuffle0_0, i32 1
1219  store i8 %tmp, ptr addrspace(1) %out2
1220  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1221  ret void
1222}
1223
1224; Not combined to perm due to multi use of or operands (introduced by insert op)
1225define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1226; GFX10-LABEL: ive_store_div:
1227; GFX10:       ; %bb.0:
1228; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1229; GFX10-NEXT:    v_and_b32_e32 v9, 0x3ff, v31
1230; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 2, v9
1231; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v9
1232; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1233; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v9
1234; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1235; GFX10-NEXT:    global_load_dword v9, v[0:1], off
1236; GFX10-NEXT:    global_load_dword v10, v[2:3], off
1237; GFX10-NEXT:    v_mov_b32_e32 v0, 16
1238; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
1239; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v4
1240; GFX10-NEXT:    s_waitcnt vmcnt(1)
1241; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1242; GFX10-NEXT:    s_waitcnt vmcnt(0)
1243; GFX10-NEXT:    v_and_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1244; GFX10-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1245; GFX10-NEXT:    v_or_b32_e32 v1, v1, v2
1246; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1247; GFX10-NEXT:    v_perm_b32 v1, v10, v9, 0x2000706
1248; GFX10-NEXT:    global_store_dword v[5:6], v0, off
1249; GFX10-NEXT:    global_store_dword v[7:8], v1, off
1250; GFX10-NEXT:    s_setpc_b64 s[30:31]
1251;
1252; GFX9-LABEL: ive_store_div:
1253; GFX9:       ; %bb.0:
1254; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1255; GFX9-NEXT:    v_and_b32_e32 v9, 0x3ff, v31
1256; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 2, v9
1257; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v9
1258; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1259; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v9
1260; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1261; GFX9-NEXT:    global_load_dword v9, v[0:1], off
1262; GFX9-NEXT:    global_load_dword v10, v[2:3], off
1263; GFX9-NEXT:    s_movk_i32 s4, 0xff
1264; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v4
1265; GFX9-NEXT:    s_mov_b32 s5, 0x2000706
1266; GFX9-NEXT:    s_waitcnt vmcnt(1)
1267; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v9
1268; GFX9-NEXT:    s_waitcnt vmcnt(0)
1269; GFX9-NEXT:    v_and_b32_sdwa v2, v10, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1270; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1271; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
1272; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1273; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1274; GFX9-NEXT:    v_perm_b32 v3, v10, v9, s5
1275; GFX9-NEXT:    global_store_dword v[5:6], v0, off
1276; GFX9-NEXT:    global_store_dword v[7:8], v3, off
1277; GFX9-NEXT:    s_waitcnt vmcnt(0)
1278; GFX9-NEXT:    s_setpc_b64 s[30:31]
1279  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1280  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1281  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1282  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1283  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1284  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 7, i32 0, i32 2>
1285  %vecins = insertelement <4 x i8> %shuffle0_0, i8 %elt, i32 1
1286  store <4 x i8> %vecins, ptr addrspace(1) %out0
1287  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1288  ret void
1289}
1290
1291
1292define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1293; GFX10-LABEL: lhsr_store_div:
1294; GFX10:       ; %bb.0:
1295; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1296; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1297; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1298; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
1299; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1300; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
1301; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1302; GFX10-NEXT:    global_load_dword v4, v[0:1], off
1303; GFX10-NEXT:    global_load_dword v9, v[2:3], off
1304; GFX10-NEXT:    v_mov_b32_e32 v0, 26
1305; GFX10-NEXT:    s_waitcnt vmcnt(1)
1306; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v4
1307; GFX10-NEXT:    s_waitcnt vmcnt(0)
1308; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1309; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 25, v9
1310; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 26, v4
1311; GFX10-NEXT:    v_and_b32_e32 v1, 0x7f00, v1
1312; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
1313; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1314; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1315; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x1030707
1316; GFX10-NEXT:    global_store_dword v[5:6], v0, off
1317; GFX10-NEXT:    global_store_dword v[7:8], v1, off
1318; GFX10-NEXT:    s_setpc_b64 s[30:31]
1319;
1320; GFX9-LABEL: lhsr_store_div:
1321; GFX9:       ; %bb.0:
1322; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1323; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1324; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1325; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
1326; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1327; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
1328; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1329; GFX9-NEXT:    global_load_dword v4, v[0:1], off
1330; GFX9-NEXT:    global_load_dword v9, v[2:3], off
1331; GFX9-NEXT:    v_mov_b32_e32 v0, 26
1332; GFX9-NEXT:    s_mov_b32 s4, 0x1030707
1333; GFX9-NEXT:    s_waitcnt vmcnt(1)
1334; GFX9-NEXT:    v_lshrrev_b16_e32 v3, 1, v4
1335; GFX9-NEXT:    s_waitcnt vmcnt(0)
1336; GFX9-NEXT:    v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1337; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 25, v9
1338; GFX9-NEXT:    v_perm_b32 v1, v9, v4, s4
1339; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 26, v4
1340; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
1341; GFX9-NEXT:    v_and_b32_e32 v2, 0x7f00, v3
1342; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1343; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1344; GFX9-NEXT:    global_store_dword v[5:6], v0, off
1345; GFX9-NEXT:    global_store_dword v[7:8], v1, off
1346; GFX9-NEXT:    s_waitcnt vmcnt(0)
1347; GFX9-NEXT:    s_setpc_b64 s[30:31]
1348  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1349  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1350  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1351  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1352  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1353  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 1>
1354  %vecins = lshr <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
1355  store <4 x i8> %vecins, ptr addrspace(1) %out0
1356  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1357  ret void
1358}
1359
1360
1361define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1362; GFX10-LABEL: mul_store_div:
1363; GFX10:       ; %bb.0:
1364; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1365; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1366; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1367; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
1368; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1369; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
1370; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1371; GFX10-NEXT:    global_load_dword v4, v[0:1], off
1372; GFX10-NEXT:    global_load_dword v9, v[2:3], off
1373; GFX10-NEXT:    s_waitcnt vmcnt(1)
1374; GFX10-NEXT:    v_lshrrev_b16 v0, 8, v4
1375; GFX10-NEXT:    s_waitcnt vmcnt(0)
1376; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 24, v9
1377; GFX10-NEXT:    v_lshrrev_b16 v2, 8, v9
1378; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
1379; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v2
1380; GFX10-NEXT:    v_mul_lo_u16 v1, v3, v1
1381; GFX10-NEXT:    v_mul_lo_u16 v2, v4, v9
1382; GFX10-NEXT:    v_mul_lo_u16 v3, v9, v3
1383; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
1384; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
1385; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1386; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1387; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1388; GFX10-NEXT:    v_perm_b32 v1, v4, v9, 0x2000504
1389; GFX10-NEXT:    global_store_dword v[5:6], v0, off
1390; GFX10-NEXT:    global_store_dword v[7:8], v1, off
1391; GFX10-NEXT:    s_setpc_b64 s[30:31]
1392;
1393; GFX9-LABEL: mul_store_div:
1394; GFX9:       ; %bb.0:
1395; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1396; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1397; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1398; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
1399; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1400; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
1401; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1402; GFX9-NEXT:    global_load_dword v4, v[0:1], off
1403; GFX9-NEXT:    global_load_dword v9, v[2:3], off
1404; GFX9-NEXT:    s_mov_b32 s4, 0x2000504
1405; GFX9-NEXT:    s_waitcnt vmcnt(0)
1406; GFX9-NEXT:    v_perm_b32 v0, v4, v9, s4
1407; GFX9-NEXT:    v_mul_lo_u16_e32 v1, v4, v9
1408; GFX9-NEXT:    v_mul_lo_u16_sdwa v2, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1409; GFX9-NEXT:    v_mul_lo_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_3
1410; GFX9-NEXT:    v_mul_lo_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1411; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1412; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1413; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1414; GFX9-NEXT:    global_store_dword v[5:6], v1, off
1415; GFX9-NEXT:    global_store_dword v[7:8], v0, off
1416; GFX9-NEXT:    s_waitcnt vmcnt(0)
1417; GFX9-NEXT:    s_setpc_b64 s[30:31]
1418  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1419  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1420  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1421  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1422  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1423  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 4, i32 6>
1424  %vecins = mul <4 x i8> %shuffle0_0, %vec1
1425  store <4 x i8> %vecins, ptr addrspace(1) %out0
1426  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1427  ret void
1428}
1429
1430
1431define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1432; GFX10-LABEL: or_store_div:
1433; GFX10:       ; %bb.0:
1434; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1435; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1436; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1437; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
1438; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1439; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
1440; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1441; GFX10-NEXT:    global_load_dword v4, v[2:3], off
1442; GFX10-NEXT:    global_load_dword v9, v[0:1], off
1443; GFX10-NEXT:    v_mov_b32_e32 v0, 16
1444; GFX10-NEXT:    v_bfrev_b32_e32 v2, 4.0
1445; GFX10-NEXT:    s_waitcnt vmcnt(1)
1446; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v4
1447; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1448; GFX10-NEXT:    s_waitcnt vmcnt(0)
1449; GFX10-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1450; GFX10-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1451; GFX10-NEXT:    v_or_b32_e32 v1, 0x201, v1
1452; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1453; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1454; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x2010005
1455; GFX10-NEXT:    global_store_dword v[5:6], v0, off
1456; GFX10-NEXT:    global_store_dword v[7:8], v1, off
1457; GFX10-NEXT:    s_setpc_b64 s[30:31]
1458;
1459; GFX9-LABEL: or_store_div:
1460; GFX9:       ; %bb.0:
1461; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1462; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1463; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1464; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
1465; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1466; GFX9-NEXT:    global_load_dword v2, v[2:3], off
1467; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
1468; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1469; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1470; GFX9-NEXT:    s_mov_b32 s4, 0x2010005
1471; GFX9-NEXT:    s_movk_i32 s5, 0x102
1472; GFX9-NEXT:    s_waitcnt vmcnt(1)
1473; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1474; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 8, v2
1475; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
1476; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1477; GFX9-NEXT:    v_or_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1478; GFX9-NEXT:    s_waitcnt vmcnt(0)
1479; GFX9-NEXT:    v_perm_b32 v4, v0, v2, s4
1480; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1481; GFX9-NEXT:    v_or_b32_e32 v0, 0x201, v0
1482; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1483; GFX9-NEXT:    global_store_dword v[5:6], v0, off
1484; GFX9-NEXT:    global_store_dword v[7:8], v4, off
1485; GFX9-NEXT:    s_waitcnt vmcnt(0)
1486; GFX9-NEXT:    s_setpc_b64 s[30:31]
1487  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1488  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1489  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1490  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1491  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1492  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 4, i32 5, i32 6>
1493  %vecins = or <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
1494  store <4 x i8> %vecins, ptr addrspace(1) %out0
1495  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1496  ret void
1497}
1498
1499define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1500; GFX10-LABEL: sdiv_store_div:
1501; GFX10:       ; %bb.0:
1502; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1503; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1504; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1505; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
1506; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1507; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
1508; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1509; GFX10-NEXT:    global_load_dword v4, v[2:3], off
1510; GFX10-NEXT:    global_load_dword v9, v[0:1], off
1511; GFX10-NEXT:    s_waitcnt vmcnt(1)
1512; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1513; GFX10-NEXT:    v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1514; GFX10-NEXT:    s_waitcnt vmcnt(0)
1515; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1516; GFX10-NEXT:    v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1517; GFX10-NEXT:    v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1518; GFX10-NEXT:    v_rcp_iflag_f32_e32 v15, v1
1519; GFX10-NEXT:    v_rcp_iflag_f32_e32 v16, v10
1520; GFX10-NEXT:    v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1521; GFX10-NEXT:    v_rcp_iflag_f32_e32 v17, v12
1522; GFX10-NEXT:    v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
1523; GFX10-NEXT:    v_rcp_iflag_f32_e32 v18, v14
1524; GFX10-NEXT:    v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
1525; GFX10-NEXT:    v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1526; GFX10-NEXT:    v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
1527; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1528; GFX10-NEXT:    v_mul_f32_e32 v15, v2, v15
1529; GFX10-NEXT:    v_mul_f32_e32 v16, v19, v16
1530; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
1531; GFX10-NEXT:    v_mul_f32_e32 v17, v2, v17
1532; GFX10-NEXT:    v_or_b32_e32 v0, 1, v0
1533; GFX10-NEXT:    v_trunc_f32_e32 v15, v15
1534; GFX10-NEXT:    v_trunc_f32_e32 v16, v16
1535; GFX10-NEXT:    v_mul_f32_e32 v18, v1, v18
1536; GFX10-NEXT:    v_trunc_f32_e32 v17, v17
1537; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 30, v11
1538; GFX10-NEXT:    v_mad_f32 v20, -v15, v1, v2
1539; GFX10-NEXT:    v_mad_f32 v19, -v16, v10, v19
1540; GFX10-NEXT:    v_or_b32_e32 v3, 1, v3
1541; GFX10-NEXT:    v_trunc_f32_e32 v18, v18
1542; GFX10-NEXT:    v_mad_f32 v2, -v17, v12, v2
1543; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1|
1544; GFX10-NEXT:    v_ashrrev_i32_e32 v13, 30, v13
1545; GFX10-NEXT:    v_or_b32_e32 v11, 1, v11
1546; GFX10-NEXT:    v_mad_f32 v21, -v18, v14, v1
1547; GFX10-NEXT:    v_cvt_i32_f32_e32 v15, v15
1548; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc_lo
1549; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10|
1550; GFX10-NEXT:    v_or_b32_e32 v13, 1, v13
1551; GFX10-NEXT:    v_cvt_i32_f32_e32 v16, v16
1552; GFX10-NEXT:    v_cvt_i32_f32_e32 v17, v17
1553; GFX10-NEXT:    v_cvt_i32_f32_e32 v18, v18
1554; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc_lo
1555; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, |v12|
1556; GFX10-NEXT:    v_add_nc_u32_e32 v0, v15, v0
1557; GFX10-NEXT:    v_add_nc_u32_sdwa v1, v16, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1558; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v11, vcc_lo
1559; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14|
1560; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1561; GFX10-NEXT:    v_add_nc_u32_e32 v2, v17, v2
1562; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0, v13, vcc_lo
1563; GFX10-NEXT:    v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1564; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1565; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1566; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x60706
1567; GFX10-NEXT:    global_store_dword v[5:6], v0, off
1568; GFX10-NEXT:    global_store_dword v[7:8], v1, off
1569; GFX10-NEXT:    s_setpc_b64 s[30:31]
1570;
1571; GFX9-LABEL: sdiv_store_div:
1572; GFX9:       ; %bb.0:
1573; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1574; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1575; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1576; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
1577; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1578; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
1579; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1580; GFX9-NEXT:    global_load_dword v4, v[2:3], off
1581; GFX9-NEXT:    global_load_dword v9, v[0:1], off
1582; GFX9-NEXT:    s_mov_b32 s4, 0x60706
1583; GFX9-NEXT:    s_waitcnt vmcnt(1)
1584; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1585; GFX9-NEXT:    v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1586; GFX9-NEXT:    s_waitcnt vmcnt(0)
1587; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4
1588; GFX9-NEXT:    v_xor_b32_sdwa v1, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
1589; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1590; GFX9-NEXT:    v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
1591; GFX9-NEXT:    v_cvt_f32_i32_sdwa v11, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1592; GFX9-NEXT:    v_xor_b32_sdwa v9, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1593; GFX9-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1594; GFX9-NEXT:    v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
1595; GFX9-NEXT:    v_cvt_f32_i32_sdwa v4, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1596; GFX9-NEXT:    v_rcp_iflag_f32_e32 v15, v2
1597; GFX9-NEXT:    v_rcp_iflag_f32_e32 v16, v12
1598; GFX9-NEXT:    v_rcp_iflag_f32_e32 v17, v13
1599; GFX9-NEXT:    v_rcp_iflag_f32_e32 v18, v4
1600; GFX9-NEXT:    v_mul_f32_e32 v15, v3, v15
1601; GFX9-NEXT:    v_mul_f32_e32 v16, v11, v16
1602; GFX9-NEXT:    v_trunc_f32_e32 v15, v15
1603; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
1604; GFX9-NEXT:    v_mul_f32_e32 v17, v3, v17
1605; GFX9-NEXT:    v_mul_f32_e32 v18, v2, v18
1606; GFX9-NEXT:    v_trunc_f32_e32 v16, v16
1607; GFX9-NEXT:    v_mad_f32 v19, -v15, v2, v3
1608; GFX9-NEXT:    v_ashrrev_i32_e32 v10, 30, v10
1609; GFX9-NEXT:    v_or_b32_e32 v1, 1, v1
1610; GFX9-NEXT:    v_trunc_f32_e32 v17, v17
1611; GFX9-NEXT:    v_trunc_f32_e32 v18, v18
1612; GFX9-NEXT:    v_mad_f32 v11, -v16, v12, v11
1613; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, |v2|
1614; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 30, v9
1615; GFX9-NEXT:    v_or_b32_e32 v10, 1, v10
1616; GFX9-NEXT:    v_cvt_i32_f32_e32 v15, v15
1617; GFX9-NEXT:    v_cvt_i32_f32_e32 v16, v16
1618; GFX9-NEXT:    v_mad_f32 v3, -v17, v13, v3
1619; GFX9-NEXT:    v_cvt_i32_f32_e32 v17, v17
1620; GFX9-NEXT:    v_mad_f32 v2, -v18, v4, v2
1621; GFX9-NEXT:    v_cvt_i32_f32_e32 v18, v18
1622; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
1623; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v11|, |v12|
1624; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 30, v14
1625; GFX9-NEXT:    v_or_b32_e32 v9, 1, v9
1626; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
1627; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v13|
1628; GFX9-NEXT:    v_or_b32_e32 v14, 1, v14
1629; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
1630; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v4|
1631; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v14, vcc
1632; GFX9-NEXT:    v_add_u32_e32 v1, v15, v1
1633; GFX9-NEXT:    v_add_u32_sdwa v4, v16, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1634; GFX9-NEXT:    v_add_u32_e32 v3, v17, v3
1635; GFX9-NEXT:    v_add_u32_sdwa v2, v18, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1636; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1637; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1638; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1639; GFX9-NEXT:    global_store_dword v[5:6], v1, off
1640; GFX9-NEXT:    global_store_dword v[7:8], v0, off
1641; GFX9-NEXT:    s_waitcnt vmcnt(0)
1642; GFX9-NEXT:    s_setpc_b64 s[30:31]
1643  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1644  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1645  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1646  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1647  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1648  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 3, i32 2, i32 4>
1649  %vecins = sdiv <4 x i8> %shuffle0_0, %vec1
1650  store <4 x i8> %vecins, ptr addrspace(1) %out0
1651  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1652  ret void
1653}
1654
1655
1656define hidden void @sext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1657; GFX10-LABEL: sext_store_div:
1658; GFX10:       ; %bb.0:
1659; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1660; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1661; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1662; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
1663; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1664; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
1665; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1666; GFX10-NEXT:    global_load_dword v4, v[2:3], off
1667; GFX10-NEXT:    global_load_dword v9, v[0:1], off
1668; GFX10-NEXT:    s_waitcnt vmcnt(1)
1669; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
1670; GFX10-NEXT:    s_waitcnt vmcnt(0)
1671; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v9
1672; GFX10-NEXT:    v_ashrrev_i16 v2, 8, v4
1673; GFX10-NEXT:    v_ashrrev_i16 v0, 8, v0
1674; GFX10-NEXT:    v_ashrrev_i16 v3, 8, v1
1675; GFX10-NEXT:    v_perm_b32 v1, v0, v2, 0x5040100
1676; GFX10-NEXT:    v_perm_b32 v0, v3, v3, 0x5040100
1677; GFX10-NEXT:    v_perm_b32 v2, v9, v4, 0x3010707
1678; GFX10-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off
1679; GFX10-NEXT:    global_store_dword v[5:6], v2, off
1680; GFX10-NEXT:    s_setpc_b64 s[30:31]
1681;
1682; GFX9-LABEL: sext_store_div:
1683; GFX9:       ; %bb.0:
1684; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1685; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1686; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1687; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
1688; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1689; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
1690; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1691; GFX9-NEXT:    global_load_dword v4, v[0:1], off
1692; GFX9-NEXT:    global_load_dword v9, v[2:3], off
1693; GFX9-NEXT:    v_mov_b32_e32 v0, 8
1694; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
1695; GFX9-NEXT:    s_mov_b32 s4, 0x3010707
1696; GFX9-NEXT:    s_waitcnt vmcnt(0)
1697; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 8, v9
1698; GFX9-NEXT:    v_ashrrev_i16_sdwa v3, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1699; GFX9-NEXT:    v_ashrrev_i16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1700; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s5
1701; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s5
1702; GFX9-NEXT:    v_perm_b32 v2, v4, v9, s4
1703; GFX9-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off
1704; GFX9-NEXT:    global_store_dword v[5:6], v2, off
1705; GFX9-NEXT:    s_waitcnt vmcnt(0)
1706; GFX9-NEXT:    s_setpc_b64 s[30:31]
1707  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1708  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1709  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1710  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1711  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1712  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 3, i32 5, i32 7>
1713  %insvec = sext <4 x i8> %shuffle0_0 to <4 x i16>
1714  store <4 x i16> %insvec, ptr addrspace(1) %out1
1715  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
1716  ret void
1717}
1718
1719
1720define hidden void @shl_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1721; GFX10-LABEL: shl_store_div:
1722; GFX10:       ; %bb.0:
1723; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1724; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1725; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1726; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
1727; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1728; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
1729; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1730; GFX10-NEXT:    global_load_dword v4, v[0:1], off
1731; GFX10-NEXT:    global_load_dword v9, v[2:3], off
1732; GFX10-NEXT:    s_waitcnt vmcnt(1)
1733; GFX10-NEXT:    v_lshlrev_b16 v0, 2, v4
1734; GFX10-NEXT:    s_waitcnt vmcnt(0)
1735; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v9
1736; GFX10-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v0
1737; GFX10-NEXT:    v_and_b32_e32 v3, 0xfe, v1
1738; GFX10-NEXT:    v_and_b32_e32 v1, 0xfffffe00, v1
1739; GFX10-NEXT:    v_and_b32_e32 v0, 0xfc, v0
1740; GFX10-NEXT:    v_or_b32_e32 v2, v3, v2
1741; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1742; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x5000104
1743; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1744; GFX10-NEXT:    global_store_dword v[5:6], v0, off
1745; GFX10-NEXT:    global_store_dword v[7:8], v1, off
1746; GFX10-NEXT:    s_setpc_b64 s[30:31]
1747;
1748; GFX9-LABEL: shl_store_div:
1749; GFX9:       ; %bb.0:
1750; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1751; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1752; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1753; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
1754; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1755; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
1756; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1757; GFX9-NEXT:    global_load_dword v4, v[0:1], off
1758; GFX9-NEXT:    global_load_dword v9, v[2:3], off
1759; GFX9-NEXT:    s_mov_b32 s4, 0x5000104
1760; GFX9-NEXT:    s_waitcnt vmcnt(1)
1761; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 2, v4
1762; GFX9-NEXT:    s_waitcnt vmcnt(0)
1763; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 1, v9
1764; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4
1765; GFX9-NEXT:    v_and_b32_e32 v3, 0xfffffc00, v1
1766; GFX9-NEXT:    v_and_b32_e32 v4, 0xfe, v2
1767; GFX9-NEXT:    v_and_b32_e32 v2, 0xfffffe00, v2
1768; GFX9-NEXT:    v_and_b32_e32 v1, 0xfc, v1
1769; GFX9-NEXT:    v_or_b32_e32 v3, v4, v3
1770; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1771; GFX9-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1772; GFX9-NEXT:    global_store_dword v[5:6], v1, off
1773; GFX9-NEXT:    global_store_dword v[7:8], v0, off
1774; GFX9-NEXT:    s_waitcnt vmcnt(0)
1775; GFX9-NEXT:    s_setpc_b64 s[30:31]
1776  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1777  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1778  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1779  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1780  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1781  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 0, i32 5>
1782  %vecins = shl <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
1783  store <4 x i8> %vecins, ptr addrspace(1) %out0
1784  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
1785  ret void
1786}
1787
1788
1789define hidden void @sitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1790; GFX10-LABEL: sitofp_store_div:
1791; GFX10:       ; %bb.0:
1792; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1793; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1794; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1795; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
1796; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1797; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
1798; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1799; GFX10-NEXT:    global_load_dword v4, v[2:3], off
1800; GFX10-NEXT:    global_load_dword v9, v[0:1], off
1801; GFX10-NEXT:    s_waitcnt vmcnt(1)
1802; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
1803; GFX10-NEXT:    s_waitcnt vmcnt(0)
1804; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v9
1805; GFX10-NEXT:    v_ashrrev_i16 v2, 8, v9
1806; GFX10-NEXT:    v_ashrrev_i16 v3, 8, v4
1807; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x6010205
1808; GFX10-NEXT:    v_bfe_i32 v10, v0, 0, 8
1809; GFX10-NEXT:    v_bfe_i32 v1, v1, 0, 8
1810; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1811; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1812; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1813; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1814; GFX10-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
1815; GFX10-NEXT:    global_store_dword v[5:6], v4, off
1816; GFX10-NEXT:    s_setpc_b64 s[30:31]
1817;
1818; GFX9-LABEL: sitofp_store_div:
1819; GFX9:       ; %bb.0:
1820; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1821; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1822; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1823; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
1824; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1825; GFX9-NEXT:    global_load_dword v9, v[0:1], off
1826; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v4
1827; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
1828; GFX9-NEXT:    global_load_dword v4, v[0:1], off
1829; GFX9-NEXT:    s_mov_b32 s4, 0x6010205
1830; GFX9-NEXT:    s_waitcnt vmcnt(1)
1831; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
1832; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 8, v9
1833; GFX9-NEXT:    v_bfe_i32 v10, v0, 0, 8
1834; GFX9-NEXT:    s_waitcnt vmcnt(0)
1835; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
1836; GFX9-NEXT:    v_ashrrev_i16_e32 v3, 8, v4
1837; GFX9-NEXT:    v_bfe_i32 v11, v2, 0, 8
1838; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1839; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1840; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v11) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1841; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1842; GFX9-NEXT:    v_perm_b32 v4, v4, v9, s4
1843; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
1844; GFX9-NEXT:    global_store_dword v[5:6], v4, off
1845; GFX9-NEXT:    s_waitcnt vmcnt(0)
1846; GFX9-NEXT:    s_setpc_b64 s[30:31]
1847  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1848  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
1849  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1850  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
1851  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
1852  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 2, i32 1, i32 6>
1853  %insvec = sitofp <4 x i8> %shuffle0_0 to <4 x float>
1854  store <4 x float> %insvec, ptr addrspace(1) %out1
1855  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
1856  ret void
1857}
1858
1859
1860define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
1861; GFX10-LABEL: srem_store_div:
1862; GFX10:       ; %bb.0:
1863; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1864; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1865; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1866; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
1867; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1868; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
1869; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1870; GFX10-NEXT:    global_load_dword v4, v[2:3], off
1871; GFX10-NEXT:    global_load_dword v9, v[0:1], off
1872; GFX10-NEXT:    s_waitcnt vmcnt(1)
1873; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1874; GFX10-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1875; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1876; GFX10-NEXT:    s_waitcnt vmcnt(0)
1877; GFX10-NEXT:    v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1878; GFX10-NEXT:    v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1879; GFX10-NEXT:    v_rcp_iflag_f32_e32 v17, v2
1880; GFX10-NEXT:    v_rcp_iflag_f32_e32 v18, v13
1881; GFX10-NEXT:    v_rcp_iflag_f32_e32 v19, v3
1882; GFX10-NEXT:    v_xor_b32_sdwa v1, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
1883; GFX10-NEXT:    v_rcp_iflag_f32_e32 v20, v15
1884; GFX10-NEXT:    v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
1885; GFX10-NEXT:    v_cvt_f32_i32_sdwa v21, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1886; GFX10-NEXT:    v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2
1887; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
1888; GFX10-NEXT:    v_xor_b32_sdwa v16, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3
1889; GFX10-NEXT:    v_mul_f32_e32 v17, v3, v17
1890; GFX10-NEXT:    v_mul_f32_e32 v18, v12, v18
1891; GFX10-NEXT:    v_mul_f32_e32 v19, v15, v19
1892; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 30, v11
1893; GFX10-NEXT:    v_or_b32_e32 v1, 1, v1
1894; GFX10-NEXT:    v_trunc_f32_e32 v17, v17
1895; GFX10-NEXT:    v_trunc_f32_e32 v18, v18
1896; GFX10-NEXT:    v_mul_f32_e32 v20, v21, v20
1897; GFX10-NEXT:    v_trunc_f32_e32 v19, v19
1898; GFX10-NEXT:    v_ashrrev_i32_e32 v14, 30, v14
1899; GFX10-NEXT:    v_mad_f32 v22, -v17, v2, v3
1900; GFX10-NEXT:    v_mad_f32 v12, -v18, v13, v12
1901; GFX10-NEXT:    v_or_b32_e32 v11, 1, v11
1902; GFX10-NEXT:    v_trunc_f32_e32 v20, v20
1903; GFX10-NEXT:    v_mad_f32 v23, -v19, v3, v15
1904; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v22|, |v2|
1905; GFX10-NEXT:    v_ashrrev_i32_e32 v16, 30, v16
1906; GFX10-NEXT:    v_or_b32_e32 v14, 1, v14
1907; GFX10-NEXT:    v_mad_f32 v21, -v20, v15, v21
1908; GFX10-NEXT:    v_cvt_i32_f32_e32 v17, v17
1909; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
1910; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v12|, |v13|
1911; GFX10-NEXT:    v_or_b32_e32 v16, 1, v16
1912; GFX10-NEXT:    v_cvt_i32_f32_e32 v18, v18
1913; GFX10-NEXT:    v_cvt_i32_f32_e32 v19, v19
1914; GFX10-NEXT:    v_cvt_i32_f32_e32 v20, v20
1915; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v11, vcc_lo
1916; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v23|, |v3|
1917; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
1918; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
1919; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v4
1920; GFX10-NEXT:    v_add_nc_u32_e32 v1, v17, v1
1921; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0, v14, vcc_lo
1922; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v21|, |v15|
1923; GFX10-NEXT:    v_add_nc_u32_e32 v2, v18, v2
1924; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v4
1925; GFX10-NEXT:    v_add_nc_u32_e32 v3, v19, v3
1926; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v16, vcc_lo
1927; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v10
1928; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v0
1929; GFX10-NEXT:    v_add_nc_u32_e32 v11, v20, v11
1930; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
1931; GFX10-NEXT:    v_sub_nc_u32_sdwa v1, v9, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
1932; GFX10-NEXT:    v_mul_lo_u32 v10, v11, v12
1933; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v12, v3
1934; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1935; GFX10-NEXT:    v_sub_nc_u32_sdwa v3, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1936; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1937; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1938; GFX10-NEXT:    v_perm_b32 v1, v4, v9, 0x2070306
1939; GFX10-NEXT:    global_store_dword v[5:6], v0, off
1940; GFX10-NEXT:    global_store_dword v[7:8], v1, off
1941; GFX10-NEXT:    s_setpc_b64 s[30:31]
1942;
1943; GFX9-LABEL: srem_store_div:
1944; GFX9:       ; %bb.0:
1945; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1946; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
1947; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
1948; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
1949; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1950; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
1951; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1952; GFX9-NEXT:    global_load_dword v4, v[2:3], off
1953; GFX9-NEXT:    global_load_dword v9, v[0:1], off
1954; GFX9-NEXT:    s_mov_b32 s4, 0x2070306
1955; GFX9-NEXT:    s_waitcnt vmcnt(1)
1956; GFX9-NEXT:    v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1957; GFX9-NEXT:    s_waitcnt vmcnt(0)
1958; GFX9-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1959; GFX9-NEXT:    v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1960; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1961; GFX9-NEXT:    v_rcp_iflag_f32_e32 v18, v14
1962; GFX9-NEXT:    v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
1963; GFX9-NEXT:    v_rcp_iflag_f32_e32 v19, v10
1964; GFX9-NEXT:    v_perm_b32 v1, v4, v9, s4
1965; GFX9-NEXT:    v_mul_f32_e32 v18, v13, v18
1966; GFX9-NEXT:    v_trunc_f32_e32 v18, v18
1967; GFX9-NEXT:    v_mad_f32 v13, -v18, v14, v13
1968; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v13|, |v14|
1969; GFX9-NEXT:    v_rcp_iflag_f32_e32 v13, v3
1970; GFX9-NEXT:    v_mul_f32_e32 v14, v16, v19
1971; GFX9-NEXT:    v_trunc_f32_e32 v14, v14
1972; GFX9-NEXT:    v_mad_f32 v19, -v14, v10, v16
1973; GFX9-NEXT:    v_mul_f32_e32 v13, v10, v13
1974; GFX9-NEXT:    v_trunc_f32_e32 v13, v13
1975; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v19|, |v10|
1976; GFX9-NEXT:    v_mad_f32 v10, -v13, v3, v10
1977; GFX9-NEXT:    v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1978; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v10|, |v3|
1979; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v16
1980; GFX9-NEXT:    v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
1981; GFX9-NEXT:    v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
1982; GFX9-NEXT:    v_xor_b32_sdwa v15, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2
1983; GFX9-NEXT:    v_mul_f32_e32 v3, v19, v3
1984; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
1985; GFX9-NEXT:    v_ashrrev_i32_e32 v12, 30, v12
1986; GFX9-NEXT:    v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3
1987; GFX9-NEXT:    v_cvt_i32_f32_e32 v13, v13
1988; GFX9-NEXT:    v_cvt_i32_f32_e32 v18, v18
1989; GFX9-NEXT:    v_cvt_i32_f32_e32 v14, v14
1990; GFX9-NEXT:    v_mad_f32 v19, -v3, v16, v19
1991; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
1992; GFX9-NEXT:    v_ashrrev_i32_e32 v15, 30, v15
1993; GFX9-NEXT:    v_or_b32_e32 v12, 1, v12
1994; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
1995; GFX9-NEXT:    v_ashrrev_i32_e32 v10, 30, v10
1996; GFX9-NEXT:    v_or_b32_e32 v15, 1, v15
1997; GFX9-NEXT:    v_or_b32_e32 v2, 1, v2
1998; GFX9-NEXT:    v_or_b32_e32 v10, 1, v10
1999; GFX9-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
2000; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, |v16|
2001; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
2002; GFX9-NEXT:    v_cndmask_b32_e64 v15, 0, v15, s[4:5]
2003; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
2004; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
2005; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
2006; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 24, v4
2007; GFX9-NEXT:    v_add_u32_e32 v2, v13, v2
2008; GFX9-NEXT:    v_add_u32_e32 v12, v18, v12
2009; GFX9-NEXT:    v_add_u32_e32 v13, v14, v15
2010; GFX9-NEXT:    v_add_u32_e32 v3, v3, v10
2011; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
2012; GFX9-NEXT:    v_mul_lo_u32 v4, v12, v11
2013; GFX9-NEXT:    v_mul_lo_u32 v10, v13, v0
2014; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v17
2015; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
2016; GFX9-NEXT:    v_sub_u32_sdwa v2, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2017; GFX9-NEXT:    v_sub_u32_e32 v4, v17, v10
2018; GFX9-NEXT:    v_sub_u32_sdwa v3, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2019; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2020; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2021; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2022; GFX9-NEXT:    global_store_dword v[5:6], v0, off
2023; GFX9-NEXT:    global_store_dword v[7:8], v1, off
2024; GFX9-NEXT:    s_waitcnt vmcnt(0)
2025; GFX9-NEXT:    s_setpc_b64 s[30:31]
2026  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2027  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2028  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2029  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2030  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2031  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 3, i32 7, i32 2>
2032  %vecins = srem <4 x i8> %shuffle0_0, %vec1
2033  store <4 x i8> %vecins, ptr addrspace(1) %out0
2034  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2035  ret void
2036}
2037
2038
2039define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2040; GFX10-LABEL: sub_store_div:
2041; GFX10:       ; %bb.0:
2042; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2043; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2044; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2045; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
2046; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2047; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
2048; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2049; GFX10-NEXT:    global_load_dword v2, v[2:3], off
2050; GFX10-NEXT:    global_load_dword v0, v[0:1], off
2051; GFX10-NEXT:    s_waitcnt vmcnt(1)
2052; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
2053; GFX10-NEXT:    v_lshrrev_b16 v3, 8, v2
2054; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
2055; GFX10-NEXT:    s_waitcnt vmcnt(0)
2056; GFX10-NEXT:    v_sub_nc_u16 v3, v0, v3
2057; GFX10-NEXT:    v_sub_nc_u16 v9, v1, v4
2058; GFX10-NEXT:    v_sub_nc_u16 v10, v4, v2
2059; GFX10-NEXT:    v_sub_nc_u16 v1, v4, v1
2060; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x6070007
2061; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
2062; GFX10-NEXT:    v_lshlrev_b16 v4, 8, v9
2063; GFX10-NEXT:    v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2064; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2065; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2066; GFX10-NEXT:    global_store_dword v[5:6], v1, off
2067; GFX10-NEXT:    global_store_dword v[7:8], v0, off
2068; GFX10-NEXT:    s_setpc_b64 s[30:31]
2069;
2070; GFX9-LABEL: sub_store_div:
2071; GFX9:       ; %bb.0:
2072; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2073; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2074; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2075; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
2076; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2077; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
2078; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2079; GFX9-NEXT:    global_load_dword v4, v[0:1], off
2080; GFX9-NEXT:    global_load_dword v9, v[2:3], off
2081; GFX9-NEXT:    s_mov_b32 s4, 0x6070007
2082; GFX9-NEXT:    s_waitcnt vmcnt(0)
2083; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4
2084; GFX9-NEXT:    v_sub_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
2085; GFX9-NEXT:    v_sub_u16_sdwa v2, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2086; GFX9-NEXT:    v_sub_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_3
2087; GFX9-NEXT:    v_sub_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:WORD_1
2088; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2089; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2090; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2091; GFX9-NEXT:    global_store_dword v[5:6], v1, off
2092; GFX9-NEXT:    global_store_dword v[7:8], v0, off
2093; GFX9-NEXT:    s_waitcnt vmcnt(0)
2094; GFX9-NEXT:    s_setpc_b64 s[30:31]
2095  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2096  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2097  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2098  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2099  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2100  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 0, i32 7, i32 6>
2101  %vecins = sub <4 x i8> %shuffle0_0, %vec1
2102  store <4 x i8> %vecins, ptr addrspace(1) %out0
2103  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2104  ret void
2105}
2106
2107
2108define hidden void @sv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
2109; GFX10-LABEL: sv_store_div:
2110; GFX10:       ; %bb.0:
2111; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2112; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2113; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2114; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
2115; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2116; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
2117; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2118; GFX10-NEXT:    global_load_dword v4, v[0:1], off
2119; GFX10-NEXT:    global_load_dword v5, v[2:3], off
2120; GFX10-NEXT:    s_waitcnt vmcnt(0)
2121; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x50705
2122; GFX10-NEXT:    global_store_dword v[7:8], v0, off
2123; GFX10-NEXT:    s_setpc_b64 s[30:31]
2124;
2125; GFX9-LABEL: sv_store_div:
2126; GFX9:       ; %bb.0:
2127; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2128; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2129; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2130; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
2131; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2132; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
2133; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2134; GFX9-NEXT:    global_load_dword v4, v[0:1], off
2135; GFX9-NEXT:    global_load_dword v5, v[2:3], off
2136; GFX9-NEXT:    s_mov_b32 s4, 0x50705
2137; GFX9-NEXT:    s_waitcnt vmcnt(0)
2138; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
2139; GFX9-NEXT:    global_store_dword v[7:8], v0, off
2140; GFX9-NEXT:    s_waitcnt vmcnt(0)
2141; GFX9-NEXT:    s_setpc_b64 s[30:31]
2142  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2143  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2144  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2145  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2146  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2147  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 1, i32 4>
2148  %insvec = shufflevector <4 x i8> %shuffle0_0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 3, i32 7, i32 0>
2149  store <4 x i8> %insvec, ptr addrspace(1) %out1
2150  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2151  ret void
2152}
2153
2154
2155define hidden void @trunc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2156; GFX10-LABEL: trunc_store_div:
2157; GFX10:       ; %bb.0:
2158; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2159; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2160; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2161; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
2162; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2163; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
2164; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2165; GFX10-NEXT:    global_load_dword v4, v[0:1], off
2166; GFX10-NEXT:    global_load_dword v9, v[2:3], off
2167; GFX10-NEXT:    v_mov_b32_e32 v0, 1
2168; GFX10-NEXT:    s_waitcnt vmcnt(1)
2169; GFX10-NEXT:    v_and_b32_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2170; GFX10-NEXT:    s_waitcnt vmcnt(0)
2171; GFX10-NEXT:    v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
2172; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
2173; GFX10-NEXT:    v_lshlrev_b16 v2, 2, v0
2174; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
2175; GFX10-NEXT:    v_lshlrev_b16 v1, 3, v4
2176; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
2177; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
2178; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x50205
2179; GFX10-NEXT:    v_and_b32_e32 v0, 15, v0
2180; GFX10-NEXT:    global_store_byte v[7:8], v0, off
2181; GFX10-NEXT:    global_store_dword v[5:6], v1, off
2182; GFX10-NEXT:    s_setpc_b64 s[30:31]
2183;
2184; GFX9-LABEL: trunc_store_div:
2185; GFX9:       ; %bb.0:
2186; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2187; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2188; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2189; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
2190; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2191; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
2192; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2193; GFX9-NEXT:    global_load_dword v4, v[0:1], off
2194; GFX9-NEXT:    global_load_dword v9, v[2:3], off
2195; GFX9-NEXT:    v_mov_b32_e32 v0, 1
2196; GFX9-NEXT:    s_mov_b32 s4, 0x50205
2197; GFX9-NEXT:    s_waitcnt vmcnt(1)
2198; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 3, v4
2199; GFX9-NEXT:    s_waitcnt vmcnt(0)
2200; GFX9-NEXT:    v_and_b32_sdwa v2, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
2201; GFX9-NEXT:    v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2202; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
2203; GFX9-NEXT:    v_perm_b32 v1, v9, v4, s4
2204; GFX9-NEXT:    v_lshlrev_b16_e32 v4, 2, v2
2205; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
2206; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
2207; GFX9-NEXT:    v_or_b32_e32 v0, v0, v3
2208; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
2209; GFX9-NEXT:    global_store_byte v[7:8], v0, off
2210; GFX9-NEXT:    global_store_dword v[5:6], v1, off
2211; GFX9-NEXT:    s_waitcnt vmcnt(0)
2212; GFX9-NEXT:    s_setpc_b64 s[30:31]
2213  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2214  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2215  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2216  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2217  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2218  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 2, i32 5, i32 0>
2219  %insvec = trunc <4 x i8> %shuffle0_0 to <4 x i1>
2220  store <4 x i1> %insvec, ptr addrspace(1) %out1
2221  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
2222  ret void
2223}
2224
2225define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2226; GFX10-LABEL: udiv:
2227; GFX10:       ; %bb.0:
2228; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2229; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2230; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2231; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
2232; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2233; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
2234; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2235; GFX10-NEXT:    global_load_dword v2, v[2:3], off
2236; GFX10-NEXT:    global_load_dword v0, v[0:1], off
2237; GFX10-NEXT:    s_waitcnt vmcnt(1)
2238; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
2239; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v3, v2
2240; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v9, v2
2241; GFX10-NEXT:    s_waitcnt vmcnt(0)
2242; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v14, v0
2243; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v4, v2
2244; GFX10-NEXT:    v_rcp_iflag_f32_e32 v10, v1
2245; GFX10-NEXT:    v_rcp_iflag_f32_e32 v11, v3
2246; GFX10-NEXT:    v_rcp_iflag_f32_e32 v13, v9
2247; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v15, v0
2248; GFX10-NEXT:    v_rcp_iflag_f32_e32 v12, v4
2249; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x40207
2250; GFX10-NEXT:    v_mul_f32_e32 v10, v14, v10
2251; GFX10-NEXT:    v_mul_f32_e32 v11, v4, v11
2252; GFX10-NEXT:    v_mul_f32_e32 v13, v1, v13
2253; GFX10-NEXT:    v_mul_f32_e32 v12, v15, v12
2254; GFX10-NEXT:    v_trunc_f32_e32 v10, v10
2255; GFX10-NEXT:    v_trunc_f32_e32 v11, v11
2256; GFX10-NEXT:    v_trunc_f32_e32 v13, v13
2257; GFX10-NEXT:    v_trunc_f32_e32 v12, v12
2258; GFX10-NEXT:    v_mad_f32 v14, -v10, v1, v14
2259; GFX10-NEXT:    v_cvt_u32_f32_e32 v10, v10
2260; GFX10-NEXT:    v_mad_f32 v16, -v11, v3, v4
2261; GFX10-NEXT:    v_mad_f32 v17, -v13, v9, v1
2262; GFX10-NEXT:    v_cvt_u32_f32_e32 v11, v11
2263; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v14|, v1
2264; GFX10-NEXT:    v_cvt_u32_f32_e32 v13, v13
2265; GFX10-NEXT:    v_mad_f32 v15, -v12, v4, v15
2266; GFX10-NEXT:    v_cvt_u32_f32_e32 v12, v12
2267; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo
2268; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v16|, v3
2269; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
2270; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v17|, v9
2271; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
2272; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo
2273; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v15|, v4
2274; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2275; GFX10-NEXT:    v_lshlrev_b16 v9, 8, v9
2276; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo
2277; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2278; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2279; GFX10-NEXT:    global_store_dword v[5:6], v1, off
2280; GFX10-NEXT:    global_store_dword v[7:8], v0, off
2281; GFX10-NEXT:    s_setpc_b64 s[30:31]
2282;
2283; GFX9-LABEL: udiv:
2284; GFX9:       ; %bb.0:
2285; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2286; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2287; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2288; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
2289; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2290; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
2291; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2292; GFX9-NEXT:    global_load_dword v4, v[2:3], off
2293; GFX9-NEXT:    global_load_dword v9, v[0:1], off
2294; GFX9-NEXT:    s_mov_b32 s4, 0x40207
2295; GFX9-NEXT:    s_waitcnt vmcnt(1)
2296; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
2297; GFX9-NEXT:    v_rcp_iflag_f32_e32 v11, v2
2298; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v3, v4
2299; GFX9-NEXT:    v_rcp_iflag_f32_e32 v12, v3
2300; GFX9-NEXT:    s_waitcnt vmcnt(0)
2301; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v1, v9
2302; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v10, v4
2303; GFX9-NEXT:    v_rcp_iflag_f32_e32 v13, v10
2304; GFX9-NEXT:    v_mul_f32_e32 v11, v1, v11
2305; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4
2306; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v4, v4
2307; GFX9-NEXT:    v_trunc_f32_e32 v11, v11
2308; GFX9-NEXT:    v_rcp_iflag_f32_e32 v14, v4
2309; GFX9-NEXT:    v_mul_f32_e32 v12, v10, v12
2310; GFX9-NEXT:    v_mad_f32 v1, -v11, v2, v1
2311; GFX9-NEXT:    v_cvt_u32_f32_e32 v11, v11
2312; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v9, v9
2313; GFX9-NEXT:    v_trunc_f32_e32 v12, v12
2314; GFX9-NEXT:    v_mul_f32_e32 v13, v9, v13
2315; GFX9-NEXT:    v_mad_f32 v15, -v12, v3, v10
2316; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v12
2317; GFX9-NEXT:    v_trunc_f32_e32 v13, v13
2318; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v2
2319; GFX9-NEXT:    v_mul_f32_e32 v14, v2, v14
2320; GFX9-NEXT:    v_mad_f32 v9, -v13, v10, v9
2321; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v13
2322; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v11, vcc
2323; GFX9-NEXT:    v_trunc_f32_e32 v14, v14
2324; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v15|, v3
2325; GFX9-NEXT:    v_mad_f32 v16, -v14, v4, v2
2326; GFX9-NEXT:    v_cvt_u32_f32_e32 v14, v14
2327; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v12, vcc
2328; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v9|, v10
2329; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v13, vcc
2330; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v16|, v4
2331; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v14, vcc
2332; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
2333; GFX9-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
2334; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2335; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2336; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2337; GFX9-NEXT:    global_store_dword v[5:6], v1, off
2338; GFX9-NEXT:    global_store_dword v[7:8], v0, off
2339; GFX9-NEXT:    s_waitcnt vmcnt(0)
2340; GFX9-NEXT:    s_setpc_b64 s[30:31]
2341  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2342  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2343  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2344  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2345  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2346  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 6, i32 0, i32 4>
2347  %vecins = udiv <4 x i8> %shuffle0_0, %vec1
2348  store <4 x i8> %vecins, ptr addrspace(1) %out0
2349  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2350  ret void
2351}
2352
2353
2354define hidden void @uitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2355; GFX10-LABEL: uitofp_store_div:
2356; GFX10:       ; %bb.0:
2357; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2358; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2359; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2360; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
2361; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2362; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
2363; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2364; GFX10-NEXT:    global_load_dword v4, v[2:3], off
2365; GFX10-NEXT:    global_load_dword v9, v[0:1], off
2366; GFX10-NEXT:    s_waitcnt vmcnt(1)
2367; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v3, v4
2368; GFX10-NEXT:    s_waitcnt vmcnt(0)
2369; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v9
2370; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v9
2371; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
2372; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x5020104
2373; GFX10-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
2374; GFX10-NEXT:    global_store_dword v[5:6], v4, off
2375; GFX10-NEXT:    s_setpc_b64 s[30:31]
2376;
2377; GFX9-LABEL: uitofp_store_div:
2378; GFX9:       ; %bb.0:
2379; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2380; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2381; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2382; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
2383; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2384; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
2385; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2386; GFX9-NEXT:    global_load_dword v4, v[0:1], off
2387; GFX9-NEXT:    global_load_dword v9, v[2:3], off
2388; GFX9-NEXT:    s_mov_b32 s4, 0x5020104
2389; GFX9-NEXT:    s_waitcnt vmcnt(1)
2390; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
2391; GFX9-NEXT:    s_waitcnt vmcnt(0)
2392; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v3, v9
2393; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
2394; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v9
2395; GFX9-NEXT:    v_perm_b32 v10, v9, v4, s4
2396; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
2397; GFX9-NEXT:    global_store_dword v[5:6], v10, off
2398; GFX9-NEXT:    s_waitcnt vmcnt(0)
2399; GFX9-NEXT:    s_setpc_b64 s[30:31]
2400  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2401  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2402  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2403  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2404  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2405  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 2, i32 5>
2406  %insvec = uitofp <4 x i8> %shuffle0_0 to <4 x float>
2407  store <4 x float> %insvec, ptr addrspace(1) %out1
2408  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
2409  ret void
2410}
2411
2412
2413define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2414; GFX10-LABEL: urem_store_div:
2415; GFX10:       ; %bb.0:
2416; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2417; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2418; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2419; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
2420; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2421; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
2422; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2423; GFX10-NEXT:    global_load_dword v2, v[2:3], off
2424; GFX10-NEXT:    global_load_dword v0, v[0:1], off
2425; GFX10-NEXT:    s_waitcnt vmcnt(1)
2426; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
2427; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v3, v2
2428; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v4, v2
2429; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v9, v2
2430; GFX10-NEXT:    s_waitcnt vmcnt(0)
2431; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v15, v0
2432; GFX10-NEXT:    v_rcp_iflag_f32_e32 v10, v1
2433; GFX10-NEXT:    v_rcp_iflag_f32_e32 v11, v3
2434; GFX10-NEXT:    v_rcp_iflag_f32_e32 v12, v4
2435; GFX10-NEXT:    v_rcp_iflag_f32_e32 v13, v9
2436; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
2437; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
2438; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 24, v2
2439; GFX10-NEXT:    v_mul_f32_e32 v10, v3, v10
2440; GFX10-NEXT:    v_mul_f32_e32 v11, v3, v11
2441; GFX10-NEXT:    v_mul_f32_e32 v12, v3, v12
2442; GFX10-NEXT:    v_mul_f32_e32 v13, v15, v13
2443; GFX10-NEXT:    v_trunc_f32_e32 v10, v10
2444; GFX10-NEXT:    v_trunc_f32_e32 v11, v11
2445; GFX10-NEXT:    v_trunc_f32_e32 v12, v12
2446; GFX10-NEXT:    v_trunc_f32_e32 v13, v13
2447; GFX10-NEXT:    v_mad_f32 v18, -v10, v1, v3
2448; GFX10-NEXT:    v_cvt_u32_f32_e32 v10, v10
2449; GFX10-NEXT:    v_mad_f32 v19, -v11, v3, v3
2450; GFX10-NEXT:    v_cvt_u32_f32_e32 v11, v11
2451; GFX10-NEXT:    v_mad_f32 v20, -v12, v4, v3
2452; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v18|, v1
2453; GFX10-NEXT:    v_cvt_u32_f32_e32 v12, v12
2454; GFX10-NEXT:    v_mad_f32 v15, -v13, v9, v15
2455; GFX10-NEXT:    v_cvt_u32_f32_e32 v13, v13
2456; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo
2457; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v19|, v3
2458; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v2
2459; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
2460; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v20|, v4
2461; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v16
2462; GFX10-NEXT:    v_sub_nc_u32_e32 v1, v16, v1
2463; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo
2464; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v15|, v9
2465; GFX10-NEXT:    v_mul_lo_u32 v4, v4, v14
2466; GFX10-NEXT:    v_sub_nc_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2467; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo
2468; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2469; GFX10-NEXT:    v_mul_lo_u32 v9, v9, v17
2470; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v16, v4
2471; GFX10-NEXT:    v_sub_nc_u32_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2472; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x2050505
2473; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2474; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2475; GFX10-NEXT:    global_store_dword v[5:6], v1, off
2476; GFX10-NEXT:    global_store_dword v[7:8], v0, off
2477; GFX10-NEXT:    s_setpc_b64 s[30:31]
2478;
2479; GFX9-LABEL: urem_store_div:
2480; GFX9:       ; %bb.0:
2481; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2482; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2483; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2484; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
2485; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2486; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
2487; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2488; GFX9-NEXT:    global_load_dword v4, v[2:3], off
2489; GFX9-NEXT:    global_load_dword v9, v[0:1], off
2490; GFX9-NEXT:    s_mov_b32 s4, 0x2050505
2491; GFX9-NEXT:    s_waitcnt vmcnt(1)
2492; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
2493; GFX9-NEXT:    v_rcp_iflag_f32_e32 v15, v2
2494; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v3, v4
2495; GFX9-NEXT:    v_rcp_iflag_f32_e32 v16, v3
2496; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v11, v4
2497; GFX9-NEXT:    v_rcp_iflag_f32_e32 v17, v11
2498; GFX9-NEXT:    v_mul_f32_e32 v15, v3, v15
2499; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v14, v4
2500; GFX9-NEXT:    v_trunc_f32_e32 v15, v15
2501; GFX9-NEXT:    v_rcp_iflag_f32_e32 v18, v14
2502; GFX9-NEXT:    v_mul_f32_e32 v16, v3, v16
2503; GFX9-NEXT:    v_mad_f32 v19, -v15, v2, v3
2504; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v15
2505; GFX9-NEXT:    v_trunc_f32_e32 v16, v16
2506; GFX9-NEXT:    v_mul_f32_e32 v17, v3, v17
2507; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, v2
2508; GFX9-NEXT:    v_mad_f32 v2, -v16, v3, v3
2509; GFX9-NEXT:    v_cvt_u32_f32_e32 v16, v16
2510; GFX9-NEXT:    s_waitcnt vmcnt(0)
2511; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v13, v9
2512; GFX9-NEXT:    v_trunc_f32_e32 v17, v17
2513; GFX9-NEXT:    v_mul_f32_e32 v18, v13, v18
2514; GFX9-NEXT:    v_mad_f32 v19, -v17, v11, v3
2515; GFX9-NEXT:    v_cvt_u32_f32_e32 v17, v17
2516; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
2517; GFX9-NEXT:    v_trunc_f32_e32 v18, v18
2518; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
2519; GFX9-NEXT:    v_mad_f32 v13, -v18, v14, v13
2520; GFX9-NEXT:    v_cvt_u32_f32_e32 v18, v18
2521; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v16, vcc
2522; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, v11
2523; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v17, vcc
2524; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v13|, v14
2525; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
2526; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
2527; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v4
2528; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v18, vcc
2529; GFX9-NEXT:    v_perm_b32 v1, v4, v9, s4
2530; GFX9-NEXT:    v_mul_lo_u32 v4, v15, v4
2531; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v10
2532; GFX9-NEXT:    v_mul_lo_u32 v0, v3, v0
2533; GFX9-NEXT:    v_mul_lo_u32 v3, v11, v12
2534; GFX9-NEXT:    v_sub_u32_e32 v4, v10, v4
2535; GFX9-NEXT:    v_sub_u32_sdwa v2, v10, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2536; GFX9-NEXT:    v_sub_u32_e32 v0, v10, v0
2537; GFX9-NEXT:    v_sub_u32_sdwa v3, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2538; GFX9-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2539; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2540; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2541; GFX9-NEXT:    global_store_dword v[5:6], v0, off
2542; GFX9-NEXT:    global_store_dword v[7:8], v1, off
2543; GFX9-NEXT:    s_waitcnt vmcnt(0)
2544; GFX9-NEXT:    s_setpc_b64 s[30:31]
2545  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2546  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2547  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2548  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2549  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2550  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 2>
2551  %vecins = urem <4 x i8> %shuffle0_0, %vec1
2552  store <4 x i8> %vecins, ptr addrspace(1) %out0
2553  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2554  ret void
2555}
2556
2557
2558define hidden void @xor_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2559; GFX10-LABEL: xor_store_div:
2560; GFX10:       ; %bb.0:
2561; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2562; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2563; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2564; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
2565; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2566; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
2567; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2568; GFX10-NEXT:    global_load_dword v4, v[0:1], off
2569; GFX10-NEXT:    global_load_dword v9, v[2:3], off
2570; GFX10-NEXT:    v_mov_b32_e32 v0, 0xffffff00
2571; GFX10-NEXT:    v_mov_b32_e32 v1, 1
2572; GFX10-NEXT:    v_mov_b32_e32 v2, 2
2573; GFX10-NEXT:    s_waitcnt vmcnt(1)
2574; GFX10-NEXT:    v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2575; GFX10-NEXT:    s_waitcnt vmcnt(0)
2576; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff00, v9
2577; GFX10-NEXT:    v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2578; GFX10-NEXT:    v_xor_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2579; GFX10-NEXT:    v_xor_b32_e32 v0, 0x200, v0
2580; GFX10-NEXT:    v_xor_b32_e32 v3, 0x100, v3
2581; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
2582; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2583; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2584; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x5060307
2585; GFX10-NEXT:    global_store_dword v[5:6], v0, off
2586; GFX10-NEXT:    global_store_dword v[7:8], v1, off
2587; GFX10-NEXT:    s_setpc_b64 s[30:31]
2588;
2589; GFX9-LABEL: xor_store_div:
2590; GFX9:       ; %bb.0:
2591; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2592; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2593; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2594; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
2595; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2596; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
2597; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2598; GFX9-NEXT:    global_load_dword v4, v[0:1], off
2599; GFX9-NEXT:    global_load_dword v9, v[2:3], off
2600; GFX9-NEXT:    s_movk_i32 s4, 0xff00
2601; GFX9-NEXT:    v_mov_b32_e32 v0, 1
2602; GFX9-NEXT:    v_mov_b32_e32 v1, 2
2603; GFX9-NEXT:    s_mov_b32 s5, 0x5060307
2604; GFX9-NEXT:    s_waitcnt vmcnt(1)
2605; GFX9-NEXT:    v_and_b32_sdwa v2, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2606; GFX9-NEXT:    s_waitcnt vmcnt(0)
2607; GFX9-NEXT:    v_and_b32_e32 v3, 0xffffff00, v9
2608; GFX9-NEXT:    v_xor_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2609; GFX9-NEXT:    v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2610; GFX9-NEXT:    v_xor_b32_e32 v2, 0x200, v2
2611; GFX9-NEXT:    v_xor_b32_e32 v3, 0x100, v3
2612; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
2613; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2614; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2615; GFX9-NEXT:    v_perm_b32 v4, v9, v4, s5
2616; GFX9-NEXT:    global_store_dword v[5:6], v0, off
2617; GFX9-NEXT:    global_store_dword v[7:8], v4, off
2618; GFX9-NEXT:    s_waitcnt vmcnt(0)
2619; GFX9-NEXT:    s_setpc_b64 s[30:31]
2620  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2621  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2622  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2623  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2624  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2625  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 3, i32 6, i32 5>
2626  %vecins = xor <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1>
2627  store <4 x i8> %vecins, ptr addrspace(1) %out0
2628  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1
2629  ret void
2630}
2631
2632
2633define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
2634; GFX10-LABEL: zext_store_div:
2635; GFX10:       ; %bb.0:
2636; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2637; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2638; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2639; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
2640; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2641; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
2642; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2643; GFX10-NEXT:    global_load_dword v4, v[0:1], off
2644; GFX10-NEXT:    global_load_dword v9, v[2:3], off
2645; GFX10-NEXT:    v_mov_b32_e32 v0, 0xff
2646; GFX10-NEXT:    s_waitcnt vmcnt(1)
2647; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v4
2648; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v4
2649; GFX10-NEXT:    s_waitcnt vmcnt(0)
2650; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v9
2651; GFX10-NEXT:    v_and_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2652; GFX10-NEXT:    v_perm_b32 v0, v1, v2, 0x5040100
2653; GFX10-NEXT:    v_perm_b32 v2, v4, v9, 0x60504
2654; GFX10-NEXT:    v_perm_b32 v1, v3, v10, 0x5040100
2655; GFX10-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off
2656; GFX10-NEXT:    global_store_dword v[5:6], v2, off
2657; GFX10-NEXT:    s_setpc_b64 s[30:31]
2658;
2659; GFX9-LABEL: zext_store_div:
2660; GFX9:       ; %bb.0:
2661; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2662; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
2663; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
2664; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
2665; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2666; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
2667; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2668; GFX9-NEXT:    global_load_dword v4, v[0:1], off
2669; GFX9-NEXT:    global_load_dword v9, v[2:3], off
2670; GFX9-NEXT:    s_mov_b32 s4, 0x60504
2671; GFX9-NEXT:    s_movk_i32 s5, 0xff
2672; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
2673; GFX9-NEXT:    s_waitcnt vmcnt(1)
2674; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v4
2675; GFX9-NEXT:    s_waitcnt vmcnt(0)
2676; GFX9-NEXT:    v_perm_b32 v2, v4, v9, s4
2677; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v4
2678; GFX9-NEXT:    v_and_b32_e32 v3, 0xff, v9
2679; GFX9-NEXT:    v_and_b32_sdwa v4, v4, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2680; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s6
2681; GFX9-NEXT:    v_perm_b32 v1, v3, v4, s6
2682; GFX9-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off
2683; GFX9-NEXT:    global_store_dword v[5:6], v2, off
2684; GFX9-NEXT:    s_waitcnt vmcnt(0)
2685; GFX9-NEXT:    s_setpc_b64 s[30:31]
2686  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2687  %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
2688  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
2689  %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
2690  %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
2691  %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
2692  %insvec = zext <4 x i8> %shuffle0_0 to <4 x i16>
2693  store <4 x i16> %insvec, ptr addrspace(1) %out1
2694  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
2695  ret void
2696}
2697
2698define void @Source16Bit(i16 %in, <2 x i16> %reg) {
2699; GFX10-LABEL: Source16Bit:
2700; GFX10:       ; %bb.0: ; %entry
2701; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2702; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3050204
2703; GFX10-NEXT:    global_store_dword v[0:1], v0, off
2704; GFX10-NEXT:    s_setpc_b64 s[30:31]
2705;
2706; GFX9-LABEL: Source16Bit:
2707; GFX9:       ; %bb.0: ; %entry
2708; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2709; GFX9-NEXT:    s_mov_b32 s4, 0x3050204
2710; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
2711; GFX9-NEXT:    global_store_dword v[0:1], v0, off
2712; GFX9-NEXT:    s_waitcnt vmcnt(0)
2713; GFX9-NEXT:    s_setpc_b64 s[30:31]
2714entry:
2715  %elt0 = extractelement <2 x i16> %reg, i32 1
2716  %e0b0 = and i16 %elt0, 255
2717  %e0b1 = and i16 %elt0, -256
2718  %e1b0 = and i16 %in, 255
2719  %e1b1 = and i16 %in, -256
2720  %tmp0 = shl i16 %e0b0, 8
2721  %byte0 = or i16 %tmp0, %e1b0
2722  %tmp2 = lshr i16 %e1b1, 8
2723  %byte1 = or i16 %e0b1, %tmp2
2724  %ext0 = zext i16 %byte0 to i32
2725  %ext1 = zext i16 %byte1 to i32
2726  %shifted = shl i32 %ext1, 16
2727  %result = or i32 %shifted, %ext0
2728  store i32 %result, ptr addrspace(1) undef
2729  ret void
2730}
2731
2732define hidden void @extract3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2733; GFX10-LABEL: extract3744:
2734; GFX10:       ; %bb.0:
2735; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2736; GFX10-NEXT:    global_load_dword v6, v[0:1], off
2737; GFX10-NEXT:    global_load_dword v7, v[2:3], off
2738; GFX10-NEXT:    s_waitcnt vmcnt(0)
2739; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x3070404
2740; GFX10-NEXT:    global_store_dword v[4:5], v0, off
2741; GFX10-NEXT:    s_setpc_b64 s[30:31]
2742;
2743; GFX9-LABEL: extract3744:
2744; GFX9:       ; %bb.0:
2745; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2746; GFX9-NEXT:    global_load_dword v6, v[0:1], off
2747; GFX9-NEXT:    global_load_dword v7, v[2:3], off
2748; GFX9-NEXT:    s_mov_b32 s4, 0x3070404
2749; GFX9-NEXT:    s_waitcnt vmcnt(0)
2750; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
2751; GFX9-NEXT:    global_store_dword v[4:5], v0, off
2752; GFX9-NEXT:    s_waitcnt vmcnt(0)
2753; GFX9-NEXT:    s_setpc_b64 s[30:31]
2754  %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
2755  %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
2756  %v1e0 = extractelement <4 x i8> %vec1, i64 0
2757  %zv1e0 = zext i8 %v1e0 to i32
2758  %byte1 = shl i32 %zv1e0, 8
2759
2760  %v1e3 = extractelement <4 x i8> %vec1, i64 3
2761  %zv1e3 = zext i8 %v1e3 to i32
2762  %byte2 = shl i32 %zv1e3, 16
2763  %v2e3 = extractelement <4 x i8> %vec2, i64 3
2764  %zv2e3 = zext i8 %v2e3 to i32
2765  %byte3 = shl i32 %zv2e3, 24
2766
2767  %tmp0 = or i32 %zv1e0, %byte1
2768  %tmp1 = or i32 %tmp0, %byte2
2769  %res = or i32 %tmp1, %byte3
2770  store i32 %res, ptr addrspace(1) %out0, align 4
2771  ret void
2772}
2773
2774declare i32 @llvm.amdgcn.perm(i32, i32, i32)
2775
2776define hidden void @extract_perm_3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2777; GFX10-LABEL: extract_perm_3744:
2778; GFX10:       ; %bb.0:
2779; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2780; GFX10-NEXT:    global_load_dword v6, v[0:1], off
2781; GFX10-NEXT:    global_load_dword v7, v[2:3], off
2782; GFX10-NEXT:    s_waitcnt vmcnt(0)
2783; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x3070404
2784; GFX10-NEXT:    global_store_dword v[4:5], v0, off
2785; GFX10-NEXT:    s_setpc_b64 s[30:31]
2786;
2787; GFX9-LABEL: extract_perm_3744:
2788; GFX9:       ; %bb.0:
2789; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2790; GFX9-NEXT:    global_load_dword v6, v[0:1], off
2791; GFX9-NEXT:    global_load_dword v7, v[2:3], off
2792; GFX9-NEXT:    s_mov_b32 s4, 0x3070404
2793; GFX9-NEXT:    s_waitcnt vmcnt(0)
2794; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
2795; GFX9-NEXT:    global_store_dword v[4:5], v0, off
2796; GFX9-NEXT:    s_waitcnt vmcnt(0)
2797; GFX9-NEXT:    s_setpc_b64 s[30:31]
2798  %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
2799  %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
2800  %cast1 = bitcast <4 x i8> %vec1 to i32
2801  %cast2 = bitcast <4 x i8> %vec2 to i32
2802  %lo24 = call i32 @llvm.amdgcn.perm(i32 %cast1, i32 %cast1, i32 201523200)
2803  %hi8 = call i32 @llvm.amdgcn.perm(i32 %cast2, i32 %cast2, i32 51121164)
2804  %res = or i32 %hi8, %lo24
2805  store i32 %res, ptr addrspace(1) %out0, align 4
2806  ret void
2807}
2808
2809define hidden void @extract1347_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2810; GFX10-LABEL: extract1347_v2i16:
2811; GFX10:       ; %bb.0:
2812; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2813; GFX10-NEXT:    global_load_dword v6, v[0:1], off
2814; GFX10-NEXT:    global_load_dword v7, v[2:3], off
2815; GFX10-NEXT:    s_waitcnt vmcnt(0)
2816; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x1030407
2817; GFX10-NEXT:    global_store_dword v[4:5], v0, off
2818; GFX10-NEXT:    s_setpc_b64 s[30:31]
2819;
2820; GFX9-LABEL: extract1347_v2i16:
2821; GFX9:       ; %bb.0:
2822; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2823; GFX9-NEXT:    global_load_dword v6, v[0:1], off
2824; GFX9-NEXT:    global_load_dword v7, v[2:3], off
2825; GFX9-NEXT:    s_mov_b32 s4, 0x1030407
2826; GFX9-NEXT:    s_waitcnt vmcnt(0)
2827; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
2828; GFX9-NEXT:    global_store_dword v[4:5], v0, off
2829; GFX9-NEXT:    s_waitcnt vmcnt(0)
2830; GFX9-NEXT:    s_setpc_b64 s[30:31]
2831  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
2832  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
2833  %v1e0 = extractelement <2 x i16> %vec1, i64 0
2834  %v1e1 = extractelement <2 x i16> %vec1, i64 1
2835  %v2e0 = extractelement <2 x i16> %vec2, i64 0
2836  %v2e1 = extractelement <2 x i16> %vec2, i64 1
2837
2838  %b0t0 = and i16 -256, %v2e1
2839  %b0t1 = lshr i16 %b0t0, 8
2840  %byte0 = zext i16 %b0t1 to i32
2841
2842  %b1t0 = and i16 255, %v2e0
2843  %b1t1 = zext i16 %b1t0 to i32
2844  %byte1 = shl i32 %b1t1, 8
2845
2846  %b2t0 = and i16 -256, %v1e1
2847  %b2t1 = lshr i16 %b2t0, 8
2848  %b2t2 = zext i16 %b2t1 to i32
2849  %byte2 = shl i32 %b2t2, 16
2850
2851  %b3t0 = and i16 -256, %v1e0
2852  %b3t1 = lshr i16 %b3t0, 8
2853  %b3t2 = zext i16 %b3t1 to i32
2854  %byte3 = shl i32 %b3t2, 24
2855
2856  %tmp0 = or i32 %byte0, %byte1
2857  %tmp1 = or i32 %tmp0, %byte2
2858  %res = or i32 %tmp1, %byte3
2859  store i32 %res, ptr addrspace(1) %out0, align 4
2860  ret void
2861}
2862
2863
2864declare i16 @llvm.fshr.i16(i16, i16, i16)
2865
2866define hidden void @fshri16_8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2867; GFX10-LABEL: fshri16_8:
2868; GFX10:       ; %bb.0:
2869; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2870; GFX10-NEXT:    global_load_dword v6, v[0:1], off
2871; GFX10-NEXT:    global_load_dword v7, v[2:3], off
2872; GFX10-NEXT:    s_waitcnt vmcnt(0)
2873; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
2874; GFX10-NEXT:    global_store_dword v[4:5], v0, off
2875; GFX10-NEXT:    s_setpc_b64 s[30:31]
2876;
2877; GFX9-LABEL: fshri16_8:
2878; GFX9:       ; %bb.0:
2879; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2880; GFX9-NEXT:    global_load_dword v6, v[0:1], off
2881; GFX9-NEXT:    global_load_dword v7, v[2:3], off
2882; GFX9-NEXT:    s_mov_b32 s4, 0x30407
2883; GFX9-NEXT:    s_waitcnt vmcnt(0)
2884; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
2885; GFX9-NEXT:    global_store_dword v[4:5], v0, off
2886; GFX9-NEXT:    s_waitcnt vmcnt(0)
2887; GFX9-NEXT:    s_setpc_b64 s[30:31]
2888  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
2889  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
2890  %v1e0 = extractelement <2 x i16> %vec1, i64 0
2891  %v1e1 = extractelement <2 x i16> %vec1, i64 1
2892  %v2e0 = extractelement <2 x i16> %vec2, i64 0
2893  %v2e1 = extractelement <2 x i16> %vec2, i64 1
2894
2895  %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 8)
2896  %byte01 = zext i16 %tmp01.0 to i32
2897
2898  %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 8)
2899  %tmp23.1 = zext i16 %tmp23.0 to i32
2900  %byte23 = shl i32 %tmp23.1, 16
2901  %res = or i32 %byte01, %byte23
2902  store i32 %res, ptr addrspace(1) %out0, align 4
2903  ret void
2904}
2905
2906define hidden void @fshri16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2907; GFX10-LABEL: fshri16_16:
2908; GFX10:       ; %bb.0:
2909; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2910; GFX10-NEXT:    global_load_dword v6, v[0:1], off
2911; GFX10-NEXT:    global_load_dword v7, v[2:3], off
2912; GFX10-NEXT:    s_waitcnt vmcnt(0)
2913; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3020706
2914; GFX10-NEXT:    global_store_dword v[4:5], v0, off
2915; GFX10-NEXT:    s_setpc_b64 s[30:31]
2916;
2917; GFX9-LABEL: fshri16_16:
2918; GFX9:       ; %bb.0:
2919; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2920; GFX9-NEXT:    global_load_dword v6, v[0:1], off
2921; GFX9-NEXT:    global_load_dword v7, v[2:3], off
2922; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
2923; GFX9-NEXT:    s_waitcnt vmcnt(0)
2924; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
2925; GFX9-NEXT:    global_store_dword v[4:5], v0, off
2926; GFX9-NEXT:    s_waitcnt vmcnt(0)
2927; GFX9-NEXT:    s_setpc_b64 s[30:31]
2928  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
2929  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
2930  %v1e0 = extractelement <2 x i16> %vec1, i64 0
2931  %v1e1 = extractelement <2 x i16> %vec1, i64 1
2932  %v2e0 = extractelement <2 x i16> %vec2, i64 0
2933  %v2e1 = extractelement <2 x i16> %vec2, i64 1
2934
2935  %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 16)
2936  %byte01 = zext i16 %tmp01.0 to i32
2937
2938  %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 16)
2939  %tmp23.1 = zext i16 %tmp23.0 to i32
2940  %byte23 = shl i32 %tmp23.1, 16
2941  %res = or i32 %byte01, %byte23
2942  store i32 %res, ptr addrspace(1) %out0, align 4
2943  ret void
2944}
2945
2946define hidden void @fshri16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2947; GFX10-LABEL: fshri16_24:
2948; GFX10:       ; %bb.0:
2949; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2950; GFX10-NEXT:    global_load_dword v6, v[0:1], off
2951; GFX10-NEXT:    global_load_dword v7, v[2:3], off
2952; GFX10-NEXT:    s_waitcnt vmcnt(0)
2953; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
2954; GFX10-NEXT:    global_store_dword v[4:5], v0, off
2955; GFX10-NEXT:    s_setpc_b64 s[30:31]
2956;
2957; GFX9-LABEL: fshri16_24:
2958; GFX9:       ; %bb.0:
2959; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2960; GFX9-NEXT:    global_load_dword v6, v[0:1], off
2961; GFX9-NEXT:    global_load_dword v7, v[2:3], off
2962; GFX9-NEXT:    s_mov_b32 s4, 0x30407
2963; GFX9-NEXT:    s_waitcnt vmcnt(0)
2964; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
2965; GFX9-NEXT:    global_store_dword v[4:5], v0, off
2966; GFX9-NEXT:    s_waitcnt vmcnt(0)
2967; GFX9-NEXT:    s_setpc_b64 s[30:31]
2968  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
2969  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
2970  %v1e0 = extractelement <2 x i16> %vec1, i64 0
2971  %v1e1 = extractelement <2 x i16> %vec1, i64 1
2972  %v2e0 = extractelement <2 x i16> %vec2, i64 0
2973  %v2e1 = extractelement <2 x i16> %vec2, i64 1
2974
2975  %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 24)
2976  %byte01 = zext i16 %tmp01.0 to i32
2977
2978  %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 24)
2979  %tmp23.1 = zext i16 %tmp23.0 to i32
2980  %byte23 = shl i32 %tmp23.1, 16
2981  %res = or i32 %byte01, %byte23
2982  store i32 %res, ptr addrspace(1) %out0, align 4
2983  ret void
2984}
2985
2986define hidden void @fshri16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2987; GFX10-LABEL: fshri16_32:
2988; GFX10:       ; %bb.0:
2989; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2990; GFX10-NEXT:    global_load_dword v6, v[0:1], off
2991; GFX10-NEXT:    global_load_dword v7, v[2:3], off
2992; GFX10-NEXT:    s_waitcnt vmcnt(0)
2993; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3020706
2994; GFX10-NEXT:    global_store_dword v[4:5], v0, off
2995; GFX10-NEXT:    s_setpc_b64 s[30:31]
2996;
2997; GFX9-LABEL: fshri16_32:
2998; GFX9:       ; %bb.0:
2999; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3000; GFX9-NEXT:    global_load_dword v6, v[0:1], off
3001; GFX9-NEXT:    global_load_dword v7, v[2:3], off
3002; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
3003; GFX9-NEXT:    s_waitcnt vmcnt(0)
3004; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
3005; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3006; GFX9-NEXT:    s_waitcnt vmcnt(0)
3007; GFX9-NEXT:    s_setpc_b64 s[30:31]
3008  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3009  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3010  %v1e0 = extractelement <2 x i16> %vec1, i64 0
3011  %v1e1 = extractelement <2 x i16> %vec1, i64 1
3012  %v2e0 = extractelement <2 x i16> %vec2, i64 0
3013  %v2e1 = extractelement <2 x i16> %vec2, i64 1
3014
3015  %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 32)
3016  %byte01 = zext i16 %tmp01.0 to i32
3017
3018  %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 32)
3019  %tmp23.1 = zext i16 %tmp23.0 to i32
3020  %byte23 = shl i32 %tmp23.1, 16
3021  %res = or i32 %byte01, %byte23
3022  store i32 %res, ptr addrspace(1) %out0, align 4
3023  ret void
3024}
3025
3026define hidden void @fshri16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3027; GFX10-LABEL: fshri16_88:
3028; GFX10:       ; %bb.0:
3029; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3030; GFX10-NEXT:    global_load_dword v6, v[0:1], off
3031; GFX10-NEXT:    global_load_dword v7, v[2:3], off
3032; GFX10-NEXT:    s_waitcnt vmcnt(0)
3033; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
3034; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3035; GFX10-NEXT:    s_setpc_b64 s[30:31]
3036;
3037; GFX9-LABEL: fshri16_88:
3038; GFX9:       ; %bb.0:
3039; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3040; GFX9-NEXT:    global_load_dword v6, v[0:1], off
3041; GFX9-NEXT:    global_load_dword v7, v[2:3], off
3042; GFX9-NEXT:    s_mov_b32 s4, 0x30407
3043; GFX9-NEXT:    s_waitcnt vmcnt(0)
3044; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
3045; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3046; GFX9-NEXT:    s_waitcnt vmcnt(0)
3047; GFX9-NEXT:    s_setpc_b64 s[30:31]
3048  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3049  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3050  %v1e0 = extractelement <2 x i16> %vec1, i64 0
3051  %v1e1 = extractelement <2 x i16> %vec1, i64 1
3052  %v2e0 = extractelement <2 x i16> %vec2, i64 0
3053  %v2e1 = extractelement <2 x i16> %vec2, i64 1
3054
3055  %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 88)
3056  %byte01 = zext i16 %tmp01.0 to i32
3057
3058  %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 88)
3059  %tmp23.1 = zext i16 %tmp23.0 to i32
3060  %byte23 = shl i32 %tmp23.1, 16
3061  %res = or i32 %byte01, %byte23
3062  store i32 %res, ptr addrspace(1) %out0, align 4
3063  ret void
3064}
3065
3066declare i16 @llvm.fshl.i16(i16, i16, i16)
3067
3068define hidden void @fshli16_1347(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3069; GFX10-LABEL: fshli16_1347:
3070; GFX10:       ; %bb.0:
3071; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3072; GFX10-NEXT:    global_load_dword v6, v[0:1], off
3073; GFX10-NEXT:    global_load_dword v7, v[2:3], off
3074; GFX10-NEXT:    s_waitcnt vmcnt(0)
3075; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
3076; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3077; GFX10-NEXT:    s_setpc_b64 s[30:31]
3078;
3079; GFX9-LABEL: fshli16_1347:
3080; GFX9:       ; %bb.0:
3081; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3082; GFX9-NEXT:    global_load_dword v6, v[0:1], off
3083; GFX9-NEXT:    global_load_dword v7, v[2:3], off
3084; GFX9-NEXT:    s_mov_b32 s4, 0x30407
3085; GFX9-NEXT:    s_waitcnt vmcnt(0)
3086; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
3087; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3088; GFX9-NEXT:    s_waitcnt vmcnt(0)
3089; GFX9-NEXT:    s_setpc_b64 s[30:31]
3090  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3091  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3092  %v1e0 = extractelement <2 x i16> %vec1, i64 0
3093  %v1e1 = extractelement <2 x i16> %vec1, i64 1
3094  %v2e0 = extractelement <2 x i16> %vec2, i64 0
3095  %v2e1 = extractelement <2 x i16> %vec2, i64 1
3096
3097  %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 8)
3098  %byte01 = zext i16 %tmp01.0 to i32
3099
3100  %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 8)
3101  %tmp23.1 = zext i16 %tmp23.0 to i32
3102  %byte23 = shl i32 %tmp23.1, 16
3103  %res = or i32 %byte01, %byte23
3104  store i32 %res, ptr addrspace(1) %out0, align 4
3105  ret void
3106}
3107
3108define hidden void @fshli16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3109; GFX10-LABEL: fshli16_16:
3110; GFX10:       ; %bb.0:
3111; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3112; GFX10-NEXT:    global_load_dword v6, v[0:1], off
3113; GFX10-NEXT:    global_load_dword v7, v[2:3], off
3114; GFX10-NEXT:    s_waitcnt vmcnt(0)
3115; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x1000504
3116; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3117; GFX10-NEXT:    s_setpc_b64 s[30:31]
3118;
3119; GFX9-LABEL: fshli16_16:
3120; GFX9:       ; %bb.0:
3121; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3122; GFX9-NEXT:    global_load_dword v6, v[0:1], off
3123; GFX9-NEXT:    global_load_dword v7, v[2:3], off
3124; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
3125; GFX9-NEXT:    s_waitcnt vmcnt(0)
3126; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
3127; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3128; GFX9-NEXT:    s_waitcnt vmcnt(0)
3129; GFX9-NEXT:    s_setpc_b64 s[30:31]
3130  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3131  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3132  %v1e0 = extractelement <2 x i16> %vec1, i64 0
3133  %v1e1 = extractelement <2 x i16> %vec1, i64 1
3134  %v2e0 = extractelement <2 x i16> %vec2, i64 0
3135  %v2e1 = extractelement <2 x i16> %vec2, i64 1
3136
3137  %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 16)
3138  %byte01 = zext i16 %tmp01.0 to i32
3139
3140  %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 16)
3141  %tmp23.1 = zext i16 %tmp23.0 to i32
3142  %byte23 = shl i32 %tmp23.1, 16
3143  %res = or i32 %byte01, %byte23
3144  store i32 %res, ptr addrspace(1) %out0, align 4
3145  ret void
3146}
3147
3148define hidden void @fshli16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3149; GFX10-LABEL: fshli16_24:
3150; GFX10:       ; %bb.0:
3151; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3152; GFX10-NEXT:    global_load_dword v6, v[0:1], off
3153; GFX10-NEXT:    global_load_dword v7, v[2:3], off
3154; GFX10-NEXT:    s_waitcnt vmcnt(0)
3155; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
3156; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3157; GFX10-NEXT:    s_setpc_b64 s[30:31]
3158;
3159; GFX9-LABEL: fshli16_24:
3160; GFX9:       ; %bb.0:
3161; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3162; GFX9-NEXT:    global_load_dword v6, v[0:1], off
3163; GFX9-NEXT:    global_load_dword v7, v[2:3], off
3164; GFX9-NEXT:    s_mov_b32 s4, 0x30407
3165; GFX9-NEXT:    s_waitcnt vmcnt(0)
3166; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
3167; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3168; GFX9-NEXT:    s_waitcnt vmcnt(0)
3169; GFX9-NEXT:    s_setpc_b64 s[30:31]
3170  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3171  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3172  %v1e0 = extractelement <2 x i16> %vec1, i64 0
3173  %v1e1 = extractelement <2 x i16> %vec1, i64 1
3174  %v2e0 = extractelement <2 x i16> %vec2, i64 0
3175  %v2e1 = extractelement <2 x i16> %vec2, i64 1
3176
3177  %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 24)
3178  %byte01 = zext i16 %tmp01.0 to i32
3179
3180  %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 24)
3181  %tmp23.1 = zext i16 %tmp23.0 to i32
3182  %byte23 = shl i32 %tmp23.1, 16
3183  %res = or i32 %byte01, %byte23
3184  store i32 %res, ptr addrspace(1) %out0, align 4
3185  ret void
3186}
3187
3188define hidden void @fshli16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3189; GFX10-LABEL: fshli16_32:
3190; GFX10:       ; %bb.0:
3191; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3192; GFX10-NEXT:    global_load_dword v6, v[0:1], off
3193; GFX10-NEXT:    global_load_dword v7, v[2:3], off
3194; GFX10-NEXT:    s_waitcnt vmcnt(0)
3195; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x1000504
3196; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3197; GFX10-NEXT:    s_setpc_b64 s[30:31]
3198;
3199; GFX9-LABEL: fshli16_32:
3200; GFX9:       ; %bb.0:
3201; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3202; GFX9-NEXT:    global_load_dword v6, v[0:1], off
3203; GFX9-NEXT:    global_load_dword v7, v[2:3], off
3204; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
3205; GFX9-NEXT:    s_waitcnt vmcnt(0)
3206; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
3207; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3208; GFX9-NEXT:    s_waitcnt vmcnt(0)
3209; GFX9-NEXT:    s_setpc_b64 s[30:31]
3210  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3211  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3212  %v1e0 = extractelement <2 x i16> %vec1, i64 0
3213  %v1e1 = extractelement <2 x i16> %vec1, i64 1
3214  %v2e0 = extractelement <2 x i16> %vec2, i64 0
3215  %v2e1 = extractelement <2 x i16> %vec2, i64 1
3216
3217  %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 32)
3218  %byte01 = zext i16 %tmp01.0 to i32
3219
3220  %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 32)
3221  %tmp23.1 = zext i16 %tmp23.0 to i32
3222  %byte23 = shl i32 %tmp23.1, 16
3223  %res = or i32 %byte01, %byte23
3224  store i32 %res, ptr addrspace(1) %out0, align 4
3225  ret void
3226}
3227
3228define hidden void @fshli16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3229; GFX10-LABEL: fshli16_88:
3230; GFX10:       ; %bb.0:
3231; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3232; GFX10-NEXT:    global_load_dword v6, v[0:1], off
3233; GFX10-NEXT:    global_load_dword v7, v[2:3], off
3234; GFX10-NEXT:    s_waitcnt vmcnt(0)
3235; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
3236; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3237; GFX10-NEXT:    s_setpc_b64 s[30:31]
3238;
3239; GFX9-LABEL: fshli16_88:
3240; GFX9:       ; %bb.0:
3241; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3242; GFX9-NEXT:    global_load_dword v6, v[0:1], off
3243; GFX9-NEXT:    global_load_dword v7, v[2:3], off
3244; GFX9-NEXT:    s_mov_b32 s4, 0x30407
3245; GFX9-NEXT:    s_waitcnt vmcnt(0)
3246; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
3247; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3248; GFX9-NEXT:    s_waitcnt vmcnt(0)
3249; GFX9-NEXT:    s_setpc_b64 s[30:31]
3250  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
3251  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
3252  %v1e0 = extractelement <2 x i16> %vec1, i64 0
3253  %v1e1 = extractelement <2 x i16> %vec1, i64 1
3254  %v2e0 = extractelement <2 x i16> %vec2, i64 0
3255  %v2e1 = extractelement <2 x i16> %vec2, i64 1
3256
3257  %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 88)
3258  %byte01 = zext i16 %tmp01.0 to i32
3259
3260  %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 88)
3261  %tmp23.1 = zext i16 %tmp23.0 to i32
3262  %byte23 = shl i32 %tmp23.1, 16
3263  %res = or i32 %byte01, %byte23
3264  store i32 %res, ptr addrspace(1) %out0, align 4
3265  ret void
3266}
3267
3268define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i32 %base) {
3269; GFX10-LABEL: shlbase:
3270; GFX10:       ; %bb.0:
3271; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3272; GFX10-NEXT:    global_load_dword v7, v[0:1], off
3273; GFX10-NEXT:    global_load_dword v8, v[2:3], off
3274; GFX10-NEXT:    v_add_nc_u32_e32 v0, 16, v6
3275; GFX10-NEXT:    v_add_nc_u32_e32 v1, 24, v6
3276; GFX10-NEXT:    v_add_nc_u32_e32 v3, 8, v6
3277; GFX10-NEXT:    s_waitcnt vmcnt(1)
3278; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v7
3279; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3280; GFX10-NEXT:    s_waitcnt vmcnt(0)
3281; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3282; GFX10-NEXT:    v_lshl_or_b32 v2, v2, v3, v2
3283; GFX10-NEXT:    v_or3_b32 v0, v2, v0, v1
3284; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3285; GFX10-NEXT:    s_setpc_b64 s[30:31]
3286;
3287; GFX9-LABEL: shlbase:
3288; GFX9:       ; %bb.0:
3289; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3290; GFX9-NEXT:    global_load_dword v7, v[0:1], off
3291; GFX9-NEXT:    global_load_dword v8, v[2:3], off
3292; GFX9-NEXT:    v_add_u32_e32 v0, 8, v6
3293; GFX9-NEXT:    v_add_u32_e32 v1, 16, v6
3294; GFX9-NEXT:    v_add_u32_e32 v2, 24, v6
3295; GFX9-NEXT:    s_waitcnt vmcnt(1)
3296; GFX9-NEXT:    v_and_b32_e32 v3, 0xff, v7
3297; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3298; GFX9-NEXT:    s_waitcnt vmcnt(0)
3299; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3300; GFX9-NEXT:    v_lshl_or_b32 v0, v3, v0, v3
3301; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
3302; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3303; GFX9-NEXT:    s_waitcnt vmcnt(0)
3304; GFX9-NEXT:    s_setpc_b64 s[30:31]
3305  %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
3306  %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
3307  %v1e0 = extractelement <4 x i8> %vec1, i64 0
3308  %zv1e0 = zext i8 %v1e0 to i32
3309  %b8 = add i32 %base, 8
3310  %byte1 = shl i32 %zv1e0, %b8
3311
3312  %v1e3 = extractelement <4 x i8> %vec1, i64 3
3313  %zv1e3 = zext i8 %v1e3 to i32
3314  %b16 = add i32 %base, 16
3315  %byte2 = shl i32 %zv1e3, %b16
3316  %v2e3 = extractelement <4 x i8> %vec2, i64 3
3317  %zv2e3 = zext i8 %v2e3 to i32
3318  %b24 = add i32 %base, 24
3319  %byte3 = shl i32 %zv2e3, %b24
3320
3321  %tmp0 = or i32 %zv1e0, %byte1
3322  %tmp1 = or i32 %tmp0, %byte2
3323  %res = or i32 %tmp1, %byte3
3324  store i32 %res, ptr addrspace(1) %out0, align 4
3325  ret void
3326}
3327
3328; TODO -- lower into v_perm
3329define hidden void @extractbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i64 %base) {
3330; GFX10-LABEL: extractbase:
3331; GFX10:       ; %bb.0:
3332; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3333; GFX10-NEXT:    global_load_dword v7, v[0:1], off
3334; GFX10-NEXT:    global_load_dword v8, v[2:3], off
3335; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v6
3336; GFX10-NEXT:    v_add_nc_u32_e32 v1, 24, v0
3337; GFX10-NEXT:    s_waitcnt vmcnt(1)
3338; GFX10-NEXT:    v_bfe_u32 v2, v7, v1, 8
3339; GFX10-NEXT:    v_bfe_u32 v0, v7, v0, 8
3340; GFX10-NEXT:    s_waitcnt vmcnt(0)
3341; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3342; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3343; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 8, v0
3344; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
3345; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3346; GFX10-NEXT:    s_setpc_b64 s[30:31]
3347;
3348; GFX9-LABEL: extractbase:
3349; GFX9:       ; %bb.0:
3350; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3351; GFX9-NEXT:    global_load_dword v7, v[0:1], off
3352; GFX9-NEXT:    global_load_dword v8, v[2:3], off
3353; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v6
3354; GFX9-NEXT:    v_add_u32_e32 v1, 24, v0
3355; GFX9-NEXT:    s_waitcnt vmcnt(1)
3356; GFX9-NEXT:    v_bfe_u32 v0, v7, v0, 8
3357; GFX9-NEXT:    v_bfe_u32 v2, v7, v1, 8
3358; GFX9-NEXT:    s_waitcnt vmcnt(0)
3359; GFX9-NEXT:    v_lshrrev_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3360; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3361; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 8, v0
3362; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
3363; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3364; GFX9-NEXT:    s_waitcnt vmcnt(0)
3365; GFX9-NEXT:    s_setpc_b64 s[30:31]
3366  %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
3367  %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
3368  %v1b = extractelement <4 x i8> %vec1, i64 %base
3369  %zv1b = zext i8 %v1b to i32
3370  %byte1 = shl i32 %zv1b, 8
3371
3372  %b3 = add i64 %base, 3
3373  %v1b3 = extractelement <4 x i8> %vec1, i64 %b3
3374  %zv1b3 = zext i8 %v1b3 to i32
3375  %byte2 = shl i32 %zv1b3, 16
3376  %v2b3 = extractelement <4 x i8> %vec2, i64 %b3
3377  %zv2b3 = zext i8 %v2b3 to i32
3378  %byte3 = shl i32 %zv2b3, 24
3379
3380  %tmp0 = or i32 %zv1b, %byte1
3381  %tmp1 = or i32 %tmp0, %byte2
3382  %res = or i32 %tmp1, %byte3
3383  store i32 %res, ptr addrspace(1) %out0, align 4
3384  ret void
3385}
3386
3387define hidden void @extract_hilo(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3388; GFX10-LABEL: extract_hilo:
3389; GFX10:       ; %bb.0:
3390; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3391; GFX10-NEXT:    global_load_dword v6, v[2:3], off
3392; GFX10-NEXT:    global_load_dword v7, v[0:1], off offset:4
3393; GFX10-NEXT:    s_waitcnt vmcnt(0)
3394; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3060505
3395; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3396; GFX10-NEXT:    s_setpc_b64 s[30:31]
3397;
3398; GFX9-LABEL: extract_hilo:
3399; GFX9:       ; %bb.0:
3400; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3401; GFX9-NEXT:    global_load_dword v6, v[2:3], off
3402; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:4
3403; GFX9-NEXT:    s_mov_b32 s4, 0x3060505
3404; GFX9-NEXT:    s_waitcnt vmcnt(0)
3405; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
3406; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3407; GFX9-NEXT:    s_waitcnt vmcnt(0)
3408; GFX9-NEXT:    s_setpc_b64 s[30:31]
3409  %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
3410  %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
3411  %v1e5 = extractelement <8 x i8> %vec1, i64 5
3412  %zv1e5 = zext i8 %v1e5 to i32
3413  %byte1 = shl i32 %zv1e5, 8
3414
3415  %v1e6 = extractelement <8 x i8> %vec1, i64 6
3416  %zv1e6 = zext i8 %v1e6 to i32
3417  %byte2 = shl i32 %zv1e6, 16
3418  %v2e3 = extractelement <8 x i8> %vec2, i64 3
3419  %zv2e3 = zext i8 %v2e3 to i32
3420  %byte3 = shl i32 %zv2e3, 24
3421
3422  %tmp0 = or i32 %zv1e5, %byte1
3423  %tmp1 = or i32 %tmp0, %byte2
3424  %res = or i32 %tmp1, %byte3
3425  store i32 %res, ptr addrspace(1) %out0, align 4
3426  ret void
3427}
3428
3429define hidden void @extract_lohi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3430; GFX10-LABEL: extract_lohi:
3431; GFX10:       ; %bb.0:
3432; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3433; GFX10-NEXT:    global_load_dword v6, v[2:3], off offset:4
3434; GFX10-NEXT:    global_load_dword v7, v[0:1], off
3435; GFX10-NEXT:    s_waitcnt vmcnt(0)
3436; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x70404
3437; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3438; GFX10-NEXT:    s_setpc_b64 s[30:31]
3439;
3440; GFX9-LABEL: extract_lohi:
3441; GFX9:       ; %bb.0:
3442; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3443; GFX9-NEXT:    global_load_dword v6, v[2:3], off offset:4
3444; GFX9-NEXT:    global_load_dword v7, v[0:1], off
3445; GFX9-NEXT:    s_mov_b32 s4, 0x70404
3446; GFX9-NEXT:    s_waitcnt vmcnt(0)
3447; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
3448; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3449; GFX9-NEXT:    s_waitcnt vmcnt(0)
3450; GFX9-NEXT:    s_setpc_b64 s[30:31]
3451  %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
3452  %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
3453  %v1e0 = extractelement <8 x i8> %vec1, i64 0
3454  %zv1e0 = zext i8 %v1e0 to i32
3455  %byte1 = shl i32 %zv1e0, 8
3456
3457  %v1e3 = extractelement <8 x i8> %vec1, i64 3
3458  %zv1e3 = zext i8 %v1e3 to i32
3459  %byte2 = shl i32 %zv1e3, 16
3460  %v2e4 = extractelement <8 x i8> %vec2, i64 4
3461  %zv2e4 = zext i8 %v2e4 to i32
3462  %byte3 = shl i32 %zv2e4, 24
3463
3464  %tmp0 = or i32 %zv1e0, %byte1
3465  %tmp1 = or i32 %tmp0, %byte2
3466  %res = or i32 %tmp1, %byte3
3467  store i32 %res, ptr addrspace(1) %out0, align 4
3468  ret void
3469}
3470
3471define hidden void @extract_hihi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3472; GFX10-LABEL: extract_hihi:
3473; GFX10:       ; %bb.0:
3474; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3475; GFX10-NEXT:    global_load_dword v6, v[2:3], off offset:4
3476; GFX10-NEXT:    global_load_dword v7, v[0:1], off offset:4
3477; GFX10-NEXT:    s_waitcnt vmcnt(0)
3478; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x2070505
3479; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3480; GFX10-NEXT:    s_setpc_b64 s[30:31]
3481;
3482; GFX9-LABEL: extract_hihi:
3483; GFX9:       ; %bb.0:
3484; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3485; GFX9-NEXT:    global_load_dword v6, v[2:3], off offset:4
3486; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:4
3487; GFX9-NEXT:    s_mov_b32 s4, 0x2070505
3488; GFX9-NEXT:    s_waitcnt vmcnt(0)
3489; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
3490; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3491; GFX9-NEXT:    s_waitcnt vmcnt(0)
3492; GFX9-NEXT:    s_setpc_b64 s[30:31]
3493  %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
3494  %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
3495  %v1e5 = extractelement <8 x i8> %vec1, i64 5
3496  %zv1e5 = zext i8 %v1e5 to i32
3497  %byte1 = shl i32 %zv1e5, 8
3498
3499  %v1e7 = extractelement <8 x i8> %vec1, i64 7
3500  %zv1e7 = zext i8 %v1e7 to i32
3501  %byte2 = shl i32 %zv1e7, 16
3502  %v2e6 = extractelement <8 x i8> %vec2, i64 6
3503  %zv2e6 = zext i8 %v2e6 to i32
3504  %byte3 = shl i32 %zv2e6, 24
3505
3506  %tmp0 = or i32 %zv1e5, %byte1
3507  %tmp1 = or i32 %tmp0, %byte2
3508  %res = or i32 %tmp1, %byte3
3509  store i32 %res, ptr addrspace(1) %out0, align 4
3510  ret void
3511}
3512
3513define hidden void @extract_v8i8(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
3514; GFX10-LABEL: extract_v8i8:
3515; GFX10:       ; %bb.0:
3516; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3517; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
3518; GFX10-NEXT:    s_waitcnt vmcnt(0)
3519; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x1070404
3520; GFX10-NEXT:    global_store_dword v[2:3], v0, off
3521; GFX10-NEXT:    s_setpc_b64 s[30:31]
3522;
3523; GFX9-LABEL: extract_v8i8:
3524; GFX9:       ; %bb.0:
3525; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3526; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
3527; GFX9-NEXT:    s_mov_b32 s4, 0x1070404
3528; GFX9-NEXT:    s_waitcnt vmcnt(0)
3529; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
3530; GFX9-NEXT:    global_store_dword v[2:3], v0, off
3531; GFX9-NEXT:    s_waitcnt vmcnt(0)
3532; GFX9-NEXT:    s_setpc_b64 s[30:31]
3533  %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
3534  %v1e4 = extractelement <8 x i8> %vec1, i64 4
3535  %zv1e4 = zext i8 %v1e4 to i32
3536  %byte1 = shl i32 %zv1e4, 8
3537
3538  %v1e7 = extractelement <8 x i8> %vec1, i64 7
3539  %zv1e7 = zext i8 %v1e7 to i32
3540  %byte2 = shl i32 %zv1e7, 16
3541  %v2e1 = extractelement <8 x i8> %vec1, i64 1
3542  %zv2e1 = zext i8 %v2e1 to i32
3543  %byte3 = shl i32 %zv2e1, 24
3544
3545  %tmp0 = or i32 %zv1e4, %byte1
3546  %tmp1 = or i32 %tmp0, %byte2
3547  %res = or i32 %tmp1, %byte3
3548  store i32 %res, ptr addrspace(1) %out0, align 4
3549  ret void
3550}
3551
3552define hidden void @extract_v256i8(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
3553; GFX10-LABEL: extract_v256i8:
3554; GFX10:       ; %bb.0:
3555; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3556; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:252
3557; GFX10-NEXT:    s_waitcnt vmcnt(0)
3558; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x6050707
3559; GFX10-NEXT:    global_store_dword v[2:3], v0, off
3560; GFX10-NEXT:    s_setpc_b64 s[30:31]
3561;
3562; GFX9-LABEL: extract_v256i8:
3563; GFX9:       ; %bb.0:
3564; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3565; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:252
3566; GFX9-NEXT:    s_mov_b32 s4, 0x6050707
3567; GFX9-NEXT:    s_waitcnt vmcnt(0)
3568; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
3569; GFX9-NEXT:    global_store_dword v[2:3], v0, off
3570; GFX9-NEXT:    s_waitcnt vmcnt(0)
3571; GFX9-NEXT:    s_setpc_b64 s[30:31]
3572  %vec1 = load <256 x i8>, ptr addrspace(1) %in0, align 4
3573  %v1e4 = extractelement <256 x i8> %vec1, i64 255
3574  %zv1e4 = zext i8 %v1e4 to i32
3575  %byte1 = shl i32 %zv1e4, 8
3576
3577  %v1e7 = extractelement <256 x i8> %vec1, i64 253
3578  %zv1e7 = zext i8 %v1e7 to i32
3579  %byte2 = shl i32 %zv1e7, 16
3580  %v2e1 = extractelement <256 x i8> %vec1, i64 254
3581  %zv2e1 = zext i8 %v2e1 to i32
3582  %byte3 = shl i32 %zv2e1, 24
3583
3584  %tmp0 = or i32 %zv1e4, %byte1
3585  %tmp1 = or i32 %tmp0, %byte2
3586  %res = or i32 %tmp1, %byte3
3587  store i32 %res, ptr addrspace(1) %out0, align 4
3588  ret void
3589}
3590
3591; TODO : support this pattern
3592define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3593; GFX10-LABEL: extract_3src:
3594; GFX10:       ; %bb.0:
3595; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3596; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
3597; GFX10-NEXT:    global_load_dword v8, v[2:3], off offset:4
3598; GFX10-NEXT:    s_waitcnt vmcnt(1)
3599; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
3600; GFX10-NEXT:    s_waitcnt vmcnt(0)
3601; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v8
3602; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v6
3603; GFX10-NEXT:    v_and_b32_e32 v0, 0xff0000, v0
3604; GFX10-NEXT:    v_and_b32_e32 v1, 0xff000000, v1
3605; GFX10-NEXT:    v_lshl_or_b32 v2, v2, 8, v2
3606; GFX10-NEXT:    v_or3_b32 v0, v2, v0, v1
3607; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3608; GFX10-NEXT:    s_setpc_b64 s[30:31]
3609;
3610; GFX9-LABEL: extract_3src:
3611; GFX9:       ; %bb.0:
3612; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3613; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
3614; GFX9-NEXT:    global_load_dword v8, v[2:3], off offset:4
3615; GFX9-NEXT:    s_waitcnt vmcnt(1)
3616; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v6
3617; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
3618; GFX9-NEXT:    s_waitcnt vmcnt(0)
3619; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
3620; GFX9-NEXT:    v_and_b32_e32 v1, 0xff0000, v1
3621; GFX9-NEXT:    v_and_b32_e32 v2, 0xff000000, v2
3622; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 8, v0
3623; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
3624; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3625; GFX9-NEXT:    s_waitcnt vmcnt(0)
3626; GFX9-NEXT:    s_setpc_b64 s[30:31]
3627  %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4
3628  %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4
3629  %v1e0 = extractelement <8 x i8> %vec1, i64 0
3630  %zv1e0 = zext i8 %v1e0 to i32
3631  %byte1 = shl i32 %zv1e0, 8
3632
3633  %v1e5 = extractelement <8 x i8> %vec1, i64 5
3634  %zv1e5 = zext i8 %v1e5 to i32
3635  %byte2 = shl i32 %zv1e5, 16
3636  %v2e6 = extractelement <8 x i8> %vec2, i64 6
3637  %zv2e6 = zext i8 %v2e6 to i32
3638  %byte3 = shl i32 %zv2e6, 24
3639
3640  %tmp0 = or i32 %zv1e0, %byte1
3641  %tmp1 = or i32 %tmp0, %byte2
3642  %res = or i32 %tmp1, %byte3
3643  store i32 %res, ptr addrspace(1) %out0, align 4
3644  ret void
3645}
3646
3647; Should not result in crash
3648define hidden void @extract_v6i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
3649; GFX10-LABEL: extract_v6i16:
3650; GFX10:       ; %bb.0:
3651; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3652; GFX10-NEXT:    s_clause 0x3
3653; GFX10-NEXT:    global_load_ushort v2, v[0:1], off offset:6
3654; GFX10-NEXT:    global_load_ushort v3, v[0:1], off
3655; GFX10-NEXT:    global_load_ushort v8, v[0:1], off offset:2
3656; GFX10-NEXT:    global_load_ushort v9, v[0:1], off offset:4
3657; GFX10-NEXT:    s_waitcnt vmcnt(1)
3658; GFX10-NEXT:    v_lshl_or_b32 v0, v8, 16, v3
3659; GFX10-NEXT:    s_waitcnt vmcnt(0)
3660; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v9
3661; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3662; GFX10-NEXT:    global_store_dword v[6:7], v1, off
3663; GFX10-NEXT:    s_setpc_b64 s[30:31]
3664;
3665; GFX9-LABEL: extract_v6i16:
3666; GFX9:       ; %bb.0:
3667; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3668; GFX9-NEXT:    global_load_ushort v2, v[0:1], off offset:6
3669; GFX9-NEXT:    global_load_ushort v3, v[0:1], off
3670; GFX9-NEXT:    global_load_ushort v8, v[0:1], off offset:4
3671; GFX9-NEXT:    global_load_ushort v9, v[0:1], off offset:2
3672; GFX9-NEXT:    s_waitcnt vmcnt(1)
3673; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v8
3674; GFX9-NEXT:    s_waitcnt vmcnt(0)
3675; GFX9-NEXT:    v_lshl_or_b32 v1, v9, 16, v3
3676; GFX9-NEXT:    global_store_dword v[4:5], v1, off
3677; GFX9-NEXT:    global_store_dword v[6:7], v0, off
3678; GFX9-NEXT:    s_waitcnt vmcnt(0)
3679; GFX9-NEXT:    s_setpc_b64 s[30:31]
3680  %vec = load <6 x i16>, ptr addrspace(1) %in0, align 2
3681  %el0 = extractelement <6 x i16> %vec, i32 0
3682  %el1 = extractelement <6 x i16> %vec, i32 1
3683  %el2 = extractelement <6 x i16> %vec, i32 2
3684  %el3 = extractelement <6 x i16> %vec, i32 3
3685  %z0 = zext i16 %el0 to i32
3686  %z1 = zext i16 %el1 to i32
3687  %s1 = shl nuw i32 %z1, 16
3688  %o0 = or i32 %s1, %z0
3689  %z2 = zext i16 %el2 to i32
3690  %z3 = zext i16 %el3 to i32
3691  %s3 = shl nuw i32 %z3, 16
3692  %o1 = or i32 %z2, %s3
3693
3694  store i32 %o0, ptr addrspace(1) %out0, align 4
3695  store i32 %o1, ptr addrspace(1) %out1, align 4
3696  ret void
3697}
3698
3699
3700; Should not result in crash
3701define hidden void @extract_v7i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
3702; GFX10-LABEL: extract_v7i16:
3703; GFX10:       ; %bb.0:
3704; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3705; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
3706; GFX10-NEXT:    s_waitcnt vmcnt(0)
3707; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3708; GFX10-NEXT:    global_store_dword v[6:7], v1, off
3709; GFX10-NEXT:    s_setpc_b64 s[30:31]
3710;
3711; GFX9-LABEL: extract_v7i16:
3712; GFX9:       ; %bb.0:
3713; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3714; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
3715; GFX9-NEXT:    s_waitcnt vmcnt(0)
3716; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3717; GFX9-NEXT:    global_store_dword v[6:7], v1, off
3718; GFX9-NEXT:    s_waitcnt vmcnt(0)
3719; GFX9-NEXT:    s_setpc_b64 s[30:31]
3720  %vec = load <7 x i16>, ptr addrspace(1) %in0, align 2
3721  %el0 = extractelement <7 x i16> %vec, i32 0
3722  %el1 = extractelement <7 x i16> %vec, i32 1
3723  %el2 = extractelement <7 x i16> %vec, i32 2
3724  %el3 = extractelement <7 x i16> %vec, i32 3
3725  %z0 = zext i16 %el0 to i32
3726  %z1 = zext i16 %el1 to i32
3727  %s1 = shl nuw i32 %z1, 16
3728  %o0 = or i32 %s1, %z0
3729  %z2 = zext i16 %el2 to i32
3730  %z3 = zext i16 %el3 to i32
3731  %s3 = shl nuw i32 %z3, 16
3732  %o1 = or i32 %z2, %s3
3733
3734  store i32 %o0, ptr addrspace(1) %out0, align 4
3735  store i32 %o1, ptr addrspace(1) %out1, align 4
3736  ret void
3737}
3738
3739; Should not result in crash
3740define hidden void @extract_v13i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
3741; GFX10-LABEL: extract_v13i8:
3742; GFX10:       ; %bb.0:
3743; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3744; GFX10-NEXT:    s_clause 0x1
3745; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
3746; GFX10-NEXT:    global_load_ushort v8, v[0:1], off offset:8
3747; GFX10-NEXT:    s_waitcnt vmcnt(1)
3748; GFX10-NEXT:    v_bfe_u32 v0, v2, 8, 8
3749; GFX10-NEXT:    s_waitcnt vmcnt(0)
3750; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v8
3751; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040c00
3752; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x5040c03
3753; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3754; GFX10-NEXT:    global_store_dword v[6:7], v1, off
3755; GFX10-NEXT:    s_setpc_b64 s[30:31]
3756;
3757; GFX9-LABEL: extract_v13i8:
3758; GFX9:       ; %bb.0:
3759; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3760; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
3761; GFX9-NEXT:    global_load_ushort v8, v[0:1], off offset:8
3762; GFX9-NEXT:    s_mov_b32 s4, 0x5040c00
3763; GFX9-NEXT:    s_mov_b32 s5, 0x5040c03
3764; GFX9-NEXT:    s_waitcnt vmcnt(1)
3765; GFX9-NEXT:    v_bfe_u32 v0, v2, 8, 8
3766; GFX9-NEXT:    s_waitcnt vmcnt(0)
3767; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v8
3768; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
3769; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s5
3770; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3771; GFX9-NEXT:    global_store_dword v[6:7], v1, off
3772; GFX9-NEXT:    s_waitcnt vmcnt(0)
3773; GFX9-NEXT:    s_setpc_b64 s[30:31]
3774  %vec = load <13 x i8>, ptr addrspace(1) %in0, align 2
3775  %el0 = extractelement <13 x i8> %vec, i32 0
3776  %el1 = extractelement <13 x i8> %vec, i32 1
3777  %el2 = extractelement <13 x i8> %vec, i32 7
3778  %el3 = extractelement <13 x i8> %vec, i32 8
3779  %z0 = zext i8 %el0 to i32
3780  %z1 = zext i8 %el1 to i32
3781  %s1 = shl nuw i32 %z1, 16
3782  %o0 = or i32 %s1, %z0
3783  %z2 = zext i8 %el2 to i32
3784  %z3 = zext i8 %el3 to i32
3785  %s3 = shl nuw i32 %z3, 16
3786  %o1 = or i32 %z2, %s3
3787
3788  store i32 %o0, ptr addrspace(1) %out0, align 4
3789  store i32 %o1, ptr addrspace(1) %out1, align 4
3790  ret void
3791}
3792
3793; Should not result in crash
3794define hidden void @extract_v13i64(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
3795; GFX10-LABEL: extract_v13i64:
3796; GFX10:       ; %bb.0:
3797; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3798; GFX10-NEXT:    s_clause 0x2
3799; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:48
3800; GFX10-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off
3801; GFX10-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off offset:64
3802; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
3803; GFX10-NEXT:    s_waitcnt vmcnt(1)
3804; GFX10-NEXT:    v_perm_b32 v0, v12, v13, 0x1000504
3805; GFX10-NEXT:    s_waitcnt vmcnt(0)
3806; GFX10-NEXT:    v_perm_b32 v1, v10, v14, 0x1000504
3807; GFX10-NEXT:    global_store_dword v[4:5], v0, off
3808; GFX10-NEXT:    global_store_dword v[6:7], v1, off
3809; GFX10-NEXT:    s_setpc_b64 s[30:31]
3810;
3811; GFX9-LABEL: extract_v13i64:
3812; GFX9:       ; %bb.0:
3813; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3814; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:48
3815; GFX9-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off
3816; GFX9-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off offset:64
3817; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
3818; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
3819; GFX9-NEXT:    s_waitcnt vmcnt(1)
3820; GFX9-NEXT:    v_perm_b32 v0, v12, v13, s4
3821; GFX9-NEXT:    s_waitcnt vmcnt(0)
3822; GFX9-NEXT:    v_perm_b32 v1, v10, v14, s4
3823; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3824; GFX9-NEXT:    global_store_dword v[6:7], v1, off
3825; GFX9-NEXT:    s_waitcnt vmcnt(0)
3826; GFX9-NEXT:    s_setpc_b64 s[30:31]
3827  %vec = load <13 x i64>, ptr addrspace(1) %in0, align 2
3828  %el0 = extractelement <13 x i64> %vec, i32 0
3829  %el1 = extractelement <13 x i64> %vec, i32 1
3830  %el2 = extractelement <13 x i64> %vec, i32 7
3831  %el3 = extractelement <13 x i64> %vec, i32 8
3832  %el00 = lshr i64 %el0, 32
3833  %t0 = trunc i64 %el00 to i16
3834  %z0 = zext i16 %t0 to i32
3835  %z1 = trunc i64 %el1 to i32
3836  %s1 = shl nuw i32 %z1, 16
3837  %o0 = or i32 %s1, %z0
3838  %t2 = trunc i64 %el2 to i16
3839  %z2 = zext i16 %t2 to i32
3840  %z3 = trunc i64 %el3 to i32
3841  %s3 = shl nuw i32 %z3, 16
3842  %o1 = or i32 %z2, %s3
3843
3844  store i32 %o0, ptr addrspace(1) %out0, align 4
3845  store i32 %o1, ptr addrspace(1) %out1, align 4
3846  ret void
3847}
3848
3849
3850; Should combine the lower 16 bits from each i32 in load
3851define hidden void @trunc_vector(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
3852; GFX10-LABEL: trunc_vector:
3853; GFX10:       ; %bb.0:
3854; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3855; GFX10-NEXT:    s_clause 0x1
3856; GFX10-NEXT:    global_load_ushort v2, v[0:1], off
3857; GFX10-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:4
3858; GFX10-NEXT:    s_waitcnt vmcnt(0)
3859; GFX10-NEXT:    global_store_dword v[4:5], v2, off
3860; GFX10-NEXT:    s_setpc_b64 s[30:31]
3861;
3862; GFX9-LABEL: trunc_vector:
3863; GFX9:       ; %bb.0:
3864; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3865; GFX9-NEXT:    global_load_ushort v2, v[0:1], off
3866; GFX9-NEXT:    global_load_ushort v3, v[0:1], off offset:4
3867; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
3868; GFX9-NEXT:    s_waitcnt vmcnt(0)
3869; GFX9-NEXT:    v_perm_b32 v0, v3, v2, s4
3870; GFX9-NEXT:    global_store_dword v[4:5], v0, off
3871; GFX9-NEXT:    s_waitcnt vmcnt(0)
3872; GFX9-NEXT:    s_setpc_b64 s[30:31]
3873  %vec = load <2 x i32>, ptr addrspace(1) %in0, align 2
3874  %tvec = trunc <2 x i32> %vec to <2 x i16>
3875  %el0 = extractelement <2 x i16> %tvec, i32 0
3876  %el1 = extractelement <2 x i16> %tvec, i32 1
3877  %z0 = zext i16 %el0 to i32
3878  %z1 = zext i16 %el1 to i32
3879  %s1 = shl nuw i32 %z1, 16
3880  %o0 = or i32 %s1, %z0
3881
3882  store i32 %o0, ptr addrspace(1) %out0, align 4
3883  ret void
3884}
3885