xref: /llvm-project/llvm/test/CodeGen/AMDGPU/permute.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
3
4define amdgpu_kernel void @lsh8_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) {
5; GCN-LABEL: lsh8_or_and:
6; GCN:       ; %bb.0: ; %bb
7; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
8; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
9; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
10; GCN-NEXT:    v_mov_b32_e32 v3, 0x6050400
11; GCN-NEXT:    s_waitcnt lgkmcnt(0)
12; GCN-NEXT:    v_mov_b32_e32 v1, s1
13; GCN-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
14; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
15; GCN-NEXT:    flat_load_dword v2, v[0:1]
16; GCN-NEXT:    s_waitcnt vmcnt(0)
17; GCN-NEXT:    v_perm_b32 v2, v2, s2, v3
18; GCN-NEXT:    flat_store_dword v[0:1], v2
19; GCN-NEXT:    s_endpgm
20bb:
21  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
22  %gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
23  %tmp = load i32, ptr addrspace(1) %gep, align 4
24  %tmp2 = shl i32 %tmp, 8
25  %tmp3 = and i32 %arg1, 255
26  %tmp4 = or i32 %tmp2, %tmp3
27  store i32 %tmp4, ptr addrspace(1) %gep, align 4
28  ret void
29}
30
31define amdgpu_kernel void @lsr24_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) {
32; GCN-LABEL: lsr24_or_and:
33; GCN:       ; %bb.0: ; %bb
34; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
35; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
36; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
37; GCN-NEXT:    v_mov_b32_e32 v3, 0x7060503
38; GCN-NEXT:    s_waitcnt lgkmcnt(0)
39; GCN-NEXT:    v_mov_b32_e32 v1, s1
40; GCN-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
41; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
42; GCN-NEXT:    flat_load_dword v2, v[0:1]
43; GCN-NEXT:    s_waitcnt vmcnt(0)
44; GCN-NEXT:    v_perm_b32 v2, s2, v2, v3
45; GCN-NEXT:    flat_store_dword v[0:1], v2
46; GCN-NEXT:    s_endpgm
47bb:
48  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
49  %gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
50  %tmp = load i32, ptr addrspace(1) %gep, align 4
51  %tmp2 = lshr i32 %tmp, 24
52  %tmp3 = and i32 %arg1, 4294967040 ; 0xffffff00
53  %tmp4 = or i32 %tmp2, %tmp3
54  store i32 %tmp4, ptr addrspace(1) %gep, align 4
55  ret void
56}
57
58define amdgpu_kernel void @and_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) {
59; GCN-LABEL: and_or_lsr24:
60; GCN:       ; %bb.0: ; %bb
61; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
62; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
63; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
64; GCN-NEXT:    v_mov_b32_e32 v3, 0x7060503
65; GCN-NEXT:    s_waitcnt lgkmcnt(0)
66; GCN-NEXT:    v_mov_b32_e32 v1, s1
67; GCN-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
68; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
69; GCN-NEXT:    flat_load_dword v2, v[0:1]
70; GCN-NEXT:    s_waitcnt vmcnt(0)
71; GCN-NEXT:    v_perm_b32 v2, v2, s2, v3
72; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
73; GCN-NEXT:    flat_store_dword v[0:1], v2
74; GCN-NEXT:    s_endpgm
75bb:
76  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
77  %gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
78  %tmp = load i32, ptr addrspace(1) %gep, align 4
79  %tmp2 = and i32 %tmp, 4294967040 ; 0xffffff00
80  %tmp3 = lshr i32 %arg1, 24
81  %tmp4 = or i32 %tmp2, %tmp3
82  %tmp5 = xor i32 %tmp4, -2147483648
83  store i32 %tmp5, ptr addrspace(1) %gep, align 4
84  ret void
85}
86
87define amdgpu_kernel void @and_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) {
88; GCN-LABEL: and_or_and:
89; GCN:       ; %bb.0: ; %bb
90; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
91; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
92; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
93; GCN-NEXT:    v_mov_b32_e32 v3, 0x7020500
94; GCN-NEXT:    s_waitcnt lgkmcnt(0)
95; GCN-NEXT:    v_mov_b32_e32 v1, s1
96; GCN-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
97; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
98; GCN-NEXT:    flat_load_dword v2, v[0:1]
99; GCN-NEXT:    s_waitcnt vmcnt(0)
100; GCN-NEXT:    v_perm_b32 v2, v2, s2, v3
101; GCN-NEXT:    flat_store_dword v[0:1], v2
102; GCN-NEXT:    s_endpgm
103bb:
104  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
105  %gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
106  %tmp = load i32, ptr addrspace(1) %gep, align 4
107  %tmp2 = and i32 %tmp, -16711936
108  %tmp3 = and i32 %arg1, 16711935
109  %tmp4 = or i32 %tmp2, %tmp3
110  store i32 %tmp4, ptr addrspace(1) %gep, align 4
111  ret void
112}
113
114; FIXME: produce v_alignbit_b32 v2, v2, s0, 24 instead of v_perm
115define amdgpu_kernel void @lsh8_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) {
116; GCN-LABEL: lsh8_or_lsr24:
117; GCN:       ; %bb.0: ; %bb
118; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
119; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
120; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
121; GCN-NEXT:    v_mov_b32_e32 v3, 0x2010007
122; GCN-NEXT:    s_waitcnt lgkmcnt(0)
123; GCN-NEXT:    v_mov_b32_e32 v1, s1
124; GCN-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
125; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
126; GCN-NEXT:    flat_load_dword v2, v[0:1]
127; GCN-NEXT:    s_waitcnt vmcnt(0)
128; GCN-NEXT:    v_perm_b32 v2, s2, v2, v3
129; GCN-NEXT:    flat_store_dword v[0:1], v2
130; GCN-NEXT:    s_endpgm
131bb:
132  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
133  %gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
134  %tmp = load i32, ptr addrspace(1) %gep, align 4
135  %tmp2 = shl i32 %tmp, 8
136  %tmp3 = lshr i32 %arg1, 24
137  %tmp4 = or i32 %tmp2, %tmp3
138  store i32 %tmp4, ptr addrspace(1) %gep, align 4
139  ret void
140}
141
142define amdgpu_kernel void @lsh16_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %arg1) {
143; GCN-LABEL: lsh16_or_lsr24:
144; GCN:       ; %bb.0: ; %bb
145; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
146; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
147; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
148; GCN-NEXT:    v_mov_b32_e32 v3, 0x5040c03
149; GCN-NEXT:    s_waitcnt lgkmcnt(0)
150; GCN-NEXT:    v_mov_b32_e32 v1, s1
151; GCN-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
152; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
153; GCN-NEXT:    flat_load_dword v2, v[0:1]
154; GCN-NEXT:    s_waitcnt vmcnt(0)
155; GCN-NEXT:    v_perm_b32 v2, v2, s2, v3
156; GCN-NEXT:    flat_store_dword v[0:1], v2
157; GCN-NEXT:    s_endpgm
158bb:
159  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
160  %gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
161  %tmp = load i32, ptr addrspace(1) %gep, align 4
162  %tmp2 = shl i32 %tmp, 16
163  %tmp3 = lshr i32 %arg1, 24
164  %tmp4 = or i32 %tmp2, %tmp3
165  store i32 %tmp4, ptr addrspace(1) %gep, align 4
166  ret void
167}
168
169define amdgpu_kernel void @and_xor_and(ptr addrspace(1) nocapture %arg, i32 %arg1) {
170; GCN-LABEL: and_xor_and:
171; GCN:       ; %bb.0: ; %bb
172; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
173; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
174; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
175; GCN-NEXT:    v_mov_b32_e32 v3, 0x7020104
176; GCN-NEXT:    s_waitcnt lgkmcnt(0)
177; GCN-NEXT:    v_mov_b32_e32 v1, s1
178; GCN-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
179; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
180; GCN-NEXT:    flat_load_dword v2, v[0:1]
181; GCN-NEXT:    s_waitcnt vmcnt(0)
182; GCN-NEXT:    v_perm_b32 v2, v2, s2, v3
183; GCN-NEXT:    flat_store_dword v[0:1], v2
184; GCN-NEXT:    s_endpgm
185bb:
186  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
187  %gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
188  %tmp = load i32, ptr addrspace(1) %gep, align 4
189  %tmp2 = and i32 %tmp, -16776961
190  %tmp3 = and i32 %arg1, 16776960
191  %tmp4 = xor i32 %tmp2, %tmp3
192  store i32 %tmp4, ptr addrspace(1) %gep, align 4
193  ret void
194}
195
196; FIXME here should have been "v_perm_b32" with 0xffff0500 mask.
197define amdgpu_kernel void @and_or_or_and(ptr addrspace(1) nocapture %arg, i32 %arg1) {
198; GCN-LABEL: and_or_or_and:
199; GCN:       ; %bb.0: ; %bb
200; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
201; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
202; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
203; GCN-NEXT:    s_waitcnt lgkmcnt(0)
204; GCN-NEXT:    v_mov_b32_e32 v1, s1
205; GCN-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
206; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
207; GCN-NEXT:    flat_load_dword v2, v[0:1]
208; GCN-NEXT:    s_and_b32 s0, s2, 0xff00
209; GCN-NEXT:    s_or_b32 s0, s0, 0xffff0000
210; GCN-NEXT:    s_waitcnt vmcnt(0)
211; GCN-NEXT:    v_and_b32_e32 v2, 0xff00ff, v2
212; GCN-NEXT:    v_or_b32_e32 v2, s0, v2
213; GCN-NEXT:    flat_store_dword v[0:1], v2
214; GCN-NEXT:    s_endpgm
215bb:
216  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
217  %gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
218  %tmp = load i32, ptr addrspace(1) %gep, align 4
219  %and = and i32 %tmp, 16711935     ; 0x00ff00ff
220  %tmp1 = and i32 %arg1, 4294967040 ; 0xffffff00
221  %tmp2 = or i32 %tmp1, -65536
222  %tmp3 = or i32 %tmp2, %and
223  store i32 %tmp3, ptr addrspace(1) %gep, align 4
224  ret void
225}
226
227define amdgpu_kernel void @and_or_and_shl(ptr addrspace(1) nocapture %arg, i32 %arg1) {
228; GCN-LABEL: and_or_and_shl:
229; GCN:       ; %bb.0: ; %bb
230; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
231; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
232; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
233; GCN-NEXT:    v_mov_b32_e32 v3, 0x50c0c00
234; GCN-NEXT:    s_waitcnt lgkmcnt(0)
235; GCN-NEXT:    v_mov_b32_e32 v1, s1
236; GCN-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
237; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
238; GCN-NEXT:    flat_load_dword v2, v[0:1]
239; GCN-NEXT:    s_waitcnt vmcnt(0)
240; GCN-NEXT:    v_perm_b32 v2, v2, s2, v3
241; GCN-NEXT:    flat_store_dword v[0:1], v2
242; GCN-NEXT:    s_endpgm
243bb:
244  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
245  %gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
246  %tmp = load i32, ptr addrspace(1) %gep, align 4
247  %tmp2 = shl i32 %tmp, 16
248  %tmp3 = and i32 %arg1, 65535
249  %tmp4 = or i32 %tmp2, %tmp3
250  %and = and i32 %tmp4, 4278190335
251  store i32 %and, ptr addrspace(1) %gep, align 4
252  ret void
253}
254
255define amdgpu_kernel void @or_and_or(ptr addrspace(1) nocapture %arg, i32 %arg1) {
256; GCN-LABEL: or_and_or:
257; GCN:       ; %bb.0: ; %bb
258; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
259; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
260; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
261; GCN-NEXT:    v_mov_b32_e32 v3, 0x7020104
262; GCN-NEXT:    s_waitcnt lgkmcnt(0)
263; GCN-NEXT:    v_mov_b32_e32 v1, s1
264; GCN-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
265; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
266; GCN-NEXT:    flat_load_dword v2, v[0:1]
267; GCN-NEXT:    s_waitcnt vmcnt(0)
268; GCN-NEXT:    v_perm_b32 v2, v2, s2, v3
269; GCN-NEXT:    flat_store_dword v[0:1], v2
270; GCN-NEXT:    s_endpgm
271bb:
272  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
273  %gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
274  %tmp = load i32, ptr addrspace(1) %gep, align 4
275  %or1 = or i32 %tmp, 16776960    ; 0x00ffff00
276  %or2 = or i32 %arg1, 4278190335 ; 0xff0000ff
277  %and = and i32 %or1, %or2
278  store i32 %and, ptr addrspace(1) %gep, align 4
279  ret void
280}
281
282; FIXME here should have been "v_perm_b32" with 0xffff0500 mask.
283define amdgpu_kernel void @known_ffff0500(ptr addrspace(1) nocapture %arg, i32 %arg1) {
284; GCN-LABEL: known_ffff0500:
285; GCN:       ; %bb.0: ; %bb
286; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
287; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
288; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
289; GCN-NEXT:    v_mov_b32_e32 v5, 0xffff8004
290; GCN-NEXT:    s_waitcnt lgkmcnt(0)
291; GCN-NEXT:    v_mov_b32_e32 v1, s1
292; GCN-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
293; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
294; GCN-NEXT:    flat_load_dword v4, v[0:1]
295; GCN-NEXT:    s_bitset1_b32 s2, 15
296; GCN-NEXT:    v_mov_b32_e32 v3, s1
297; GCN-NEXT:    v_mov_b32_e32 v2, s0
298; GCN-NEXT:    s_and_b32 s0, s2, 0xff00
299; GCN-NEXT:    s_or_b32 s0, s0, 0xffff0000
300; GCN-NEXT:    s_waitcnt vmcnt(0)
301; GCN-NEXT:    v_or_b32_e32 v4, 4, v4
302; GCN-NEXT:    v_and_b32_e32 v4, 0xff00ff, v4
303; GCN-NEXT:    v_or_b32_e32 v4, s0, v4
304; GCN-NEXT:    flat_store_dword v[0:1], v4
305; GCN-NEXT:    flat_store_dword v[2:3], v5
306; GCN-NEXT:    s_endpgm
307bb:
308  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
309  %gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
310  %load = load i32, ptr addrspace(1) %gep, align 4
311  %mask1 = or i32 %arg1, 32768 ; 0x8000
312  %mask2 = or i32 %load, 4
313  %and = and i32 %mask2, 16711935     ; 0x00ff00ff
314  %tmp1 = and i32 %mask1, 4294967040 ; 0xffffff00
315  %tmp2 = or i32 %tmp1, 4294901760   ; 0xffff0000
316  %tmp3 = or i32 %tmp2, %and
317  store i32 %tmp3, ptr addrspace(1) %gep, align 4
318  %v = and i32 %tmp3, 4294934532 ; 0xffff8004
319  store i32 %v, ptr addrspace(1) %arg, align 4
320  ret void
321}
322
323define amdgpu_kernel void @known_050c0c00(ptr addrspace(1) nocapture %arg, i32 %arg1) {
324; GCN-LABEL: known_050c0c00:
325; GCN:       ; %bb.0: ; %bb
326; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
327; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
328; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
329; GCN-NEXT:    v_mov_b32_e32 v5, 0x50c0c00
330; GCN-NEXT:    v_mov_b32_e32 v6, 4
331; GCN-NEXT:    s_waitcnt lgkmcnt(0)
332; GCN-NEXT:    v_mov_b32_e32 v1, s1
333; GCN-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
334; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
335; GCN-NEXT:    flat_load_dword v4, v[0:1]
336; GCN-NEXT:    s_or_b32 s2, s2, 4
337; GCN-NEXT:    v_mov_b32_e32 v3, s1
338; GCN-NEXT:    v_mov_b32_e32 v2, s0
339; GCN-NEXT:    s_waitcnt vmcnt(0)
340; GCN-NEXT:    v_perm_b32 v4, v4, s2, v5
341; GCN-NEXT:    flat_store_dword v[0:1], v4
342; GCN-NEXT:    flat_store_dword v[2:3], v6
343; GCN-NEXT:    s_endpgm
344bb:
345  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
346  %gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
347  %tmp = load i32, ptr addrspace(1) %gep, align 4
348  %tmp2 = shl i32 %tmp, 16
349  %mask = or i32 %arg1, 4
350  %tmp3 = and i32 %mask, 65535
351  %tmp4 = or i32 %tmp2, %tmp3
352  %and = and i32 %tmp4, 4278190335
353  store i32 %and, ptr addrspace(1) %gep, align 4
354  %v = and i32 %and, 16776964
355  store i32 %v, ptr addrspace(1) %arg, align 4
356  ret void
357}
358
359define amdgpu_kernel void @known_ffff8004(ptr addrspace(1) nocapture %arg, i32 %arg1) {
360; GCN-LABEL: known_ffff8004:
361; GCN:       ; %bb.0: ; %bb
362; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
363; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
364; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
365; GCN-NEXT:    v_mov_b32_e32 v5, 0xffff0500
366; GCN-NEXT:    v_mov_b32_e32 v6, 0xffff8004
367; GCN-NEXT:    s_waitcnt lgkmcnt(0)
368; GCN-NEXT:    v_mov_b32_e32 v1, s1
369; GCN-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
370; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
371; GCN-NEXT:    flat_load_dword v4, v[0:1]
372; GCN-NEXT:    s_or_b32 s2, s2, 4
373; GCN-NEXT:    v_mov_b32_e32 v3, s1
374; GCN-NEXT:    v_mov_b32_e32 v2, s0
375; GCN-NEXT:    s_waitcnt vmcnt(0)
376; GCN-NEXT:    v_or_b32_e32 v4, 0x8000, v4
377; GCN-NEXT:    v_perm_b32 v4, v4, s2, v5
378; GCN-NEXT:    flat_store_dword v[0:1], v4
379; GCN-NEXT:    flat_store_dword v[2:3], v6
380; GCN-NEXT:    s_endpgm
381bb:
382  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
383  %gep = getelementptr i32, ptr addrspace(1) %arg, i32 %id
384  %load = load i32, ptr addrspace(1) %gep, align 4
385  %mask1 = or i32 %arg1, 4
386  %mask2 = or i32 %load, 32768 ; 0x8000
387  %and = and i32 %mask1, 16711935     ; 0x00ff00ff
388  %tmp1 = and i32 %mask2, 4294967040 ; 0xffffff00
389  %tmp2 = or i32 %tmp1, 4294901760   ; 0xffff0000
390  %tmp3 = or i32 %tmp2, %and
391  store i32 %tmp3, ptr addrspace(1) %gep, align 4
392  %v = and i32 %tmp3, 4294934532 ; 0xffff8004
393  store i32 %v, ptr addrspace(1) %arg, align 4
394  ret void
395}
396
397declare i32 @llvm.amdgcn.workitem.id.x()
398