xref: /llvm-project/llvm/test/CodeGen/AMDGPU/bfi_int.ll (revision 1416744f8405db03096bc240a8ec9de176a71569)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s
3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
5; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8-GISEL %s
6; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10-GISEL %s
7
8; BFI_INT Definition pattern from ISA docs
9; (y & x) | (z & ~x)
10;
11define amdgpu_kernel void @s_bfi_def_i32(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
12; GFX7-LABEL: s_bfi_def_i32:
13; GFX7:       ; %bb.0: ; %entry
14; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
15; GFX7-NEXT:    s_load_dword s6, s[0:1], 0xd
16; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
17; GFX7-NEXT:    s_mov_b32 s3, 0xf000
18; GFX7-NEXT:    s_mov_b32 s2, -1
19; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
20; GFX7-NEXT:    s_andn2_b32 s6, s6, s4
21; GFX7-NEXT:    s_and_b32 s4, s5, s4
22; GFX7-NEXT:    s_or_b32 s4, s6, s4
23; GFX7-NEXT:    v_mov_b32_e32 v0, s4
24; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
25; GFX7-NEXT:    s_endpgm
26;
27; GFX8-LABEL: s_bfi_def_i32:
28; GFX8:       ; %bb.0: ; %entry
29; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
30; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x34
31; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
32; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX8-NEXT:    s_andn2_b32 s4, s4, s2
34; GFX8-NEXT:    s_and_b32 s2, s3, s2
35; GFX8-NEXT:    s_or_b32 s2, s4, s2
36; GFX8-NEXT:    v_mov_b32_e32 v0, s0
37; GFX8-NEXT:    v_mov_b32_e32 v1, s1
38; GFX8-NEXT:    v_mov_b32_e32 v2, s2
39; GFX8-NEXT:    flat_store_dword v[0:1], v2
40; GFX8-NEXT:    s_endpgm
41;
42; GFX10-LABEL: s_bfi_def_i32:
43; GFX10:       ; %bb.0: ; %entry
44; GFX10-NEXT:    s_clause 0x2
45; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
46; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x34
47; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
48; GFX10-NEXT:    v_mov_b32_e32 v0, 0
49; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
50; GFX10-NEXT:    s_andn2_b32 s4, s4, s2
51; GFX10-NEXT:    s_and_b32 s2, s3, s2
52; GFX10-NEXT:    s_or_b32 s2, s4, s2
53; GFX10-NEXT:    v_mov_b32_e32 v1, s2
54; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
55; GFX10-NEXT:    s_endpgm
56;
57; GFX8-GISEL-LABEL: s_bfi_def_i32:
58; GFX8-GISEL:       ; %bb.0: ; %entry
59; GFX8-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
60; GFX8-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x34
61; GFX8-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
62; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
63; GFX8-GISEL-NEXT:    s_andn2_b32 s4, s4, s2
64; GFX8-GISEL-NEXT:    s_and_b32 s2, s3, s2
65; GFX8-GISEL-NEXT:    s_or_b32 s2, s4, s2
66; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
67; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s2
68; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
69; GFX8-GISEL-NEXT:    flat_store_dword v[0:1], v2
70; GFX8-GISEL-NEXT:    s_endpgm
71;
72; GFX10-GISEL-LABEL: s_bfi_def_i32:
73; GFX10-GISEL:       ; %bb.0: ; %entry
74; GFX10-GISEL-NEXT:    s_clause 0x2
75; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
76; GFX10-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x34
77; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
78; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
79; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
80; GFX10-GISEL-NEXT:    s_andn2_b32 s4, s4, s2
81; GFX10-GISEL-NEXT:    s_and_b32 s2, s3, s2
82; GFX10-GISEL-NEXT:    s_or_b32 s2, s4, s2
83; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
84; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
85; GFX10-GISEL-NEXT:    s_endpgm
86entry:
87  %0 = xor i32 %x, -1
88  %1 = and i32 %z, %0
89  %2 = and i32 %y, %x
90  %3 = or i32 %1, %2
91  store i32 %3, i32 addrspace(1)* %out
92  ret void
93}
94
95define i32 @v_bfi_def_i32(i32 %x, i32 %y, i32 %z) {
96; GFX7-LABEL: v_bfi_def_i32:
97; GFX7:       ; %bb.0: ; %entry
98; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
99; GFX7-NEXT:    v_bfi_b32 v0, v0, v1, v2
100; GFX7-NEXT:    s_setpc_b64 s[30:31]
101;
102; GFX8-LABEL: v_bfi_def_i32:
103; GFX8:       ; %bb.0: ; %entry
104; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105; GFX8-NEXT:    v_bfi_b32 v0, v0, v1, v2
106; GFX8-NEXT:    s_setpc_b64 s[30:31]
107;
108; GFX10-LABEL: v_bfi_def_i32:
109; GFX10:       ; %bb.0: ; %entry
110; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
111; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
112; GFX10-NEXT:    v_bfi_b32 v0, v0, v1, v2
113; GFX10-NEXT:    s_setpc_b64 s[30:31]
114;
115; GFX8-GISEL-LABEL: v_bfi_def_i32:
116; GFX8-GISEL:       ; %bb.0: ; %entry
117; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
118; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
119; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
120;
121; GFX10-GISEL-LABEL: v_bfi_def_i32:
122; GFX10-GISEL:       ; %bb.0: ; %entry
123; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
124; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
125; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
126; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
127entry:
128  %0 = xor i32 %x, -1
129  %1 = and i32 %z, %0
130  %2 = and i32 %y, %x
131  %3 = or i32 %1, %2
132  ret i32 %3
133}
134
135; SHA-256 Ch function
136; z ^ (x & (y ^ z))
137define amdgpu_kernel void @s_bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
138; GFX7-LABEL: s_bfi_sha256_ch:
139; GFX7:       ; %bb.0: ; %entry
140; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
141; GFX7-NEXT:    s_load_dword s6, s[0:1], 0xd
142; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
143; GFX7-NEXT:    s_mov_b32 s3, 0xf000
144; GFX7-NEXT:    s_mov_b32 s2, -1
145; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
146; GFX7-NEXT:    s_xor_b32 s5, s5, s6
147; GFX7-NEXT:    s_and_b32 s4, s4, s5
148; GFX7-NEXT:    s_xor_b32 s4, s6, s4
149; GFX7-NEXT:    v_mov_b32_e32 v0, s4
150; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
151; GFX7-NEXT:    s_endpgm
152;
153; GFX8-LABEL: s_bfi_sha256_ch:
154; GFX8:       ; %bb.0: ; %entry
155; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
156; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x34
157; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
158; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
159; GFX8-NEXT:    s_xor_b32 s3, s3, s4
160; GFX8-NEXT:    s_and_b32 s2, s2, s3
161; GFX8-NEXT:    s_xor_b32 s2, s4, s2
162; GFX8-NEXT:    v_mov_b32_e32 v0, s0
163; GFX8-NEXT:    v_mov_b32_e32 v1, s1
164; GFX8-NEXT:    v_mov_b32_e32 v2, s2
165; GFX8-NEXT:    flat_store_dword v[0:1], v2
166; GFX8-NEXT:    s_endpgm
167;
168; GFX10-LABEL: s_bfi_sha256_ch:
169; GFX10:       ; %bb.0: ; %entry
170; GFX10-NEXT:    s_clause 0x2
171; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
172; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x34
173; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
174; GFX10-NEXT:    v_mov_b32_e32 v0, 0
175; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
176; GFX10-NEXT:    s_xor_b32 s3, s3, s4
177; GFX10-NEXT:    s_and_b32 s2, s2, s3
178; GFX10-NEXT:    s_xor_b32 s2, s4, s2
179; GFX10-NEXT:    v_mov_b32_e32 v1, s2
180; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
181; GFX10-NEXT:    s_endpgm
182;
183; GFX8-GISEL-LABEL: s_bfi_sha256_ch:
184; GFX8-GISEL:       ; %bb.0: ; %entry
185; GFX8-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
186; GFX8-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x34
187; GFX8-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
188; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
189; GFX8-GISEL-NEXT:    s_xor_b32 s3, s3, s4
190; GFX8-GISEL-NEXT:    s_and_b32 s2, s2, s3
191; GFX8-GISEL-NEXT:    s_xor_b32 s2, s4, s2
192; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
193; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s2
194; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
195; GFX8-GISEL-NEXT:    flat_store_dword v[0:1], v2
196; GFX8-GISEL-NEXT:    s_endpgm
197;
198; GFX10-GISEL-LABEL: s_bfi_sha256_ch:
199; GFX10-GISEL:       ; %bb.0: ; %entry
200; GFX10-GISEL-NEXT:    s_clause 0x2
201; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
202; GFX10-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x34
203; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
204; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
205; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
206; GFX10-GISEL-NEXT:    s_xor_b32 s3, s3, s4
207; GFX10-GISEL-NEXT:    s_and_b32 s2, s2, s3
208; GFX10-GISEL-NEXT:    s_xor_b32 s2, s4, s2
209; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
210; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
211; GFX10-GISEL-NEXT:    s_endpgm
212entry:
213  %0 = xor i32 %y, %z
214  %1 = and i32 %x, %0
215  %2 = xor i32 %z, %1
216  store i32 %2, i32 addrspace(1)* %out
217  ret void
218}
219
220define i32 @v_bfi_sha256_ch(i32 %x, i32 %y, i32 %z) {
221; GFX7-LABEL: v_bfi_sha256_ch:
222; GFX7:       ; %bb.0: ; %entry
223; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
224; GFX7-NEXT:    v_bfi_b32 v0, v0, v1, v2
225; GFX7-NEXT:    s_setpc_b64 s[30:31]
226;
227; GFX8-LABEL: v_bfi_sha256_ch:
228; GFX8:       ; %bb.0: ; %entry
229; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230; GFX8-NEXT:    v_bfi_b32 v0, v0, v1, v2
231; GFX8-NEXT:    s_setpc_b64 s[30:31]
232;
233; GFX10-LABEL: v_bfi_sha256_ch:
234; GFX10:       ; %bb.0: ; %entry
235; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
236; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
237; GFX10-NEXT:    v_bfi_b32 v0, v0, v1, v2
238; GFX10-NEXT:    s_setpc_b64 s[30:31]
239;
240; GFX8-GISEL-LABEL: v_bfi_sha256_ch:
241; GFX8-GISEL:       ; %bb.0: ; %entry
242; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
243; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
244; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
245;
246; GFX10-GISEL-LABEL: v_bfi_sha256_ch:
247; GFX10-GISEL:       ; %bb.0: ; %entry
248; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
250; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
251; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
252entry:
253  %0 = xor i32 %y, %z
254  %1 = and i32 %x, %0
255  %2 = xor i32 %z, %1
256  ret i32 %2
257}
258
259define amdgpu_ps float @v_s_s_bfi_sha256_ch(i32 %x, i32 inreg %y, i32 inreg %z) {
260; GFX7-LABEL: v_s_s_bfi_sha256_ch:
261; GFX7:       ; %bb.0: ; %entry
262; GFX7-NEXT:    v_mov_b32_e32 v1, s1
263; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v1
264; GFX7-NEXT:    ; return to shader part epilog
265;
266; GFX8-LABEL: v_s_s_bfi_sha256_ch:
267; GFX8:       ; %bb.0: ; %entry
268; GFX8-NEXT:    v_mov_b32_e32 v1, s1
269; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v1
270; GFX8-NEXT:    ; return to shader part epilog
271;
272; GFX10-LABEL: v_s_s_bfi_sha256_ch:
273; GFX10:       ; %bb.0: ; %entry
274; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, s1
275; GFX10-NEXT:    ; return to shader part epilog
276;
277; GFX8-GISEL-LABEL: v_s_s_bfi_sha256_ch:
278; GFX8-GISEL:       ; %bb.0: ; %entry
279; GFX8-GISEL-NEXT:    s_xor_b32 s0, s0, s1
280; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
281; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s1, v0
282; GFX8-GISEL-NEXT:    ; return to shader part epilog
283;
284; GFX10-GISEL-LABEL: v_s_s_bfi_sha256_ch:
285; GFX10-GISEL:       ; %bb.0: ; %entry
286; GFX10-GISEL-NEXT:    s_xor_b32 s0, s0, s1
287; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
288; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s1, v0
289; GFX10-GISEL-NEXT:    ; return to shader part epilog
290entry:
291  %xor0 = xor i32 %y, %z
292  %and = and i32 %x, %xor0
293  %xor1 = xor i32 %z, %and
294  %cast = bitcast i32 %xor1 to float
295  ret float %cast
296}
297
298define amdgpu_ps float @s_v_s_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 inreg %z) {
299; GFX7-LABEL: s_v_s_bfi_sha256_ch:
300; GFX7:       ; %bb.0: ; %entry
301; GFX7-NEXT:    v_mov_b32_e32 v1, s1
302; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v1
303; GFX7-NEXT:    ; return to shader part epilog
304;
305; GFX8-LABEL: s_v_s_bfi_sha256_ch:
306; GFX8:       ; %bb.0: ; %entry
307; GFX8-NEXT:    v_mov_b32_e32 v1, s1
308; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v1
309; GFX8-NEXT:    ; return to shader part epilog
310;
311; GFX10-LABEL: s_v_s_bfi_sha256_ch:
312; GFX10:       ; %bb.0: ; %entry
313; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, s1
314; GFX10-NEXT:    ; return to shader part epilog
315;
316; GFX8-GISEL-LABEL: s_v_s_bfi_sha256_ch:
317; GFX8-GISEL:       ; %bb.0: ; %entry
318; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s1, v0
319; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
320; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s1, v0
321; GFX8-GISEL-NEXT:    ; return to shader part epilog
322;
323; GFX10-GISEL-LABEL: s_v_s_bfi_sha256_ch:
324; GFX10-GISEL:       ; %bb.0: ; %entry
325; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s1, v0
326; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
327; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s1, v0
328; GFX10-GISEL-NEXT:    ; return to shader part epilog
329entry:
330  %xor0 = xor i32 %y, %z
331  %and = and i32 %x, %xor0
332  %xor1 = xor i32 %z, %and
333  %cast = bitcast i32 %xor1 to float
334  ret float %cast
335}
336
337define amdgpu_ps float @s_s_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) {
338; GFX7-LABEL: s_s_v_bfi_sha256_ch:
339; GFX7:       ; %bb.0: ; %entry
340; GFX7-NEXT:    v_mov_b32_e32 v1, s1
341; GFX7-NEXT:    v_bfi_b32 v0, s0, v1, v0
342; GFX7-NEXT:    ; return to shader part epilog
343;
344; GFX8-LABEL: s_s_v_bfi_sha256_ch:
345; GFX8:       ; %bb.0: ; %entry
346; GFX8-NEXT:    v_mov_b32_e32 v1, s1
347; GFX8-NEXT:    v_bfi_b32 v0, s0, v1, v0
348; GFX8-NEXT:    ; return to shader part epilog
349;
350; GFX10-LABEL: s_s_v_bfi_sha256_ch:
351; GFX10:       ; %bb.0: ; %entry
352; GFX10-NEXT:    v_bfi_b32 v0, s0, s1, v0
353; GFX10-NEXT:    ; return to shader part epilog
354;
355; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ch:
356; GFX8-GISEL:       ; %bb.0: ; %entry
357; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s0
358; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v1, s1, v0
359; GFX8-GISEL-NEXT:    ; return to shader part epilog
360;
361; GFX10-GISEL-LABEL: s_s_v_bfi_sha256_ch:
362; GFX10-GISEL:       ; %bb.0: ; %entry
363; GFX10-GISEL-NEXT:    v_bfi_b32 v0, s0, s1, v0
364; GFX10-GISEL-NEXT:    ; return to shader part epilog
365entry:
366  %xor0 = xor i32 %y, %z
367  %and = and i32 %x, %xor0
368  %xor1 = xor i32 %z, %and
369  %cast = bitcast i32 %xor1 to float
370  ret float %cast
371}
372
373define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 %z) {
374; GFX7-LABEL: s_v_v_bfi_sha256_ch:
375; GFX7:       ; %bb.0: ; %entry
376; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v1
377; GFX7-NEXT:    ; return to shader part epilog
378;
379; GFX8-LABEL: s_v_v_bfi_sha256_ch:
380; GFX8:       ; %bb.0: ; %entry
381; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v1
382; GFX8-NEXT:    ; return to shader part epilog
383;
384; GFX10-LABEL: s_v_v_bfi_sha256_ch:
385; GFX10:       ; %bb.0: ; %entry
386; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, v1
387; GFX10-NEXT:    ; return to shader part epilog
388;
389; GFX8-GISEL-LABEL: s_v_v_bfi_sha256_ch:
390; GFX8-GISEL:       ; %bb.0: ; %entry
391; GFX8-GISEL-NEXT:    v_bfi_b32 v0, s0, v0, v1
392; GFX8-GISEL-NEXT:    ; return to shader part epilog
393;
394; GFX10-GISEL-LABEL: s_v_v_bfi_sha256_ch:
395; GFX10-GISEL:       ; %bb.0: ; %entry
396; GFX10-GISEL-NEXT:    v_bfi_b32 v0, s0, v0, v1
397; GFX10-GISEL-NEXT:    ; return to shader part epilog
398entry:
399  %xor0 = xor i32 %y, %z
400  %and = and i32 %x, %xor0
401  %xor1 = xor i32 %z, %and
402  %cast = bitcast i32 %xor1 to float
403  ret float %cast
404}
405
406define amdgpu_ps float @v_s_v_bfi_sha256_ch(i32 %x, i32 inreg %y, i32 %z) {
407; GFX7-LABEL: v_s_v_bfi_sha256_ch:
408; GFX7:       ; %bb.0: ; %entry
409; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v1
410; GFX7-NEXT:    ; return to shader part epilog
411;
412; GFX8-LABEL: v_s_v_bfi_sha256_ch:
413; GFX8:       ; %bb.0: ; %entry
414; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v1
415; GFX8-NEXT:    ; return to shader part epilog
416;
417; GFX10-LABEL: v_s_v_bfi_sha256_ch:
418; GFX10:       ; %bb.0: ; %entry
419; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, v1
420; GFX10-NEXT:    ; return to shader part epilog
421;
422; GFX8-GISEL-LABEL: v_s_v_bfi_sha256_ch:
423; GFX8-GISEL:       ; %bb.0: ; %entry
424; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, s0, v1
425; GFX8-GISEL-NEXT:    ; return to shader part epilog
426;
427; GFX10-GISEL-LABEL: v_s_v_bfi_sha256_ch:
428; GFX10-GISEL:       ; %bb.0: ; %entry
429; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, s0, v1
430; GFX10-GISEL-NEXT:    ; return to shader part epilog
431entry:
432  %xor0 = xor i32 %y, %z
433  %and = and i32 %x, %xor0
434  %xor1 = xor i32 %z, %and
435  %cast = bitcast i32 %xor1 to float
436  ret float %cast
437}
438
439define amdgpu_ps float @v_v_s_bfi_sha256_ch(i32 %x, i32 %y, i32 inreg %z) {
440; GFX7-LABEL: v_v_s_bfi_sha256_ch:
441; GFX7:       ; %bb.0: ; %entry
442; GFX7-NEXT:    v_bfi_b32 v0, v0, v1, s0
443; GFX7-NEXT:    ; return to shader part epilog
444;
445; GFX8-LABEL: v_v_s_bfi_sha256_ch:
446; GFX8:       ; %bb.0: ; %entry
447; GFX8-NEXT:    v_bfi_b32 v0, v0, v1, s0
448; GFX8-NEXT:    ; return to shader part epilog
449;
450; GFX10-LABEL: v_v_s_bfi_sha256_ch:
451; GFX10:       ; %bb.0: ; %entry
452; GFX10-NEXT:    v_bfi_b32 v0, v0, v1, s0
453; GFX10-NEXT:    ; return to shader part epilog
454;
455; GFX8-GISEL-LABEL: v_v_s_bfi_sha256_ch:
456; GFX8-GISEL:       ; %bb.0: ; %entry
457; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v1
458; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
459; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s0, v0
460; GFX8-GISEL-NEXT:    ; return to shader part epilog
461;
462; GFX10-GISEL-LABEL: v_v_s_bfi_sha256_ch:
463; GFX10-GISEL:       ; %bb.0: ; %entry
464; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v1
465; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
466; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s0, v0
467; GFX10-GISEL-NEXT:    ; return to shader part epilog
468entry:
469  %xor0 = xor i32 %y, %z
470  %and = and i32 %x, %xor0
471  %xor1 = xor i32 %z, %and
472  %cast = bitcast i32 %xor1 to float
473  ret float %cast
474}
475
476; SHA-256 Ma function
477; ((x & z) | (y & (x | z)))
478define amdgpu_kernel void @s_bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
479; GFX7-LABEL: s_bfi_sha256_ma:
480; GFX7:       ; %bb.0: ; %entry
481; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
482; GFX7-NEXT:    s_load_dword s6, s[0:1], 0xd
483; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
484; GFX7-NEXT:    s_mov_b32 s3, 0xf000
485; GFX7-NEXT:    s_mov_b32 s2, -1
486; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
487; GFX7-NEXT:    s_and_b32 s7, s4, s6
488; GFX7-NEXT:    s_or_b32 s4, s4, s6
489; GFX7-NEXT:    s_and_b32 s4, s5, s4
490; GFX7-NEXT:    s_or_b32 s4, s7, s4
491; GFX7-NEXT:    v_mov_b32_e32 v0, s4
492; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
493; GFX7-NEXT:    s_endpgm
494;
495; GFX8-LABEL: s_bfi_sha256_ma:
496; GFX8:       ; %bb.0: ; %entry
497; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
498; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x34
499; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
500; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
501; GFX8-NEXT:    s_and_b32 s5, s2, s4
502; GFX8-NEXT:    s_or_b32 s2, s2, s4
503; GFX8-NEXT:    s_and_b32 s2, s3, s2
504; GFX8-NEXT:    s_or_b32 s2, s5, s2
505; GFX8-NEXT:    v_mov_b32_e32 v0, s0
506; GFX8-NEXT:    v_mov_b32_e32 v1, s1
507; GFX8-NEXT:    v_mov_b32_e32 v2, s2
508; GFX8-NEXT:    flat_store_dword v[0:1], v2
509; GFX8-NEXT:    s_endpgm
510;
511; GFX10-LABEL: s_bfi_sha256_ma:
512; GFX10:       ; %bb.0: ; %entry
513; GFX10-NEXT:    s_clause 0x2
514; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
515; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x34
516; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
517; GFX10-NEXT:    v_mov_b32_e32 v0, 0
518; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
519; GFX10-NEXT:    s_or_b32 s5, s2, s4
520; GFX10-NEXT:    s_and_b32 s2, s2, s4
521; GFX10-NEXT:    s_and_b32 s3, s3, s5
522; GFX10-NEXT:    s_or_b32 s2, s2, s3
523; GFX10-NEXT:    v_mov_b32_e32 v1, s2
524; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
525; GFX10-NEXT:    s_endpgm
526;
527; GFX8-GISEL-LABEL: s_bfi_sha256_ma:
528; GFX8-GISEL:       ; %bb.0: ; %entry
529; GFX8-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
530; GFX8-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x34
531; GFX8-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
532; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
533; GFX8-GISEL-NEXT:    s_and_b32 s5, s2, s4
534; GFX8-GISEL-NEXT:    s_or_b32 s2, s2, s4
535; GFX8-GISEL-NEXT:    s_and_b32 s2, s3, s2
536; GFX8-GISEL-NEXT:    s_or_b32 s2, s5, s2
537; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
538; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s2
539; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
540; GFX8-GISEL-NEXT:    flat_store_dword v[0:1], v2
541; GFX8-GISEL-NEXT:    s_endpgm
542;
543; GFX10-GISEL-LABEL: s_bfi_sha256_ma:
544; GFX10-GISEL:       ; %bb.0: ; %entry
545; GFX10-GISEL-NEXT:    s_clause 0x2
546; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
547; GFX10-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x34
548; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
549; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
550; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
551; GFX10-GISEL-NEXT:    s_or_b32 s5, s2, s4
552; GFX10-GISEL-NEXT:    s_and_b32 s2, s2, s4
553; GFX10-GISEL-NEXT:    s_and_b32 s3, s3, s5
554; GFX10-GISEL-NEXT:    s_or_b32 s2, s2, s3
555; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
556; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
557; GFX10-GISEL-NEXT:    s_endpgm
558entry:
559  %0 = and i32 %x, %z
560  %1 = or i32 %x, %z
561  %2 = and i32 %y, %1
562  %3 = or i32 %0, %2
563  store i32 %3, i32 addrspace(1)* %out
564  ret void
565}
566
567define i32 @v_bfi_sha256_ma(i32 %x, i32 %y, i32 %z) {
568; GFX7-LABEL: v_bfi_sha256_ma:
569; GFX7:       ; %bb.0: ; %entry
570; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
571; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v1
572; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, v1
573; GFX7-NEXT:    s_setpc_b64 s[30:31]
574;
575; GFX8-LABEL: v_bfi_sha256_ma:
576; GFX8:       ; %bb.0: ; %entry
577; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
578; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v1
579; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, v1
580; GFX8-NEXT:    s_setpc_b64 s[30:31]
581;
582; GFX10-LABEL: v_bfi_sha256_ma:
583; GFX10:       ; %bb.0: ; %entry
584; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
585; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
586; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v1
587; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, v1
588; GFX10-NEXT:    s_setpc_b64 s[30:31]
589;
590; GFX8-GISEL-LABEL: v_bfi_sha256_ma:
591; GFX8-GISEL:       ; %bb.0: ; %entry
592; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
593; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
594; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, v1
595; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
596;
597; GFX10-GISEL-LABEL: v_bfi_sha256_ma:
598; GFX10-GISEL:       ; %bb.0: ; %entry
599; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
600; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
601; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
602; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, v1
603; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
604entry:
605  %0 = and i32 %x, %z
606  %1 = or i32 %x, %z
607  %2 = and i32 %y, %1
608  %3 = or i32 %0, %2
609  ret i32 %3
610}
611
612define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %mask) {
613; GFX7-LABEL: v_bitselect_v2i32_pat1:
614; GFX7:       ; %bb.0:
615; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
616; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
617; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
618; GFX7-NEXT:    s_setpc_b64 s[30:31]
619;
620; GFX8-LABEL: v_bitselect_v2i32_pat1:
621; GFX8:       ; %bb.0:
622; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
623; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
624; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
625; GFX8-NEXT:    s_setpc_b64 s[30:31]
626;
627; GFX10-LABEL: v_bitselect_v2i32_pat1:
628; GFX10:       ; %bb.0:
629; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
630; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
631; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, v4
632; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, v5
633; GFX10-NEXT:    s_setpc_b64 s[30:31]
634;
635; GFX8-GISEL-LABEL: v_bitselect_v2i32_pat1:
636; GFX8-GISEL:       ; %bb.0:
637; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
638; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
639; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
640; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
641; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
642; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
643; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
644; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
645;
646; GFX10-GISEL-LABEL: v_bitselect_v2i32_pat1:
647; GFX10-GISEL:       ; %bb.0:
648; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
649; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
650; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
651; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
652; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
653; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
654; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
655; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
656; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
657  %xor.0 = xor <2 x i32> %a, %mask
658  %and = and <2 x i32> %xor.0, %b
659  %bitselect = xor <2 x i32> %and, %mask
660  ret <2 x i32> %bitselect
661}
662
663define i64 @v_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
664; GFX7-LABEL: v_bitselect_i64_pat_0:
665; GFX7:       ; %bb.0:
666; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
667; GFX7-NEXT:    v_bfi_b32 v1, v1, v3, v5
668; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, v4
669; GFX7-NEXT:    s_setpc_b64 s[30:31]
670;
671; GFX8-LABEL: v_bitselect_i64_pat_0:
672; GFX8:       ; %bb.0:
673; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
674; GFX8-NEXT:    v_bfi_b32 v1, v1, v3, v5
675; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, v4
676; GFX8-NEXT:    s_setpc_b64 s[30:31]
677;
678; GFX10-LABEL: v_bitselect_i64_pat_0:
679; GFX10:       ; %bb.0:
680; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
681; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
682; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, v4
683; GFX10-NEXT:    v_bfi_b32 v1, v1, v3, v5
684; GFX10-NEXT:    s_setpc_b64 s[30:31]
685;
686; GFX8-GISEL-LABEL: v_bitselect_i64_pat_0:
687; GFX8-GISEL:       ; %bb.0:
688; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
689; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, v0, v2
690; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, v1, v3
691; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, -1, v0
692; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, -1, v1
693; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
694; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
695; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
696; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
697; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
698;
699; GFX10-GISEL-LABEL: v_bitselect_i64_pat_0:
700; GFX10-GISEL:       ; %bb.0:
701; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
702; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
703; GFX10-GISEL-NEXT:    v_xor_b32_e32 v6, -1, v0
704; GFX10-GISEL-NEXT:    v_xor_b32_e32 v7, -1, v1
705; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
706; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
707; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, v6, v4
708; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, v7, v5
709; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
710; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
711; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
712  %and0 = and i64 %a, %b
713  %not.a = xor i64 %a, -1
714  %and1 = and i64 %not.a, %mask
715  %bitselect = or i64 %and0, %and1
716  ret i64 %bitselect
717}
718
719define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_0(i64 %a, i64 inreg %b, i64 inreg %mask) {
720; GFX7-LABEL: v_s_s_bitselect_i64_pat_0:
721; GFX7:       ; %bb.0:
722; GFX7-NEXT:    v_mov_b32_e32 v2, s3
723; GFX7-NEXT:    v_bfi_b32 v1, v1, s1, v2
724; GFX7-NEXT:    v_mov_b32_e32 v2, s2
725; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v2
726; GFX7-NEXT:    ; return to shader part epilog
727;
728; GFX8-LABEL: v_s_s_bitselect_i64_pat_0:
729; GFX8:       ; %bb.0:
730; GFX8-NEXT:    v_mov_b32_e32 v2, s3
731; GFX8-NEXT:    v_bfi_b32 v1, v1, s1, v2
732; GFX8-NEXT:    v_mov_b32_e32 v2, s2
733; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v2
734; GFX8-NEXT:    ; return to shader part epilog
735;
736; GFX10-LABEL: v_s_s_bitselect_i64_pat_0:
737; GFX10:       ; %bb.0:
738; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, s2
739; GFX10-NEXT:    v_bfi_b32 v1, v1, s1, s3
740; GFX10-NEXT:    ; return to shader part epilog
741;
742; GFX8-GISEL-LABEL: v_s_s_bitselect_i64_pat_0:
743; GFX8-GISEL:       ; %bb.0:
744; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, s0, v0
745; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, s1, v1
746; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, -1, v0
747; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, -1, v1
748; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s2, v0
749; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s3, v1
750; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
751; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
752; GFX8-GISEL-NEXT:    ; return to shader part epilog
753;
754; GFX10-GISEL-LABEL: v_s_s_bitselect_i64_pat_0:
755; GFX10-GISEL:       ; %bb.0:
756; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, -1, v0
757; GFX10-GISEL-NEXT:    v_xor_b32_e32 v3, -1, v1
758; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
759; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
760; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s2, v2
761; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, s3, v3
762; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
763; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
764; GFX10-GISEL-NEXT:    ; return to shader part epilog
765  %and0 = and i64 %a, %b
766  %not.a = xor i64 %a, -1
767  %and1 = and i64 %not.a, %mask
768  %bitselect = or i64 %and0, %and1
769  %cast = bitcast i64 %bitselect to <2 x float>
770  ret <2 x float> %cast
771}
772
773define amdgpu_ps <2 x float> @s_v_s_bitselect_i64_pat_0(i64 inreg %a, i64 %b, i64 inreg %mask) {
774; GFX7-LABEL: s_v_s_bitselect_i64_pat_0:
775; GFX7:       ; %bb.0:
776; GFX7-NEXT:    v_mov_b32_e32 v2, s3
777; GFX7-NEXT:    v_bfi_b32 v1, s1, v1, v2
778; GFX7-NEXT:    v_mov_b32_e32 v2, s2
779; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v2
780; GFX7-NEXT:    ; return to shader part epilog
781;
782; GFX8-LABEL: s_v_s_bitselect_i64_pat_0:
783; GFX8:       ; %bb.0:
784; GFX8-NEXT:    v_mov_b32_e32 v2, s3
785; GFX8-NEXT:    v_bfi_b32 v1, s1, v1, v2
786; GFX8-NEXT:    v_mov_b32_e32 v2, s2
787; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v2
788; GFX8-NEXT:    ; return to shader part epilog
789;
790; GFX10-LABEL: s_v_s_bitselect_i64_pat_0:
791; GFX10:       ; %bb.0:
792; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, s2
793; GFX10-NEXT:    v_bfi_b32 v1, s1, v1, s3
794; GFX10-NEXT:    ; return to shader part epilog
795;
796; GFX8-GISEL-LABEL: s_v_s_bitselect_i64_pat_0:
797; GFX8-GISEL:       ; %bb.0:
798; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
799; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
800; GFX8-GISEL-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[0:1]
801; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s0, v0
802; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, s1, v1
803; GFX8-GISEL-NEXT:    ; return to shader part epilog
804;
805; GFX10-GISEL-LABEL: s_v_s_bitselect_i64_pat_0:
806; GFX10-GISEL:       ; %bb.0:
807; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
808; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
809; GFX10-GISEL-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[0:1]
810; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, s0, v0
811; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, s1, v1
812; GFX10-GISEL-NEXT:    ; return to shader part epilog
813  %and0 = and i64 %a, %b
814  %not.a = xor i64 %a, -1
815  %and1 = and i64 %not.a, %mask
816  %bitselect = or i64 %and0, %and1
817  %cast = bitcast i64 %bitselect to <2 x float>
818  ret <2 x float> %cast
819}
820
821define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_0(i64 inreg %a, i64 inreg %b, i64 %mask) {
822; GFX7-LABEL: s_s_v_bitselect_i64_pat_0:
823; GFX7:       ; %bb.0:
824; GFX7-NEXT:    v_mov_b32_e32 v2, s3
825; GFX7-NEXT:    v_bfi_b32 v1, s1, v2, v1
826; GFX7-NEXT:    v_mov_b32_e32 v2, s2
827; GFX7-NEXT:    v_bfi_b32 v0, s0, v2, v0
828; GFX7-NEXT:    ; return to shader part epilog
829;
830; GFX8-LABEL: s_s_v_bitselect_i64_pat_0:
831; GFX8:       ; %bb.0:
832; GFX8-NEXT:    v_mov_b32_e32 v2, s3
833; GFX8-NEXT:    v_bfi_b32 v1, s1, v2, v1
834; GFX8-NEXT:    v_mov_b32_e32 v2, s2
835; GFX8-NEXT:    v_bfi_b32 v0, s0, v2, v0
836; GFX8-NEXT:    ; return to shader part epilog
837;
838; GFX10-LABEL: s_s_v_bitselect_i64_pat_0:
839; GFX10:       ; %bb.0:
840; GFX10-NEXT:    v_bfi_b32 v0, s0, s2, v0
841; GFX10-NEXT:    v_bfi_b32 v1, s1, s3, v1
842; GFX10-NEXT:    ; return to shader part epilog
843;
844; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_0:
845; GFX8-GISEL:       ; %bb.0:
846; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
847; GFX8-GISEL-NEXT:    s_not_b64 s[0:1], s[0:1]
848; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
849; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
850; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s2, v0
851; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, s3, v1
852; GFX8-GISEL-NEXT:    ; return to shader part epilog
853;
854; GFX10-GISEL-LABEL: s_s_v_bitselect_i64_pat_0:
855; GFX10-GISEL:       ; %bb.0:
856; GFX10-GISEL-NEXT:    s_not_b64 s[4:5], s[0:1]
857; GFX10-GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
858; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
859; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s5, v1
860; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, s0, v0
861; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, s1, v1
862; GFX10-GISEL-NEXT:    ; return to shader part epilog
863  %and0 = and i64 %a, %b
864  %not.a = xor i64 %a, -1
865  %and1 = and i64 %not.a, %mask
866  %bitselect = or i64 %and0, %and1
867  %cast = bitcast i64 %bitselect to <2 x float>
868  ret <2 x float> %cast
869}
870
871define amdgpu_ps <2 x float> @v_v_s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 inreg %mask) {
872; GFX7-LABEL: v_v_s_bitselect_i64_pat_0:
873; GFX7:       ; %bb.0:
874; GFX7-NEXT:    v_bfi_b32 v1, v1, v3, s1
875; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, s0
876; GFX7-NEXT:    ; return to shader part epilog
877;
878; GFX8-LABEL: v_v_s_bitselect_i64_pat_0:
879; GFX8:       ; %bb.0:
880; GFX8-NEXT:    v_bfi_b32 v1, v1, v3, s1
881; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, s0
882; GFX8-NEXT:    ; return to shader part epilog
883;
884; GFX10-LABEL: v_v_s_bitselect_i64_pat_0:
885; GFX10:       ; %bb.0:
886; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, s0
887; GFX10-NEXT:    v_bfi_b32 v1, v1, v3, s1
888; GFX10-NEXT:    ; return to shader part epilog
889;
890; GFX8-GISEL-LABEL: v_v_s_bitselect_i64_pat_0:
891; GFX8-GISEL:       ; %bb.0:
892; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, v0, v2
893; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, v1, v3
894; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, -1, v0
895; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, -1, v1
896; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
897; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
898; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
899; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
900; GFX8-GISEL-NEXT:    ; return to shader part epilog
901;
902; GFX10-GISEL-LABEL: v_v_s_bitselect_i64_pat_0:
903; GFX10-GISEL:       ; %bb.0:
904; GFX10-GISEL-NEXT:    v_xor_b32_e32 v4, -1, v0
905; GFX10-GISEL-NEXT:    v_xor_b32_e32 v5, -1, v1
906; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
907; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
908; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s0, v4
909; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, s1, v5
910; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
911; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
912; GFX10-GISEL-NEXT:    ; return to shader part epilog
913  %and0 = and i64 %a, %b
914  %not.a = xor i64 %a, -1
915  %and1 = and i64 %not.a, %mask
916  %bitselect = or i64 %and0, %and1
917  %cast = bitcast i64 %bitselect to <2 x float>
918  ret <2 x float> %cast
919}
920
921define amdgpu_ps <2 x float> @v_s_v_bitselect_i64_pat_0(i64 %a, i64 inreg %b, i64 %mask) {
922; GFX7-LABEL: v_s_v_bitselect_i64_pat_0:
923; GFX7:       ; %bb.0:
924; GFX7-NEXT:    v_bfi_b32 v1, v1, s1, v3
925; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v2
926; GFX7-NEXT:    ; return to shader part epilog
927;
928; GFX8-LABEL: v_s_v_bitselect_i64_pat_0:
929; GFX8:       ; %bb.0:
930; GFX8-NEXT:    v_bfi_b32 v1, v1, s1, v3
931; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v2
932; GFX8-NEXT:    ; return to shader part epilog
933;
934; GFX10-LABEL: v_s_v_bitselect_i64_pat_0:
935; GFX10:       ; %bb.0:
936; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, v2
937; GFX10-NEXT:    v_bfi_b32 v1, v1, s1, v3
938; GFX10-NEXT:    ; return to shader part epilog
939;
940; GFX8-GISEL-LABEL: v_s_v_bitselect_i64_pat_0:
941; GFX8-GISEL:       ; %bb.0:
942; GFX8-GISEL-NEXT:    v_and_b32_e32 v4, s0, v0
943; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, s1, v1
944; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, -1, v0
945; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, -1, v1
946; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
947; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
948; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
949; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v5, v1
950; GFX8-GISEL-NEXT:    ; return to shader part epilog
951;
952; GFX10-GISEL-LABEL: v_s_v_bitselect_i64_pat_0:
953; GFX10-GISEL:       ; %bb.0:
954; GFX10-GISEL-NEXT:    v_xor_b32_e32 v4, -1, v0
955; GFX10-GISEL-NEXT:    v_xor_b32_e32 v5, -1, v1
956; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
957; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
958; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, v4, v2
959; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, v5, v3
960; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
961; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
962; GFX10-GISEL-NEXT:    ; return to shader part epilog
963  %and0 = and i64 %a, %b
964  %not.a = xor i64 %a, -1
965  %and1 = and i64 %not.a, %mask
966  %bitselect = or i64 %and0, %and1
967  %cast = bitcast i64 %bitselect to <2 x float>
968  ret <2 x float> %cast
969}
970
971define amdgpu_ps <2 x float> @s_v_v_bitselect_i64_pat_0(i64 inreg %a, i64 %b, i64 %mask) {
972; GFX7-LABEL: s_v_v_bitselect_i64_pat_0:
973; GFX7:       ; %bb.0:
974; GFX7-NEXT:    v_bfi_b32 v1, s1, v1, v3
975; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v2
976; GFX7-NEXT:    ; return to shader part epilog
977;
978; GFX8-LABEL: s_v_v_bitselect_i64_pat_0:
979; GFX8:       ; %bb.0:
980; GFX8-NEXT:    v_bfi_b32 v1, s1, v1, v3
981; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v2
982; GFX8-NEXT:    ; return to shader part epilog
983;
984; GFX10-LABEL: s_v_v_bitselect_i64_pat_0:
985; GFX10:       ; %bb.0:
986; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, v2
987; GFX10-NEXT:    v_bfi_b32 v1, s1, v1, v3
988; GFX10-NEXT:    ; return to shader part epilog
989;
990; GFX8-GISEL-LABEL: s_v_v_bitselect_i64_pat_0:
991; GFX8-GISEL:       ; %bb.0:
992; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
993; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
994; GFX8-GISEL-NEXT:    s_not_b64 s[0:1], s[0:1]
995; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, s0, v2
996; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, s1, v3
997; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
998; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
999; GFX8-GISEL-NEXT:    ; return to shader part epilog
1000;
1001; GFX10-GISEL-LABEL: s_v_v_bitselect_i64_pat_0:
1002; GFX10-GISEL:       ; %bb.0:
1003; GFX10-GISEL-NEXT:    s_not_b64 s[2:3], s[0:1]
1004; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1005; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1006; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s2, v2
1007; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, s3, v3
1008; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
1009; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
1010; GFX10-GISEL-NEXT:    ; return to shader part epilog
1011  %and0 = and i64 %a, %b
1012  %not.a = xor i64 %a, -1
1013  %and1 = and i64 %not.a, %mask
1014  %bitselect = or i64 %and0, %and1
1015  %cast = bitcast i64 %bitselect to <2 x float>
1016  ret <2 x float> %cast
1017}
1018
1019define i64 @v_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
1020; GFX7-LABEL: v_bitselect_i64_pat_1:
1021; GFX7:       ; %bb.0:
1022; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1023; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
1024; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
1025; GFX7-NEXT:    s_setpc_b64 s[30:31]
1026;
1027; GFX8-LABEL: v_bitselect_i64_pat_1:
1028; GFX8:       ; %bb.0:
1029; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1030; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
1031; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
1032; GFX8-NEXT:    s_setpc_b64 s[30:31]
1033;
1034; GFX10-LABEL: v_bitselect_i64_pat_1:
1035; GFX10:       ; %bb.0:
1036; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1037; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1038; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, v4
1039; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, v5
1040; GFX10-NEXT:    s_setpc_b64 s[30:31]
1041;
1042; GFX8-GISEL-LABEL: v_bitselect_i64_pat_1:
1043; GFX8-GISEL:       ; %bb.0:
1044; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1045; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1046; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1047; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
1048; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
1049; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1050; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1051; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
1052;
1053; GFX10-GISEL-LABEL: v_bitselect_i64_pat_1:
1054; GFX10-GISEL:       ; %bb.0:
1055; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1056; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
1057; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1058; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1059; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
1060; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
1061; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1062; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1063; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
1064  %xor.0 = xor i64 %a, %mask
1065  %and = and i64 %xor.0, %b
1066  %bitselect = xor i64 %and, %mask
1067  ret i64 %bitselect
1068}
1069
1070define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_1(i64 %a, i64 inreg %b, i64 inreg %mask) {
1071; GFX7-LABEL: v_s_s_bitselect_i64_pat_1:
1072; GFX7:       ; %bb.0:
1073; GFX7-NEXT:    v_mov_b32_e32 v2, s3
1074; GFX7-NEXT:    v_bfi_b32 v1, s1, v1, v2
1075; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1076; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v2
1077; GFX7-NEXT:    ; return to shader part epilog
1078;
1079; GFX8-LABEL: v_s_s_bitselect_i64_pat_1:
1080; GFX8:       ; %bb.0:
1081; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1082; GFX8-NEXT:    v_bfi_b32 v1, s1, v1, v2
1083; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1084; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v2
1085; GFX8-NEXT:    ; return to shader part epilog
1086;
1087; GFX10-LABEL: v_s_s_bitselect_i64_pat_1:
1088; GFX10:       ; %bb.0:
1089; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, s2
1090; GFX10-NEXT:    v_bfi_b32 v1, s1, v1, s3
1091; GFX10-NEXT:    ; return to shader part epilog
1092;
1093; GFX8-GISEL-LABEL: v_s_s_bitselect_i64_pat_1:
1094; GFX8-GISEL:       ; %bb.0:
1095; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s2, v0
1096; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, s3, v1
1097; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1098; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1099; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s2, v0
1100; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, s3, v1
1101; GFX8-GISEL-NEXT:    ; return to shader part epilog
1102;
1103; GFX10-GISEL-LABEL: v_s_s_bitselect_i64_pat_1:
1104; GFX10-GISEL:       ; %bb.0:
1105; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s2, v0
1106; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, s3, v1
1107; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1108; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1109; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s2, v0
1110; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, s3, v1
1111; GFX10-GISEL-NEXT:    ; return to shader part epilog
1112  %xor.0 = xor i64 %a, %mask
1113  %and = and i64 %xor.0, %b
1114  %bitselect = xor i64 %and, %mask
1115  %cast = bitcast i64 %bitselect to <2 x float>
1116  ret <2 x float> %cast
1117}
1118
1119define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_1(i64 inreg %a, i64 inreg %b, i64 %mask) {
1120; GFX7-LABEL: s_s_v_bitselect_i64_pat_1:
1121; GFX7:       ; %bb.0:
1122; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1123; GFX7-NEXT:    v_bfi_b32 v1, s3, v2, v1
1124; GFX7-NEXT:    v_mov_b32_e32 v2, s0
1125; GFX7-NEXT:    v_bfi_b32 v0, s2, v2, v0
1126; GFX7-NEXT:    ; return to shader part epilog
1127;
1128; GFX8-LABEL: s_s_v_bitselect_i64_pat_1:
1129; GFX8:       ; %bb.0:
1130; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1131; GFX8-NEXT:    v_bfi_b32 v1, s3, v2, v1
1132; GFX8-NEXT:    v_mov_b32_e32 v2, s0
1133; GFX8-NEXT:    v_bfi_b32 v0, s2, v2, v0
1134; GFX8-NEXT:    ; return to shader part epilog
1135;
1136; GFX10-LABEL: s_s_v_bitselect_i64_pat_1:
1137; GFX10:       ; %bb.0:
1138; GFX10-NEXT:    v_bfi_b32 v0, s2, s0, v0
1139; GFX10-NEXT:    v_bfi_b32 v1, s3, s1, v1
1140; GFX10-NEXT:    ; return to shader part epilog
1141;
1142; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_1:
1143; GFX8-GISEL:       ; %bb.0:
1144; GFX8-GISEL-NEXT:    v_xor_b32_e32 v2, s0, v0
1145; GFX8-GISEL-NEXT:    v_xor_b32_e32 v3, s1, v1
1146; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, s2, v2
1147; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, s3, v3
1148; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v2, v0
1149; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v1
1150; GFX8-GISEL-NEXT:    ; return to shader part epilog
1151;
1152; GFX10-GISEL-LABEL: s_s_v_bitselect_i64_pat_1:
1153; GFX10-GISEL:       ; %bb.0:
1154; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, s0, v0
1155; GFX10-GISEL-NEXT:    v_xor_b32_e32 v3, s1, v1
1156; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s2, v2
1157; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, s3, v3
1158; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v2, v0
1159; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v1
1160; GFX10-GISEL-NEXT:    ; return to shader part epilog
1161  %xor.0 = xor i64 %a, %mask
1162  %and = and i64 %xor.0, %b
1163  %bitselect = xor i64 %and, %mask
1164  %cast = bitcast i64 %bitselect to <2 x float>
1165  ret <2 x float> %cast
1166}
1167
1168define amdgpu_ps <2 x float> @s_v_s_bitselect_i64_pat_1(i64 inreg %a, i64 %b, i64 inreg %mask) {
1169; GFX7-LABEL: s_v_s_bitselect_i64_pat_1:
1170; GFX7:       ; %bb.0:
1171; GFX7-NEXT:    v_mov_b32_e32 v2, s3
1172; GFX7-NEXT:    v_bfi_b32 v1, v1, s1, v2
1173; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1174; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v2
1175; GFX7-NEXT:    ; return to shader part epilog
1176;
1177; GFX8-LABEL: s_v_s_bitselect_i64_pat_1:
1178; GFX8:       ; %bb.0:
1179; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1180; GFX8-NEXT:    v_bfi_b32 v1, v1, s1, v2
1181; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1182; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v2
1183; GFX8-NEXT:    ; return to shader part epilog
1184;
1185; GFX10-LABEL: s_v_s_bitselect_i64_pat_1:
1186; GFX10:       ; %bb.0:
1187; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, s2
1188; GFX10-NEXT:    v_bfi_b32 v1, v1, s1, s3
1189; GFX10-NEXT:    ; return to shader part epilog
1190;
1191; GFX8-GISEL-LABEL: s_v_s_bitselect_i64_pat_1:
1192; GFX8-GISEL:       ; %bb.0:
1193; GFX8-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
1194; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1195; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1196; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s2, v0
1197; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, s3, v1
1198; GFX8-GISEL-NEXT:    ; return to shader part epilog
1199;
1200; GFX10-GISEL-LABEL: s_v_s_bitselect_i64_pat_1:
1201; GFX10-GISEL:       ; %bb.0:
1202; GFX10-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
1203; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1204; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1205; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s2, v0
1206; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, s3, v1
1207; GFX10-GISEL-NEXT:    ; return to shader part epilog
1208  %xor.0 = xor i64 %a, %mask
1209  %and = and i64 %xor.0, %b
1210  %bitselect = xor i64 %and, %mask
1211  %cast = bitcast i64 %bitselect to <2 x float>
1212  ret <2 x float> %cast
1213}
1214
1215define i64 @v_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
1216; GFX7-LABEL: v_bitselect_i64_pat_2:
1217; GFX7:       ; %bb.0:
1218; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1219; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
1220; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
1221; GFX7-NEXT:    s_setpc_b64 s[30:31]
1222;
1223; GFX8-LABEL: v_bitselect_i64_pat_2:
1224; GFX8:       ; %bb.0:
1225; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1226; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
1227; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
1228; GFX8-NEXT:    s_setpc_b64 s[30:31]
1229;
1230; GFX10-LABEL: v_bitselect_i64_pat_2:
1231; GFX10:       ; %bb.0:
1232; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1233; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1234; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, v4
1235; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, v5
1236; GFX10-NEXT:    s_setpc_b64 s[30:31]
1237;
1238; GFX8-GISEL-LABEL: v_bitselect_i64_pat_2:
1239; GFX8-GISEL:       ; %bb.0:
1240; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1241; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1242; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1243; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
1244; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
1245; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1246; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1247; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
1248;
1249; GFX10-GISEL-LABEL: v_bitselect_i64_pat_2:
1250; GFX10-GISEL:       ; %bb.0:
1251; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1252; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
1253; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1254; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1255; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
1256; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
1257; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1258; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1259; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
1260  %xor.0 = xor i64 %a, %mask
1261  %and = and i64 %xor.0, %b
1262  %bitselect = xor i64 %and, %mask
1263  ret i64 %bitselect
1264}
1265
1266define i64 @v_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
1267; GFX7-LABEL: v_bfi_sha256_ma_i64:
1268; GFX7:       ; %bb.0: ; %entry
1269; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1270; GFX7-NEXT:    v_xor_b32_e32 v1, v1, v3
1271; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v2
1272; GFX7-NEXT:    v_bfi_b32 v1, v1, v5, v3
1273; GFX7-NEXT:    v_bfi_b32 v0, v0, v4, v2
1274; GFX7-NEXT:    s_setpc_b64 s[30:31]
1275;
1276; GFX8-LABEL: v_bfi_sha256_ma_i64:
1277; GFX8:       ; %bb.0: ; %entry
1278; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1279; GFX8-NEXT:    v_xor_b32_e32 v1, v1, v3
1280; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v2
1281; GFX8-NEXT:    v_bfi_b32 v1, v1, v5, v3
1282; GFX8-NEXT:    v_bfi_b32 v0, v0, v4, v2
1283; GFX8-NEXT:    s_setpc_b64 s[30:31]
1284;
1285; GFX10-LABEL: v_bfi_sha256_ma_i64:
1286; GFX10:       ; %bb.0: ; %entry
1287; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1288; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1289; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v2
1290; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
1291; GFX10-NEXT:    v_bfi_b32 v0, v0, v4, v2
1292; GFX10-NEXT:    v_bfi_b32 v1, v1, v5, v3
1293; GFX10-NEXT:    s_setpc_b64 s[30:31]
1294;
1295; GFX8-GISEL-LABEL: v_bfi_sha256_ma_i64:
1296; GFX8-GISEL:       ; %bb.0: ; %entry
1297; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1298; GFX8-GISEL-NEXT:    v_and_b32_e32 v6, v0, v4
1299; GFX8-GISEL-NEXT:    v_and_b32_e32 v7, v1, v5
1300; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
1301; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
1302; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v2, v0
1303; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v3, v1
1304; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v6, v0
1305; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v7, v1
1306; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
1307;
1308; GFX10-GISEL-LABEL: v_bfi_sha256_ma_i64:
1309; GFX10-GISEL:       ; %bb.0: ; %entry
1310; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1311; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
1312; GFX10-GISEL-NEXT:    v_or_b32_e32 v6, v0, v4
1313; GFX10-GISEL-NEXT:    v_or_b32_e32 v7, v1, v5
1314; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
1315; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
1316; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
1317; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
1318; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
1319; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
1320; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
1321entry:
1322  %and0 = and i64 %x, %z
1323  %or0 = or i64 %x, %z
1324  %and1 = and i64 %y, %or0
1325  %or1 = or i64 %and0, %and1
1326  ret i64 %or1
1327}
1328
1329define amdgpu_ps <2 x float> @v_s_s_bfi_sha256_ma_i64(i64 %x, i64 inreg %y, i64 inreg %z) {
1330; GFX7-LABEL: v_s_s_bfi_sha256_ma_i64:
1331; GFX7:       ; %bb.0: ; %entry
1332; GFX7-NEXT:    v_xor_b32_e32 v1, s1, v1
1333; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1334; GFX7-NEXT:    v_bfi_b32 v1, v1, s3, v2
1335; GFX7-NEXT:    v_xor_b32_e32 v0, s0, v0
1336; GFX7-NEXT:    v_mov_b32_e32 v2, s0
1337; GFX7-NEXT:    v_bfi_b32 v0, v0, s2, v2
1338; GFX7-NEXT:    ; return to shader part epilog
1339;
1340; GFX8-LABEL: v_s_s_bfi_sha256_ma_i64:
1341; GFX8:       ; %bb.0: ; %entry
1342; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v1
1343; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1344; GFX8-NEXT:    v_bfi_b32 v1, v1, s3, v2
1345; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
1346; GFX8-NEXT:    v_mov_b32_e32 v2, s0
1347; GFX8-NEXT:    v_bfi_b32 v0, v0, s2, v2
1348; GFX8-NEXT:    ; return to shader part epilog
1349;
1350; GFX10-LABEL: v_s_s_bfi_sha256_ma_i64:
1351; GFX10:       ; %bb.0: ; %entry
1352; GFX10-NEXT:    v_xor_b32_e32 v0, s0, v0
1353; GFX10-NEXT:    v_xor_b32_e32 v1, s1, v1
1354; GFX10-NEXT:    v_bfi_b32 v0, v0, s2, s0
1355; GFX10-NEXT:    v_bfi_b32 v1, v1, s3, s1
1356; GFX10-NEXT:    ; return to shader part epilog
1357;
1358; GFX8-GISEL-LABEL: v_s_s_bfi_sha256_ma_i64:
1359; GFX8-GISEL:       ; %bb.0: ; %entry
1360; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, s2, v0
1361; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, s3, v1
1362; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s2, v0
1363; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, s3, v1
1364; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1365; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1366; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
1367; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
1368; GFX8-GISEL-NEXT:    ; return to shader part epilog
1369;
1370; GFX10-GISEL-LABEL: v_s_s_bfi_sha256_ma_i64:
1371; GFX10-GISEL:       ; %bb.0: ; %entry
1372; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, s2, v0
1373; GFX10-GISEL-NEXT:    v_or_b32_e32 v3, s3, v1
1374; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s2, v0
1375; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s3, v1
1376; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s0, v2
1377; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, s1, v3
1378; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
1379; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
1380; GFX10-GISEL-NEXT:    ; return to shader part epilog
1381entry:
1382  %and0 = and i64 %x, %z
1383  %or0 = or i64 %x, %z
1384  %and1 = and i64 %y, %or0
1385  %or1 = or i64 %and0, %and1
1386  %cast = bitcast i64 %or1 to <2 x float>
1387  ret <2 x float> %cast
1388}
1389
1390define amdgpu_ps <2 x float> @s_v_s_bfi_sha256_ma_i64(i64 inreg %x, i64 %y, i64 inreg %z) {
1391; GFX7-LABEL: s_v_s_bfi_sha256_ma_i64:
1392; GFX7:       ; %bb.0: ; %entry
1393; GFX7-NEXT:    v_xor_b32_e32 v2, s1, v1
1394; GFX7-NEXT:    v_bfi_b32 v1, v2, s3, v1
1395; GFX7-NEXT:    v_xor_b32_e32 v2, s0, v0
1396; GFX7-NEXT:    v_bfi_b32 v0, v2, s2, v0
1397; GFX7-NEXT:    ; return to shader part epilog
1398;
1399; GFX8-LABEL: s_v_s_bfi_sha256_ma_i64:
1400; GFX8:       ; %bb.0: ; %entry
1401; GFX8-NEXT:    v_xor_b32_e32 v2, s1, v1
1402; GFX8-NEXT:    v_bfi_b32 v1, v2, s3, v1
1403; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v0
1404; GFX8-NEXT:    v_bfi_b32 v0, v2, s2, v0
1405; GFX8-NEXT:    ; return to shader part epilog
1406;
1407; GFX10-LABEL: s_v_s_bfi_sha256_ma_i64:
1408; GFX10:       ; %bb.0: ; %entry
1409; GFX10-NEXT:    v_xor_b32_e32 v2, s0, v0
1410; GFX10-NEXT:    v_xor_b32_e32 v3, s1, v1
1411; GFX10-NEXT:    v_bfi_b32 v0, v2, s2, v0
1412; GFX10-NEXT:    v_bfi_b32 v1, v3, s3, v1
1413; GFX10-NEXT:    ; return to shader part epilog
1414;
1415; GFX8-GISEL-LABEL: s_v_s_bfi_sha256_ma_i64:
1416; GFX8-GISEL:       ; %bb.0: ; %entry
1417; GFX8-GISEL-NEXT:    s_and_b64 s[4:5], s[0:1], s[2:3]
1418; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1419; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1420; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1421; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s4, v0
1422; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, s5, v1
1423; GFX8-GISEL-NEXT:    ; return to shader part epilog
1424;
1425; GFX10-GISEL-LABEL: s_v_s_bfi_sha256_ma_i64:
1426; GFX10-GISEL:       ; %bb.0: ; %entry
1427; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[0:1], s[2:3]
1428; GFX10-GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
1429; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
1430; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s5, v1
1431; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, s0, v0
1432; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, s1, v1
1433; GFX10-GISEL-NEXT:    ; return to shader part epilog
1434entry:
1435  %and0 = and i64 %x, %z
1436  %or0 = or i64 %x, %z
1437  %and1 = and i64 %y, %or0
1438  %or1 = or i64 %and0, %and1
1439  %cast = bitcast i64 %or1 to <2 x float>
1440  ret <2 x float> %cast
1441}
1442
1443define amdgpu_ps <2 x float> @s_s_v_bfi_sha256_ma_i64(i64 inreg %x, i64 inreg %y, i64 %z) {
1444; GFX7-LABEL: s_s_v_bfi_sha256_ma_i64:
1445; GFX7:       ; %bb.0: ; %entry
1446; GFX7-NEXT:    v_mov_b32_e32 v2, s3
1447; GFX7-NEXT:    v_xor_b32_e32 v2, s1, v2
1448; GFX7-NEXT:    v_bfi_b32 v1, v2, v1, s3
1449; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1450; GFX7-NEXT:    v_xor_b32_e32 v2, s0, v2
1451; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, s2
1452; GFX7-NEXT:    ; return to shader part epilog
1453;
1454; GFX8-LABEL: s_s_v_bfi_sha256_ma_i64:
1455; GFX8:       ; %bb.0: ; %entry
1456; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1457; GFX8-NEXT:    v_xor_b32_e32 v2, s1, v2
1458; GFX8-NEXT:    v_bfi_b32 v1, v2, v1, s3
1459; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1460; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v2
1461; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, s2
1462; GFX8-NEXT:    ; return to shader part epilog
1463;
1464; GFX10-LABEL: s_s_v_bfi_sha256_ma_i64:
1465; GFX10:       ; %bb.0: ; %entry
1466; GFX10-NEXT:    v_xor_b32_e64 v2, s0, s2
1467; GFX10-NEXT:    v_xor_b32_e64 v3, s1, s3
1468; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, s2
1469; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, s3
1470; GFX10-NEXT:    ; return to shader part epilog
1471;
1472; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ma_i64:
1473; GFX8-GISEL:       ; %bb.0: ; %entry
1474; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, s0, v0
1475; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, s1, v1
1476; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s0, v0
1477; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, s1, v1
1478; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s2, v0
1479; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s3, v1
1480; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
1481; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
1482; GFX8-GISEL-NEXT:    ; return to shader part epilog
1483;
1484; GFX10-GISEL-LABEL: s_s_v_bfi_sha256_ma_i64:
1485; GFX10-GISEL:       ; %bb.0: ; %entry
1486; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, s0, v0
1487; GFX10-GISEL-NEXT:    v_or_b32_e32 v3, s1, v1
1488; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1489; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1490; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s2, v2
1491; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, s3, v3
1492; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
1493; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
1494; GFX10-GISEL-NEXT:    ; return to shader part epilog
1495entry:
1496  %and0 = and i64 %x, %z
1497  %or0 = or i64 %x, %z
1498  %and1 = and i64 %y, %or0
1499  %or1 = or i64 %and0, %and1
1500  %cast = bitcast i64 %or1 to <2 x float>
1501  ret <2 x float> %cast
1502}
1503
1504define amdgpu_ps <2 x float> @v_s_v_bfi_sha256_ma_i64(i64 %x, i64 inreg %y, i64 %z) {
1505; GFX7-LABEL: v_s_v_bfi_sha256_ma_i64:
1506; GFX7:       ; %bb.0: ; %entry
1507; GFX7-NEXT:    v_xor_b32_e32 v1, s1, v1
1508; GFX7-NEXT:    v_xor_b32_e32 v0, s0, v0
1509; GFX7-NEXT:    v_bfi_b32 v1, v1, v3, s1
1510; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, s0
1511; GFX7-NEXT:    ; return to shader part epilog
1512;
1513; GFX8-LABEL: v_s_v_bfi_sha256_ma_i64:
1514; GFX8:       ; %bb.0: ; %entry
1515; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v1
1516; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
1517; GFX8-NEXT:    v_bfi_b32 v1, v1, v3, s1
1518; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, s0
1519; GFX8-NEXT:    ; return to shader part epilog
1520;
1521; GFX10-LABEL: v_s_v_bfi_sha256_ma_i64:
1522; GFX10:       ; %bb.0: ; %entry
1523; GFX10-NEXT:    v_xor_b32_e32 v0, s0, v0
1524; GFX10-NEXT:    v_xor_b32_e32 v1, s1, v1
1525; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, s0
1526; GFX10-NEXT:    v_bfi_b32 v1, v1, v3, s1
1527; GFX10-NEXT:    ; return to shader part epilog
1528;
1529; GFX8-GISEL-LABEL: v_s_v_bfi_sha256_ma_i64:
1530; GFX8-GISEL:       ; %bb.0: ; %entry
1531; GFX8-GISEL-NEXT:    v_and_b32_e32 v4, v0, v2
1532; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, v1, v3
1533; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
1534; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
1535; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1536; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1537; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
1538; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v5, v1
1539; GFX8-GISEL-NEXT:    ; return to shader part epilog
1540;
1541; GFX10-GISEL-LABEL: v_s_v_bfi_sha256_ma_i64:
1542; GFX10-GISEL:       ; %bb.0: ; %entry
1543; GFX10-GISEL-NEXT:    v_or_b32_e32 v4, v0, v2
1544; GFX10-GISEL-NEXT:    v_or_b32_e32 v5, v1, v3
1545; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
1546; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
1547; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s0, v4
1548; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, s1, v5
1549; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
1550; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
1551; GFX10-GISEL-NEXT:    ; return to shader part epilog
1552entry:
1553  %and0 = and i64 %x, %z
1554  %or0 = or i64 %x, %z
1555  %and1 = and i64 %y, %or0
1556  %or1 = or i64 %and0, %and1
1557  %cast = bitcast i64 %or1 to <2 x float>
1558  ret <2 x float> %cast
1559}
1560
1561define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
1562; GFX7-LABEL: s_bitselect_i64_pat_0:
1563; GFX7:       ; %bb.0:
1564; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1565; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1566; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1567; GFX7-NEXT:    s_mov_b32 s2, -1
1568; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1569; GFX7-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
1570; GFX7-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
1571; GFX7-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
1572; GFX7-NEXT:    s_add_u32 s0, s0, 10
1573; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1574; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1575; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1576; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1577; GFX7-NEXT:    s_endpgm
1578;
1579; GFX8-LABEL: s_bitselect_i64_pat_0:
1580; GFX8:       ; %bb.0:
1581; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1582; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1583; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1584; GFX8-NEXT:    s_and_b64 s[2:3], s[4:5], s[6:7]
1585; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
1586; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1587; GFX8-NEXT:    s_add_u32 s0, s0, 10
1588; GFX8-NEXT:    s_addc_u32 s1, s1, 0
1589; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1590; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1591; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1592; GFX8-NEXT:    s_endpgm
1593;
1594; GFX10-LABEL: s_bitselect_i64_pat_0:
1595; GFX10:       ; %bb.0:
1596; GFX10-NEXT:    s_clause 0x1
1597; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1598; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1599; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1600; GFX10-NEXT:    s_and_b64 s[2:3], s[4:5], s[6:7]
1601; GFX10-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
1602; GFX10-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1603; GFX10-NEXT:    s_add_u32 s0, s0, 10
1604; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1605; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1606; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1607; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1608; GFX10-NEXT:    s_endpgm
1609;
1610; GFX8-GISEL-LABEL: s_bitselect_i64_pat_0:
1611; GFX8-GISEL:       ; %bb.0:
1612; GFX8-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1613; GFX8-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1614; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1615; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[4:5], s[6:7]
1616; GFX8-GISEL-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
1617; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1618; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
1619; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1620; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1621; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1622; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1623; GFX8-GISEL-NEXT:    s_endpgm
1624;
1625; GFX10-GISEL-LABEL: s_bitselect_i64_pat_0:
1626; GFX10-GISEL:       ; %bb.0:
1627; GFX10-GISEL-NEXT:    s_clause 0x1
1628; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1629; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1630; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1631; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[4:5], s[6:7]
1632; GFX10-GISEL-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
1633; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1634; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
1635; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1636; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1637; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1638; GFX10-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1639; GFX10-GISEL-NEXT:    s_endpgm
1640  %and0 = and i64 %a, %b
1641  %not.a = xor i64 %a, -1
1642  %and1 = and i64 %not.a, %mask
1643  %bitselect = or i64 %and0, %and1
1644  %scalar.use = add i64 %bitselect, 10
1645  store i64 %scalar.use, i64 addrspace(1)* undef
1646  ret void
1647}
1648
1649define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
1650; GFX7-LABEL: s_bitselect_i64_pat_1:
1651; GFX7:       ; %bb.0:
1652; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1653; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1654; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1655; GFX7-NEXT:    s_mov_b32 s2, -1
1656; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1657; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], s[0:1]
1658; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
1659; GFX7-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
1660; GFX7-NEXT:    s_add_u32 s0, s0, 10
1661; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1662; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1663; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1664; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1665; GFX7-NEXT:    s_endpgm
1666;
1667; GFX8-LABEL: s_bitselect_i64_pat_1:
1668; GFX8:       ; %bb.0:
1669; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1670; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1671; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1672; GFX8-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1673; GFX8-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1674; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1675; GFX8-NEXT:    s_add_u32 s0, s0, 10
1676; GFX8-NEXT:    s_addc_u32 s1, s1, 0
1677; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1678; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1679; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1680; GFX8-NEXT:    s_endpgm
1681;
1682; GFX10-LABEL: s_bitselect_i64_pat_1:
1683; GFX10:       ; %bb.0:
1684; GFX10-NEXT:    s_clause 0x1
1685; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1686; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1687; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1688; GFX10-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1689; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1690; GFX10-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1691; GFX10-NEXT:    s_add_u32 s0, s0, 10
1692; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1693; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1694; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1695; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1696; GFX10-NEXT:    s_endpgm
1697;
1698; GFX8-GISEL-LABEL: s_bitselect_i64_pat_1:
1699; GFX8-GISEL:       ; %bb.0:
1700; GFX8-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1701; GFX8-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1702; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1703; GFX8-GISEL-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1704; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1705; GFX8-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1706; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
1707; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1708; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1709; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1710; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1711; GFX8-GISEL-NEXT:    s_endpgm
1712;
1713; GFX10-GISEL-LABEL: s_bitselect_i64_pat_1:
1714; GFX10-GISEL:       ; %bb.0:
1715; GFX10-GISEL-NEXT:    s_clause 0x1
1716; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1717; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1718; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1719; GFX10-GISEL-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1720; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1721; GFX10-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1722; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
1723; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1724; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1725; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1726; GFX10-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1727; GFX10-GISEL-NEXT:    s_endpgm
1728  %xor.0 = xor i64 %a, %mask
1729  %and = and i64 %xor.0, %b
1730  %bitselect = xor i64 %and, %mask
1731
1732  %scalar.use = add i64 %bitselect, 10
1733  store i64 %scalar.use, i64 addrspace(1)* undef
1734  ret void
1735}
1736
1737define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
1738; GFX7-LABEL: s_bitselect_i64_pat_2:
1739; GFX7:       ; %bb.0:
1740; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1741; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1742; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1743; GFX7-NEXT:    s_mov_b32 s2, -1
1744; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1745; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], s[0:1]
1746; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
1747; GFX7-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
1748; GFX7-NEXT:    s_add_u32 s0, s0, 10
1749; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1750; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1751; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1752; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1753; GFX7-NEXT:    s_endpgm
1754;
1755; GFX8-LABEL: s_bitselect_i64_pat_2:
1756; GFX8:       ; %bb.0:
1757; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1758; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1759; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1760; GFX8-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1761; GFX8-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1762; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1763; GFX8-NEXT:    s_add_u32 s0, s0, 10
1764; GFX8-NEXT:    s_addc_u32 s1, s1, 0
1765; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1766; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1767; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1768; GFX8-NEXT:    s_endpgm
1769;
1770; GFX10-LABEL: s_bitselect_i64_pat_2:
1771; GFX10:       ; %bb.0:
1772; GFX10-NEXT:    s_clause 0x1
1773; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1774; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1775; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1776; GFX10-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1777; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1778; GFX10-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1779; GFX10-NEXT:    s_add_u32 s0, s0, 10
1780; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1781; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1782; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1783; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1784; GFX10-NEXT:    s_endpgm
1785;
1786; GFX8-GISEL-LABEL: s_bitselect_i64_pat_2:
1787; GFX8-GISEL:       ; %bb.0:
1788; GFX8-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1789; GFX8-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1790; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1791; GFX8-GISEL-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1792; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1793; GFX8-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1794; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
1795; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1796; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1797; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1798; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1799; GFX8-GISEL-NEXT:    s_endpgm
1800;
1801; GFX10-GISEL-LABEL: s_bitselect_i64_pat_2:
1802; GFX10-GISEL:       ; %bb.0:
1803; GFX10-GISEL-NEXT:    s_clause 0x1
1804; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1805; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1806; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1807; GFX10-GISEL-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1808; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1809; GFX10-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1810; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
1811; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1812; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1813; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1814; GFX10-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1815; GFX10-GISEL-NEXT:    s_endpgm
1816  %xor.0 = xor i64 %a, %mask
1817  %and = and i64 %xor.0, %b
1818  %bitselect = xor i64 %and, %mask
1819
1820  %scalar.use = add i64 %bitselect, 10
1821  store i64 %scalar.use, i64 addrspace(1)* undef
1822  ret void
1823}
1824
1825define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
1826; GFX7-LABEL: s_bfi_sha256_ma_i64:
1827; GFX7:       ; %bb.0: ; %entry
1828; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1829; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1830; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1831; GFX7-NEXT:    s_mov_b32 s2, -1
1832; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1833; GFX7-NEXT:    s_and_b64 s[8:9], s[4:5], s[0:1]
1834; GFX7-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
1835; GFX7-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
1836; GFX7-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
1837; GFX7-NEXT:    s_add_u32 s0, s0, 10
1838; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1839; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1840; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1841; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1842; GFX7-NEXT:    s_endpgm
1843;
1844; GFX8-LABEL: s_bfi_sha256_ma_i64:
1845; GFX8:       ; %bb.0: ; %entry
1846; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1847; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1848; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1849; GFX8-NEXT:    s_and_b64 s[2:3], s[4:5], s[0:1]
1850; GFX8-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
1851; GFX8-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
1852; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1853; GFX8-NEXT:    s_add_u32 s0, s0, 10
1854; GFX8-NEXT:    s_addc_u32 s1, s1, 0
1855; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1856; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1857; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1858; GFX8-NEXT:    s_endpgm
1859;
1860; GFX10-LABEL: s_bfi_sha256_ma_i64:
1861; GFX10:       ; %bb.0: ; %entry
1862; GFX10-NEXT:    s_clause 0x1
1863; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1864; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1865; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1866; GFX10-NEXT:    s_or_b64 s[2:3], s[4:5], s[0:1]
1867; GFX10-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
1868; GFX10-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
1869; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1870; GFX10-NEXT:    s_add_u32 s0, s0, 10
1871; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1872; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1873; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1874; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1875; GFX10-NEXT:    s_endpgm
1876;
1877; GFX8-GISEL-LABEL: s_bfi_sha256_ma_i64:
1878; GFX8-GISEL:       ; %bb.0: ; %entry
1879; GFX8-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1880; GFX8-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1881; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1882; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[4:5], s[0:1]
1883; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
1884; GFX8-GISEL-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
1885; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1886; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
1887; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1888; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1889; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1890; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1891; GFX8-GISEL-NEXT:    s_endpgm
1892;
1893; GFX10-GISEL-LABEL: s_bfi_sha256_ma_i64:
1894; GFX10-GISEL:       ; %bb.0: ; %entry
1895; GFX10-GISEL-NEXT:    s_clause 0x1
1896; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1897; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1898; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1899; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[4:5], s[0:1]
1900; GFX10-GISEL-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
1901; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
1902; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1903; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
1904; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1905; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1906; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1907; GFX10-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1908; GFX10-GISEL-NEXT:    s_endpgm
1909entry:
1910  %and0 = and i64 %x, %z
1911  %or0 = or i64 %x, %z
1912  %and1 = and i64 %y, %or0
1913  %or1 = or i64 %and0, %and1
1914
1915  %scalar.use = add i64 %or1, 10
1916  store i64 %scalar.use, i64 addrspace(1)* undef
1917  ret void
1918}
1919