xref: /llvm-project/llvm/test/CodeGen/AMDGPU/bfi_int.ll (revision 09fbdde42cc1aa09fbfc11c0b2d2be8a28cc91db)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s
3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
5; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8-GISEL %s
6; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10-GISEL %s
7
8; BFI_INT Definition pattern from ISA docs
9; (y & x) | (z & ~x)
10;
11define amdgpu_kernel void @s_bfi_def_i32(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
12; GFX7-LABEL: s_bfi_def_i32:
13; GFX7:       ; %bb.0: ; %entry
14; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
15; GFX7-NEXT:    s_load_dword s8, s[0:1], 0xd
16; GFX7-NEXT:    s_mov_b32 s3, 0xf000
17; GFX7-NEXT:    s_mov_b32 s2, -1
18; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
19; GFX7-NEXT:    s_mov_b32 s0, s4
20; GFX7-NEXT:    s_mov_b32 s1, s5
21; GFX7-NEXT:    s_andn2_b32 s4, s8, s6
22; GFX7-NEXT:    s_and_b32 s5, s7, s6
23; GFX7-NEXT:    s_or_b32 s4, s4, s5
24; GFX7-NEXT:    v_mov_b32_e32 v0, s4
25; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
26; GFX7-NEXT:    s_endpgm
27;
28; GFX8-LABEL: s_bfi_def_i32:
29; GFX8:       ; %bb.0: ; %entry
30; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
31; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
32; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX8-NEXT:    s_and_b32 s1, s7, s6
34; GFX8-NEXT:    s_andn2_b32 s0, s0, s6
35; GFX8-NEXT:    s_or_b32 s0, s0, s1
36; GFX8-NEXT:    v_mov_b32_e32 v0, s4
37; GFX8-NEXT:    v_mov_b32_e32 v1, s5
38; GFX8-NEXT:    v_mov_b32_e32 v2, s0
39; GFX8-NEXT:    flat_store_dword v[0:1], v2
40; GFX8-NEXT:    s_endpgm
41;
42; GFX10-LABEL: s_bfi_def_i32:
43; GFX10:       ; %bb.0: ; %entry
44; GFX10-NEXT:    s_clause 0x1
45; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
46; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
47; GFX10-NEXT:    v_mov_b32_e32 v0, 0
48; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
49; GFX10-NEXT:    s_and_b32 s1, s7, s6
50; GFX10-NEXT:    s_andn2_b32 s0, s0, s6
51; GFX10-NEXT:    s_or_b32 s0, s0, s1
52; GFX10-NEXT:    v_mov_b32_e32 v1, s0
53; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
54; GFX10-NEXT:    s_endpgm
55;
56; GFX8-GISEL-LABEL: s_bfi_def_i32:
57; GFX8-GISEL:       ; %bb.0: ; %entry
58; GFX8-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
59; GFX8-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x34
60; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
61; GFX8-GISEL-NEXT:    s_and_b32 s1, s7, s6
62; GFX8-GISEL-NEXT:    s_andn2_b32 s0, s0, s6
63; GFX8-GISEL-NEXT:    s_or_b32 s0, s0, s1
64; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s4
65; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s0
66; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s5
67; GFX8-GISEL-NEXT:    flat_store_dword v[0:1], v2
68; GFX8-GISEL-NEXT:    s_endpgm
69;
70; GFX10-GISEL-LABEL: s_bfi_def_i32:
71; GFX10-GISEL:       ; %bb.0: ; %entry
72; GFX10-GISEL-NEXT:    s_clause 0x1
73; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
74; GFX10-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x34
75; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
76; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
77; GFX10-GISEL-NEXT:    s_and_b32 s1, s7, s6
78; GFX10-GISEL-NEXT:    s_andn2_b32 s0, s0, s6
79; GFX10-GISEL-NEXT:    s_or_b32 s0, s0, s1
80; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
81; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
82; GFX10-GISEL-NEXT:    s_endpgm
83entry:
84  %0 = xor i32 %x, -1
85  %1 = and i32 %z, %0
86  %2 = and i32 %y, %x
87  %3 = or i32 %1, %2
88  store i32 %3, i32 addrspace(1)* %out
89  ret void
90}
91
92define i32 @v_bfi_def_i32(i32 %x, i32 %y, i32 %z) {
93; GFX7-LABEL: v_bfi_def_i32:
94; GFX7:       ; %bb.0: ; %entry
95; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
96; GFX7-NEXT:    v_bfi_b32 v0, v0, v1, v2
97; GFX7-NEXT:    s_setpc_b64 s[30:31]
98;
99; GFX8-LABEL: v_bfi_def_i32:
100; GFX8:       ; %bb.0: ; %entry
101; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102; GFX8-NEXT:    v_bfi_b32 v0, v0, v1, v2
103; GFX8-NEXT:    s_setpc_b64 s[30:31]
104;
105; GFX10-LABEL: v_bfi_def_i32:
106; GFX10:       ; %bb.0: ; %entry
107; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
109; GFX10-NEXT:    v_bfi_b32 v0, v0, v1, v2
110; GFX10-NEXT:    s_setpc_b64 s[30:31]
111;
112; GFX8-GISEL-LABEL: v_bfi_def_i32:
113; GFX8-GISEL:       ; %bb.0: ; %entry
114; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
116; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
117;
118; GFX10-GISEL-LABEL: v_bfi_def_i32:
119; GFX10-GISEL:       ; %bb.0: ; %entry
120; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
122; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
123; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
124entry:
125  %0 = xor i32 %x, -1
126  %1 = and i32 %z, %0
127  %2 = and i32 %y, %x
128  %3 = or i32 %1, %2
129  ret i32 %3
130}
131
132; SHA-256 Ch function
133; z ^ (x & (y ^ z))
134define amdgpu_kernel void @s_bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
135; GFX7-LABEL: s_bfi_sha256_ch:
136; GFX7:       ; %bb.0: ; %entry
137; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
138; GFX7-NEXT:    s_load_dword s8, s[0:1], 0xd
139; GFX7-NEXT:    s_mov_b32 s3, 0xf000
140; GFX7-NEXT:    s_mov_b32 s2, -1
141; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
142; GFX7-NEXT:    s_mov_b32 s0, s4
143; GFX7-NEXT:    s_xor_b32 s4, s7, s8
144; GFX7-NEXT:    s_and_b32 s4, s6, s4
145; GFX7-NEXT:    s_xor_b32 s4, s8, s4
146; GFX7-NEXT:    s_mov_b32 s1, s5
147; GFX7-NEXT:    v_mov_b32_e32 v0, s4
148; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
149; GFX7-NEXT:    s_endpgm
150;
151; GFX8-LABEL: s_bfi_sha256_ch:
152; GFX8:       ; %bb.0: ; %entry
153; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
154; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
155; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
156; GFX8-NEXT:    v_mov_b32_e32 v0, s4
157; GFX8-NEXT:    s_xor_b32 s1, s7, s0
158; GFX8-NEXT:    s_and_b32 s1, s6, s1
159; GFX8-NEXT:    s_xor_b32 s0, s0, s1
160; GFX8-NEXT:    v_mov_b32_e32 v1, s5
161; GFX8-NEXT:    v_mov_b32_e32 v2, s0
162; GFX8-NEXT:    flat_store_dword v[0:1], v2
163; GFX8-NEXT:    s_endpgm
164;
165; GFX10-LABEL: s_bfi_sha256_ch:
166; GFX10:       ; %bb.0: ; %entry
167; GFX10-NEXT:    s_clause 0x1
168; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
169; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
170; GFX10-NEXT:    v_mov_b32_e32 v0, 0
171; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
172; GFX10-NEXT:    s_xor_b32 s1, s7, s0
173; GFX10-NEXT:    s_and_b32 s1, s6, s1
174; GFX10-NEXT:    s_xor_b32 s0, s0, s1
175; GFX10-NEXT:    v_mov_b32_e32 v1, s0
176; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
177; GFX10-NEXT:    s_endpgm
178;
179; GFX8-GISEL-LABEL: s_bfi_sha256_ch:
180; GFX8-GISEL:       ; %bb.0: ; %entry
181; GFX8-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
182; GFX8-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x34
183; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
184; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s4
185; GFX8-GISEL-NEXT:    s_xor_b32 s1, s7, s0
186; GFX8-GISEL-NEXT:    s_and_b32 s1, s6, s1
187; GFX8-GISEL-NEXT:    s_xor_b32 s0, s0, s1
188; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s0
189; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s5
190; GFX8-GISEL-NEXT:    flat_store_dword v[0:1], v2
191; GFX8-GISEL-NEXT:    s_endpgm
192;
193; GFX10-GISEL-LABEL: s_bfi_sha256_ch:
194; GFX10-GISEL:       ; %bb.0: ; %entry
195; GFX10-GISEL-NEXT:    s_clause 0x1
196; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
197; GFX10-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x34
198; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
199; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
200; GFX10-GISEL-NEXT:    s_xor_b32 s1, s7, s0
201; GFX10-GISEL-NEXT:    s_and_b32 s1, s6, s1
202; GFX10-GISEL-NEXT:    s_xor_b32 s0, s0, s1
203; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
204; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
205; GFX10-GISEL-NEXT:    s_endpgm
206entry:
207  %0 = xor i32 %y, %z
208  %1 = and i32 %x, %0
209  %2 = xor i32 %z, %1
210  store i32 %2, i32 addrspace(1)* %out
211  ret void
212}
213
214define i32 @v_bfi_sha256_ch(i32 %x, i32 %y, i32 %z) {
215; GFX7-LABEL: v_bfi_sha256_ch:
216; GFX7:       ; %bb.0: ; %entry
217; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218; GFX7-NEXT:    v_bfi_b32 v0, v0, v1, v2
219; GFX7-NEXT:    s_setpc_b64 s[30:31]
220;
221; GFX8-LABEL: v_bfi_sha256_ch:
222; GFX8:       ; %bb.0: ; %entry
223; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
224; GFX8-NEXT:    v_bfi_b32 v0, v0, v1, v2
225; GFX8-NEXT:    s_setpc_b64 s[30:31]
226;
227; GFX10-LABEL: v_bfi_sha256_ch:
228; GFX10:       ; %bb.0: ; %entry
229; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
231; GFX10-NEXT:    v_bfi_b32 v0, v0, v1, v2
232; GFX10-NEXT:    s_setpc_b64 s[30:31]
233;
234; GFX8-GISEL-LABEL: v_bfi_sha256_ch:
235; GFX8-GISEL:       ; %bb.0: ; %entry
236; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
238; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
239;
240; GFX10-GISEL-LABEL: v_bfi_sha256_ch:
241; GFX10-GISEL:       ; %bb.0: ; %entry
242; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
243; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
244; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
245; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
246entry:
247  %0 = xor i32 %y, %z
248  %1 = and i32 %x, %0
249  %2 = xor i32 %z, %1
250  ret i32 %2
251}
252
253define amdgpu_ps float @v_s_s_bfi_sha256_ch(i32 %x, i32 inreg %y, i32 inreg %z) {
254; GFX7-LABEL: v_s_s_bfi_sha256_ch:
255; GFX7:       ; %bb.0: ; %entry
256; GFX7-NEXT:    v_mov_b32_e32 v1, s1
257; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v1
258; GFX7-NEXT:    ; return to shader part epilog
259;
260; GFX8-LABEL: v_s_s_bfi_sha256_ch:
261; GFX8:       ; %bb.0: ; %entry
262; GFX8-NEXT:    v_mov_b32_e32 v1, s1
263; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v1
264; GFX8-NEXT:    ; return to shader part epilog
265;
266; GFX10-LABEL: v_s_s_bfi_sha256_ch:
267; GFX10:       ; %bb.0: ; %entry
268; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, s1
269; GFX10-NEXT:    ; return to shader part epilog
270;
271; GFX8-GISEL-LABEL: v_s_s_bfi_sha256_ch:
272; GFX8-GISEL:       ; %bb.0: ; %entry
273; GFX8-GISEL-NEXT:    s_xor_b32 s0, s0, s1
274; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
275; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s1, v0
276; GFX8-GISEL-NEXT:    ; return to shader part epilog
277;
278; GFX10-GISEL-LABEL: v_s_s_bfi_sha256_ch:
279; GFX10-GISEL:       ; %bb.0: ; %entry
280; GFX10-GISEL-NEXT:    s_xor_b32 s0, s0, s1
281; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
282; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s1, v0
283; GFX10-GISEL-NEXT:    ; return to shader part epilog
284entry:
285  %xor0 = xor i32 %y, %z
286  %and = and i32 %x, %xor0
287  %xor1 = xor i32 %z, %and
288  %cast = bitcast i32 %xor1 to float
289  ret float %cast
290}
291
292define amdgpu_ps float @s_v_s_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 inreg %z) {
293; GFX7-LABEL: s_v_s_bfi_sha256_ch:
294; GFX7:       ; %bb.0: ; %entry
295; GFX7-NEXT:    v_mov_b32_e32 v1, s1
296; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v1
297; GFX7-NEXT:    ; return to shader part epilog
298;
299; GFX8-LABEL: s_v_s_bfi_sha256_ch:
300; GFX8:       ; %bb.0: ; %entry
301; GFX8-NEXT:    v_mov_b32_e32 v1, s1
302; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v1
303; GFX8-NEXT:    ; return to shader part epilog
304;
305; GFX10-LABEL: s_v_s_bfi_sha256_ch:
306; GFX10:       ; %bb.0: ; %entry
307; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, s1
308; GFX10-NEXT:    ; return to shader part epilog
309;
310; GFX8-GISEL-LABEL: s_v_s_bfi_sha256_ch:
311; GFX8-GISEL:       ; %bb.0: ; %entry
312; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s1, v0
313; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
314; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s1, v0
315; GFX8-GISEL-NEXT:    ; return to shader part epilog
316;
317; GFX10-GISEL-LABEL: s_v_s_bfi_sha256_ch:
318; GFX10-GISEL:       ; %bb.0: ; %entry
319; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s1, v0
320; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
321; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s1, v0
322; GFX10-GISEL-NEXT:    ; return to shader part epilog
323entry:
324  %xor0 = xor i32 %y, %z
325  %and = and i32 %x, %xor0
326  %xor1 = xor i32 %z, %and
327  %cast = bitcast i32 %xor1 to float
328  ret float %cast
329}
330
331define amdgpu_ps float @s_s_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) {
332; GFX7-LABEL: s_s_v_bfi_sha256_ch:
333; GFX7:       ; %bb.0: ; %entry
334; GFX7-NEXT:    v_mov_b32_e32 v1, s1
335; GFX7-NEXT:    v_bfi_b32 v0, s0, v1, v0
336; GFX7-NEXT:    ; return to shader part epilog
337;
338; GFX8-LABEL: s_s_v_bfi_sha256_ch:
339; GFX8:       ; %bb.0: ; %entry
340; GFX8-NEXT:    v_mov_b32_e32 v1, s1
341; GFX8-NEXT:    v_bfi_b32 v0, s0, v1, v0
342; GFX8-NEXT:    ; return to shader part epilog
343;
344; GFX10-LABEL: s_s_v_bfi_sha256_ch:
345; GFX10:       ; %bb.0: ; %entry
346; GFX10-NEXT:    v_bfi_b32 v0, s0, s1, v0
347; GFX10-NEXT:    ; return to shader part epilog
348;
349; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ch:
350; GFX8-GISEL:       ; %bb.0: ; %entry
351; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s0
352; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v1, s1, v0
353; GFX8-GISEL-NEXT:    ; return to shader part epilog
354;
355; GFX10-GISEL-LABEL: s_s_v_bfi_sha256_ch:
356; GFX10-GISEL:       ; %bb.0: ; %entry
357; GFX10-GISEL-NEXT:    v_bfi_b32 v0, s0, s1, v0
358; GFX10-GISEL-NEXT:    ; return to shader part epilog
359entry:
360  %xor0 = xor i32 %y, %z
361  %and = and i32 %x, %xor0
362  %xor1 = xor i32 %z, %and
363  %cast = bitcast i32 %xor1 to float
364  ret float %cast
365}
366
367define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 %z) {
368; GFX7-LABEL: s_v_v_bfi_sha256_ch:
369; GFX7:       ; %bb.0: ; %entry
370; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v1
371; GFX7-NEXT:    ; return to shader part epilog
372;
373; GFX8-LABEL: s_v_v_bfi_sha256_ch:
374; GFX8:       ; %bb.0: ; %entry
375; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v1
376; GFX8-NEXT:    ; return to shader part epilog
377;
378; GFX10-LABEL: s_v_v_bfi_sha256_ch:
379; GFX10:       ; %bb.0: ; %entry
380; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, v1
381; GFX10-NEXT:    ; return to shader part epilog
382;
383; GFX8-GISEL-LABEL: s_v_v_bfi_sha256_ch:
384; GFX8-GISEL:       ; %bb.0: ; %entry
385; GFX8-GISEL-NEXT:    v_bfi_b32 v0, s0, v0, v1
386; GFX8-GISEL-NEXT:    ; return to shader part epilog
387;
388; GFX10-GISEL-LABEL: s_v_v_bfi_sha256_ch:
389; GFX10-GISEL:       ; %bb.0: ; %entry
390; GFX10-GISEL-NEXT:    v_bfi_b32 v0, s0, v0, v1
391; GFX10-GISEL-NEXT:    ; return to shader part epilog
392entry:
393  %xor0 = xor i32 %y, %z
394  %and = and i32 %x, %xor0
395  %xor1 = xor i32 %z, %and
396  %cast = bitcast i32 %xor1 to float
397  ret float %cast
398}
399
400define amdgpu_ps float @v_s_v_bfi_sha256_ch(i32 %x, i32 inreg %y, i32 %z) {
401; GFX7-LABEL: v_s_v_bfi_sha256_ch:
402; GFX7:       ; %bb.0: ; %entry
403; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v1
404; GFX7-NEXT:    ; return to shader part epilog
405;
406; GFX8-LABEL: v_s_v_bfi_sha256_ch:
407; GFX8:       ; %bb.0: ; %entry
408; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v1
409; GFX8-NEXT:    ; return to shader part epilog
410;
411; GFX10-LABEL: v_s_v_bfi_sha256_ch:
412; GFX10:       ; %bb.0: ; %entry
413; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, v1
414; GFX10-NEXT:    ; return to shader part epilog
415;
416; GFX8-GISEL-LABEL: v_s_v_bfi_sha256_ch:
417; GFX8-GISEL:       ; %bb.0: ; %entry
418; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, s0, v1
419; GFX8-GISEL-NEXT:    ; return to shader part epilog
420;
421; GFX10-GISEL-LABEL: v_s_v_bfi_sha256_ch:
422; GFX10-GISEL:       ; %bb.0: ; %entry
423; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, s0, v1
424; GFX10-GISEL-NEXT:    ; return to shader part epilog
425entry:
426  %xor0 = xor i32 %y, %z
427  %and = and i32 %x, %xor0
428  %xor1 = xor i32 %z, %and
429  %cast = bitcast i32 %xor1 to float
430  ret float %cast
431}
432
433define amdgpu_ps float @v_v_s_bfi_sha256_ch(i32 %x, i32 %y, i32 inreg %z) {
434; GFX7-LABEL: v_v_s_bfi_sha256_ch:
435; GFX7:       ; %bb.0: ; %entry
436; GFX7-NEXT:    v_bfi_b32 v0, v0, v1, s0
437; GFX7-NEXT:    ; return to shader part epilog
438;
439; GFX8-LABEL: v_v_s_bfi_sha256_ch:
440; GFX8:       ; %bb.0: ; %entry
441; GFX8-NEXT:    v_bfi_b32 v0, v0, v1, s0
442; GFX8-NEXT:    ; return to shader part epilog
443;
444; GFX10-LABEL: v_v_s_bfi_sha256_ch:
445; GFX10:       ; %bb.0: ; %entry
446; GFX10-NEXT:    v_bfi_b32 v0, v0, v1, s0
447; GFX10-NEXT:    ; return to shader part epilog
448;
449; GFX8-GISEL-LABEL: v_v_s_bfi_sha256_ch:
450; GFX8-GISEL:       ; %bb.0: ; %entry
451; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v1
452; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
453; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s0, v0
454; GFX8-GISEL-NEXT:    ; return to shader part epilog
455;
456; GFX10-GISEL-LABEL: v_v_s_bfi_sha256_ch:
457; GFX10-GISEL:       ; %bb.0: ; %entry
458; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, s0, v1
459; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
460; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s0, v0
461; GFX10-GISEL-NEXT:    ; return to shader part epilog
462entry:
463  %xor0 = xor i32 %y, %z
464  %and = and i32 %x, %xor0
465  %xor1 = xor i32 %z, %and
466  %cast = bitcast i32 %xor1 to float
467  ret float %cast
468}
469
470; SHA-256 Ma function
471; ((x & z) | (y & (x | z)))
472define amdgpu_kernel void @s_bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
473; GFX7-LABEL: s_bfi_sha256_ma:
474; GFX7:       ; %bb.0: ; %entry
475; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
476; GFX7-NEXT:    s_load_dword s8, s[0:1], 0xd
477; GFX7-NEXT:    s_mov_b32 s3, 0xf000
478; GFX7-NEXT:    s_mov_b32 s2, -1
479; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
480; GFX7-NEXT:    s_mov_b32 s1, s5
481; GFX7-NEXT:    s_or_b32 s5, s6, s8
482; GFX7-NEXT:    s_mov_b32 s0, s4
483; GFX7-NEXT:    s_and_b32 s4, s6, s8
484; GFX7-NEXT:    s_and_b32 s5, s7, s5
485; GFX7-NEXT:    s_or_b32 s4, s4, s5
486; GFX7-NEXT:    v_mov_b32_e32 v0, s4
487; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
488; GFX7-NEXT:    s_endpgm
489;
490; GFX8-LABEL: s_bfi_sha256_ma:
491; GFX8:       ; %bb.0: ; %entry
492; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
493; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
494; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
495; GFX8-NEXT:    v_mov_b32_e32 v0, s4
496; GFX8-NEXT:    s_and_b32 s1, s6, s0
497; GFX8-NEXT:    s_or_b32 s0, s6, s0
498; GFX8-NEXT:    s_and_b32 s0, s7, s0
499; GFX8-NEXT:    s_or_b32 s0, s1, s0
500; GFX8-NEXT:    v_mov_b32_e32 v1, s5
501; GFX8-NEXT:    v_mov_b32_e32 v2, s0
502; GFX8-NEXT:    flat_store_dword v[0:1], v2
503; GFX8-NEXT:    s_endpgm
504;
505; GFX10-LABEL: s_bfi_sha256_ma:
506; GFX10:       ; %bb.0: ; %entry
507; GFX10-NEXT:    s_clause 0x1
508; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
509; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
510; GFX10-NEXT:    v_mov_b32_e32 v0, 0
511; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
512; GFX10-NEXT:    s_or_b32 s1, s6, s0
513; GFX10-NEXT:    s_and_b32 s0, s6, s0
514; GFX10-NEXT:    s_and_b32 s1, s7, s1
515; GFX10-NEXT:    s_or_b32 s0, s0, s1
516; GFX10-NEXT:    v_mov_b32_e32 v1, s0
517; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
518; GFX10-NEXT:    s_endpgm
519;
520; GFX8-GISEL-LABEL: s_bfi_sha256_ma:
521; GFX8-GISEL:       ; %bb.0: ; %entry
522; GFX8-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
523; GFX8-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x34
524; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
525; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s4
526; GFX8-GISEL-NEXT:    s_and_b32 s1, s6, s0
527; GFX8-GISEL-NEXT:    s_or_b32 s0, s6, s0
528; GFX8-GISEL-NEXT:    s_and_b32 s0, s7, s0
529; GFX8-GISEL-NEXT:    s_or_b32 s0, s1, s0
530; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s0
531; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s5
532; GFX8-GISEL-NEXT:    flat_store_dword v[0:1], v2
533; GFX8-GISEL-NEXT:    s_endpgm
534;
535; GFX10-GISEL-LABEL: s_bfi_sha256_ma:
536; GFX10-GISEL:       ; %bb.0: ; %entry
537; GFX10-GISEL-NEXT:    s_clause 0x1
538; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
539; GFX10-GISEL-NEXT:    s_load_dword s0, s[0:1], 0x34
540; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
541; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
542; GFX10-GISEL-NEXT:    s_or_b32 s1, s6, s0
543; GFX10-GISEL-NEXT:    s_and_b32 s0, s6, s0
544; GFX10-GISEL-NEXT:    s_and_b32 s1, s7, s1
545; GFX10-GISEL-NEXT:    s_or_b32 s0, s0, s1
546; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
547; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[4:5]
548; GFX10-GISEL-NEXT:    s_endpgm
549entry:
550  %0 = and i32 %x, %z
551  %1 = or i32 %x, %z
552  %2 = and i32 %y, %1
553  %3 = or i32 %0, %2
554  store i32 %3, i32 addrspace(1)* %out
555  ret void
556}
557
558define i32 @v_bfi_sha256_ma(i32 %x, i32 %y, i32 %z) {
559; GFX7-LABEL: v_bfi_sha256_ma:
560; GFX7:       ; %bb.0: ; %entry
561; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
562; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v1
563; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, v1
564; GFX7-NEXT:    s_setpc_b64 s[30:31]
565;
566; GFX8-LABEL: v_bfi_sha256_ma:
567; GFX8:       ; %bb.0: ; %entry
568; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
569; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v1
570; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, v1
571; GFX8-NEXT:    s_setpc_b64 s[30:31]
572;
573; GFX10-LABEL: v_bfi_sha256_ma:
574; GFX10:       ; %bb.0: ; %entry
575; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
576; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
577; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v1
578; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, v1
579; GFX10-NEXT:    s_setpc_b64 s[30:31]
580;
581; GFX8-GISEL-LABEL: v_bfi_sha256_ma:
582; GFX8-GISEL:       ; %bb.0: ; %entry
583; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
584; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
585; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, v1
586; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
587;
588; GFX10-GISEL-LABEL: v_bfi_sha256_ma:
589; GFX10-GISEL:       ; %bb.0: ; %entry
590; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
591; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
592; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
593; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, v1
594; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
595entry:
596  %0 = and i32 %x, %z
597  %1 = or i32 %x, %z
598  %2 = and i32 %y, %1
599  %3 = or i32 %0, %2
600  ret i32 %3
601}
602
603define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %mask) {
604; GFX7-LABEL: v_bitselect_v2i32_pat1:
605; GFX7:       ; %bb.0:
606; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
607; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
608; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
609; GFX7-NEXT:    s_setpc_b64 s[30:31]
610;
611; GFX8-LABEL: v_bitselect_v2i32_pat1:
612; GFX8:       ; %bb.0:
613; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
614; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
615; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
616; GFX8-NEXT:    s_setpc_b64 s[30:31]
617;
618; GFX10-LABEL: v_bitselect_v2i32_pat1:
619; GFX10:       ; %bb.0:
620; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
621; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
622; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, v4
623; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, v5
624; GFX10-NEXT:    s_setpc_b64 s[30:31]
625;
626; GFX8-GISEL-LABEL: v_bitselect_v2i32_pat1:
627; GFX8-GISEL:       ; %bb.0:
628; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
629; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
630; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
631; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
632; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
633; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
634; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
635; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
636;
637; GFX10-GISEL-LABEL: v_bitselect_v2i32_pat1:
638; GFX10-GISEL:       ; %bb.0:
639; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
640; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
641; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
642; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
643; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
644; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
645; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
646; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
647; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
648  %xor.0 = xor <2 x i32> %a, %mask
649  %and = and <2 x i32> %xor.0, %b
650  %bitselect = xor <2 x i32> %and, %mask
651  ret <2 x i32> %bitselect
652}
653
654define i64 @v_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
655; GFX7-LABEL: v_bitselect_i64_pat_0:
656; GFX7:       ; %bb.0:
657; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
658; GFX7-NEXT:    v_bfi_b32 v1, v1, v3, v5
659; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, v4
660; GFX7-NEXT:    s_setpc_b64 s[30:31]
661;
662; GFX8-LABEL: v_bitselect_i64_pat_0:
663; GFX8:       ; %bb.0:
664; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
665; GFX8-NEXT:    v_bfi_b32 v1, v1, v3, v5
666; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, v4
667; GFX8-NEXT:    s_setpc_b64 s[30:31]
668;
669; GFX10-LABEL: v_bitselect_i64_pat_0:
670; GFX10:       ; %bb.0:
671; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
672; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
673; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, v4
674; GFX10-NEXT:    v_bfi_b32 v1, v1, v3, v5
675; GFX10-NEXT:    s_setpc_b64 s[30:31]
676;
677; GFX8-GISEL-LABEL: v_bitselect_i64_pat_0:
678; GFX8-GISEL:       ; %bb.0:
679; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
680; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, v0, v2
681; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, v1, v3
682; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, -1, v0
683; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, -1, v1
684; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
685; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
686; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
687; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
688; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
689;
690; GFX10-GISEL-LABEL: v_bitselect_i64_pat_0:
691; GFX10-GISEL:       ; %bb.0:
692; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
693; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
694; GFX10-GISEL-NEXT:    v_xor_b32_e32 v6, -1, v0
695; GFX10-GISEL-NEXT:    v_xor_b32_e32 v7, -1, v1
696; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
697; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
698; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, v6, v4
699; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, v7, v5
700; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
701; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
702; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
703  %and0 = and i64 %a, %b
704  %not.a = xor i64 %a, -1
705  %and1 = and i64 %not.a, %mask
706  %bitselect = or i64 %and0, %and1
707  ret i64 %bitselect
708}
709
710define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_0(i64 %a, i64 inreg %b, i64 inreg %mask) {
711; GFX7-LABEL: v_s_s_bitselect_i64_pat_0:
712; GFX7:       ; %bb.0:
713; GFX7-NEXT:    v_mov_b32_e32 v2, s3
714; GFX7-NEXT:    v_bfi_b32 v1, v1, s1, v2
715; GFX7-NEXT:    v_mov_b32_e32 v2, s2
716; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v2
717; GFX7-NEXT:    ; return to shader part epilog
718;
719; GFX8-LABEL: v_s_s_bitselect_i64_pat_0:
720; GFX8:       ; %bb.0:
721; GFX8-NEXT:    v_mov_b32_e32 v2, s3
722; GFX8-NEXT:    v_bfi_b32 v1, v1, s1, v2
723; GFX8-NEXT:    v_mov_b32_e32 v2, s2
724; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v2
725; GFX8-NEXT:    ; return to shader part epilog
726;
727; GFX10-LABEL: v_s_s_bitselect_i64_pat_0:
728; GFX10:       ; %bb.0:
729; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, s2
730; GFX10-NEXT:    v_bfi_b32 v1, v1, s1, s3
731; GFX10-NEXT:    ; return to shader part epilog
732;
733; GFX8-GISEL-LABEL: v_s_s_bitselect_i64_pat_0:
734; GFX8-GISEL:       ; %bb.0:
735; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, s0, v0
736; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, s1, v1
737; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, -1, v0
738; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, -1, v1
739; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s2, v0
740; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s3, v1
741; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
742; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
743; GFX8-GISEL-NEXT:    ; return to shader part epilog
744;
745; GFX10-GISEL-LABEL: v_s_s_bitselect_i64_pat_0:
746; GFX10-GISEL:       ; %bb.0:
747; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, -1, v0
748; GFX10-GISEL-NEXT:    v_xor_b32_e32 v3, -1, v1
749; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
750; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
751; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s2, v2
752; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, s3, v3
753; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
754; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
755; GFX10-GISEL-NEXT:    ; return to shader part epilog
756  %and0 = and i64 %a, %b
757  %not.a = xor i64 %a, -1
758  %and1 = and i64 %not.a, %mask
759  %bitselect = or i64 %and0, %and1
760  %cast = bitcast i64 %bitselect to <2 x float>
761  ret <2 x float> %cast
762}
763
764define amdgpu_ps <2 x float> @s_v_s_bitselect_i64_pat_0(i64 inreg %a, i64 %b, i64 inreg %mask) {
765; GFX7-LABEL: s_v_s_bitselect_i64_pat_0:
766; GFX7:       ; %bb.0:
767; GFX7-NEXT:    v_mov_b32_e32 v2, s3
768; GFX7-NEXT:    v_bfi_b32 v1, s1, v1, v2
769; GFX7-NEXT:    v_mov_b32_e32 v2, s2
770; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v2
771; GFX7-NEXT:    ; return to shader part epilog
772;
773; GFX8-LABEL: s_v_s_bitselect_i64_pat_0:
774; GFX8:       ; %bb.0:
775; GFX8-NEXT:    v_mov_b32_e32 v2, s3
776; GFX8-NEXT:    v_bfi_b32 v1, s1, v1, v2
777; GFX8-NEXT:    v_mov_b32_e32 v2, s2
778; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v2
779; GFX8-NEXT:    ; return to shader part epilog
780;
781; GFX10-LABEL: s_v_s_bitselect_i64_pat_0:
782; GFX10:       ; %bb.0:
783; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, s2
784; GFX10-NEXT:    v_bfi_b32 v1, s1, v1, s3
785; GFX10-NEXT:    ; return to shader part epilog
786;
787; GFX8-GISEL-LABEL: s_v_s_bitselect_i64_pat_0:
788; GFX8-GISEL:       ; %bb.0:
789; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
790; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
791; GFX8-GISEL-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[0:1]
792; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s0, v0
793; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, s1, v1
794; GFX8-GISEL-NEXT:    ; return to shader part epilog
795;
796; GFX10-GISEL-LABEL: s_v_s_bitselect_i64_pat_0:
797; GFX10-GISEL:       ; %bb.0:
798; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
799; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
800; GFX10-GISEL-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[0:1]
801; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, s0, v0
802; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, s1, v1
803; GFX10-GISEL-NEXT:    ; return to shader part epilog
804  %and0 = and i64 %a, %b
805  %not.a = xor i64 %a, -1
806  %and1 = and i64 %not.a, %mask
807  %bitselect = or i64 %and0, %and1
808  %cast = bitcast i64 %bitselect to <2 x float>
809  ret <2 x float> %cast
810}
811
812define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_0(i64 inreg %a, i64 inreg %b, i64 %mask) {
813; GFX7-LABEL: s_s_v_bitselect_i64_pat_0:
814; GFX7:       ; %bb.0:
815; GFX7-NEXT:    v_mov_b32_e32 v2, s3
816; GFX7-NEXT:    v_bfi_b32 v1, s1, v2, v1
817; GFX7-NEXT:    v_mov_b32_e32 v2, s2
818; GFX7-NEXT:    v_bfi_b32 v0, s0, v2, v0
819; GFX7-NEXT:    ; return to shader part epilog
820;
821; GFX8-LABEL: s_s_v_bitselect_i64_pat_0:
822; GFX8:       ; %bb.0:
823; GFX8-NEXT:    v_mov_b32_e32 v2, s3
824; GFX8-NEXT:    v_bfi_b32 v1, s1, v2, v1
825; GFX8-NEXT:    v_mov_b32_e32 v2, s2
826; GFX8-NEXT:    v_bfi_b32 v0, s0, v2, v0
827; GFX8-NEXT:    ; return to shader part epilog
828;
829; GFX10-LABEL: s_s_v_bitselect_i64_pat_0:
830; GFX10:       ; %bb.0:
831; GFX10-NEXT:    v_bfi_b32 v0, s0, s2, v0
832; GFX10-NEXT:    v_bfi_b32 v1, s1, s3, v1
833; GFX10-NEXT:    ; return to shader part epilog
834;
835; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_0:
836; GFX8-GISEL:       ; %bb.0:
837; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
838; GFX8-GISEL-NEXT:    s_not_b64 s[0:1], s[0:1]
839; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
840; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
841; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s2, v0
842; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, s3, v1
843; GFX8-GISEL-NEXT:    ; return to shader part epilog
844;
845; GFX10-GISEL-LABEL: s_s_v_bitselect_i64_pat_0:
846; GFX10-GISEL:       ; %bb.0:
847; GFX10-GISEL-NEXT:    s_not_b64 s[4:5], s[0:1]
848; GFX10-GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
849; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
850; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s5, v1
851; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, s0, v0
852; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, s1, v1
853; GFX10-GISEL-NEXT:    ; return to shader part epilog
854  %and0 = and i64 %a, %b
855  %not.a = xor i64 %a, -1
856  %and1 = and i64 %not.a, %mask
857  %bitselect = or i64 %and0, %and1
858  %cast = bitcast i64 %bitselect to <2 x float>
859  ret <2 x float> %cast
860}
861
862define amdgpu_ps <2 x float> @v_v_s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 inreg %mask) {
863; GFX7-LABEL: v_v_s_bitselect_i64_pat_0:
864; GFX7:       ; %bb.0:
865; GFX7-NEXT:    v_bfi_b32 v1, v1, v3, s1
866; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, s0
867; GFX7-NEXT:    ; return to shader part epilog
868;
869; GFX8-LABEL: v_v_s_bitselect_i64_pat_0:
870; GFX8:       ; %bb.0:
871; GFX8-NEXT:    v_bfi_b32 v1, v1, v3, s1
872; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, s0
873; GFX8-NEXT:    ; return to shader part epilog
874;
875; GFX10-LABEL: v_v_s_bitselect_i64_pat_0:
876; GFX10:       ; %bb.0:
877; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, s0
878; GFX10-NEXT:    v_bfi_b32 v1, v1, v3, s1
879; GFX10-NEXT:    ; return to shader part epilog
880;
881; GFX8-GISEL-LABEL: v_v_s_bitselect_i64_pat_0:
882; GFX8-GISEL:       ; %bb.0:
883; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, v0, v2
884; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, v1, v3
885; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, -1, v0
886; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, -1, v1
887; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
888; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
889; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
890; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
891; GFX8-GISEL-NEXT:    ; return to shader part epilog
892;
893; GFX10-GISEL-LABEL: v_v_s_bitselect_i64_pat_0:
894; GFX10-GISEL:       ; %bb.0:
895; GFX10-GISEL-NEXT:    v_xor_b32_e32 v4, -1, v0
896; GFX10-GISEL-NEXT:    v_xor_b32_e32 v5, -1, v1
897; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
898; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
899; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s0, v4
900; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, s1, v5
901; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
902; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
903; GFX10-GISEL-NEXT:    ; return to shader part epilog
904  %and0 = and i64 %a, %b
905  %not.a = xor i64 %a, -1
906  %and1 = and i64 %not.a, %mask
907  %bitselect = or i64 %and0, %and1
908  %cast = bitcast i64 %bitselect to <2 x float>
909  ret <2 x float> %cast
910}
911
912define amdgpu_ps <2 x float> @v_s_v_bitselect_i64_pat_0(i64 %a, i64 inreg %b, i64 %mask) {
913; GFX7-LABEL: v_s_v_bitselect_i64_pat_0:
914; GFX7:       ; %bb.0:
915; GFX7-NEXT:    v_bfi_b32 v1, v1, s1, v3
916; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v2
917; GFX7-NEXT:    ; return to shader part epilog
918;
919; GFX8-LABEL: v_s_v_bitselect_i64_pat_0:
920; GFX8:       ; %bb.0:
921; GFX8-NEXT:    v_bfi_b32 v1, v1, s1, v3
922; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v2
923; GFX8-NEXT:    ; return to shader part epilog
924;
925; GFX10-LABEL: v_s_v_bitselect_i64_pat_0:
926; GFX10:       ; %bb.0:
927; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, v2
928; GFX10-NEXT:    v_bfi_b32 v1, v1, s1, v3
929; GFX10-NEXT:    ; return to shader part epilog
930;
931; GFX8-GISEL-LABEL: v_s_v_bitselect_i64_pat_0:
932; GFX8-GISEL:       ; %bb.0:
933; GFX8-GISEL-NEXT:    v_and_b32_e32 v4, s0, v0
934; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, s1, v1
935; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, -1, v0
936; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, -1, v1
937; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
938; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
939; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
940; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v5, v1
941; GFX8-GISEL-NEXT:    ; return to shader part epilog
942;
943; GFX10-GISEL-LABEL: v_s_v_bitselect_i64_pat_0:
944; GFX10-GISEL:       ; %bb.0:
945; GFX10-GISEL-NEXT:    v_xor_b32_e32 v4, -1, v0
946; GFX10-GISEL-NEXT:    v_xor_b32_e32 v5, -1, v1
947; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
948; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
949; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, v4, v2
950; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, v5, v3
951; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
952; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
953; GFX10-GISEL-NEXT:    ; return to shader part epilog
954  %and0 = and i64 %a, %b
955  %not.a = xor i64 %a, -1
956  %and1 = and i64 %not.a, %mask
957  %bitselect = or i64 %and0, %and1
958  %cast = bitcast i64 %bitselect to <2 x float>
959  ret <2 x float> %cast
960}
961
962define amdgpu_ps <2 x float> @s_v_v_bitselect_i64_pat_0(i64 inreg %a, i64 %b, i64 %mask) {
963; GFX7-LABEL: s_v_v_bitselect_i64_pat_0:
964; GFX7:       ; %bb.0:
965; GFX7-NEXT:    v_bfi_b32 v1, s1, v1, v3
966; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v2
967; GFX7-NEXT:    ; return to shader part epilog
968;
969; GFX8-LABEL: s_v_v_bitselect_i64_pat_0:
970; GFX8:       ; %bb.0:
971; GFX8-NEXT:    v_bfi_b32 v1, s1, v1, v3
972; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v2
973; GFX8-NEXT:    ; return to shader part epilog
974;
975; GFX10-LABEL: s_v_v_bitselect_i64_pat_0:
976; GFX10:       ; %bb.0:
977; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, v2
978; GFX10-NEXT:    v_bfi_b32 v1, s1, v1, v3
979; GFX10-NEXT:    ; return to shader part epilog
980;
981; GFX8-GISEL-LABEL: s_v_v_bitselect_i64_pat_0:
982; GFX8-GISEL:       ; %bb.0:
983; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
984; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
985; GFX8-GISEL-NEXT:    s_not_b64 s[0:1], s[0:1]
986; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, s0, v2
987; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, s1, v3
988; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
989; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
990; GFX8-GISEL-NEXT:    ; return to shader part epilog
991;
992; GFX10-GISEL-LABEL: s_v_v_bitselect_i64_pat_0:
993; GFX10-GISEL:       ; %bb.0:
994; GFX10-GISEL-NEXT:    s_not_b64 s[2:3], s[0:1]
995; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
996; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
997; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s2, v2
998; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, s3, v3
999; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
1000; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
1001; GFX10-GISEL-NEXT:    ; return to shader part epilog
1002  %and0 = and i64 %a, %b
1003  %not.a = xor i64 %a, -1
1004  %and1 = and i64 %not.a, %mask
1005  %bitselect = or i64 %and0, %and1
1006  %cast = bitcast i64 %bitselect to <2 x float>
1007  ret <2 x float> %cast
1008}
1009
1010define i64 @v_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
1011; GFX7-LABEL: v_bitselect_i64_pat_1:
1012; GFX7:       ; %bb.0:
1013; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1014; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
1015; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
1016; GFX7-NEXT:    s_setpc_b64 s[30:31]
1017;
1018; GFX8-LABEL: v_bitselect_i64_pat_1:
1019; GFX8:       ; %bb.0:
1020; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1021; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
1022; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
1023; GFX8-NEXT:    s_setpc_b64 s[30:31]
1024;
1025; GFX10-LABEL: v_bitselect_i64_pat_1:
1026; GFX10:       ; %bb.0:
1027; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1028; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1029; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, v4
1030; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, v5
1031; GFX10-NEXT:    s_setpc_b64 s[30:31]
1032;
1033; GFX8-GISEL-LABEL: v_bitselect_i64_pat_1:
1034; GFX8-GISEL:       ; %bb.0:
1035; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1036; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1037; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1038; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
1039; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
1040; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1041; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1042; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
1043;
1044; GFX10-GISEL-LABEL: v_bitselect_i64_pat_1:
1045; GFX10-GISEL:       ; %bb.0:
1046; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1047; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
1048; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1049; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1050; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
1051; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
1052; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1053; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1054; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
1055  %xor.0 = xor i64 %a, %mask
1056  %and = and i64 %xor.0, %b
1057  %bitselect = xor i64 %and, %mask
1058  ret i64 %bitselect
1059}
1060
1061define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_1(i64 %a, i64 inreg %b, i64 inreg %mask) {
1062; GFX7-LABEL: v_s_s_bitselect_i64_pat_1:
1063; GFX7:       ; %bb.0:
1064; GFX7-NEXT:    v_mov_b32_e32 v2, s3
1065; GFX7-NEXT:    v_bfi_b32 v1, s1, v1, v2
1066; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1067; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v2
1068; GFX7-NEXT:    ; return to shader part epilog
1069;
1070; GFX8-LABEL: v_s_s_bitselect_i64_pat_1:
1071; GFX8:       ; %bb.0:
1072; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1073; GFX8-NEXT:    v_bfi_b32 v1, s1, v1, v2
1074; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1075; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v2
1076; GFX8-NEXT:    ; return to shader part epilog
1077;
1078; GFX10-LABEL: v_s_s_bitselect_i64_pat_1:
1079; GFX10:       ; %bb.0:
1080; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, s2
1081; GFX10-NEXT:    v_bfi_b32 v1, s1, v1, s3
1082; GFX10-NEXT:    ; return to shader part epilog
1083;
1084; GFX8-GISEL-LABEL: v_s_s_bitselect_i64_pat_1:
1085; GFX8-GISEL:       ; %bb.0:
1086; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s2, v0
1087; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, s3, v1
1088; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1089; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1090; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s2, v0
1091; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, s3, v1
1092; GFX8-GISEL-NEXT:    ; return to shader part epilog
1093;
1094; GFX10-GISEL-LABEL: v_s_s_bitselect_i64_pat_1:
1095; GFX10-GISEL:       ; %bb.0:
1096; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s2, v0
1097; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, s3, v1
1098; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1099; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1100; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s2, v0
1101; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, s3, v1
1102; GFX10-GISEL-NEXT:    ; return to shader part epilog
1103  %xor.0 = xor i64 %a, %mask
1104  %and = and i64 %xor.0, %b
1105  %bitselect = xor i64 %and, %mask
1106  %cast = bitcast i64 %bitselect to <2 x float>
1107  ret <2 x float> %cast
1108}
1109
1110define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_1(i64 inreg %a, i64 inreg %b, i64 %mask) {
1111; GFX7-LABEL: s_s_v_bitselect_i64_pat_1:
1112; GFX7:       ; %bb.0:
1113; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1114; GFX7-NEXT:    v_bfi_b32 v1, s3, v2, v1
1115; GFX7-NEXT:    v_mov_b32_e32 v2, s0
1116; GFX7-NEXT:    v_bfi_b32 v0, s2, v2, v0
1117; GFX7-NEXT:    ; return to shader part epilog
1118;
1119; GFX8-LABEL: s_s_v_bitselect_i64_pat_1:
1120; GFX8:       ; %bb.0:
1121; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1122; GFX8-NEXT:    v_bfi_b32 v1, s3, v2, v1
1123; GFX8-NEXT:    v_mov_b32_e32 v2, s0
1124; GFX8-NEXT:    v_bfi_b32 v0, s2, v2, v0
1125; GFX8-NEXT:    ; return to shader part epilog
1126;
1127; GFX10-LABEL: s_s_v_bitselect_i64_pat_1:
1128; GFX10:       ; %bb.0:
1129; GFX10-NEXT:    v_bfi_b32 v0, s2, s0, v0
1130; GFX10-NEXT:    v_bfi_b32 v1, s3, s1, v1
1131; GFX10-NEXT:    ; return to shader part epilog
1132;
1133; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_1:
1134; GFX8-GISEL:       ; %bb.0:
1135; GFX8-GISEL-NEXT:    v_xor_b32_e32 v2, s0, v0
1136; GFX8-GISEL-NEXT:    v_xor_b32_e32 v3, s1, v1
1137; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, s2, v2
1138; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, s3, v3
1139; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v2, v0
1140; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v1
1141; GFX8-GISEL-NEXT:    ; return to shader part epilog
1142;
1143; GFX10-GISEL-LABEL: s_s_v_bitselect_i64_pat_1:
1144; GFX10-GISEL:       ; %bb.0:
1145; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, s0, v0
1146; GFX10-GISEL-NEXT:    v_xor_b32_e32 v3, s1, v1
1147; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s2, v2
1148; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, s3, v3
1149; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v2, v0
1150; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v1
1151; GFX10-GISEL-NEXT:    ; return to shader part epilog
1152  %xor.0 = xor i64 %a, %mask
1153  %and = and i64 %xor.0, %b
1154  %bitselect = xor i64 %and, %mask
1155  %cast = bitcast i64 %bitselect to <2 x float>
1156  ret <2 x float> %cast
1157}
1158
1159define amdgpu_ps <2 x float> @s_v_s_bitselect_i64_pat_1(i64 inreg %a, i64 %b, i64 inreg %mask) {
1160; GFX7-LABEL: s_v_s_bitselect_i64_pat_1:
1161; GFX7:       ; %bb.0:
1162; GFX7-NEXT:    v_mov_b32_e32 v2, s3
1163; GFX7-NEXT:    v_bfi_b32 v1, v1, s1, v2
1164; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1165; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v2
1166; GFX7-NEXT:    ; return to shader part epilog
1167;
1168; GFX8-LABEL: s_v_s_bitselect_i64_pat_1:
1169; GFX8:       ; %bb.0:
1170; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1171; GFX8-NEXT:    v_bfi_b32 v1, v1, s1, v2
1172; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1173; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v2
1174; GFX8-NEXT:    ; return to shader part epilog
1175;
1176; GFX10-LABEL: s_v_s_bitselect_i64_pat_1:
1177; GFX10:       ; %bb.0:
1178; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, s2
1179; GFX10-NEXT:    v_bfi_b32 v1, v1, s1, s3
1180; GFX10-NEXT:    ; return to shader part epilog
1181;
1182; GFX8-GISEL-LABEL: s_v_s_bitselect_i64_pat_1:
1183; GFX8-GISEL:       ; %bb.0:
1184; GFX8-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
1185; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1186; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1187; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s2, v0
1188; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, s3, v1
1189; GFX8-GISEL-NEXT:    ; return to shader part epilog
1190;
1191; GFX10-GISEL-LABEL: s_v_s_bitselect_i64_pat_1:
1192; GFX10-GISEL:       ; %bb.0:
1193; GFX10-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
1194; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1195; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1196; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s2, v0
1197; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, s3, v1
1198; GFX10-GISEL-NEXT:    ; return to shader part epilog
1199  %xor.0 = xor i64 %a, %mask
1200  %and = and i64 %xor.0, %b
1201  %bitselect = xor i64 %and, %mask
1202  %cast = bitcast i64 %bitselect to <2 x float>
1203  ret <2 x float> %cast
1204}
1205
1206define i64 @v_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
1207; GFX7-LABEL: v_bitselect_i64_pat_2:
1208; GFX7:       ; %bb.0:
1209; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1210; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
1211; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
1212; GFX7-NEXT:    s_setpc_b64 s[30:31]
1213;
1214; GFX8-LABEL: v_bitselect_i64_pat_2:
1215; GFX8:       ; %bb.0:
1216; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1217; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
1218; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
1219; GFX8-NEXT:    s_setpc_b64 s[30:31]
1220;
1221; GFX10-LABEL: v_bitselect_i64_pat_2:
1222; GFX10:       ; %bb.0:
1223; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1224; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1225; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, v4
1226; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, v5
1227; GFX10-NEXT:    s_setpc_b64 s[30:31]
1228;
1229; GFX8-GISEL-LABEL: v_bitselect_i64_pat_2:
1230; GFX8-GISEL:       ; %bb.0:
1231; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1232; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1233; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1234; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
1235; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
1236; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1237; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1238; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
1239;
1240; GFX10-GISEL-LABEL: v_bitselect_i64_pat_2:
1241; GFX10-GISEL:       ; %bb.0:
1242; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1243; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
1244; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1245; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1246; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
1247; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
1248; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
1249; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
1250; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
1251  %xor.0 = xor i64 %a, %mask
1252  %and = and i64 %xor.0, %b
1253  %bitselect = xor i64 %and, %mask
1254  ret i64 %bitselect
1255}
1256
1257define i64 @v_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
1258; GFX7-LABEL: v_bfi_sha256_ma_i64:
1259; GFX7:       ; %bb.0: ; %entry
1260; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1261; GFX7-NEXT:    v_xor_b32_e32 v1, v1, v3
1262; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v2
1263; GFX7-NEXT:    v_bfi_b32 v1, v1, v5, v3
1264; GFX7-NEXT:    v_bfi_b32 v0, v0, v4, v2
1265; GFX7-NEXT:    s_setpc_b64 s[30:31]
1266;
1267; GFX8-LABEL: v_bfi_sha256_ma_i64:
1268; GFX8:       ; %bb.0: ; %entry
1269; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1270; GFX8-NEXT:    v_xor_b32_e32 v1, v1, v3
1271; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v2
1272; GFX8-NEXT:    v_bfi_b32 v1, v1, v5, v3
1273; GFX8-NEXT:    v_bfi_b32 v0, v0, v4, v2
1274; GFX8-NEXT:    s_setpc_b64 s[30:31]
1275;
1276; GFX10-LABEL: v_bfi_sha256_ma_i64:
1277; GFX10:       ; %bb.0: ; %entry
1278; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1279; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1280; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v2
1281; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
1282; GFX10-NEXT:    v_bfi_b32 v0, v0, v4, v2
1283; GFX10-NEXT:    v_bfi_b32 v1, v1, v5, v3
1284; GFX10-NEXT:    s_setpc_b64 s[30:31]
1285;
1286; GFX8-GISEL-LABEL: v_bfi_sha256_ma_i64:
1287; GFX8-GISEL:       ; %bb.0: ; %entry
1288; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1289; GFX8-GISEL-NEXT:    v_and_b32_e32 v6, v0, v4
1290; GFX8-GISEL-NEXT:    v_and_b32_e32 v7, v1, v5
1291; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
1292; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
1293; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, v2, v0
1294; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, v3, v1
1295; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v6, v0
1296; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v7, v1
1297; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
1298;
1299; GFX10-GISEL-LABEL: v_bfi_sha256_ma_i64:
1300; GFX10-GISEL:       ; %bb.0: ; %entry
1301; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1302; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
1303; GFX10-GISEL-NEXT:    v_or_b32_e32 v6, v0, v4
1304; GFX10-GISEL-NEXT:    v_or_b32_e32 v7, v1, v5
1305; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
1306; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
1307; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, v2, v6
1308; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, v3, v7
1309; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
1310; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
1311; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
1312entry:
1313  %and0 = and i64 %x, %z
1314  %or0 = or i64 %x, %z
1315  %and1 = and i64 %y, %or0
1316  %or1 = or i64 %and0, %and1
1317  ret i64 %or1
1318}
1319
1320define amdgpu_ps <2 x float> @v_s_s_bfi_sha256_ma_i64(i64 %x, i64 inreg %y, i64 inreg %z) {
1321; GFX7-LABEL: v_s_s_bfi_sha256_ma_i64:
1322; GFX7:       ; %bb.0: ; %entry
1323; GFX7-NEXT:    v_xor_b32_e32 v1, s1, v1
1324; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1325; GFX7-NEXT:    v_bfi_b32 v1, v1, s3, v2
1326; GFX7-NEXT:    v_xor_b32_e32 v0, s0, v0
1327; GFX7-NEXT:    v_mov_b32_e32 v2, s0
1328; GFX7-NEXT:    v_bfi_b32 v0, v0, s2, v2
1329; GFX7-NEXT:    ; return to shader part epilog
1330;
1331; GFX8-LABEL: v_s_s_bfi_sha256_ma_i64:
1332; GFX8:       ; %bb.0: ; %entry
1333; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v1
1334; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1335; GFX8-NEXT:    v_bfi_b32 v1, v1, s3, v2
1336; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
1337; GFX8-NEXT:    v_mov_b32_e32 v2, s0
1338; GFX8-NEXT:    v_bfi_b32 v0, v0, s2, v2
1339; GFX8-NEXT:    ; return to shader part epilog
1340;
1341; GFX10-LABEL: v_s_s_bfi_sha256_ma_i64:
1342; GFX10:       ; %bb.0: ; %entry
1343; GFX10-NEXT:    v_xor_b32_e32 v0, s0, v0
1344; GFX10-NEXT:    v_xor_b32_e32 v1, s1, v1
1345; GFX10-NEXT:    v_bfi_b32 v0, v0, s2, s0
1346; GFX10-NEXT:    v_bfi_b32 v1, v1, s3, s1
1347; GFX10-NEXT:    ; return to shader part epilog
1348;
1349; GFX8-GISEL-LABEL: v_s_s_bfi_sha256_ma_i64:
1350; GFX8-GISEL:       ; %bb.0: ; %entry
1351; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, s2, v0
1352; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, s3, v1
1353; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s2, v0
1354; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, s3, v1
1355; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1356; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1357; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
1358; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
1359; GFX8-GISEL-NEXT:    ; return to shader part epilog
1360;
1361; GFX10-GISEL-LABEL: v_s_s_bfi_sha256_ma_i64:
1362; GFX10-GISEL:       ; %bb.0: ; %entry
1363; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, s2, v0
1364; GFX10-GISEL-NEXT:    v_or_b32_e32 v3, s3, v1
1365; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s2, v0
1366; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s3, v1
1367; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s0, v2
1368; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, s1, v3
1369; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
1370; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
1371; GFX10-GISEL-NEXT:    ; return to shader part epilog
1372entry:
1373  %and0 = and i64 %x, %z
1374  %or0 = or i64 %x, %z
1375  %and1 = and i64 %y, %or0
1376  %or1 = or i64 %and0, %and1
1377  %cast = bitcast i64 %or1 to <2 x float>
1378  ret <2 x float> %cast
1379}
1380
1381define amdgpu_ps <2 x float> @s_v_s_bfi_sha256_ma_i64(i64 inreg %x, i64 %y, i64 inreg %z) {
1382; GFX7-LABEL: s_v_s_bfi_sha256_ma_i64:
1383; GFX7:       ; %bb.0: ; %entry
1384; GFX7-NEXT:    v_xor_b32_e32 v2, s1, v1
1385; GFX7-NEXT:    v_bfi_b32 v1, v2, s3, v1
1386; GFX7-NEXT:    v_xor_b32_e32 v2, s0, v0
1387; GFX7-NEXT:    v_bfi_b32 v0, v2, s2, v0
1388; GFX7-NEXT:    ; return to shader part epilog
1389;
1390; GFX8-LABEL: s_v_s_bfi_sha256_ma_i64:
1391; GFX8:       ; %bb.0: ; %entry
1392; GFX8-NEXT:    v_xor_b32_e32 v2, s1, v1
1393; GFX8-NEXT:    v_bfi_b32 v1, v2, s3, v1
1394; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v0
1395; GFX8-NEXT:    v_bfi_b32 v0, v2, s2, v0
1396; GFX8-NEXT:    ; return to shader part epilog
1397;
1398; GFX10-LABEL: s_v_s_bfi_sha256_ma_i64:
1399; GFX10:       ; %bb.0: ; %entry
1400; GFX10-NEXT:    v_xor_b32_e32 v2, s0, v0
1401; GFX10-NEXT:    v_xor_b32_e32 v3, s1, v1
1402; GFX10-NEXT:    v_bfi_b32 v0, v2, s2, v0
1403; GFX10-NEXT:    v_bfi_b32 v1, v3, s3, v1
1404; GFX10-NEXT:    ; return to shader part epilog
1405;
1406; GFX8-GISEL-LABEL: s_v_s_bfi_sha256_ma_i64:
1407; GFX8-GISEL:       ; %bb.0: ; %entry
1408; GFX8-GISEL-NEXT:    s_and_b64 s[4:5], s[0:1], s[2:3]
1409; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1410; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1411; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1412; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s4, v0
1413; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, s5, v1
1414; GFX8-GISEL-NEXT:    ; return to shader part epilog
1415;
1416; GFX10-GISEL-LABEL: s_v_s_bfi_sha256_ma_i64:
1417; GFX10-GISEL:       ; %bb.0: ; %entry
1418; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[0:1], s[2:3]
1419; GFX10-GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
1420; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s4, v0
1421; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s5, v1
1422; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, s0, v0
1423; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, s1, v1
1424; GFX10-GISEL-NEXT:    ; return to shader part epilog
1425entry:
1426  %and0 = and i64 %x, %z
1427  %or0 = or i64 %x, %z
1428  %and1 = and i64 %y, %or0
1429  %or1 = or i64 %and0, %and1
1430  %cast = bitcast i64 %or1 to <2 x float>
1431  ret <2 x float> %cast
1432}
1433
1434define amdgpu_ps <2 x float> @s_s_v_bfi_sha256_ma_i64(i64 inreg %x, i64 inreg %y, i64 %z) {
1435; GFX7-LABEL: s_s_v_bfi_sha256_ma_i64:
1436; GFX7:       ; %bb.0: ; %entry
1437; GFX7-NEXT:    v_mov_b32_e32 v2, s3
1438; GFX7-NEXT:    v_xor_b32_e32 v2, s1, v2
1439; GFX7-NEXT:    v_bfi_b32 v1, v2, v1, s3
1440; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1441; GFX7-NEXT:    v_xor_b32_e32 v2, s0, v2
1442; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, s2
1443; GFX7-NEXT:    ; return to shader part epilog
1444;
1445; GFX8-LABEL: s_s_v_bfi_sha256_ma_i64:
1446; GFX8:       ; %bb.0: ; %entry
1447; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1448; GFX8-NEXT:    v_xor_b32_e32 v2, s1, v2
1449; GFX8-NEXT:    v_bfi_b32 v1, v2, v1, s3
1450; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1451; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v2
1452; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, s2
1453; GFX8-NEXT:    ; return to shader part epilog
1454;
1455; GFX10-LABEL: s_s_v_bfi_sha256_ma_i64:
1456; GFX10:       ; %bb.0: ; %entry
1457; GFX10-NEXT:    v_xor_b32_e64 v2, s0, s2
1458; GFX10-NEXT:    v_xor_b32_e64 v3, s1, s3
1459; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, s2
1460; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, s3
1461; GFX10-NEXT:    ; return to shader part epilog
1462;
1463; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ma_i64:
1464; GFX8-GISEL:       ; %bb.0: ; %entry
1465; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, s0, v0
1466; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, s1, v1
1467; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s0, v0
1468; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, s1, v1
1469; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s2, v0
1470; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s3, v1
1471; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
1472; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
1473; GFX8-GISEL-NEXT:    ; return to shader part epilog
1474;
1475; GFX10-GISEL-LABEL: s_s_v_bfi_sha256_ma_i64:
1476; GFX10-GISEL:       ; %bb.0: ; %entry
1477; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, s0, v0
1478; GFX10-GISEL-NEXT:    v_or_b32_e32 v3, s1, v1
1479; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1480; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1481; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s2, v2
1482; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, s3, v3
1483; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
1484; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
1485; GFX10-GISEL-NEXT:    ; return to shader part epilog
1486entry:
1487  %and0 = and i64 %x, %z
1488  %or0 = or i64 %x, %z
1489  %and1 = and i64 %y, %or0
1490  %or1 = or i64 %and0, %and1
1491  %cast = bitcast i64 %or1 to <2 x float>
1492  ret <2 x float> %cast
1493}
1494
1495define amdgpu_ps <2 x float> @v_s_v_bfi_sha256_ma_i64(i64 %x, i64 inreg %y, i64 %z) {
1496; GFX7-LABEL: v_s_v_bfi_sha256_ma_i64:
1497; GFX7:       ; %bb.0: ; %entry
1498; GFX7-NEXT:    v_xor_b32_e32 v1, s1, v1
1499; GFX7-NEXT:    v_xor_b32_e32 v0, s0, v0
1500; GFX7-NEXT:    v_bfi_b32 v1, v1, v3, s1
1501; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, s0
1502; GFX7-NEXT:    ; return to shader part epilog
1503;
1504; GFX8-LABEL: v_s_v_bfi_sha256_ma_i64:
1505; GFX8:       ; %bb.0: ; %entry
1506; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v1
1507; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
1508; GFX8-NEXT:    v_bfi_b32 v1, v1, v3, s1
1509; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, s0
1510; GFX8-NEXT:    ; return to shader part epilog
1511;
1512; GFX10-LABEL: v_s_v_bfi_sha256_ma_i64:
1513; GFX10:       ; %bb.0: ; %entry
1514; GFX10-NEXT:    v_xor_b32_e32 v0, s0, v0
1515; GFX10-NEXT:    v_xor_b32_e32 v1, s1, v1
1516; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, s0
1517; GFX10-NEXT:    v_bfi_b32 v1, v1, v3, s1
1518; GFX10-NEXT:    ; return to shader part epilog
1519;
1520; GFX8-GISEL-LABEL: v_s_v_bfi_sha256_ma_i64:
1521; GFX8-GISEL:       ; %bb.0: ; %entry
1522; GFX8-GISEL-NEXT:    v_and_b32_e32 v4, v0, v2
1523; GFX8-GISEL-NEXT:    v_and_b32_e32 v5, v1, v3
1524; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
1525; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
1526; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1527; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1528; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
1529; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v5, v1
1530; GFX8-GISEL-NEXT:    ; return to shader part epilog
1531;
1532; GFX10-GISEL-LABEL: v_s_v_bfi_sha256_ma_i64:
1533; GFX10-GISEL:       ; %bb.0: ; %entry
1534; GFX10-GISEL-NEXT:    v_or_b32_e32 v4, v0, v2
1535; GFX10-GISEL-NEXT:    v_or_b32_e32 v5, v1, v3
1536; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
1537; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
1538; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s0, v4
1539; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, s1, v5
1540; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
1541; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
1542; GFX10-GISEL-NEXT:    ; return to shader part epilog
1543entry:
1544  %and0 = and i64 %x, %z
1545  %or0 = or i64 %x, %z
1546  %and1 = and i64 %y, %or0
1547  %or1 = or i64 %and0, %and1
1548  %cast = bitcast i64 %or1 to <2 x float>
1549  ret <2 x float> %cast
1550}
1551
1552define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
1553; GFX7-LABEL: s_bitselect_i64_pat_0:
1554; GFX7:       ; %bb.0:
1555; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1556; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1557; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1558; GFX7-NEXT:    s_mov_b32 s2, -1
1559; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1560; GFX7-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
1561; GFX7-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
1562; GFX7-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
1563; GFX7-NEXT:    s_add_u32 s0, s0, 10
1564; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1565; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1566; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1567; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1568; GFX7-NEXT:    s_endpgm
1569;
1570; GFX8-LABEL: s_bitselect_i64_pat_0:
1571; GFX8:       ; %bb.0:
1572; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1573; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1574; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1575; GFX8-NEXT:    s_and_b64 s[2:3], s[4:5], s[6:7]
1576; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
1577; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1578; GFX8-NEXT:    s_add_u32 s0, s0, 10
1579; GFX8-NEXT:    s_addc_u32 s1, s1, 0
1580; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1581; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1582; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1583; GFX8-NEXT:    s_endpgm
1584;
1585; GFX10-LABEL: s_bitselect_i64_pat_0:
1586; GFX10:       ; %bb.0:
1587; GFX10-NEXT:    s_clause 0x1
1588; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1589; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1590; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1591; GFX10-NEXT:    s_and_b64 s[2:3], s[4:5], s[6:7]
1592; GFX10-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
1593; GFX10-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1594; GFX10-NEXT:    s_add_u32 s0, s0, 10
1595; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1596; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1597; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1598; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1599; GFX10-NEXT:    s_endpgm
1600;
1601; GFX8-GISEL-LABEL: s_bitselect_i64_pat_0:
1602; GFX8-GISEL:       ; %bb.0:
1603; GFX8-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1604; GFX8-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1605; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1606; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[4:5], s[6:7]
1607; GFX8-GISEL-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
1608; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1609; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
1610; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1611; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1612; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1613; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1614; GFX8-GISEL-NEXT:    s_endpgm
1615;
1616; GFX10-GISEL-LABEL: s_bitselect_i64_pat_0:
1617; GFX10-GISEL:       ; %bb.0:
1618; GFX10-GISEL-NEXT:    s_clause 0x1
1619; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1620; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1621; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1622; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[4:5], s[6:7]
1623; GFX10-GISEL-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
1624; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1625; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
1626; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1627; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1628; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1629; GFX10-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1630; GFX10-GISEL-NEXT:    s_endpgm
1631  %and0 = and i64 %a, %b
1632  %not.a = xor i64 %a, -1
1633  %and1 = and i64 %not.a, %mask
1634  %bitselect = or i64 %and0, %and1
1635  %scalar.use = add i64 %bitselect, 10
1636  store i64 %scalar.use, i64 addrspace(1)* undef
1637  ret void
1638}
1639
1640define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
1641; GFX7-LABEL: s_bitselect_i64_pat_1:
1642; GFX7:       ; %bb.0:
1643; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1644; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1645; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1646; GFX7-NEXT:    s_mov_b32 s2, -1
1647; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1648; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], s[0:1]
1649; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
1650; GFX7-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
1651; GFX7-NEXT:    s_add_u32 s0, s0, 10
1652; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1653; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1654; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1655; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1656; GFX7-NEXT:    s_endpgm
1657;
1658; GFX8-LABEL: s_bitselect_i64_pat_1:
1659; GFX8:       ; %bb.0:
1660; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1661; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1662; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1663; GFX8-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1664; GFX8-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1665; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1666; GFX8-NEXT:    s_add_u32 s0, s0, 10
1667; GFX8-NEXT:    s_addc_u32 s1, s1, 0
1668; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1669; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1670; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1671; GFX8-NEXT:    s_endpgm
1672;
1673; GFX10-LABEL: s_bitselect_i64_pat_1:
1674; GFX10:       ; %bb.0:
1675; GFX10-NEXT:    s_clause 0x1
1676; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1677; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1678; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1679; GFX10-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1680; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1681; GFX10-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1682; GFX10-NEXT:    s_add_u32 s0, s0, 10
1683; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1684; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1685; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1686; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1687; GFX10-NEXT:    s_endpgm
1688;
1689; GFX8-GISEL-LABEL: s_bitselect_i64_pat_1:
1690; GFX8-GISEL:       ; %bb.0:
1691; GFX8-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1692; GFX8-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1693; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1694; GFX8-GISEL-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1695; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1696; GFX8-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1697; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
1698; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1699; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1700; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1701; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1702; GFX8-GISEL-NEXT:    s_endpgm
1703;
1704; GFX10-GISEL-LABEL: s_bitselect_i64_pat_1:
1705; GFX10-GISEL:       ; %bb.0:
1706; GFX10-GISEL-NEXT:    s_clause 0x1
1707; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1708; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1709; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1710; GFX10-GISEL-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1711; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1712; GFX10-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1713; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
1714; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1715; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1716; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1717; GFX10-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1718; GFX10-GISEL-NEXT:    s_endpgm
1719  %xor.0 = xor i64 %a, %mask
1720  %and = and i64 %xor.0, %b
1721  %bitselect = xor i64 %and, %mask
1722
1723  %scalar.use = add i64 %bitselect, 10
1724  store i64 %scalar.use, i64 addrspace(1)* undef
1725  ret void
1726}
1727
1728define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
1729; GFX7-LABEL: s_bitselect_i64_pat_2:
1730; GFX7:       ; %bb.0:
1731; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1732; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1733; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1734; GFX7-NEXT:    s_mov_b32 s2, -1
1735; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1736; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], s[0:1]
1737; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
1738; GFX7-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
1739; GFX7-NEXT:    s_add_u32 s0, s0, 10
1740; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1741; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1742; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1743; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1744; GFX7-NEXT:    s_endpgm
1745;
1746; GFX8-LABEL: s_bitselect_i64_pat_2:
1747; GFX8:       ; %bb.0:
1748; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1749; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1750; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1751; GFX8-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1752; GFX8-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1753; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1754; GFX8-NEXT:    s_add_u32 s0, s0, 10
1755; GFX8-NEXT:    s_addc_u32 s1, s1, 0
1756; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1757; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1758; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1759; GFX8-NEXT:    s_endpgm
1760;
1761; GFX10-LABEL: s_bitselect_i64_pat_2:
1762; GFX10:       ; %bb.0:
1763; GFX10-NEXT:    s_clause 0x1
1764; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1765; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1766; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1767; GFX10-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1768; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1769; GFX10-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1770; GFX10-NEXT:    s_add_u32 s0, s0, 10
1771; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1772; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1773; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1774; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1775; GFX10-NEXT:    s_endpgm
1776;
1777; GFX8-GISEL-LABEL: s_bitselect_i64_pat_2:
1778; GFX8-GISEL:       ; %bb.0:
1779; GFX8-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1780; GFX8-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1781; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1782; GFX8-GISEL-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1783; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1784; GFX8-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1785; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
1786; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1787; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1788; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1789; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1790; GFX8-GISEL-NEXT:    s_endpgm
1791;
1792; GFX10-GISEL-LABEL: s_bitselect_i64_pat_2:
1793; GFX10-GISEL:       ; %bb.0:
1794; GFX10-GISEL-NEXT:    s_clause 0x1
1795; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1796; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1797; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1798; GFX10-GISEL-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1799; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1800; GFX10-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1801; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
1802; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1803; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1804; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1805; GFX10-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1806; GFX10-GISEL-NEXT:    s_endpgm
1807  %xor.0 = xor i64 %a, %mask
1808  %and = and i64 %xor.0, %b
1809  %bitselect = xor i64 %and, %mask
1810
1811  %scalar.use = add i64 %bitselect, 10
1812  store i64 %scalar.use, i64 addrspace(1)* undef
1813  ret void
1814}
1815
1816define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
1817; GFX7-LABEL: s_bfi_sha256_ma_i64:
1818; GFX7:       ; %bb.0: ; %entry
1819; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1820; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1821; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1822; GFX7-NEXT:    s_mov_b32 s2, -1
1823; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1824; GFX7-NEXT:    s_and_b64 s[8:9], s[4:5], s[0:1]
1825; GFX7-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
1826; GFX7-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
1827; GFX7-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
1828; GFX7-NEXT:    s_add_u32 s0, s0, 10
1829; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1830; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1831; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1832; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1833; GFX7-NEXT:    s_endpgm
1834;
1835; GFX8-LABEL: s_bfi_sha256_ma_i64:
1836; GFX8:       ; %bb.0: ; %entry
1837; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1838; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1839; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1840; GFX8-NEXT:    s_and_b64 s[2:3], s[4:5], s[0:1]
1841; GFX8-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
1842; GFX8-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
1843; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1844; GFX8-NEXT:    s_add_u32 s0, s0, 10
1845; GFX8-NEXT:    s_addc_u32 s1, s1, 0
1846; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1847; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1848; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1849; GFX8-NEXT:    s_endpgm
1850;
1851; GFX10-LABEL: s_bfi_sha256_ma_i64:
1852; GFX10:       ; %bb.0: ; %entry
1853; GFX10-NEXT:    s_clause 0x1
1854; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1855; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1856; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1857; GFX10-NEXT:    s_or_b64 s[2:3], s[4:5], s[0:1]
1858; GFX10-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
1859; GFX10-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
1860; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1861; GFX10-NEXT:    s_add_u32 s0, s0, 10
1862; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1863; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1864; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1865; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1866; GFX10-NEXT:    s_endpgm
1867;
1868; GFX8-GISEL-LABEL: s_bfi_sha256_ma_i64:
1869; GFX8-GISEL:       ; %bb.0: ; %entry
1870; GFX8-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1871; GFX8-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1872; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1873; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[4:5], s[0:1]
1874; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
1875; GFX8-GISEL-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
1876; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1877; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
1878; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1879; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1880; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1881; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1882; GFX8-GISEL-NEXT:    s_endpgm
1883;
1884; GFX10-GISEL-LABEL: s_bfi_sha256_ma_i64:
1885; GFX10-GISEL:       ; %bb.0: ; %entry
1886; GFX10-GISEL-NEXT:    s_clause 0x1
1887; GFX10-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1888; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1889; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1890; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[4:5], s[0:1]
1891; GFX10-GISEL-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
1892; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
1893; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1894; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
1895; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1896; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1897; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1898; GFX10-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1899; GFX10-GISEL-NEXT:    s_endpgm
1900entry:
1901  %and0 = and i64 %x, %z
1902  %or0 = or i64 %x, %z
1903  %and1 = and i64 %y, %or0
1904  %or1 = or i64 %and0, %and1
1905
1906  %scalar.use = add i64 %or1, 10
1907  store i64 %scalar.use, i64 addrspace(1)* undef
1908  ret void
1909}
1910