xref: /llvm-project/llvm/test/CodeGen/AMDGPU/bfi_int.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8-GISEL %s
6; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10-GISEL %s
7
8; BFI_INT Definition pattern from ISA docs
9; (y & x) | (z & ~x)
10;
11define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
12; GFX7-LABEL: s_bfi_def_i32:
13; GFX7:       ; %bb.0: ; %entry
14; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
15; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
16; GFX7-NEXT:    s_mov_b32 s7, 0xf000
17; GFX7-NEXT:    s_mov_b32 s6, -1
18; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
19; GFX7-NEXT:    s_andn2_b32 s2, s2, s0
20; GFX7-NEXT:    s_and_b32 s0, s1, s0
21; GFX7-NEXT:    s_or_b32 s0, s2, s0
22; GFX7-NEXT:    v_mov_b32_e32 v0, s0
23; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
24; GFX7-NEXT:    s_endpgm
25;
26; GFX8-LABEL: s_bfi_def_i32:
27; GFX8:       ; %bb.0: ; %entry
28; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
29; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
30; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
31; GFX8-NEXT:    s_andn2_b32 s2, s2, s0
32; GFX8-NEXT:    s_and_b32 s0, s1, s0
33; GFX8-NEXT:    s_or_b32 s0, s2, s0
34; GFX8-NEXT:    v_mov_b32_e32 v0, s4
35; GFX8-NEXT:    v_mov_b32_e32 v1, s5
36; GFX8-NEXT:    v_mov_b32_e32 v2, s0
37; GFX8-NEXT:    flat_store_dword v[0:1], v2
38; GFX8-NEXT:    s_endpgm
39;
40; GFX10-LABEL: s_bfi_def_i32:
41; GFX10:       ; %bb.0: ; %entry
42; GFX10-NEXT:    s_clause 0x1
43; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
44; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
45; GFX10-NEXT:    v_mov_b32_e32 v0, 0
46; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
47; GFX10-NEXT:    s_andn2_b32 s2, s2, s0
48; GFX10-NEXT:    s_and_b32 s0, s1, s0
49; GFX10-NEXT:    s_or_b32 s0, s2, s0
50; GFX10-NEXT:    v_mov_b32_e32 v1, s0
51; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
52; GFX10-NEXT:    s_endpgm
53;
54; GFX8-GISEL-LABEL: s_bfi_def_i32:
55; GFX8-GISEL:       ; %bb.0: ; %entry
56; GFX8-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
57; GFX8-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x34
58; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
59; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
60; GFX8-GISEL-NEXT:    s_andn2_b32 s4, s4, s2
61; GFX8-GISEL-NEXT:    s_and_b32 s2, s3, s2
62; GFX8-GISEL-NEXT:    s_or_b32 s2, s4, s2
63; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s2
64; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
65; GFX8-GISEL-NEXT:    flat_store_dword v[0:1], v2
66; GFX8-GISEL-NEXT:    s_endpgm
67;
68; GFX10-GISEL-LABEL: s_bfi_def_i32:
69; GFX10-GISEL:       ; %bb.0: ; %entry
70; GFX10-GISEL-NEXT:    s_clause 0x1
71; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
72; GFX10-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x34
73; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
74; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
75; GFX10-GISEL-NEXT:    s_andn2_b32 s4, s4, s2
76; GFX10-GISEL-NEXT:    s_and_b32 s2, s3, s2
77; GFX10-GISEL-NEXT:    s_or_b32 s2, s4, s2
78; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
79; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
80; GFX10-GISEL-NEXT:    s_endpgm
81entry:
82  %0 = xor i32 %x, -1
83  %1 = and i32 %z, %0
84  %2 = and i32 %y, %x
85  %3 = or i32 %1, %2
86  store i32 %3, ptr addrspace(1) %out
87  ret void
88}
89
90define i32 @v_bfi_def_i32(i32 %x, i32 %y, i32 %z) {
91; GFX7-LABEL: v_bfi_def_i32:
92; GFX7:       ; %bb.0: ; %entry
93; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
94; GFX7-NEXT:    v_bfi_b32 v0, v0, v1, v2
95; GFX7-NEXT:    s_setpc_b64 s[30:31]
96;
97; GFX8-LABEL: v_bfi_def_i32:
98; GFX8:       ; %bb.0: ; %entry
99; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
100; GFX8-NEXT:    v_bfi_b32 v0, v0, v1, v2
101; GFX8-NEXT:    s_setpc_b64 s[30:31]
102;
103; GFX10-LABEL: v_bfi_def_i32:
104; GFX10:       ; %bb.0: ; %entry
105; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106; GFX10-NEXT:    v_bfi_b32 v0, v0, v1, v2
107; GFX10-NEXT:    s_setpc_b64 s[30:31]
108;
109; GFX8-GISEL-LABEL: v_bfi_def_i32:
110; GFX8-GISEL:       ; %bb.0: ; %entry
111; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
113; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
114;
115; GFX10-GISEL-LABEL: v_bfi_def_i32:
116; GFX10-GISEL:       ; %bb.0: ; %entry
117; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
118; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
119; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
120entry:
121  %0 = xor i32 %x, -1
122  %1 = and i32 %z, %0
123  %2 = and i32 %y, %x
124  %3 = or i32 %1, %2
125  ret i32 %3
126}
127
128; SHA-256 Ch function
129; z ^ (x & (y ^ z))
130define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
131; GFX7-LABEL: s_bfi_sha256_ch:
132; GFX7:       ; %bb.0: ; %entry
133; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
134; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
135; GFX7-NEXT:    s_mov_b32 s7, 0xf000
136; GFX7-NEXT:    s_mov_b32 s6, -1
137; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
138; GFX7-NEXT:    s_xor_b32 s1, s1, s2
139; GFX7-NEXT:    s_and_b32 s0, s0, s1
140; GFX7-NEXT:    s_xor_b32 s0, s2, s0
141; GFX7-NEXT:    v_mov_b32_e32 v0, s0
142; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
143; GFX7-NEXT:    s_endpgm
144;
145; GFX8-LABEL: s_bfi_sha256_ch:
146; GFX8:       ; %bb.0: ; %entry
147; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
148; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
149; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
150; GFX8-NEXT:    s_xor_b32 s1, s1, s2
151; GFX8-NEXT:    s_and_b32 s0, s0, s1
152; GFX8-NEXT:    s_xor_b32 s0, s2, s0
153; GFX8-NEXT:    v_mov_b32_e32 v0, s4
154; GFX8-NEXT:    v_mov_b32_e32 v1, s5
155; GFX8-NEXT:    v_mov_b32_e32 v2, s0
156; GFX8-NEXT:    flat_store_dword v[0:1], v2
157; GFX8-NEXT:    s_endpgm
158;
159; GFX10-LABEL: s_bfi_sha256_ch:
160; GFX10:       ; %bb.0: ; %entry
161; GFX10-NEXT:    s_clause 0x1
162; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
163; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
164; GFX10-NEXT:    v_mov_b32_e32 v0, 0
165; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX10-NEXT:    s_xor_b32 s1, s1, s2
167; GFX10-NEXT:    s_and_b32 s0, s0, s1
168; GFX10-NEXT:    s_xor_b32 s0, s2, s0
169; GFX10-NEXT:    v_mov_b32_e32 v1, s0
170; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
171; GFX10-NEXT:    s_endpgm
172;
173; GFX8-GISEL-LABEL: s_bfi_sha256_ch:
174; GFX8-GISEL:       ; %bb.0: ; %entry
175; GFX8-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
176; GFX8-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x34
177; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
178; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
179; GFX8-GISEL-NEXT:    s_xor_b32 s3, s3, s4
180; GFX8-GISEL-NEXT:    s_and_b32 s2, s2, s3
181; GFX8-GISEL-NEXT:    s_xor_b32 s2, s4, s2
182; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s2
183; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
184; GFX8-GISEL-NEXT:    flat_store_dword v[0:1], v2
185; GFX8-GISEL-NEXT:    s_endpgm
186;
187; GFX10-GISEL-LABEL: s_bfi_sha256_ch:
188; GFX10-GISEL:       ; %bb.0: ; %entry
189; GFX10-GISEL-NEXT:    s_clause 0x1
190; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
191; GFX10-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x34
192; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
193; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
194; GFX10-GISEL-NEXT:    s_xor_b32 s3, s3, s4
195; GFX10-GISEL-NEXT:    s_and_b32 s2, s2, s3
196; GFX10-GISEL-NEXT:    s_xor_b32 s2, s4, s2
197; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
198; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
199; GFX10-GISEL-NEXT:    s_endpgm
200entry:
201  %0 = xor i32 %y, %z
202  %1 = and i32 %x, %0
203  %2 = xor i32 %z, %1
204  store i32 %2, ptr addrspace(1) %out
205  ret void
206}
207
208define i32 @v_bfi_sha256_ch(i32 %x, i32 %y, i32 %z) {
209; GFX7-LABEL: v_bfi_sha256_ch:
210; GFX7:       ; %bb.0: ; %entry
211; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
212; GFX7-NEXT:    v_bfi_b32 v0, v0, v1, v2
213; GFX7-NEXT:    s_setpc_b64 s[30:31]
214;
215; GFX8-LABEL: v_bfi_sha256_ch:
216; GFX8:       ; %bb.0: ; %entry
217; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218; GFX8-NEXT:    v_bfi_b32 v0, v0, v1, v2
219; GFX8-NEXT:    s_setpc_b64 s[30:31]
220;
221; GFX10-LABEL: v_bfi_sha256_ch:
222; GFX10:       ; %bb.0: ; %entry
223; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
224; GFX10-NEXT:    v_bfi_b32 v0, v0, v1, v2
225; GFX10-NEXT:    s_setpc_b64 s[30:31]
226;
227; GFX8-GISEL-LABEL: v_bfi_sha256_ch:
228; GFX8-GISEL:       ; %bb.0: ; %entry
229; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
231; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
232;
233; GFX10-GISEL-LABEL: v_bfi_sha256_ch:
234; GFX10-GISEL:       ; %bb.0: ; %entry
235; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
236; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
237; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
238entry:
239  %0 = xor i32 %y, %z
240  %1 = and i32 %x, %0
241  %2 = xor i32 %z, %1
242  ret i32 %2
243}
244
245define amdgpu_ps float @v_s_s_bfi_sha256_ch(i32 %x, i32 inreg %y, i32 inreg %z) {
246; GFX7-LABEL: v_s_s_bfi_sha256_ch:
247; GFX7:       ; %bb.0: ; %entry
248; GFX7-NEXT:    v_mov_b32_e32 v1, s0
249; GFX7-NEXT:    v_bfi_b32 v0, v0, v1, s1
250; GFX7-NEXT:    ; return to shader part epilog
251;
252; GFX8-LABEL: v_s_s_bfi_sha256_ch:
253; GFX8:       ; %bb.0: ; %entry
254; GFX8-NEXT:    v_mov_b32_e32 v1, s0
255; GFX8-NEXT:    v_bfi_b32 v0, v0, v1, s1
256; GFX8-NEXT:    ; return to shader part epilog
257;
258; GFX10-LABEL: v_s_s_bfi_sha256_ch:
259; GFX10:       ; %bb.0: ; %entry
260; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, s1
261; GFX10-NEXT:    ; return to shader part epilog
262;
263; GFX8-GISEL-LABEL: v_s_s_bfi_sha256_ch:
264; GFX8-GISEL:       ; %bb.0: ; %entry
265; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s0
266; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, s1
267; GFX8-GISEL-NEXT:    ; return to shader part epilog
268;
269; GFX10-GISEL-LABEL: v_s_s_bfi_sha256_ch:
270; GFX10-GISEL:       ; %bb.0: ; %entry
271; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, s0, s1
272; GFX10-GISEL-NEXT:    ; return to shader part epilog
273entry:
274  %xor0 = xor i32 %y, %z
275  %and = and i32 %x, %xor0
276  %xor1 = xor i32 %z, %and
277  %cast = bitcast i32 %xor1 to float
278  ret float %cast
279}
280
281define amdgpu_ps float @s_v_s_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 inreg %z) {
282; GFX7-LABEL: s_v_s_bfi_sha256_ch:
283; GFX7:       ; %bb.0: ; %entry
284; GFX7-NEXT:    v_mov_b32_e32 v1, s0
285; GFX7-NEXT:    v_bfi_b32 v0, v1, v0, s1
286; GFX7-NEXT:    ; return to shader part epilog
287;
288; GFX8-LABEL: s_v_s_bfi_sha256_ch:
289; GFX8:       ; %bb.0: ; %entry
290; GFX8-NEXT:    v_mov_b32_e32 v1, s0
291; GFX8-NEXT:    v_bfi_b32 v0, v1, v0, s1
292; GFX8-NEXT:    ; return to shader part epilog
293;
294; GFX10-LABEL: s_v_s_bfi_sha256_ch:
295; GFX10:       ; %bb.0: ; %entry
296; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, s1
297; GFX10-NEXT:    ; return to shader part epilog
298;
299; GFX8-GISEL-LABEL: s_v_s_bfi_sha256_ch:
300; GFX8-GISEL:       ; %bb.0: ; %entry
301; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
302; GFX8-GISEL-NEXT:    v_bfi_b32 v0, s0, v0, v1
303; GFX8-GISEL-NEXT:    ; return to shader part epilog
304;
305; GFX10-GISEL-LABEL: s_v_s_bfi_sha256_ch:
306; GFX10-GISEL:       ; %bb.0: ; %entry
307; GFX10-GISEL-NEXT:    v_bfi_b32 v0, s0, v0, s1
308; GFX10-GISEL-NEXT:    ; return to shader part epilog
309entry:
310  %xor0 = xor i32 %y, %z
311  %and = and i32 %x, %xor0
312  %xor1 = xor i32 %z, %and
313  %cast = bitcast i32 %xor1 to float
314  ret float %cast
315}
316
317define amdgpu_ps float @s_s_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) {
318; GFX7-LABEL: s_s_v_bfi_sha256_ch:
319; GFX7:       ; %bb.0: ; %entry
320; GFX7-NEXT:    v_mov_b32_e32 v1, s0
321; GFX7-NEXT:    v_bfi_b32 v0, v1, s1, v0
322; GFX7-NEXT:    ; return to shader part epilog
323;
324; GFX8-LABEL: s_s_v_bfi_sha256_ch:
325; GFX8:       ; %bb.0: ; %entry
326; GFX8-NEXT:    v_mov_b32_e32 v1, s0
327; GFX8-NEXT:    v_bfi_b32 v0, v1, s1, v0
328; GFX8-NEXT:    ; return to shader part epilog
329;
330; GFX10-LABEL: s_s_v_bfi_sha256_ch:
331; GFX10:       ; %bb.0: ; %entry
332; GFX10-NEXT:    v_bfi_b32 v0, s0, s1, v0
333; GFX10-NEXT:    ; return to shader part epilog
334;
335; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ch:
336; GFX8-GISEL:       ; %bb.0: ; %entry
337; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s0
338; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v1, s1, v0
339; GFX8-GISEL-NEXT:    ; return to shader part epilog
340;
341; GFX10-GISEL-LABEL: s_s_v_bfi_sha256_ch:
342; GFX10-GISEL:       ; %bb.0: ; %entry
343; GFX10-GISEL-NEXT:    v_bfi_b32 v0, s0, s1, v0
344; GFX10-GISEL-NEXT:    ; return to shader part epilog
345entry:
346  %xor0 = xor i32 %y, %z
347  %and = and i32 %x, %xor0
348  %xor1 = xor i32 %z, %and
349  %cast = bitcast i32 %xor1 to float
350  ret float %cast
351}
352
353define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 %z) {
354; GFX7-LABEL: s_v_v_bfi_sha256_ch:
355; GFX7:       ; %bb.0: ; %entry
356; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v1
357; GFX7-NEXT:    ; return to shader part epilog
358;
359; GFX8-LABEL: s_v_v_bfi_sha256_ch:
360; GFX8:       ; %bb.0: ; %entry
361; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v1
362; GFX8-NEXT:    ; return to shader part epilog
363;
364; GFX10-LABEL: s_v_v_bfi_sha256_ch:
365; GFX10:       ; %bb.0: ; %entry
366; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, v1
367; GFX10-NEXT:    ; return to shader part epilog
368;
369; GFX8-GISEL-LABEL: s_v_v_bfi_sha256_ch:
370; GFX8-GISEL:       ; %bb.0: ; %entry
371; GFX8-GISEL-NEXT:    v_bfi_b32 v0, s0, v0, v1
372; GFX8-GISEL-NEXT:    ; return to shader part epilog
373;
374; GFX10-GISEL-LABEL: s_v_v_bfi_sha256_ch:
375; GFX10-GISEL:       ; %bb.0: ; %entry
376; GFX10-GISEL-NEXT:    v_bfi_b32 v0, s0, v0, v1
377; GFX10-GISEL-NEXT:    ; return to shader part epilog
378entry:
379  %xor0 = xor i32 %y, %z
380  %and = and i32 %x, %xor0
381  %xor1 = xor i32 %z, %and
382  %cast = bitcast i32 %xor1 to float
383  ret float %cast
384}
385
386define amdgpu_ps float @v_s_v_bfi_sha256_ch(i32 %x, i32 inreg %y, i32 %z) {
387; GFX7-LABEL: v_s_v_bfi_sha256_ch:
388; GFX7:       ; %bb.0: ; %entry
389; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v1
390; GFX7-NEXT:    ; return to shader part epilog
391;
392; GFX8-LABEL: v_s_v_bfi_sha256_ch:
393; GFX8:       ; %bb.0: ; %entry
394; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v1
395; GFX8-NEXT:    ; return to shader part epilog
396;
397; GFX10-LABEL: v_s_v_bfi_sha256_ch:
398; GFX10:       ; %bb.0: ; %entry
399; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, v1
400; GFX10-NEXT:    ; return to shader part epilog
401;
402; GFX8-GISEL-LABEL: v_s_v_bfi_sha256_ch:
403; GFX8-GISEL:       ; %bb.0: ; %entry
404; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, s0, v1
405; GFX8-GISEL-NEXT:    ; return to shader part epilog
406;
407; GFX10-GISEL-LABEL: v_s_v_bfi_sha256_ch:
408; GFX10-GISEL:       ; %bb.0: ; %entry
409; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, s0, v1
410; GFX10-GISEL-NEXT:    ; return to shader part epilog
411entry:
412  %xor0 = xor i32 %y, %z
413  %and = and i32 %x, %xor0
414  %xor1 = xor i32 %z, %and
415  %cast = bitcast i32 %xor1 to float
416  ret float %cast
417}
418
419define amdgpu_ps float @v_v_s_bfi_sha256_ch(i32 %x, i32 %y, i32 inreg %z) {
420; GFX7-LABEL: v_v_s_bfi_sha256_ch:
421; GFX7:       ; %bb.0: ; %entry
422; GFX7-NEXT:    v_bfi_b32 v0, v0, v1, s0
423; GFX7-NEXT:    ; return to shader part epilog
424;
425; GFX8-LABEL: v_v_s_bfi_sha256_ch:
426; GFX8:       ; %bb.0: ; %entry
427; GFX8-NEXT:    v_bfi_b32 v0, v0, v1, s0
428; GFX8-NEXT:    ; return to shader part epilog
429;
430; GFX10-LABEL: v_v_s_bfi_sha256_ch:
431; GFX10:       ; %bb.0: ; %entry
432; GFX10-NEXT:    v_bfi_b32 v0, v0, v1, s0
433; GFX10-NEXT:    ; return to shader part epilog
434;
435; GFX8-GISEL-LABEL: v_v_s_bfi_sha256_ch:
436; GFX8-GISEL:       ; %bb.0: ; %entry
437; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, s0
438; GFX8-GISEL-NEXT:    ; return to shader part epilog
439;
440; GFX10-GISEL-LABEL: v_v_s_bfi_sha256_ch:
441; GFX10-GISEL:       ; %bb.0: ; %entry
442; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, s0
443; GFX10-GISEL-NEXT:    ; return to shader part epilog
444entry:
445  %xor0 = xor i32 %y, %z
446  %and = and i32 %x, %xor0
447  %xor1 = xor i32 %z, %and
448  %cast = bitcast i32 %xor1 to float
449  ret float %cast
450}
451
452; SHA-256 Ma function
453; ((x & z) | (y & (x | z)))
454define amdgpu_kernel void @s_bfi_sha256_ma(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
455; GFX7-LABEL: s_bfi_sha256_ma:
456; GFX7:       ; %bb.0: ; %entry
457; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
458; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
459; GFX7-NEXT:    s_mov_b32 s7, 0xf000
460; GFX7-NEXT:    s_mov_b32 s6, -1
461; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
462; GFX7-NEXT:    s_and_b32 s3, s0, s2
463; GFX7-NEXT:    s_or_b32 s0, s0, s2
464; GFX7-NEXT:    s_and_b32 s0, s1, s0
465; GFX7-NEXT:    s_or_b32 s0, s3, s0
466; GFX7-NEXT:    v_mov_b32_e32 v0, s0
467; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
468; GFX7-NEXT:    s_endpgm
469;
470; GFX8-LABEL: s_bfi_sha256_ma:
471; GFX8:       ; %bb.0: ; %entry
472; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
473; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
474; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
475; GFX8-NEXT:    s_and_b32 s3, s0, s2
476; GFX8-NEXT:    s_or_b32 s0, s0, s2
477; GFX8-NEXT:    s_and_b32 s0, s1, s0
478; GFX8-NEXT:    s_or_b32 s0, s3, s0
479; GFX8-NEXT:    v_mov_b32_e32 v0, s4
480; GFX8-NEXT:    v_mov_b32_e32 v1, s5
481; GFX8-NEXT:    v_mov_b32_e32 v2, s0
482; GFX8-NEXT:    flat_store_dword v[0:1], v2
483; GFX8-NEXT:    s_endpgm
484;
485; GFX10-LABEL: s_bfi_sha256_ma:
486; GFX10:       ; %bb.0: ; %entry
487; GFX10-NEXT:    s_clause 0x1
488; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
489; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
490; GFX10-NEXT:    v_mov_b32_e32 v0, 0
491; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
492; GFX10-NEXT:    s_or_b32 s3, s0, s2
493; GFX10-NEXT:    s_and_b32 s0, s0, s2
494; GFX10-NEXT:    s_and_b32 s1, s1, s3
495; GFX10-NEXT:    s_or_b32 s0, s0, s1
496; GFX10-NEXT:    v_mov_b32_e32 v1, s0
497; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
498; GFX10-NEXT:    s_endpgm
499;
500; GFX8-GISEL-LABEL: s_bfi_sha256_ma:
501; GFX8-GISEL:       ; %bb.0: ; %entry
502; GFX8-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
503; GFX8-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x34
504; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
505; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
506; GFX8-GISEL-NEXT:    s_and_b32 s5, s2, s4
507; GFX8-GISEL-NEXT:    s_or_b32 s2, s2, s4
508; GFX8-GISEL-NEXT:    s_and_b32 s2, s3, s2
509; GFX8-GISEL-NEXT:    s_or_b32 s2, s5, s2
510; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s2
511; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
512; GFX8-GISEL-NEXT:    flat_store_dword v[0:1], v2
513; GFX8-GISEL-NEXT:    s_endpgm
514;
515; GFX10-GISEL-LABEL: s_bfi_sha256_ma:
516; GFX10-GISEL:       ; %bb.0: ; %entry
517; GFX10-GISEL-NEXT:    s_clause 0x1
518; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
519; GFX10-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x34
520; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
521; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
522; GFX10-GISEL-NEXT:    s_or_b32 s5, s2, s4
523; GFX10-GISEL-NEXT:    s_and_b32 s2, s2, s4
524; GFX10-GISEL-NEXT:    s_and_b32 s3, s3, s5
525; GFX10-GISEL-NEXT:    s_or_b32 s2, s2, s3
526; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
527; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
528; GFX10-GISEL-NEXT:    s_endpgm
529entry:
530  %0 = and i32 %x, %z
531  %1 = or i32 %x, %z
532  %2 = and i32 %y, %1
533  %3 = or i32 %0, %2
534  store i32 %3, ptr addrspace(1) %out
535  ret void
536}
537
538define i32 @v_bfi_sha256_ma(i32 %x, i32 %y, i32 %z) {
539; GFX7-LABEL: v_bfi_sha256_ma:
540; GFX7:       ; %bb.0: ; %entry
541; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
542; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v1
543; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, v1
544; GFX7-NEXT:    s_setpc_b64 s[30:31]
545;
546; GFX8-LABEL: v_bfi_sha256_ma:
547; GFX8:       ; %bb.0: ; %entry
548; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
549; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v1
550; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, v1
551; GFX8-NEXT:    s_setpc_b64 s[30:31]
552;
553; GFX10-LABEL: v_bfi_sha256_ma:
554; GFX10:       ; %bb.0: ; %entry
555; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
556; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v1
557; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, v1
558; GFX10-NEXT:    s_setpc_b64 s[30:31]
559;
560; GFX8-GISEL-LABEL: v_bfi_sha256_ma:
561; GFX8-GISEL:       ; %bb.0: ; %entry
562; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
563; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
564; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, v1
565; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
566;
567; GFX10-GISEL-LABEL: v_bfi_sha256_ma:
568; GFX10-GISEL:       ; %bb.0: ; %entry
569; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
570; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
571; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, v1
572; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
573entry:
574  %0 = and i32 %x, %z
575  %1 = or i32 %x, %z
576  %2 = and i32 %y, %1
577  %3 = or i32 %0, %2
578  ret i32 %3
579}
580
581define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %mask) {
582; GFX7-LABEL: v_bitselect_v2i32_pat1:
583; GFX7:       ; %bb.0:
584; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
585; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
586; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
587; GFX7-NEXT:    s_setpc_b64 s[30:31]
588;
589; GFX8-LABEL: v_bitselect_v2i32_pat1:
590; GFX8:       ; %bb.0:
591; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
592; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
593; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
594; GFX8-NEXT:    s_setpc_b64 s[30:31]
595;
596; GFX10-LABEL: v_bitselect_v2i32_pat1:
597; GFX10:       ; %bb.0:
598; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
599; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, v4
600; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, v5
601; GFX10-NEXT:    s_setpc_b64 s[30:31]
602;
603; GFX8-GISEL-LABEL: v_bitselect_v2i32_pat1:
604; GFX8-GISEL:       ; %bb.0:
605; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
606; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, v4
607; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v3, v1, v5
608; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
609;
610; GFX10-GISEL-LABEL: v_bitselect_v2i32_pat1:
611; GFX10-GISEL:       ; %bb.0:
612; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
613; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, v4
614; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v3, v1, v5
615; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
616  %xor.0 = xor <2 x i32> %a, %mask
617  %and = and <2 x i32> %xor.0, %b
618  %bitselect = xor <2 x i32> %and, %mask
619  ret <2 x i32> %bitselect
620}
621
622define i64 @v_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
623; GFX7-LABEL: v_bitselect_i64_pat_0:
624; GFX7:       ; %bb.0:
625; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
626; GFX7-NEXT:    v_bfi_b32 v1, v1, v3, v5
627; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, v4
628; GFX7-NEXT:    s_setpc_b64 s[30:31]
629;
630; GFX8-LABEL: v_bitselect_i64_pat_0:
631; GFX8:       ; %bb.0:
632; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
633; GFX8-NEXT:    v_bfi_b32 v1, v1, v3, v5
634; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, v4
635; GFX8-NEXT:    s_setpc_b64 s[30:31]
636;
637; GFX10-LABEL: v_bitselect_i64_pat_0:
638; GFX10:       ; %bb.0:
639; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
640; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, v4
641; GFX10-NEXT:    v_bfi_b32 v1, v1, v3, v5
642; GFX10-NEXT:    s_setpc_b64 s[30:31]
643;
644; GFX8-GISEL-LABEL: v_bitselect_i64_pat_0:
645; GFX8-GISEL:       ; %bb.0:
646; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
647; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, v4
648; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v1, v3, v5
649; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
650;
651; GFX10-GISEL-LABEL: v_bitselect_i64_pat_0:
652; GFX10-GISEL:       ; %bb.0:
653; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
654; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, v4
655; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v1, v3, v5
656; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
657  %and0 = and i64 %a, %b
658  %not.a = xor i64 %a, -1
659  %and1 = and i64 %not.a, %mask
660  %bitselect = or i64 %and0, %and1
661  ret i64 %bitselect
662}
663
664define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_0(i64 %a, i64 inreg %b, i64 inreg %mask) {
665; GFX7-LABEL: v_s_s_bitselect_i64_pat_0:
666; GFX7:       ; %bb.0:
667; GFX7-NEXT:    v_mov_b32_e32 v2, s3
668; GFX7-NEXT:    v_bfi_b32 v1, v1, s1, v2
669; GFX7-NEXT:    v_mov_b32_e32 v2, s2
670; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v2
671; GFX7-NEXT:    ; return to shader part epilog
672;
673; GFX8-LABEL: v_s_s_bitselect_i64_pat_0:
674; GFX8:       ; %bb.0:
675; GFX8-NEXT:    v_mov_b32_e32 v2, s3
676; GFX8-NEXT:    v_bfi_b32 v1, v1, s1, v2
677; GFX8-NEXT:    v_mov_b32_e32 v2, s2
678; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v2
679; GFX8-NEXT:    ; return to shader part epilog
680;
681; GFX10-LABEL: v_s_s_bitselect_i64_pat_0:
682; GFX10:       ; %bb.0:
683; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, s2
684; GFX10-NEXT:    v_bfi_b32 v1, v1, s1, s3
685; GFX10-NEXT:    ; return to shader part epilog
686;
687; GFX8-GISEL-LABEL: v_s_s_bitselect_i64_pat_0:
688; GFX8-GISEL:       ; %bb.0:
689; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s0
690; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, s2
691; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s1
692; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v1, v2, s3
693; GFX8-GISEL-NEXT:    ; return to shader part epilog
694;
695; GFX10-GISEL-LABEL: v_s_s_bitselect_i64_pat_0:
696; GFX10-GISEL:       ; %bb.0:
697; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, s0, s2
698; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v1, s1, s3
699; GFX10-GISEL-NEXT:    ; return to shader part epilog
700  %and0 = and i64 %a, %b
701  %not.a = xor i64 %a, -1
702  %and1 = and i64 %not.a, %mask
703  %bitselect = or i64 %and0, %and1
704  %cast = bitcast i64 %bitselect to <2 x float>
705  ret <2 x float> %cast
706}
707
708define amdgpu_ps <2 x float> @s_v_s_bitselect_i64_pat_0(i64 inreg %a, i64 %b, i64 inreg %mask) {
709; GFX7-LABEL: s_v_s_bitselect_i64_pat_0:
710; GFX7:       ; %bb.0:
711; GFX7-NEXT:    v_mov_b32_e32 v2, s3
712; GFX7-NEXT:    v_bfi_b32 v1, s1, v1, v2
713; GFX7-NEXT:    v_mov_b32_e32 v2, s2
714; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v2
715; GFX7-NEXT:    ; return to shader part epilog
716;
717; GFX8-LABEL: s_v_s_bitselect_i64_pat_0:
718; GFX8:       ; %bb.0:
719; GFX8-NEXT:    v_mov_b32_e32 v2, s3
720; GFX8-NEXT:    v_bfi_b32 v1, s1, v1, v2
721; GFX8-NEXT:    v_mov_b32_e32 v2, s2
722; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v2
723; GFX8-NEXT:    ; return to shader part epilog
724;
725; GFX10-LABEL: s_v_s_bitselect_i64_pat_0:
726; GFX10:       ; %bb.0:
727; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, s2
728; GFX10-NEXT:    v_bfi_b32 v1, s1, v1, s3
729; GFX10-NEXT:    ; return to shader part epilog
730;
731; GFX8-GISEL-LABEL: s_v_s_bitselect_i64_pat_0:
732; GFX8-GISEL:       ; %bb.0:
733; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
734; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
735; GFX8-GISEL-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[0:1]
736; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s0, v0
737; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, s1, v1
738; GFX8-GISEL-NEXT:    ; return to shader part epilog
739;
740; GFX10-GISEL-LABEL: s_v_s_bitselect_i64_pat_0:
741; GFX10-GISEL:       ; %bb.0:
742; GFX10-GISEL-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
743; GFX10-GISEL-NEXT:    v_and_or_b32 v0, s0, v0, s2
744; GFX10-GISEL-NEXT:    v_and_or_b32 v1, s1, v1, s3
745; GFX10-GISEL-NEXT:    ; return to shader part epilog
746  %and0 = and i64 %a, %b
747  %not.a = xor i64 %a, -1
748  %and1 = and i64 %not.a, %mask
749  %bitselect = or i64 %and0, %and1
750  %cast = bitcast i64 %bitselect to <2 x float>
751  ret <2 x float> %cast
752}
753
754define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_0(i64 inreg %a, i64 inreg %b, i64 %mask) {
755; GFX7-LABEL: s_s_v_bitselect_i64_pat_0:
756; GFX7:       ; %bb.0:
757; GFX7-NEXT:    v_mov_b32_e32 v2, s3
758; GFX7-NEXT:    v_bfi_b32 v1, s1, v2, v1
759; GFX7-NEXT:    v_mov_b32_e32 v2, s2
760; GFX7-NEXT:    v_bfi_b32 v0, s0, v2, v0
761; GFX7-NEXT:    ; return to shader part epilog
762;
763; GFX8-LABEL: s_s_v_bitselect_i64_pat_0:
764; GFX8:       ; %bb.0:
765; GFX8-NEXT:    v_mov_b32_e32 v2, s3
766; GFX8-NEXT:    v_bfi_b32 v1, s1, v2, v1
767; GFX8-NEXT:    v_mov_b32_e32 v2, s2
768; GFX8-NEXT:    v_bfi_b32 v0, s0, v2, v0
769; GFX8-NEXT:    ; return to shader part epilog
770;
771; GFX10-LABEL: s_s_v_bitselect_i64_pat_0:
772; GFX10:       ; %bb.0:
773; GFX10-NEXT:    v_bfi_b32 v0, s0, s2, v0
774; GFX10-NEXT:    v_bfi_b32 v1, s1, s3, v1
775; GFX10-NEXT:    ; return to shader part epilog
776;
777; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_0:
778; GFX8-GISEL:       ; %bb.0:
779; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
780; GFX8-GISEL-NEXT:    s_not_b64 s[0:1], s[0:1]
781; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
782; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
783; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s2, v0
784; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, s3, v1
785; GFX8-GISEL-NEXT:    ; return to shader part epilog
786;
787; GFX10-GISEL-LABEL: s_s_v_bitselect_i64_pat_0:
788; GFX10-GISEL:       ; %bb.0:
789; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
790; GFX10-GISEL-NEXT:    s_not_b64 s[0:1], s[0:1]
791; GFX10-GISEL-NEXT:    v_and_or_b32 v0, s0, v0, s2
792; GFX10-GISEL-NEXT:    v_and_or_b32 v1, s1, v1, s3
793; GFX10-GISEL-NEXT:    ; return to shader part epilog
794  %and0 = and i64 %a, %b
795  %not.a = xor i64 %a, -1
796  %and1 = and i64 %not.a, %mask
797  %bitselect = or i64 %and0, %and1
798  %cast = bitcast i64 %bitselect to <2 x float>
799  ret <2 x float> %cast
800}
801
802define amdgpu_ps <2 x float> @v_v_s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 inreg %mask) {
803; GFX7-LABEL: v_v_s_bitselect_i64_pat_0:
804; GFX7:       ; %bb.0:
805; GFX7-NEXT:    v_bfi_b32 v1, v1, v3, s1
806; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, s0
807; GFX7-NEXT:    ; return to shader part epilog
808;
809; GFX8-LABEL: v_v_s_bitselect_i64_pat_0:
810; GFX8:       ; %bb.0:
811; GFX8-NEXT:    v_bfi_b32 v1, v1, v3, s1
812; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, s0
813; GFX8-NEXT:    ; return to shader part epilog
814;
815; GFX10-LABEL: v_v_s_bitselect_i64_pat_0:
816; GFX10:       ; %bb.0:
817; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, s0
818; GFX10-NEXT:    v_bfi_b32 v1, v1, v3, s1
819; GFX10-NEXT:    ; return to shader part epilog
820;
821; GFX8-GISEL-LABEL: v_v_s_bitselect_i64_pat_0:
822; GFX8-GISEL:       ; %bb.0:
823; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, s0
824; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v1, v3, s1
825; GFX8-GISEL-NEXT:    ; return to shader part epilog
826;
827; GFX10-GISEL-LABEL: v_v_s_bitselect_i64_pat_0:
828; GFX10-GISEL:       ; %bb.0:
829; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, s0
830; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v1, v3, s1
831; GFX10-GISEL-NEXT:    ; return to shader part epilog
832  %and0 = and i64 %a, %b
833  %not.a = xor i64 %a, -1
834  %and1 = and i64 %not.a, %mask
835  %bitselect = or i64 %and0, %and1
836  %cast = bitcast i64 %bitselect to <2 x float>
837  ret <2 x float> %cast
838}
839
840define amdgpu_ps <2 x float> @v_s_v_bitselect_i64_pat_0(i64 %a, i64 inreg %b, i64 %mask) {
841; GFX7-LABEL: v_s_v_bitselect_i64_pat_0:
842; GFX7:       ; %bb.0:
843; GFX7-NEXT:    v_bfi_b32 v1, v1, s1, v3
844; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v2
845; GFX7-NEXT:    ; return to shader part epilog
846;
847; GFX8-LABEL: v_s_v_bitselect_i64_pat_0:
848; GFX8:       ; %bb.0:
849; GFX8-NEXT:    v_bfi_b32 v1, v1, s1, v3
850; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v2
851; GFX8-NEXT:    ; return to shader part epilog
852;
853; GFX10-LABEL: v_s_v_bitselect_i64_pat_0:
854; GFX10:       ; %bb.0:
855; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, v2
856; GFX10-NEXT:    v_bfi_b32 v1, v1, s1, v3
857; GFX10-NEXT:    ; return to shader part epilog
858;
859; GFX8-GISEL-LABEL: v_s_v_bitselect_i64_pat_0:
860; GFX8-GISEL:       ; %bb.0:
861; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, s0, v2
862; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v1, s1, v3
863; GFX8-GISEL-NEXT:    ; return to shader part epilog
864;
865; GFX10-GISEL-LABEL: v_s_v_bitselect_i64_pat_0:
866; GFX10-GISEL:       ; %bb.0:
867; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, s0, v2
868; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v1, s1, v3
869; GFX10-GISEL-NEXT:    ; return to shader part epilog
870  %and0 = and i64 %a, %b
871  %not.a = xor i64 %a, -1
872  %and1 = and i64 %not.a, %mask
873  %bitselect = or i64 %and0, %and1
874  %cast = bitcast i64 %bitselect to <2 x float>
875  ret <2 x float> %cast
876}
877
878define amdgpu_ps <2 x float> @s_v_v_bitselect_i64_pat_0(i64 inreg %a, i64 %b, i64 %mask) {
879; GFX7-LABEL: s_v_v_bitselect_i64_pat_0:
880; GFX7:       ; %bb.0:
881; GFX7-NEXT:    v_bfi_b32 v1, s1, v1, v3
882; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v2
883; GFX7-NEXT:    ; return to shader part epilog
884;
885; GFX8-LABEL: s_v_v_bitselect_i64_pat_0:
886; GFX8:       ; %bb.0:
887; GFX8-NEXT:    v_bfi_b32 v1, s1, v1, v3
888; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v2
889; GFX8-NEXT:    ; return to shader part epilog
890;
891; GFX10-LABEL: s_v_v_bitselect_i64_pat_0:
892; GFX10:       ; %bb.0:
893; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, v2
894; GFX10-NEXT:    v_bfi_b32 v1, s1, v1, v3
895; GFX10-NEXT:    ; return to shader part epilog
896;
897; GFX8-GISEL-LABEL: s_v_v_bitselect_i64_pat_0:
898; GFX8-GISEL:       ; %bb.0:
899; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
900; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
901; GFX8-GISEL-NEXT:    s_not_b64 s[0:1], s[0:1]
902; GFX8-GISEL-NEXT:    v_and_b32_e32 v2, s0, v2
903; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, s1, v3
904; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
905; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
906; GFX8-GISEL-NEXT:    ; return to shader part epilog
907;
908; GFX10-GISEL-LABEL: s_v_v_bitselect_i64_pat_0:
909; GFX10-GISEL:       ; %bb.0:
910; GFX10-GISEL-NEXT:    s_not_b64 s[2:3], s[0:1]
911; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, s2, v2
912; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, s3, v3
913; GFX10-GISEL-NEXT:    v_and_or_b32 v0, s0, v0, v2
914; GFX10-GISEL-NEXT:    v_and_or_b32 v1, s1, v1, v3
915; GFX10-GISEL-NEXT:    ; return to shader part epilog
916  %and0 = and i64 %a, %b
917  %not.a = xor i64 %a, -1
918  %and1 = and i64 %not.a, %mask
919  %bitselect = or i64 %and0, %and1
920  %cast = bitcast i64 %bitselect to <2 x float>
921  ret <2 x float> %cast
922}
923
924define i64 @v_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
925; GFX7-LABEL: v_bitselect_i64_pat_1:
926; GFX7:       ; %bb.0:
927; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
928; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
929; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
930; GFX7-NEXT:    s_setpc_b64 s[30:31]
931;
932; GFX8-LABEL: v_bitselect_i64_pat_1:
933; GFX8:       ; %bb.0:
934; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
935; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
936; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
937; GFX8-NEXT:    s_setpc_b64 s[30:31]
938;
939; GFX10-LABEL: v_bitselect_i64_pat_1:
940; GFX10:       ; %bb.0:
941; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
942; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, v4
943; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, v5
944; GFX10-NEXT:    s_setpc_b64 s[30:31]
945;
946; GFX8-GISEL-LABEL: v_bitselect_i64_pat_1:
947; GFX8-GISEL:       ; %bb.0:
948; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
949; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, v4
950; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v3, v1, v5
951; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
952;
953; GFX10-GISEL-LABEL: v_bitselect_i64_pat_1:
954; GFX10-GISEL:       ; %bb.0:
955; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
956; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, v4
957; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v3, v1, v5
958; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
959  %xor.0 = xor i64 %a, %mask
960  %and = and i64 %xor.0, %b
961  %bitselect = xor i64 %and, %mask
962  ret i64 %bitselect
963}
964
965define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_1(i64 %a, i64 inreg %b, i64 inreg %mask) {
966; GFX7-LABEL: v_s_s_bitselect_i64_pat_1:
967; GFX7:       ; %bb.0:
968; GFX7-NEXT:    v_mov_b32_e32 v2, s3
969; GFX7-NEXT:    v_bfi_b32 v1, s1, v1, v2
970; GFX7-NEXT:    v_mov_b32_e32 v2, s2
971; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v2
972; GFX7-NEXT:    ; return to shader part epilog
973;
974; GFX8-LABEL: v_s_s_bitselect_i64_pat_1:
975; GFX8:       ; %bb.0:
976; GFX8-NEXT:    v_mov_b32_e32 v2, s3
977; GFX8-NEXT:    v_bfi_b32 v1, s1, v1, v2
978; GFX8-NEXT:    v_mov_b32_e32 v2, s2
979; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v2
980; GFX8-NEXT:    ; return to shader part epilog
981;
982; GFX10-LABEL: v_s_s_bitselect_i64_pat_1:
983; GFX10:       ; %bb.0:
984; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, s2
985; GFX10-NEXT:    v_bfi_b32 v1, s1, v1, s3
986; GFX10-NEXT:    ; return to shader part epilog
987;
988; GFX8-GISEL-LABEL: v_s_s_bitselect_i64_pat_1:
989; GFX8-GISEL:       ; %bb.0:
990; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s0
991; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, s2
992; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s1
993; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v2, v1, s3
994; GFX8-GISEL-NEXT:    ; return to shader part epilog
995;
996; GFX10-GISEL-LABEL: v_s_s_bitselect_i64_pat_1:
997; GFX10-GISEL:       ; %bb.0:
998; GFX10-GISEL-NEXT:    v_bfi_b32 v0, s0, v0, s2
999; GFX10-GISEL-NEXT:    v_bfi_b32 v1, s1, v1, s3
1000; GFX10-GISEL-NEXT:    ; return to shader part epilog
1001  %xor.0 = xor i64 %a, %mask
1002  %and = and i64 %xor.0, %b
1003  %bitselect = xor i64 %and, %mask
1004  %cast = bitcast i64 %bitselect to <2 x float>
1005  ret <2 x float> %cast
1006}
1007
1008define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_1(i64 inreg %a, i64 inreg %b, i64 %mask) {
1009; GFX7-LABEL: s_s_v_bitselect_i64_pat_1:
1010; GFX7:       ; %bb.0:
1011; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1012; GFX7-NEXT:    v_bfi_b32 v1, s3, v2, v1
1013; GFX7-NEXT:    v_mov_b32_e32 v2, s0
1014; GFX7-NEXT:    v_bfi_b32 v0, s2, v2, v0
1015; GFX7-NEXT:    ; return to shader part epilog
1016;
1017; GFX8-LABEL: s_s_v_bitselect_i64_pat_1:
1018; GFX8:       ; %bb.0:
1019; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1020; GFX8-NEXT:    v_bfi_b32 v1, s3, v2, v1
1021; GFX8-NEXT:    v_mov_b32_e32 v2, s0
1022; GFX8-NEXT:    v_bfi_b32 v0, s2, v2, v0
1023; GFX8-NEXT:    ; return to shader part epilog
1024;
1025; GFX10-LABEL: s_s_v_bitselect_i64_pat_1:
1026; GFX10:       ; %bb.0:
1027; GFX10-NEXT:    v_bfi_b32 v0, s2, s0, v0
1028; GFX10-NEXT:    v_bfi_b32 v1, s3, s1, v1
1029; GFX10-NEXT:    ; return to shader part epilog
1030;
1031; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_1:
1032; GFX8-GISEL:       ; %bb.0:
1033; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s2
1034; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v2, s0, v0
1035; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1036; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v2, s1, v1
1037; GFX8-GISEL-NEXT:    ; return to shader part epilog
1038;
1039; GFX10-GISEL-LABEL: s_s_v_bitselect_i64_pat_1:
1040; GFX10-GISEL:       ; %bb.0:
1041; GFX10-GISEL-NEXT:    v_bfi_b32 v0, s2, s0, v0
1042; GFX10-GISEL-NEXT:    v_bfi_b32 v1, s3, s1, v1
1043; GFX10-GISEL-NEXT:    ; return to shader part epilog
1044  %xor.0 = xor i64 %a, %mask
1045  %and = and i64 %xor.0, %b
1046  %bitselect = xor i64 %and, %mask
1047  %cast = bitcast i64 %bitselect to <2 x float>
1048  ret <2 x float> %cast
1049}
1050
1051define amdgpu_ps <2 x float> @s_v_s_bitselect_i64_pat_1(i64 inreg %a, i64 %b, i64 inreg %mask) {
1052; GFX7-LABEL: s_v_s_bitselect_i64_pat_1:
1053; GFX7:       ; %bb.0:
1054; GFX7-NEXT:    v_mov_b32_e32 v2, s3
1055; GFX7-NEXT:    v_bfi_b32 v1, v1, s1, v2
1056; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1057; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v2
1058; GFX7-NEXT:    ; return to shader part epilog
1059;
1060; GFX8-LABEL: s_v_s_bitselect_i64_pat_1:
1061; GFX8:       ; %bb.0:
1062; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1063; GFX8-NEXT:    v_bfi_b32 v1, v1, s1, v2
1064; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1065; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v2
1066; GFX8-NEXT:    ; return to shader part epilog
1067;
1068; GFX10-LABEL: s_v_s_bitselect_i64_pat_1:
1069; GFX10:       ; %bb.0:
1070; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, s2
1071; GFX10-NEXT:    v_bfi_b32 v1, v1, s1, s3
1072; GFX10-NEXT:    ; return to shader part epilog
1073;
1074; GFX8-GISEL-LABEL: s_v_s_bitselect_i64_pat_1:
1075; GFX8-GISEL:       ; %bb.0:
1076; GFX8-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
1077; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1078; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1079; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s2, v0
1080; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, s3, v1
1081; GFX8-GISEL-NEXT:    ; return to shader part epilog
1082;
1083; GFX10-GISEL-LABEL: s_v_s_bitselect_i64_pat_1:
1084; GFX10-GISEL:       ; %bb.0:
1085; GFX10-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
1086; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1087; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1088; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s2, v0
1089; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, s3, v1
1090; GFX10-GISEL-NEXT:    ; return to shader part epilog
1091  %xor.0 = xor i64 %a, %mask
1092  %and = and i64 %xor.0, %b
1093  %bitselect = xor i64 %and, %mask
1094  %cast = bitcast i64 %bitselect to <2 x float>
1095  ret <2 x float> %cast
1096}
1097
1098define i64 @v_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
1099; GFX7-LABEL: v_bitselect_i64_pat_2:
1100; GFX7:       ; %bb.0:
1101; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1102; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
1103; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
1104; GFX7-NEXT:    s_setpc_b64 s[30:31]
1105;
1106; GFX8-LABEL: v_bitselect_i64_pat_2:
1107; GFX8:       ; %bb.0:
1108; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1109; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
1110; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
1111; GFX8-NEXT:    s_setpc_b64 s[30:31]
1112;
1113; GFX10-LABEL: v_bitselect_i64_pat_2:
1114; GFX10:       ; %bb.0:
1115; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1116; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, v4
1117; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, v5
1118; GFX10-NEXT:    s_setpc_b64 s[30:31]
1119;
1120; GFX8-GISEL-LABEL: v_bitselect_i64_pat_2:
1121; GFX8-GISEL:       ; %bb.0:
1122; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1123; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, v4
1124; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v3, v1, v5
1125; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
1126;
1127; GFX10-GISEL-LABEL: v_bitselect_i64_pat_2:
1128; GFX10-GISEL:       ; %bb.0:
1129; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1130; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, v4
1131; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v3, v1, v5
1132; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
1133  %xor.0 = xor i64 %a, %mask
1134  %and = and i64 %xor.0, %b
1135  %bitselect = xor i64 %and, %mask
1136  ret i64 %bitselect
1137}
1138
1139define i64 @v_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
1140; GFX7-LABEL: v_bfi_sha256_ma_i64:
1141; GFX7:       ; %bb.0: ; %entry
1142; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1143; GFX7-NEXT:    v_xor_b32_e32 v1, v1, v3
1144; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v2
1145; GFX7-NEXT:    v_bfi_b32 v1, v1, v5, v3
1146; GFX7-NEXT:    v_bfi_b32 v0, v0, v4, v2
1147; GFX7-NEXT:    s_setpc_b64 s[30:31]
1148;
1149; GFX8-LABEL: v_bfi_sha256_ma_i64:
1150; GFX8:       ; %bb.0: ; %entry
1151; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1152; GFX8-NEXT:    v_xor_b32_e32 v1, v1, v3
1153; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v2
1154; GFX8-NEXT:    v_bfi_b32 v1, v1, v5, v3
1155; GFX8-NEXT:    v_bfi_b32 v0, v0, v4, v2
1156; GFX8-NEXT:    s_setpc_b64 s[30:31]
1157;
1158; GFX10-LABEL: v_bfi_sha256_ma_i64:
1159; GFX10:       ; %bb.0: ; %entry
1160; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1161; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v2
1162; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
1163; GFX10-NEXT:    v_bfi_b32 v0, v0, v4, v2
1164; GFX10-NEXT:    v_bfi_b32 v1, v1, v5, v3
1165; GFX10-NEXT:    s_setpc_b64 s[30:31]
1166;
1167; GFX8-GISEL-LABEL: v_bfi_sha256_ma_i64:
1168; GFX8-GISEL:       ; %bb.0: ; %entry
1169; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1170; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
1171; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
1172; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v4, v2
1173; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v1, v5, v3
1174; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
1175;
1176; GFX10-GISEL-LABEL: v_bfi_sha256_ma_i64:
1177; GFX10-GISEL:       ; %bb.0: ; %entry
1178; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1179; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
1180; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
1181; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v4, v2
1182; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v1, v5, v3
1183; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
1184entry:
1185  %and0 = and i64 %x, %z
1186  %or0 = or i64 %x, %z
1187  %and1 = and i64 %y, %or0
1188  %or1 = or i64 %and0, %and1
1189  ret i64 %or1
1190}
1191
1192define amdgpu_ps <2 x float> @v_s_s_bfi_sha256_ma_i64(i64 %x, i64 inreg %y, i64 inreg %z) {
1193; GFX7-LABEL: v_s_s_bfi_sha256_ma_i64:
1194; GFX7:       ; %bb.0: ; %entry
1195; GFX7-NEXT:    v_xor_b32_e32 v1, s1, v1
1196; GFX7-NEXT:    v_mov_b32_e32 v2, s1
1197; GFX7-NEXT:    v_bfi_b32 v1, v1, s3, v2
1198; GFX7-NEXT:    v_xor_b32_e32 v0, s0, v0
1199; GFX7-NEXT:    v_mov_b32_e32 v2, s0
1200; GFX7-NEXT:    v_bfi_b32 v0, v0, s2, v2
1201; GFX7-NEXT:    ; return to shader part epilog
1202;
1203; GFX8-LABEL: v_s_s_bfi_sha256_ma_i64:
1204; GFX8:       ; %bb.0: ; %entry
1205; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v1
1206; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1207; GFX8-NEXT:    v_bfi_b32 v1, v1, s3, v2
1208; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
1209; GFX8-NEXT:    v_mov_b32_e32 v2, s0
1210; GFX8-NEXT:    v_bfi_b32 v0, v0, s2, v2
1211; GFX8-NEXT:    ; return to shader part epilog
1212;
1213; GFX10-LABEL: v_s_s_bfi_sha256_ma_i64:
1214; GFX10:       ; %bb.0: ; %entry
1215; GFX10-NEXT:    v_xor_b32_e32 v0, s0, v0
1216; GFX10-NEXT:    v_xor_b32_e32 v1, s1, v1
1217; GFX10-NEXT:    v_bfi_b32 v0, v0, s2, s0
1218; GFX10-NEXT:    v_bfi_b32 v1, v1, s3, s1
1219; GFX10-NEXT:    ; return to shader part epilog
1220;
1221; GFX8-GISEL-LABEL: v_s_s_bfi_sha256_ma_i64:
1222; GFX8-GISEL:       ; %bb.0: ; %entry
1223; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s2
1224; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s0, v0
1225; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, s0
1226; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1227; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, s1, v1
1228; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v1, v2, s1
1229; GFX8-GISEL-NEXT:    ; return to shader part epilog
1230;
1231; GFX10-GISEL-LABEL: v_s_s_bfi_sha256_ma_i64:
1232; GFX10-GISEL:       ; %bb.0: ; %entry
1233; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s0, v0
1234; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, s1, v1
1235; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, s2, s0
1236; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v1, s3, s1
1237; GFX10-GISEL-NEXT:    ; return to shader part epilog
1238entry:
1239  %and0 = and i64 %x, %z
1240  %or0 = or i64 %x, %z
1241  %and1 = and i64 %y, %or0
1242  %or1 = or i64 %and0, %and1
1243  %cast = bitcast i64 %or1 to <2 x float>
1244  ret <2 x float> %cast
1245}
1246
1247define amdgpu_ps <2 x float> @s_v_s_bfi_sha256_ma_i64(i64 inreg %x, i64 %y, i64 inreg %z) {
1248; GFX7-LABEL: s_v_s_bfi_sha256_ma_i64:
1249; GFX7:       ; %bb.0: ; %entry
1250; GFX7-NEXT:    v_xor_b32_e32 v2, s1, v1
1251; GFX7-NEXT:    v_bfi_b32 v1, v2, s3, v1
1252; GFX7-NEXT:    v_xor_b32_e32 v2, s0, v0
1253; GFX7-NEXT:    v_bfi_b32 v0, v2, s2, v0
1254; GFX7-NEXT:    ; return to shader part epilog
1255;
1256; GFX8-LABEL: s_v_s_bfi_sha256_ma_i64:
1257; GFX8:       ; %bb.0: ; %entry
1258; GFX8-NEXT:    v_xor_b32_e32 v2, s1, v1
1259; GFX8-NEXT:    v_bfi_b32 v1, v2, s3, v1
1260; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v0
1261; GFX8-NEXT:    v_bfi_b32 v0, v2, s2, v0
1262; GFX8-NEXT:    ; return to shader part epilog
1263;
1264; GFX10-LABEL: s_v_s_bfi_sha256_ma_i64:
1265; GFX10:       ; %bb.0: ; %entry
1266; GFX10-NEXT:    v_xor_b32_e32 v2, s0, v0
1267; GFX10-NEXT:    v_xor_b32_e32 v3, s1, v1
1268; GFX10-NEXT:    v_bfi_b32 v0, v2, s2, v0
1269; GFX10-NEXT:    v_bfi_b32 v1, v3, s3, v1
1270; GFX10-NEXT:    ; return to shader part epilog
1271;
1272; GFX8-GISEL-LABEL: s_v_s_bfi_sha256_ma_i64:
1273; GFX8-GISEL:       ; %bb.0: ; %entry
1274; GFX8-GISEL-NEXT:    s_and_b64 s[4:5], s[0:1], s[2:3]
1275; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1276; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, s0, v0
1277; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, s1, v1
1278; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, s4, v0
1279; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, s5, v1
1280; GFX8-GISEL-NEXT:    ; return to shader part epilog
1281;
1282; GFX10-GISEL-LABEL: s_v_s_bfi_sha256_ma_i64:
1283; GFX10-GISEL:       ; %bb.0: ; %entry
1284; GFX10-GISEL-NEXT:    s_and_b64 s[4:5], s[0:1], s[2:3]
1285; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1286; GFX10-GISEL-NEXT:    v_and_or_b32 v0, v0, s0, s4
1287; GFX10-GISEL-NEXT:    v_and_or_b32 v1, v1, s1, s5
1288; GFX10-GISEL-NEXT:    ; return to shader part epilog
1289entry:
1290  %and0 = and i64 %x, %z
1291  %or0 = or i64 %x, %z
1292  %and1 = and i64 %y, %or0
1293  %or1 = or i64 %and0, %and1
1294  %cast = bitcast i64 %or1 to <2 x float>
1295  ret <2 x float> %cast
1296}
1297
1298define amdgpu_ps <2 x float> @s_s_v_bfi_sha256_ma_i64(i64 inreg %x, i64 inreg %y, i64 %z) {
1299; GFX7-LABEL: s_s_v_bfi_sha256_ma_i64:
1300; GFX7:       ; %bb.0: ; %entry
1301; GFX7-NEXT:    v_mov_b32_e32 v2, s3
1302; GFX7-NEXT:    v_xor_b32_e32 v2, s1, v2
1303; GFX7-NEXT:    v_bfi_b32 v1, v2, v1, s3
1304; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1305; GFX7-NEXT:    v_xor_b32_e32 v2, s0, v2
1306; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, s2
1307; GFX7-NEXT:    ; return to shader part epilog
1308;
1309; GFX8-LABEL: s_s_v_bfi_sha256_ma_i64:
1310; GFX8:       ; %bb.0: ; %entry
1311; GFX8-NEXT:    v_mov_b32_e32 v2, s3
1312; GFX8-NEXT:    v_xor_b32_e32 v2, s1, v2
1313; GFX8-NEXT:    v_bfi_b32 v1, v2, v1, s3
1314; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1315; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v2
1316; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, s2
1317; GFX8-NEXT:    ; return to shader part epilog
1318;
1319; GFX10-LABEL: s_s_v_bfi_sha256_ma_i64:
1320; GFX10:       ; %bb.0: ; %entry
1321; GFX10-NEXT:    v_xor_b32_e64 v2, s0, s2
1322; GFX10-NEXT:    v_xor_b32_e64 v3, s1, s3
1323; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, s2
1324; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, s3
1325; GFX10-NEXT:    ; return to shader part epilog
1326;
1327; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ma_i64:
1328; GFX8-GISEL:       ; %bb.0: ; %entry
1329; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s0
1330; GFX8-GISEL-NEXT:    v_xor_b32_e32 v2, s2, v2
1331; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, s2
1332; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, s1
1333; GFX8-GISEL-NEXT:    v_xor_b32_e32 v2, s3, v2
1334; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v2, v1, s3
1335; GFX8-GISEL-NEXT:    ; return to shader part epilog
1336;
1337; GFX10-GISEL-LABEL: s_s_v_bfi_sha256_ma_i64:
1338; GFX10-GISEL:       ; %bb.0: ; %entry
1339; GFX10-GISEL-NEXT:    v_xor_b32_e64 v2, s0, s2
1340; GFX10-GISEL-NEXT:    v_xor_b32_e64 v3, s1, s3
1341; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, s2
1342; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v3, v1, s3
1343; GFX10-GISEL-NEXT:    ; return to shader part epilog
1344entry:
1345  %and0 = and i64 %x, %z
1346  %or0 = or i64 %x, %z
1347  %and1 = and i64 %y, %or0
1348  %or1 = or i64 %and0, %and1
1349  %cast = bitcast i64 %or1 to <2 x float>
1350  ret <2 x float> %cast
1351}
1352
1353define amdgpu_ps <2 x float> @v_s_v_bfi_sha256_ma_i64(i64 %x, i64 inreg %y, i64 %z) {
1354; GFX7-LABEL: v_s_v_bfi_sha256_ma_i64:
1355; GFX7:       ; %bb.0: ; %entry
1356; GFX7-NEXT:    v_xor_b32_e32 v1, s1, v1
1357; GFX7-NEXT:    v_xor_b32_e32 v0, s0, v0
1358; GFX7-NEXT:    v_bfi_b32 v1, v1, v3, s1
1359; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, s0
1360; GFX7-NEXT:    ; return to shader part epilog
1361;
1362; GFX8-LABEL: v_s_v_bfi_sha256_ma_i64:
1363; GFX8:       ; %bb.0: ; %entry
1364; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v1
1365; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
1366; GFX8-NEXT:    v_bfi_b32 v1, v1, v3, s1
1367; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, s0
1368; GFX8-NEXT:    ; return to shader part epilog
1369;
1370; GFX10-LABEL: v_s_v_bfi_sha256_ma_i64:
1371; GFX10:       ; %bb.0: ; %entry
1372; GFX10-NEXT:    v_xor_b32_e32 v0, s0, v0
1373; GFX10-NEXT:    v_xor_b32_e32 v1, s1, v1
1374; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, s0
1375; GFX10-NEXT:    v_bfi_b32 v1, v1, v3, s1
1376; GFX10-NEXT:    ; return to shader part epilog
1377;
1378; GFX8-GISEL-LABEL: v_s_v_bfi_sha256_ma_i64:
1379; GFX8-GISEL:       ; %bb.0: ; %entry
1380; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, s0, v0
1381; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, s1, v1
1382; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, s0
1383; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v1, v3, s1
1384; GFX8-GISEL-NEXT:    ; return to shader part epilog
1385;
1386; GFX10-GISEL-LABEL: v_s_v_bfi_sha256_ma_i64:
1387; GFX10-GISEL:       ; %bb.0: ; %entry
1388; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, s0, v0
1389; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, s1, v1
1390; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, s0
1391; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v1, v3, s1
1392; GFX10-GISEL-NEXT:    ; return to shader part epilog
1393entry:
1394  %and0 = and i64 %x, %z
1395  %or0 = or i64 %x, %z
1396  %and1 = and i64 %y, %or0
1397  %or1 = or i64 %and0, %and1
1398  %cast = bitcast i64 %or1 to <2 x float>
1399  ret <2 x float> %cast
1400}
1401
1402define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
1403; GFX7-LABEL: s_bitselect_i64_pat_0:
1404; GFX7:       ; %bb.0:
1405; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1406; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1407; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1408; GFX7-NEXT:    s_mov_b32 s6, -1
1409; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1410; GFX7-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
1411; GFX7-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
1412; GFX7-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1413; GFX7-NEXT:    s_add_u32 s0, s0, 10
1414; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1415; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1416; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1417; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1418; GFX7-NEXT:    s_endpgm
1419;
1420; GFX8-LABEL: s_bitselect_i64_pat_0:
1421; GFX8:       ; %bb.0:
1422; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1423; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1424; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1425; GFX8-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
1426; GFX8-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
1427; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1428; GFX8-NEXT:    s_add_u32 s0, s0, 10
1429; GFX8-NEXT:    s_addc_u32 s1, s1, 0
1430; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1431; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1432; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1433; GFX8-NEXT:    s_endpgm
1434;
1435; GFX10-LABEL: s_bitselect_i64_pat_0:
1436; GFX10:       ; %bb.0:
1437; GFX10-NEXT:    s_clause 0x1
1438; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1439; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1440; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1441; GFX10-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
1442; GFX10-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
1443; GFX10-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1444; GFX10-NEXT:    s_add_u32 s0, s0, 10
1445; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1446; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1447; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1448; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1449; GFX10-NEXT:    s_endpgm
1450;
1451; GFX8-GISEL-LABEL: s_bitselect_i64_pat_0:
1452; GFX8-GISEL:       ; %bb.0:
1453; GFX8-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1454; GFX8-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1455; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1456; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
1457; GFX8-GISEL-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
1458; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1459; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
1460; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1461; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1462; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1463; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1464; GFX8-GISEL-NEXT:    s_endpgm
1465;
1466; GFX10-GISEL-LABEL: s_bitselect_i64_pat_0:
1467; GFX10-GISEL:       ; %bb.0:
1468; GFX10-GISEL-NEXT:    s_clause 0x1
1469; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1470; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1471; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1472; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
1473; GFX10-GISEL-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
1474; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1475; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
1476; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1477; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1478; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1479; GFX10-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1480; GFX10-GISEL-NEXT:    s_endpgm
1481  %and0 = and i64 %a, %b
1482  %not.a = xor i64 %a, -1
1483  %and1 = and i64 %not.a, %mask
1484  %bitselect = or i64 %and0, %and1
1485  %scalar.use = add i64 %bitselect, 10
1486  store i64 %scalar.use, ptr addrspace(1) undef
1487  ret void
1488}
1489
1490define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
1491; GFX7-LABEL: s_bitselect_i64_pat_1:
1492; GFX7:       ; %bb.0:
1493; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1494; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1495; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1496; GFX7-NEXT:    s_mov_b32 s6, -1
1497; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1498; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1499; GFX7-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
1500; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1501; GFX7-NEXT:    s_add_u32 s0, s0, 10
1502; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1503; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1504; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1505; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1506; GFX7-NEXT:    s_endpgm
1507;
1508; GFX8-LABEL: s_bitselect_i64_pat_1:
1509; GFX8:       ; %bb.0:
1510; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1511; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1512; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1513; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1514; GFX8-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
1515; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1516; GFX8-NEXT:    s_add_u32 s0, s0, 10
1517; GFX8-NEXT:    s_addc_u32 s1, s1, 0
1518; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1519; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1520; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1521; GFX8-NEXT:    s_endpgm
1522;
1523; GFX10-LABEL: s_bitselect_i64_pat_1:
1524; GFX10:       ; %bb.0:
1525; GFX10-NEXT:    s_clause 0x1
1526; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1527; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1528; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1529; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1530; GFX10-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
1531; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1532; GFX10-NEXT:    s_add_u32 s0, s0, 10
1533; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1534; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1535; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1536; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1537; GFX10-NEXT:    s_endpgm
1538;
1539; GFX8-GISEL-LABEL: s_bitselect_i64_pat_1:
1540; GFX8-GISEL:       ; %bb.0:
1541; GFX8-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1542; GFX8-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1543; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1544; GFX8-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1545; GFX8-GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
1546; GFX8-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1547; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
1548; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1549; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1550; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1551; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1552; GFX8-GISEL-NEXT:    s_endpgm
1553;
1554; GFX10-GISEL-LABEL: s_bitselect_i64_pat_1:
1555; GFX10-GISEL:       ; %bb.0:
1556; GFX10-GISEL-NEXT:    s_clause 0x1
1557; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1558; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1559; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1560; GFX10-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1561; GFX10-GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
1562; GFX10-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1563; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
1564; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1565; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1566; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1567; GFX10-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1568; GFX10-GISEL-NEXT:    s_endpgm
1569  %xor.0 = xor i64 %a, %mask
1570  %and = and i64 %xor.0, %b
1571  %bitselect = xor i64 %and, %mask
1572
1573  %scalar.use = add i64 %bitselect, 10
1574  store i64 %scalar.use, ptr addrspace(1) undef
1575  ret void
1576}
1577
1578define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
1579; GFX7-LABEL: s_bitselect_i64_pat_2:
1580; GFX7:       ; %bb.0:
1581; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1582; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1583; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1584; GFX7-NEXT:    s_mov_b32 s6, -1
1585; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1586; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1587; GFX7-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
1588; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1589; GFX7-NEXT:    s_add_u32 s0, s0, 10
1590; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1591; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1592; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1593; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1594; GFX7-NEXT:    s_endpgm
1595;
1596; GFX8-LABEL: s_bitselect_i64_pat_2:
1597; GFX8:       ; %bb.0:
1598; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1599; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1600; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1601; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1602; GFX8-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
1603; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1604; GFX8-NEXT:    s_add_u32 s0, s0, 10
1605; GFX8-NEXT:    s_addc_u32 s1, s1, 0
1606; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1607; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1608; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1609; GFX8-NEXT:    s_endpgm
1610;
1611; GFX10-LABEL: s_bitselect_i64_pat_2:
1612; GFX10:       ; %bb.0:
1613; GFX10-NEXT:    s_clause 0x1
1614; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1615; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1616; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1617; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1618; GFX10-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
1619; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1620; GFX10-NEXT:    s_add_u32 s0, s0, 10
1621; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1622; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1623; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1624; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1625; GFX10-NEXT:    s_endpgm
1626;
1627; GFX8-GISEL-LABEL: s_bitselect_i64_pat_2:
1628; GFX8-GISEL:       ; %bb.0:
1629; GFX8-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1630; GFX8-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1631; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1632; GFX8-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1633; GFX8-GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
1634; GFX8-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1635; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
1636; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1637; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1638; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1639; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1640; GFX8-GISEL-NEXT:    s_endpgm
1641;
1642; GFX10-GISEL-LABEL: s_bitselect_i64_pat_2:
1643; GFX10-GISEL:       ; %bb.0:
1644; GFX10-GISEL-NEXT:    s_clause 0x1
1645; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1646; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1647; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1648; GFX10-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1649; GFX10-GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
1650; GFX10-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
1651; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
1652; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1653; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1654; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1655; GFX10-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1656; GFX10-GISEL-NEXT:    s_endpgm
1657  %xor.0 = xor i64 %a, %mask
1658  %and = and i64 %xor.0, %b
1659  %bitselect = xor i64 %and, %mask
1660
1661  %scalar.use = add i64 %bitselect, 10
1662  store i64 %scalar.use, ptr addrspace(1) undef
1663  ret void
1664}
1665
1666define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
1667; GFX7-LABEL: s_bfi_sha256_ma_i64:
1668; GFX7:       ; %bb.0: ; %entry
1669; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1670; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1671; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1672; GFX7-NEXT:    s_mov_b32 s6, -1
1673; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1674; GFX7-NEXT:    s_and_b64 s[8:9], s[0:1], s[4:5]
1675; GFX7-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
1676; GFX7-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
1677; GFX7-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
1678; GFX7-NEXT:    s_add_u32 s0, s0, 10
1679; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1680; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1681; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1682; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1683; GFX7-NEXT:    s_endpgm
1684;
1685; GFX8-LABEL: s_bfi_sha256_ma_i64:
1686; GFX8:       ; %bb.0: ; %entry
1687; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1688; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1689; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1690; GFX8-NEXT:    s_and_b64 s[6:7], s[0:1], s[4:5]
1691; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
1692; GFX8-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
1693; GFX8-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
1694; GFX8-NEXT:    s_add_u32 s0, s0, 10
1695; GFX8-NEXT:    s_addc_u32 s1, s1, 0
1696; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1697; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1698; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1699; GFX8-NEXT:    s_endpgm
1700;
1701; GFX10-LABEL: s_bfi_sha256_ma_i64:
1702; GFX10:       ; %bb.0: ; %entry
1703; GFX10-NEXT:    s_clause 0x1
1704; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1705; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1706; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1707; GFX10-NEXT:    s_or_b64 s[6:7], s[0:1], s[4:5]
1708; GFX10-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
1709; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1710; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1711; GFX10-NEXT:    s_add_u32 s0, s0, 10
1712; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1713; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1714; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1715; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1716; GFX10-NEXT:    s_endpgm
1717;
1718; GFX8-GISEL-LABEL: s_bfi_sha256_ma_i64:
1719; GFX8-GISEL:       ; %bb.0: ; %entry
1720; GFX8-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1721; GFX8-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1722; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1723; GFX8-GISEL-NEXT:    s_and_b64 s[6:7], s[0:1], s[4:5]
1724; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
1725; GFX8-GISEL-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
1726; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
1727; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
1728; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1729; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1730; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1731; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1732; GFX8-GISEL-NEXT:    s_endpgm
1733;
1734; GFX10-GISEL-LABEL: s_bfi_sha256_ma_i64:
1735; GFX10-GISEL:       ; %bb.0: ; %entry
1736; GFX10-GISEL-NEXT:    s_clause 0x1
1737; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1738; GFX10-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1739; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1740; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[0:1], s[4:5]
1741; GFX10-GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
1742; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1743; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1744; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
1745; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
1746; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
1747; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
1748; GFX10-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1749; GFX10-GISEL-NEXT:    s_endpgm
1750entry:
1751  %and0 = and i64 %x, %z
1752  %or0 = or i64 %x, %z
1753  %and1 = and i64 %y, %or0
1754  %or1 = or i64 %and0, %and1
1755
1756  %scalar.use = add i64 %or1, 10
1757  store i64 %scalar.use, ptr addrspace(1) undef
1758  ret void
1759}
1760