xref: /llvm-project/llvm/test/CodeGen/AMDGPU/bfi_int.ll (revision eb88e793ff579071e03766970e46a5d60d77cf7c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s
3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
5
6; BFI_INT Definition pattern from ISA docs
7; (y & x) | (z & ~x)
8;
9define amdgpu_kernel void @s_bfi_def_i32(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
10; GFX7-LABEL: s_bfi_def_i32:
11; GFX7:       ; %bb.0: ; %entry
12; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
13; GFX7-NEXT:    s_load_dword s6, s[0:1], 0xd
14; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
15; GFX7-NEXT:    s_mov_b32 s3, 0xf000
16; GFX7-NEXT:    s_mov_b32 s2, -1
17; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18; GFX7-NEXT:    s_andn2_b32 s6, s6, s4
19; GFX7-NEXT:    s_and_b32 s4, s5, s4
20; GFX7-NEXT:    s_or_b32 s4, s6, s4
21; GFX7-NEXT:    v_mov_b32_e32 v0, s4
22; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
23; GFX7-NEXT:    s_endpgm
24;
25; GFX8-LABEL: s_bfi_def_i32:
26; GFX8:       ; %bb.0: ; %entry
27; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
28; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x34
29; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
30; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
31; GFX8-NEXT:    s_andn2_b32 s4, s4, s2
32; GFX8-NEXT:    s_and_b32 s2, s3, s2
33; GFX8-NEXT:    s_or_b32 s2, s4, s2
34; GFX8-NEXT:    v_mov_b32_e32 v0, s0
35; GFX8-NEXT:    v_mov_b32_e32 v1, s1
36; GFX8-NEXT:    v_mov_b32_e32 v2, s2
37; GFX8-NEXT:    flat_store_dword v[0:1], v2
38; GFX8-NEXT:    s_endpgm
39;
40; GFX10-LABEL: s_bfi_def_i32:
41; GFX10:       ; %bb.0: ; %entry
42; GFX10-NEXT:    s_clause 0x2
43; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
44; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x34
45; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
46; GFX10-NEXT:    v_mov_b32_e32 v0, 0
47; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
48; GFX10-NEXT:    s_andn2_b32 s4, s4, s2
49; GFX10-NEXT:    s_and_b32 s2, s3, s2
50; GFX10-NEXT:    s_or_b32 s2, s4, s2
51; GFX10-NEXT:    v_mov_b32_e32 v1, s2
52; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
53; GFX10-NEXT:    s_endpgm
54entry:
55  %0 = xor i32 %x, -1
56  %1 = and i32 %z, %0
57  %2 = and i32 %y, %x
58  %3 = or i32 %1, %2
59  store i32 %3, i32 addrspace(1)* %out
60  ret void
61}
62
63define i32 @v_bfi_def_i32(i32 %x, i32 %y, i32 %z) {
64; GFX7-LABEL: v_bfi_def_i32:
65; GFX7:       ; %bb.0: ; %entry
66; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
67; GFX7-NEXT:    v_bfi_b32 v0, v0, v1, v2
68; GFX7-NEXT:    s_setpc_b64 s[30:31]
69;
70; GFX8-LABEL: v_bfi_def_i32:
71; GFX8:       ; %bb.0: ; %entry
72; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73; GFX8-NEXT:    v_bfi_b32 v0, v0, v1, v2
74; GFX8-NEXT:    s_setpc_b64 s[30:31]
75;
76; GFX10-LABEL: v_bfi_def_i32:
77; GFX10:       ; %bb.0: ; %entry
78; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
79; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
80; GFX10-NEXT:    v_bfi_b32 v0, v0, v1, v2
81; GFX10-NEXT:    s_setpc_b64 s[30:31]
82entry:
83  %0 = xor i32 %x, -1
84  %1 = and i32 %z, %0
85  %2 = and i32 %y, %x
86  %3 = or i32 %1, %2
87  ret i32 %3
88}
89
90; SHA-256 Ch function
91; z ^ (x & (y ^ z))
92define amdgpu_kernel void @s_bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
93; GFX7-LABEL: s_bfi_sha256_ch:
94; GFX7:       ; %bb.0: ; %entry
95; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
96; GFX7-NEXT:    s_load_dword s6, s[0:1], 0xd
97; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
98; GFX7-NEXT:    s_mov_b32 s3, 0xf000
99; GFX7-NEXT:    s_mov_b32 s2, -1
100; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
101; GFX7-NEXT:    s_xor_b32 s5, s5, s6
102; GFX7-NEXT:    s_and_b32 s4, s4, s5
103; GFX7-NEXT:    s_xor_b32 s4, s6, s4
104; GFX7-NEXT:    v_mov_b32_e32 v0, s4
105; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
106; GFX7-NEXT:    s_endpgm
107;
108; GFX8-LABEL: s_bfi_sha256_ch:
109; GFX8:       ; %bb.0: ; %entry
110; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
111; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x34
112; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
113; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
114; GFX8-NEXT:    s_xor_b32 s3, s3, s4
115; GFX8-NEXT:    s_and_b32 s2, s2, s3
116; GFX8-NEXT:    s_xor_b32 s2, s4, s2
117; GFX8-NEXT:    v_mov_b32_e32 v0, s0
118; GFX8-NEXT:    v_mov_b32_e32 v1, s1
119; GFX8-NEXT:    v_mov_b32_e32 v2, s2
120; GFX8-NEXT:    flat_store_dword v[0:1], v2
121; GFX8-NEXT:    s_endpgm
122;
123; GFX10-LABEL: s_bfi_sha256_ch:
124; GFX10:       ; %bb.0: ; %entry
125; GFX10-NEXT:    s_clause 0x2
126; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
127; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x34
128; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
129; GFX10-NEXT:    v_mov_b32_e32 v0, 0
130; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
131; GFX10-NEXT:    s_xor_b32 s3, s3, s4
132; GFX10-NEXT:    s_and_b32 s2, s2, s3
133; GFX10-NEXT:    s_xor_b32 s2, s4, s2
134; GFX10-NEXT:    v_mov_b32_e32 v1, s2
135; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
136; GFX10-NEXT:    s_endpgm
137entry:
138  %0 = xor i32 %y, %z
139  %1 = and i32 %x, %0
140  %2 = xor i32 %z, %1
141  store i32 %2, i32 addrspace(1)* %out
142  ret void
143}
144
145define i32 @v_bfi_sha256_ch(i32 %x, i32 %y, i32 %z) {
146; GFX7-LABEL: v_bfi_sha256_ch:
147; GFX7:       ; %bb.0: ; %entry
148; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149; GFX7-NEXT:    v_bfi_b32 v0, v0, v1, v2
150; GFX7-NEXT:    s_setpc_b64 s[30:31]
151;
152; GFX8-LABEL: v_bfi_sha256_ch:
153; GFX8:       ; %bb.0: ; %entry
154; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155; GFX8-NEXT:    v_bfi_b32 v0, v0, v1, v2
156; GFX8-NEXT:    s_setpc_b64 s[30:31]
157;
158; GFX10-LABEL: v_bfi_sha256_ch:
159; GFX10:       ; %bb.0: ; %entry
160; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
162; GFX10-NEXT:    v_bfi_b32 v0, v0, v1, v2
163; GFX10-NEXT:    s_setpc_b64 s[30:31]
164entry:
165  %0 = xor i32 %y, %z
166  %1 = and i32 %x, %0
167  %2 = xor i32 %z, %1
168  ret i32 %2
169}
170
171define amdgpu_ps float @v_s_s_bfi_sha256_ch(i32 %x, i32 inreg %y, i32 inreg %z) {
172; GFX7-LABEL: v_s_s_bfi_sha256_ch:
173; GFX7:       ; %bb.0: ; %entry
174; GFX7-NEXT:    v_mov_b32_e32 v1, s1
175; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v1
176; GFX7-NEXT:    ; return to shader part epilog
177;
178; GFX8-LABEL: v_s_s_bfi_sha256_ch:
179; GFX8:       ; %bb.0: ; %entry
180; GFX8-NEXT:    v_mov_b32_e32 v1, s1
181; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v1
182; GFX8-NEXT:    ; return to shader part epilog
183;
184; GFX10-LABEL: v_s_s_bfi_sha256_ch:
185; GFX10:       ; %bb.0: ; %entry
186; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, s1
187; GFX10-NEXT:    ; return to shader part epilog
188entry:
189  %xor0 = xor i32 %y, %z
190  %and = and i32 %x, %xor0
191  %xor1 = xor i32 %z, %and
192  %cast = bitcast i32 %xor1 to float
193  ret float %cast
194}
195
196define amdgpu_ps float @s_v_s_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 inreg %z) {
197; GFX7-LABEL: s_v_s_bfi_sha256_ch:
198; GFX7:       ; %bb.0: ; %entry
199; GFX7-NEXT:    v_mov_b32_e32 v1, s1
200; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v1
201; GFX7-NEXT:    ; return to shader part epilog
202;
203; GFX8-LABEL: s_v_s_bfi_sha256_ch:
204; GFX8:       ; %bb.0: ; %entry
205; GFX8-NEXT:    v_mov_b32_e32 v1, s1
206; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v1
207; GFX8-NEXT:    ; return to shader part epilog
208;
209; GFX10-LABEL: s_v_s_bfi_sha256_ch:
210; GFX10:       ; %bb.0: ; %entry
211; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, s1
212; GFX10-NEXT:    ; return to shader part epilog
213entry:
214  %xor0 = xor i32 %y, %z
215  %and = and i32 %x, %xor0
216  %xor1 = xor i32 %z, %and
217  %cast = bitcast i32 %xor1 to float
218  ret float %cast
219}
220
221define amdgpu_ps float @s_s_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) {
222; GFX7-LABEL: s_s_v_bfi_sha256_ch:
223; GFX7:       ; %bb.0: ; %entry
224; GFX7-NEXT:    v_mov_b32_e32 v1, s1
225; GFX7-NEXT:    v_bfi_b32 v0, s0, v1, v0
226; GFX7-NEXT:    ; return to shader part epilog
227;
228; GFX8-LABEL: s_s_v_bfi_sha256_ch:
229; GFX8:       ; %bb.0: ; %entry
230; GFX8-NEXT:    v_mov_b32_e32 v1, s1
231; GFX8-NEXT:    v_bfi_b32 v0, s0, v1, v0
232; GFX8-NEXT:    ; return to shader part epilog
233;
234; GFX10-LABEL: s_s_v_bfi_sha256_ch:
235; GFX10:       ; %bb.0: ; %entry
236; GFX10-NEXT:    v_bfi_b32 v0, s0, s1, v0
237; GFX10-NEXT:    ; return to shader part epilog
238entry:
239  %xor0 = xor i32 %y, %z
240  %and = and i32 %x, %xor0
241  %xor1 = xor i32 %z, %and
242  %cast = bitcast i32 %xor1 to float
243  ret float %cast
244}
245
246define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 %z) {
247; GFX7-LABEL: s_v_v_bfi_sha256_ch:
248; GFX7:       ; %bb.0: ; %entry
249; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v1
250; GFX7-NEXT:    ; return to shader part epilog
251;
252; GFX8-LABEL: s_v_v_bfi_sha256_ch:
253; GFX8:       ; %bb.0: ; %entry
254; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v1
255; GFX8-NEXT:    ; return to shader part epilog
256;
257; GFX10-LABEL: s_v_v_bfi_sha256_ch:
258; GFX10:       ; %bb.0: ; %entry
259; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, v1
260; GFX10-NEXT:    ; return to shader part epilog
261entry:
262  %xor0 = xor i32 %y, %z
263  %and = and i32 %x, %xor0
264  %xor1 = xor i32 %z, %and
265  %cast = bitcast i32 %xor1 to float
266  ret float %cast
267}
268
269define amdgpu_ps float @v_s_v_bfi_sha256_ch(i32 %x, i32 inreg %y, i32 %z) {
270; GFX7-LABEL: v_s_v_bfi_sha256_ch:
271; GFX7:       ; %bb.0: ; %entry
272; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v1
273; GFX7-NEXT:    ; return to shader part epilog
274;
275; GFX8-LABEL: v_s_v_bfi_sha256_ch:
276; GFX8:       ; %bb.0: ; %entry
277; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v1
278; GFX8-NEXT:    ; return to shader part epilog
279;
280; GFX10-LABEL: v_s_v_bfi_sha256_ch:
281; GFX10:       ; %bb.0: ; %entry
282; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, v1
283; GFX10-NEXT:    ; return to shader part epilog
284entry:
285  %xor0 = xor i32 %y, %z
286  %and = and i32 %x, %xor0
287  %xor1 = xor i32 %z, %and
288  %cast = bitcast i32 %xor1 to float
289  ret float %cast
290}
291
292define amdgpu_ps float @v_v_s_bfi_sha256_ch(i32 %x, i32 %y, i32 inreg %z) {
293; GFX7-LABEL: v_v_s_bfi_sha256_ch:
294; GFX7:       ; %bb.0: ; %entry
295; GFX7-NEXT:    v_bfi_b32 v0, v0, v1, s0
296; GFX7-NEXT:    ; return to shader part epilog
297;
298; GFX8-LABEL: v_v_s_bfi_sha256_ch:
299; GFX8:       ; %bb.0: ; %entry
300; GFX8-NEXT:    v_bfi_b32 v0, v0, v1, s0
301; GFX8-NEXT:    ; return to shader part epilog
302;
303; GFX10-LABEL: v_v_s_bfi_sha256_ch:
304; GFX10:       ; %bb.0: ; %entry
305; GFX10-NEXT:    v_bfi_b32 v0, v0, v1, s0
306; GFX10-NEXT:    ; return to shader part epilog
307entry:
308  %xor0 = xor i32 %y, %z
309  %and = and i32 %x, %xor0
310  %xor1 = xor i32 %z, %and
311  %cast = bitcast i32 %xor1 to float
312  ret float %cast
313}
314
315; SHA-256 Ma function
316; ((x & z) | (y & (x | z)))
317define amdgpu_kernel void @s_bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
318; GFX7-LABEL: s_bfi_sha256_ma:
319; GFX7:       ; %bb.0: ; %entry
320; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
321; GFX7-NEXT:    s_load_dword s6, s[0:1], 0xd
322; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
323; GFX7-NEXT:    s_mov_b32 s3, 0xf000
324; GFX7-NEXT:    s_mov_b32 s2, -1
325; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
326; GFX7-NEXT:    s_and_b32 s7, s4, s6
327; GFX7-NEXT:    s_or_b32 s4, s4, s6
328; GFX7-NEXT:    s_and_b32 s4, s5, s4
329; GFX7-NEXT:    s_or_b32 s4, s7, s4
330; GFX7-NEXT:    v_mov_b32_e32 v0, s4
331; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
332; GFX7-NEXT:    s_endpgm
333;
334; GFX8-LABEL: s_bfi_sha256_ma:
335; GFX8:       ; %bb.0: ; %entry
336; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
337; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x34
338; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
339; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
340; GFX8-NEXT:    s_and_b32 s5, s2, s4
341; GFX8-NEXT:    s_or_b32 s2, s2, s4
342; GFX8-NEXT:    s_and_b32 s2, s3, s2
343; GFX8-NEXT:    s_or_b32 s2, s5, s2
344; GFX8-NEXT:    v_mov_b32_e32 v0, s0
345; GFX8-NEXT:    v_mov_b32_e32 v1, s1
346; GFX8-NEXT:    v_mov_b32_e32 v2, s2
347; GFX8-NEXT:    flat_store_dword v[0:1], v2
348; GFX8-NEXT:    s_endpgm
349;
350; GFX10-LABEL: s_bfi_sha256_ma:
351; GFX10:       ; %bb.0: ; %entry
352; GFX10-NEXT:    s_clause 0x2
353; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
354; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x34
355; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
356; GFX10-NEXT:    v_mov_b32_e32 v0, 0
357; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
358; GFX10-NEXT:    s_or_b32 s5, s2, s4
359; GFX10-NEXT:    s_and_b32 s2, s2, s4
360; GFX10-NEXT:    s_and_b32 s3, s3, s5
361; GFX10-NEXT:    s_or_b32 s2, s2, s3
362; GFX10-NEXT:    v_mov_b32_e32 v1, s2
363; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
364; GFX10-NEXT:    s_endpgm
365entry:
366  %0 = and i32 %x, %z
367  %1 = or i32 %x, %z
368  %2 = and i32 %y, %1
369  %3 = or i32 %0, %2
370  store i32 %3, i32 addrspace(1)* %out
371  ret void
372}
373
374define i32 @v_bfi_sha256_ma(i32 %x, i32 %y, i32 %z) {
375; GFX7-LABEL: v_bfi_sha256_ma:
376; GFX7:       ; %bb.0: ; %entry
377; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
378; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v1
379; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, v1
380; GFX7-NEXT:    s_setpc_b64 s[30:31]
381;
382; GFX8-LABEL: v_bfi_sha256_ma:
383; GFX8:       ; %bb.0: ; %entry
384; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
385; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v1
386; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, v1
387; GFX8-NEXT:    s_setpc_b64 s[30:31]
388;
389; GFX10-LABEL: v_bfi_sha256_ma:
390; GFX10:       ; %bb.0: ; %entry
391; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
392; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
393; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v1
394; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, v1
395; GFX10-NEXT:    s_setpc_b64 s[30:31]
396entry:
397  %0 = and i32 %x, %z
398  %1 = or i32 %x, %z
399  %2 = and i32 %y, %1
400  %3 = or i32 %0, %2
401  ret i32 %3
402}
403
404define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %mask) {
405; GFX7-LABEL: v_bitselect_v2i32_pat1:
406; GFX7:       ; %bb.0:
407; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
408; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
409; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
410; GFX7-NEXT:    s_setpc_b64 s[30:31]
411;
412; GFX8-LABEL: v_bitselect_v2i32_pat1:
413; GFX8:       ; %bb.0:
414; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
415; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
416; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
417; GFX8-NEXT:    s_setpc_b64 s[30:31]
418;
419; GFX10-LABEL: v_bitselect_v2i32_pat1:
420; GFX10:       ; %bb.0:
421; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
422; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
423; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, v4
424; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, v5
425; GFX10-NEXT:    s_setpc_b64 s[30:31]
426  %xor.0 = xor <2 x i32> %a, %mask
427  %and = and <2 x i32> %xor.0, %b
428  %bitselect = xor <2 x i32> %and, %mask
429  ret <2 x i32> %bitselect
430}
431
432define i64 @v_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
433; GFX7-LABEL: v_bitselect_i64_pat_0:
434; GFX7:       ; %bb.0:
435; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
436; GFX7-NEXT:    v_bfi_b32 v1, v1, v3, v5
437; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, v4
438; GFX7-NEXT:    s_setpc_b64 s[30:31]
439;
440; GFX8-LABEL: v_bitselect_i64_pat_0:
441; GFX8:       ; %bb.0:
442; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
443; GFX8-NEXT:    v_bfi_b32 v1, v1, v3, v5
444; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, v4
445; GFX8-NEXT:    s_setpc_b64 s[30:31]
446;
447; GFX10-LABEL: v_bitselect_i64_pat_0:
448; GFX10:       ; %bb.0:
449; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
451; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, v4
452; GFX10-NEXT:    v_bfi_b32 v1, v1, v3, v5
453; GFX10-NEXT:    s_setpc_b64 s[30:31]
454  %and0 = and i64 %a, %b
455  %not.a = xor i64 %a, -1
456  %and1 = and i64 %not.a, %mask
457  %bitselect = or i64 %and0, %and1
458  ret i64 %bitselect
459}
460
461define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_0(i64 %a, i64 inreg %b, i64 inreg %mask) {
462; GFX7-LABEL: v_s_s_bitselect_i64_pat_0:
463; GFX7:       ; %bb.0:
464; GFX7-NEXT:    v_mov_b32_e32 v2, s3
465; GFX7-NEXT:    v_bfi_b32 v1, v1, s1, v2
466; GFX7-NEXT:    v_mov_b32_e32 v2, s2
467; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v2
468; GFX7-NEXT:    ; return to shader part epilog
469;
470; GFX8-LABEL: v_s_s_bitselect_i64_pat_0:
471; GFX8:       ; %bb.0:
472; GFX8-NEXT:    v_mov_b32_e32 v2, s3
473; GFX8-NEXT:    v_bfi_b32 v1, v1, s1, v2
474; GFX8-NEXT:    v_mov_b32_e32 v2, s2
475; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v2
476; GFX8-NEXT:    ; return to shader part epilog
477;
478; GFX10-LABEL: v_s_s_bitselect_i64_pat_0:
479; GFX10:       ; %bb.0:
480; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, s2
481; GFX10-NEXT:    v_bfi_b32 v1, v1, s1, s3
482; GFX10-NEXT:    ; return to shader part epilog
483  %and0 = and i64 %a, %b
484  %not.a = xor i64 %a, -1
485  %and1 = and i64 %not.a, %mask
486  %bitselect = or i64 %and0, %and1
487  %cast = bitcast i64 %bitselect to <2 x float>
488  ret <2 x float> %cast
489}
490
491define amdgpu_ps <2 x float> @s_v_s_bitselect_i64_pat_0(i64 inreg %a, i64 %b, i64 inreg %mask) {
492; GFX7-LABEL: s_v_s_bitselect_i64_pat_0:
493; GFX7:       ; %bb.0:
494; GFX7-NEXT:    v_mov_b32_e32 v2, s3
495; GFX7-NEXT:    v_bfi_b32 v1, s1, v1, v2
496; GFX7-NEXT:    v_mov_b32_e32 v2, s2
497; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v2
498; GFX7-NEXT:    ; return to shader part epilog
499;
500; GFX8-LABEL: s_v_s_bitselect_i64_pat_0:
501; GFX8:       ; %bb.0:
502; GFX8-NEXT:    v_mov_b32_e32 v2, s3
503; GFX8-NEXT:    v_bfi_b32 v1, s1, v1, v2
504; GFX8-NEXT:    v_mov_b32_e32 v2, s2
505; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v2
506; GFX8-NEXT:    ; return to shader part epilog
507;
508; GFX10-LABEL: s_v_s_bitselect_i64_pat_0:
509; GFX10:       ; %bb.0:
510; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, s2
511; GFX10-NEXT:    v_bfi_b32 v1, s1, v1, s3
512; GFX10-NEXT:    ; return to shader part epilog
513  %and0 = and i64 %a, %b
514  %not.a = xor i64 %a, -1
515  %and1 = and i64 %not.a, %mask
516  %bitselect = or i64 %and0, %and1
517  %cast = bitcast i64 %bitselect to <2 x float>
518  ret <2 x float> %cast
519}
520
521define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_0(i64 inreg %a, i64 inreg %b, i64 %mask) {
522; GFX7-LABEL: s_s_v_bitselect_i64_pat_0:
523; GFX7:       ; %bb.0:
524; GFX7-NEXT:    v_mov_b32_e32 v2, s3
525; GFX7-NEXT:    v_bfi_b32 v1, s1, v2, v1
526; GFX7-NEXT:    v_mov_b32_e32 v2, s2
527; GFX7-NEXT:    v_bfi_b32 v0, s0, v2, v0
528; GFX7-NEXT:    ; return to shader part epilog
529;
530; GFX8-LABEL: s_s_v_bitselect_i64_pat_0:
531; GFX8:       ; %bb.0:
532; GFX8-NEXT:    v_mov_b32_e32 v2, s3
533; GFX8-NEXT:    v_bfi_b32 v1, s1, v2, v1
534; GFX8-NEXT:    v_mov_b32_e32 v2, s2
535; GFX8-NEXT:    v_bfi_b32 v0, s0, v2, v0
536; GFX8-NEXT:    ; return to shader part epilog
537;
538; GFX10-LABEL: s_s_v_bitselect_i64_pat_0:
539; GFX10:       ; %bb.0:
540; GFX10-NEXT:    v_bfi_b32 v0, s0, s2, v0
541; GFX10-NEXT:    v_bfi_b32 v1, s1, s3, v1
542; GFX10-NEXT:    ; return to shader part epilog
543  %and0 = and i64 %a, %b
544  %not.a = xor i64 %a, -1
545  %and1 = and i64 %not.a, %mask
546  %bitselect = or i64 %and0, %and1
547  %cast = bitcast i64 %bitselect to <2 x float>
548  ret <2 x float> %cast
549}
550
551define amdgpu_ps <2 x float> @v_v_s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 inreg %mask) {
552; GFX7-LABEL: v_v_s_bitselect_i64_pat_0:
553; GFX7:       ; %bb.0:
554; GFX7-NEXT:    v_bfi_b32 v1, v1, v3, s1
555; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, s0
556; GFX7-NEXT:    ; return to shader part epilog
557;
558; GFX8-LABEL: v_v_s_bitselect_i64_pat_0:
559; GFX8:       ; %bb.0:
560; GFX8-NEXT:    v_bfi_b32 v1, v1, v3, s1
561; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, s0
562; GFX8-NEXT:    ; return to shader part epilog
563;
564; GFX10-LABEL: v_v_s_bitselect_i64_pat_0:
565; GFX10:       ; %bb.0:
566; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, s0
567; GFX10-NEXT:    v_bfi_b32 v1, v1, v3, s1
568; GFX10-NEXT:    ; return to shader part epilog
569  %and0 = and i64 %a, %b
570  %not.a = xor i64 %a, -1
571  %and1 = and i64 %not.a, %mask
572  %bitselect = or i64 %and0, %and1
573  %cast = bitcast i64 %bitselect to <2 x float>
574  ret <2 x float> %cast
575}
576
577define amdgpu_ps <2 x float> @v_s_v_bitselect_i64_pat_0(i64 %a, i64 inreg %b, i64 %mask) {
578; GFX7-LABEL: v_s_v_bitselect_i64_pat_0:
579; GFX7:       ; %bb.0:
580; GFX7-NEXT:    v_bfi_b32 v1, v1, s1, v3
581; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v2
582; GFX7-NEXT:    ; return to shader part epilog
583;
584; GFX8-LABEL: v_s_v_bitselect_i64_pat_0:
585; GFX8:       ; %bb.0:
586; GFX8-NEXT:    v_bfi_b32 v1, v1, s1, v3
587; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v2
588; GFX8-NEXT:    ; return to shader part epilog
589;
590; GFX10-LABEL: v_s_v_bitselect_i64_pat_0:
591; GFX10:       ; %bb.0:
592; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, v2
593; GFX10-NEXT:    v_bfi_b32 v1, v1, s1, v3
594; GFX10-NEXT:    ; return to shader part epilog
595  %and0 = and i64 %a, %b
596  %not.a = xor i64 %a, -1
597  %and1 = and i64 %not.a, %mask
598  %bitselect = or i64 %and0, %and1
599  %cast = bitcast i64 %bitselect to <2 x float>
600  ret <2 x float> %cast
601}
602
603define amdgpu_ps <2 x float> @s_v_v_bitselect_i64_pat_0(i64 inreg %a, i64 %b, i64 %mask) {
604; GFX7-LABEL: s_v_v_bitselect_i64_pat_0:
605; GFX7:       ; %bb.0:
606; GFX7-NEXT:    v_bfi_b32 v1, s1, v1, v3
607; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v2
608; GFX7-NEXT:    ; return to shader part epilog
609;
610; GFX8-LABEL: s_v_v_bitselect_i64_pat_0:
611; GFX8:       ; %bb.0:
612; GFX8-NEXT:    v_bfi_b32 v1, s1, v1, v3
613; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v2
614; GFX8-NEXT:    ; return to shader part epilog
615;
616; GFX10-LABEL: s_v_v_bitselect_i64_pat_0:
617; GFX10:       ; %bb.0:
618; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, v2
619; GFX10-NEXT:    v_bfi_b32 v1, s1, v1, v3
620; GFX10-NEXT:    ; return to shader part epilog
621  %and0 = and i64 %a, %b
622  %not.a = xor i64 %a, -1
623  %and1 = and i64 %not.a, %mask
624  %bitselect = or i64 %and0, %and1
625  %cast = bitcast i64 %bitselect to <2 x float>
626  ret <2 x float> %cast
627}
628
629define i64 @v_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
630; GFX7-LABEL: v_bitselect_i64_pat_1:
631; GFX7:       ; %bb.0:
632; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
633; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
634; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
635; GFX7-NEXT:    s_setpc_b64 s[30:31]
636;
637; GFX8-LABEL: v_bitselect_i64_pat_1:
638; GFX8:       ; %bb.0:
639; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
640; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
641; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
642; GFX8-NEXT:    s_setpc_b64 s[30:31]
643;
644; GFX10-LABEL: v_bitselect_i64_pat_1:
645; GFX10:       ; %bb.0:
646; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
647; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
648; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, v4
649; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, v5
650; GFX10-NEXT:    s_setpc_b64 s[30:31]
651  %xor.0 = xor i64 %a, %mask
652  %and = and i64 %xor.0, %b
653  %bitselect = xor i64 %and, %mask
654  ret i64 %bitselect
655}
656
657define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_1(i64 %a, i64 inreg %b, i64 inreg %mask) {
658; GFX7-LABEL: v_s_s_bitselect_i64_pat_1:
659; GFX7:       ; %bb.0:
660; GFX7-NEXT:    v_mov_b32_e32 v2, s3
661; GFX7-NEXT:    v_bfi_b32 v1, s1, v1, v2
662; GFX7-NEXT:    v_mov_b32_e32 v2, s2
663; GFX7-NEXT:    v_bfi_b32 v0, s0, v0, v2
664; GFX7-NEXT:    ; return to shader part epilog
665;
666; GFX8-LABEL: v_s_s_bitselect_i64_pat_1:
667; GFX8:       ; %bb.0:
668; GFX8-NEXT:    v_mov_b32_e32 v2, s3
669; GFX8-NEXT:    v_bfi_b32 v1, s1, v1, v2
670; GFX8-NEXT:    v_mov_b32_e32 v2, s2
671; GFX8-NEXT:    v_bfi_b32 v0, s0, v0, v2
672; GFX8-NEXT:    ; return to shader part epilog
673;
674; GFX10-LABEL: v_s_s_bitselect_i64_pat_1:
675; GFX10:       ; %bb.0:
676; GFX10-NEXT:    v_bfi_b32 v0, s0, v0, s2
677; GFX10-NEXT:    v_bfi_b32 v1, s1, v1, s3
678; GFX10-NEXT:    ; return to shader part epilog
679  %xor.0 = xor i64 %a, %mask
680  %and = and i64 %xor.0, %b
681  %bitselect = xor i64 %and, %mask
682  %cast = bitcast i64 %bitselect to <2 x float>
683  ret <2 x float> %cast
684}
685
686define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_1(i64 inreg %a, i64 inreg %b, i64 %mask) {
687; GFX7-LABEL: s_s_v_bitselect_i64_pat_1:
688; GFX7:       ; %bb.0:
689; GFX7-NEXT:    v_mov_b32_e32 v2, s1
690; GFX7-NEXT:    v_bfi_b32 v1, s3, v2, v1
691; GFX7-NEXT:    v_mov_b32_e32 v2, s0
692; GFX7-NEXT:    v_bfi_b32 v0, s2, v2, v0
693; GFX7-NEXT:    ; return to shader part epilog
694;
695; GFX8-LABEL: s_s_v_bitselect_i64_pat_1:
696; GFX8:       ; %bb.0:
697; GFX8-NEXT:    v_mov_b32_e32 v2, s1
698; GFX8-NEXT:    v_bfi_b32 v1, s3, v2, v1
699; GFX8-NEXT:    v_mov_b32_e32 v2, s0
700; GFX8-NEXT:    v_bfi_b32 v0, s2, v2, v0
701; GFX8-NEXT:    ; return to shader part epilog
702;
703; GFX10-LABEL: s_s_v_bitselect_i64_pat_1:
704; GFX10:       ; %bb.0:
705; GFX10-NEXT:    v_bfi_b32 v0, s2, s0, v0
706; GFX10-NEXT:    v_bfi_b32 v1, s3, s1, v1
707; GFX10-NEXT:    ; return to shader part epilog
708  %xor.0 = xor i64 %a, %mask
709  %and = and i64 %xor.0, %b
710  %bitselect = xor i64 %and, %mask
711  %cast = bitcast i64 %bitselect to <2 x float>
712  ret <2 x float> %cast
713}
714
715define amdgpu_ps <2 x float> @s_v_s_bitselect_i64_pat_1(i64 inreg %a, i64 %b, i64 inreg %mask) {
716; GFX7-LABEL: s_v_s_bitselect_i64_pat_1:
717; GFX7:       ; %bb.0:
718; GFX7-NEXT:    v_mov_b32_e32 v2, s3
719; GFX7-NEXT:    v_bfi_b32 v1, v1, s1, v2
720; GFX7-NEXT:    v_mov_b32_e32 v2, s2
721; GFX7-NEXT:    v_bfi_b32 v0, v0, s0, v2
722; GFX7-NEXT:    ; return to shader part epilog
723;
724; GFX8-LABEL: s_v_s_bitselect_i64_pat_1:
725; GFX8:       ; %bb.0:
726; GFX8-NEXT:    v_mov_b32_e32 v2, s3
727; GFX8-NEXT:    v_bfi_b32 v1, v1, s1, v2
728; GFX8-NEXT:    v_mov_b32_e32 v2, s2
729; GFX8-NEXT:    v_bfi_b32 v0, v0, s0, v2
730; GFX8-NEXT:    ; return to shader part epilog
731;
732; GFX10-LABEL: s_v_s_bitselect_i64_pat_1:
733; GFX10:       ; %bb.0:
734; GFX10-NEXT:    v_bfi_b32 v0, v0, s0, s2
735; GFX10-NEXT:    v_bfi_b32 v1, v1, s1, s3
736; GFX10-NEXT:    ; return to shader part epilog
737  %xor.0 = xor i64 %a, %mask
738  %and = and i64 %xor.0, %b
739  %bitselect = xor i64 %and, %mask
740  %cast = bitcast i64 %bitselect to <2 x float>
741  ret <2 x float> %cast
742}
743
744define i64 @v_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
745; GFX7-LABEL: v_bitselect_i64_pat_2:
746; GFX7:       ; %bb.0:
747; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
748; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
749; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
750; GFX7-NEXT:    s_setpc_b64 s[30:31]
751;
752; GFX8-LABEL: v_bitselect_i64_pat_2:
753; GFX8:       ; %bb.0:
754; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
755; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
756; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
757; GFX8-NEXT:    s_setpc_b64 s[30:31]
758;
759; GFX10-LABEL: v_bitselect_i64_pat_2:
760; GFX10:       ; %bb.0:
761; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
762; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
763; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, v4
764; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, v5
765; GFX10-NEXT:    s_setpc_b64 s[30:31]
766  %xor.0 = xor i64 %a, %mask
767  %and = and i64 %xor.0, %b
768  %bitselect = xor i64 %and, %mask
769  ret i64 %bitselect
770}
771
772define i64 @v_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
773; GFX7-LABEL: v_bfi_sha256_ma_i64:
774; GFX7:       ; %bb.0: ; %entry
775; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
776; GFX7-NEXT:    v_xor_b32_e32 v1, v1, v3
777; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v2
778; GFX7-NEXT:    v_bfi_b32 v1, v1, v5, v3
779; GFX7-NEXT:    v_bfi_b32 v0, v0, v4, v2
780; GFX7-NEXT:    s_setpc_b64 s[30:31]
781;
782; GFX8-LABEL: v_bfi_sha256_ma_i64:
783; GFX8:       ; %bb.0: ; %entry
784; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
785; GFX8-NEXT:    v_xor_b32_e32 v1, v1, v3
786; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v2
787; GFX8-NEXT:    v_bfi_b32 v1, v1, v5, v3
788; GFX8-NEXT:    v_bfi_b32 v0, v0, v4, v2
789; GFX8-NEXT:    s_setpc_b64 s[30:31]
790;
791; GFX10-LABEL: v_bfi_sha256_ma_i64:
792; GFX10:       ; %bb.0: ; %entry
793; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
794; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
795; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v2
796; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
797; GFX10-NEXT:    v_bfi_b32 v0, v0, v4, v2
798; GFX10-NEXT:    v_bfi_b32 v1, v1, v5, v3
799; GFX10-NEXT:    s_setpc_b64 s[30:31]
800entry:
801  %and0 = and i64 %x, %z
802  %or0 = or i64 %x, %z
803  %and1 = and i64 %y, %or0
804  %or1 = or i64 %and0, %and1
805  ret i64 %or1
806}
807
808define amdgpu_ps <2 x float> @v_s_s_bfi_sha256_ma_i64(i64 %x, i64 inreg %y, i64 inreg %z) {
809; GFX7-LABEL: v_s_s_bfi_sha256_ma_i64:
810; GFX7:       ; %bb.0: ; %entry
811; GFX7-NEXT:    v_xor_b32_e32 v1, s1, v1
812; GFX7-NEXT:    v_mov_b32_e32 v2, s1
813; GFX7-NEXT:    v_bfi_b32 v1, v1, s3, v2
814; GFX7-NEXT:    v_xor_b32_e32 v0, s0, v0
815; GFX7-NEXT:    v_mov_b32_e32 v2, s0
816; GFX7-NEXT:    v_bfi_b32 v0, v0, s2, v2
817; GFX7-NEXT:    ; return to shader part epilog
818;
819; GFX8-LABEL: v_s_s_bfi_sha256_ma_i64:
820; GFX8:       ; %bb.0: ; %entry
821; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v1
822; GFX8-NEXT:    v_mov_b32_e32 v2, s1
823; GFX8-NEXT:    v_bfi_b32 v1, v1, s3, v2
824; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
825; GFX8-NEXT:    v_mov_b32_e32 v2, s0
826; GFX8-NEXT:    v_bfi_b32 v0, v0, s2, v2
827; GFX8-NEXT:    ; return to shader part epilog
828;
829; GFX10-LABEL: v_s_s_bfi_sha256_ma_i64:
830; GFX10:       ; %bb.0: ; %entry
831; GFX10-NEXT:    v_xor_b32_e32 v0, s0, v0
832; GFX10-NEXT:    v_xor_b32_e32 v1, s1, v1
833; GFX10-NEXT:    v_bfi_b32 v0, v0, s2, s0
834; GFX10-NEXT:    v_bfi_b32 v1, v1, s3, s1
835; GFX10-NEXT:    ; return to shader part epilog
836entry:
837  %and0 = and i64 %x, %z
838  %or0 = or i64 %x, %z
839  %and1 = and i64 %y, %or0
840  %or1 = or i64 %and0, %and1
841  %cast = bitcast i64 %or1 to <2 x float>
842  ret <2 x float> %cast
843}
844
845define amdgpu_ps <2 x float> @s_v_s_bfi_sha256_ma_i64(i64 inreg %x, i64 %y, i64 inreg %z) {
846; GFX7-LABEL: s_v_s_bfi_sha256_ma_i64:
847; GFX7:       ; %bb.0: ; %entry
848; GFX7-NEXT:    v_xor_b32_e32 v2, s1, v1
849; GFX7-NEXT:    v_bfi_b32 v1, v2, s3, v1
850; GFX7-NEXT:    v_xor_b32_e32 v2, s0, v0
851; GFX7-NEXT:    v_bfi_b32 v0, v2, s2, v0
852; GFX7-NEXT:    ; return to shader part epilog
853;
854; GFX8-LABEL: s_v_s_bfi_sha256_ma_i64:
855; GFX8:       ; %bb.0: ; %entry
856; GFX8-NEXT:    v_xor_b32_e32 v2, s1, v1
857; GFX8-NEXT:    v_bfi_b32 v1, v2, s3, v1
858; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v0
859; GFX8-NEXT:    v_bfi_b32 v0, v2, s2, v0
860; GFX8-NEXT:    ; return to shader part epilog
861;
862; GFX10-LABEL: s_v_s_bfi_sha256_ma_i64:
863; GFX10:       ; %bb.0: ; %entry
864; GFX10-NEXT:    v_xor_b32_e32 v2, s0, v0
865; GFX10-NEXT:    v_xor_b32_e32 v3, s1, v1
866; GFX10-NEXT:    v_bfi_b32 v0, v2, s2, v0
867; GFX10-NEXT:    v_bfi_b32 v1, v3, s3, v1
868; GFX10-NEXT:    ; return to shader part epilog
869entry:
870  %and0 = and i64 %x, %z
871  %or0 = or i64 %x, %z
872  %and1 = and i64 %y, %or0
873  %or1 = or i64 %and0, %and1
874  %cast = bitcast i64 %or1 to <2 x float>
875  ret <2 x float> %cast
876}
877
878define amdgpu_ps <2 x float> @s_s_v_bfi_sha256_ma_i64(i64 inreg %x, i64 inreg %y, i64 %z) {
879; GFX7-LABEL: s_s_v_bfi_sha256_ma_i64:
880; GFX7:       ; %bb.0: ; %entry
881; GFX7-NEXT:    v_mov_b32_e32 v2, s3
882; GFX7-NEXT:    v_xor_b32_e32 v2, s1, v2
883; GFX7-NEXT:    v_bfi_b32 v1, v2, v1, s3
884; GFX7-NEXT:    v_mov_b32_e32 v2, s2
885; GFX7-NEXT:    v_xor_b32_e32 v2, s0, v2
886; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, s2
887; GFX7-NEXT:    ; return to shader part epilog
888;
889; GFX8-LABEL: s_s_v_bfi_sha256_ma_i64:
890; GFX8:       ; %bb.0: ; %entry
891; GFX8-NEXT:    v_mov_b32_e32 v2, s3
892; GFX8-NEXT:    v_xor_b32_e32 v2, s1, v2
893; GFX8-NEXT:    v_bfi_b32 v1, v2, v1, s3
894; GFX8-NEXT:    v_mov_b32_e32 v2, s2
895; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v2
896; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, s2
897; GFX8-NEXT:    ; return to shader part epilog
898;
899; GFX10-LABEL: s_s_v_bfi_sha256_ma_i64:
900; GFX10:       ; %bb.0: ; %entry
901; GFX10-NEXT:    v_xor_b32_e64 v2, s0, s2
902; GFX10-NEXT:    v_xor_b32_e64 v3, s1, s3
903; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, s2
904; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, s3
905; GFX10-NEXT:    ; return to shader part epilog
906entry:
907  %and0 = and i64 %x, %z
908  %or0 = or i64 %x, %z
909  %and1 = and i64 %y, %or0
910  %or1 = or i64 %and0, %and1
911  %cast = bitcast i64 %or1 to <2 x float>
912  ret <2 x float> %cast
913}
914
915define amdgpu_ps <2 x float> @v_s_v_bfi_sha256_ma_i64(i64 %x, i64 inreg %y, i64 %z) {
916; GFX7-LABEL: v_s_v_bfi_sha256_ma_i64:
917; GFX7:       ; %bb.0: ; %entry
918; GFX7-NEXT:    v_xor_b32_e32 v1, s1, v1
919; GFX7-NEXT:    v_xor_b32_e32 v0, s0, v0
920; GFX7-NEXT:    v_bfi_b32 v1, v1, v3, s1
921; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, s0
922; GFX7-NEXT:    ; return to shader part epilog
923;
924; GFX8-LABEL: v_s_v_bfi_sha256_ma_i64:
925; GFX8:       ; %bb.0: ; %entry
926; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v1
927; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
928; GFX8-NEXT:    v_bfi_b32 v1, v1, v3, s1
929; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, s0
930; GFX8-NEXT:    ; return to shader part epilog
931;
932; GFX10-LABEL: v_s_v_bfi_sha256_ma_i64:
933; GFX10:       ; %bb.0: ; %entry
934; GFX10-NEXT:    v_xor_b32_e32 v0, s0, v0
935; GFX10-NEXT:    v_xor_b32_e32 v1, s1, v1
936; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, s0
937; GFX10-NEXT:    v_bfi_b32 v1, v1, v3, s1
938; GFX10-NEXT:    ; return to shader part epilog
939entry:
940  %and0 = and i64 %x, %z
941  %or0 = or i64 %x, %z
942  %and1 = and i64 %y, %or0
943  %or1 = or i64 %and0, %and1
944  %cast = bitcast i64 %or1 to <2 x float>
945  ret <2 x float> %cast
946}
947
948define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
949; GFX7-LABEL: s_bitselect_i64_pat_0:
950; GFX7:       ; %bb.0:
951; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
952; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
953; GFX7-NEXT:    s_mov_b32 s3, 0xf000
954; GFX7-NEXT:    s_mov_b32 s2, -1
955; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
956; GFX7-NEXT:    s_and_b64 s[6:7], s[4:5], s[6:7]
957; GFX7-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
958; GFX7-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
959; GFX7-NEXT:    s_add_u32 s0, s0, 10
960; GFX7-NEXT:    s_addc_u32 s1, s1, 0
961; GFX7-NEXT:    v_mov_b32_e32 v0, s0
962; GFX7-NEXT:    v_mov_b32_e32 v1, s1
963; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
964; GFX7-NEXT:    s_endpgm
965;
966; GFX8-LABEL: s_bitselect_i64_pat_0:
967; GFX8:       ; %bb.0:
968; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
969; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
970; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
971; GFX8-NEXT:    s_and_b64 s[2:3], s[4:5], s[6:7]
972; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
973; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
974; GFX8-NEXT:    s_add_u32 s0, s0, 10
975; GFX8-NEXT:    s_addc_u32 s1, s1, 0
976; GFX8-NEXT:    v_mov_b32_e32 v0, s0
977; GFX8-NEXT:    v_mov_b32_e32 v1, s1
978; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
979; GFX8-NEXT:    s_endpgm
980;
981; GFX10-LABEL: s_bitselect_i64_pat_0:
982; GFX10:       ; %bb.0:
983; GFX10-NEXT:    s_clause 0x1
984; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
985; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
986; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
987; GFX10-NEXT:    s_and_b64 s[2:3], s[4:5], s[6:7]
988; GFX10-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
989; GFX10-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
990; GFX10-NEXT:    s_add_u32 s0, s0, 10
991; GFX10-NEXT:    s_addc_u32 s1, s1, 0
992; GFX10-NEXT:    v_mov_b32_e32 v0, s0
993; GFX10-NEXT:    v_mov_b32_e32 v1, s1
994; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
995; GFX10-NEXT:    s_endpgm
996  %and0 = and i64 %a, %b
997  %not.a = xor i64 %a, -1
998  %and1 = and i64 %not.a, %mask
999  %bitselect = or i64 %and0, %and1
1000  %scalar.use = add i64 %bitselect, 10
1001  store i64 %scalar.use, i64 addrspace(1)* undef
1002  ret void
1003}
1004
1005define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
1006; GFX7-LABEL: s_bitselect_i64_pat_1:
1007; GFX7:       ; %bb.0:
1008; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1009; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1010; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1011; GFX7-NEXT:    s_mov_b32 s2, -1
1012; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1013; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], s[0:1]
1014; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
1015; GFX7-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
1016; GFX7-NEXT:    s_add_u32 s0, s0, 10
1017; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1018; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1019; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1020; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1021; GFX7-NEXT:    s_endpgm
1022;
1023; GFX8-LABEL: s_bitselect_i64_pat_1:
1024; GFX8:       ; %bb.0:
1025; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1026; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1027; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1028; GFX8-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1029; GFX8-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1030; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1031; GFX8-NEXT:    s_add_u32 s0, s0, 10
1032; GFX8-NEXT:    s_addc_u32 s1, s1, 0
1033; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1034; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1035; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1036; GFX8-NEXT:    s_endpgm
1037;
1038; GFX10-LABEL: s_bitselect_i64_pat_1:
1039; GFX10:       ; %bb.0:
1040; GFX10-NEXT:    s_clause 0x1
1041; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1042; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1043; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1044; GFX10-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1045; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1046; GFX10-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1047; GFX10-NEXT:    s_add_u32 s0, s0, 10
1048; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1049; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1050; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1051; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1052; GFX10-NEXT:    s_endpgm
1053  %xor.0 = xor i64 %a, %mask
1054  %and = and i64 %xor.0, %b
1055  %bitselect = xor i64 %and, %mask
1056
1057  %scalar.use = add i64 %bitselect, 10
1058  store i64 %scalar.use, i64 addrspace(1)* undef
1059  ret void
1060}
1061
1062define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
1063; GFX7-LABEL: s_bitselect_i64_pat_2:
1064; GFX7:       ; %bb.0:
1065; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1066; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1067; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1068; GFX7-NEXT:    s_mov_b32 s2, -1
1069; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1070; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], s[0:1]
1071; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
1072; GFX7-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
1073; GFX7-NEXT:    s_add_u32 s0, s0, 10
1074; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1075; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1076; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1077; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1078; GFX7-NEXT:    s_endpgm
1079;
1080; GFX8-LABEL: s_bitselect_i64_pat_2:
1081; GFX8:       ; %bb.0:
1082; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1083; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1084; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1085; GFX8-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1086; GFX8-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1087; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1088; GFX8-NEXT:    s_add_u32 s0, s0, 10
1089; GFX8-NEXT:    s_addc_u32 s1, s1, 0
1090; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1091; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1092; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1093; GFX8-NEXT:    s_endpgm
1094;
1095; GFX10-LABEL: s_bitselect_i64_pat_2:
1096; GFX10:       ; %bb.0:
1097; GFX10-NEXT:    s_clause 0x1
1098; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1099; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1100; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1101; GFX10-NEXT:    s_xor_b64 s[2:3], s[4:5], s[0:1]
1102; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
1103; GFX10-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
1104; GFX10-NEXT:    s_add_u32 s0, s0, 10
1105; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1106; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1107; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1108; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1109; GFX10-NEXT:    s_endpgm
1110  %xor.0 = xor i64 %a, %mask
1111  %and = and i64 %xor.0, %b
1112  %bitselect = xor i64 %and, %mask
1113
1114  %scalar.use = add i64 %bitselect, 10
1115  store i64 %scalar.use, i64 addrspace(1)* undef
1116  ret void
1117}
1118
1119define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
1120; GFX7-LABEL: s_bfi_sha256_ma_i64:
1121; GFX7:       ; %bb.0: ; %entry
1122; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1123; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1124; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1125; GFX7-NEXT:    s_mov_b32 s2, -1
1126; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1127; GFX7-NEXT:    s_and_b64 s[8:9], s[4:5], s[0:1]
1128; GFX7-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
1129; GFX7-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
1130; GFX7-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
1131; GFX7-NEXT:    s_add_u32 s0, s0, 10
1132; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1133; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1134; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1135; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1136; GFX7-NEXT:    s_endpgm
1137;
1138; GFX8-LABEL: s_bfi_sha256_ma_i64:
1139; GFX8:       ; %bb.0: ; %entry
1140; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1141; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1142; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1143; GFX8-NEXT:    s_and_b64 s[2:3], s[4:5], s[0:1]
1144; GFX8-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
1145; GFX8-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
1146; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1147; GFX8-NEXT:    s_add_u32 s0, s0, 10
1148; GFX8-NEXT:    s_addc_u32 s1, s1, 0
1149; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1150; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1151; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
1152; GFX8-NEXT:    s_endpgm
1153;
1154; GFX10-LABEL: s_bfi_sha256_ma_i64:
1155; GFX10:       ; %bb.0: ; %entry
1156; GFX10-NEXT:    s_clause 0x1
1157; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1158; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1159; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1160; GFX10-NEXT:    s_or_b64 s[2:3], s[4:5], s[0:1]
1161; GFX10-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
1162; GFX10-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
1163; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1164; GFX10-NEXT:    s_add_u32 s0, s0, 10
1165; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1166; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1167; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1168; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
1169; GFX10-NEXT:    s_endpgm
1170entry:
1171  %and0 = and i64 %x, %z
1172  %or0 = or i64 %x, %z
1173  %and1 = and i64 %y, %or0
1174  %or1 = or i64 %and0, %and1
1175
1176  %scalar.use = add i64 %or1, 10
1177  store i64 %scalar.use, i64 addrspace(1)* undef
1178  ret void
1179}
1180