1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s 3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s 5 6; BFI_INT Definition pattern from ISA docs 7; (y & x) | (z & ~x) 8; 9define amdgpu_kernel void @s_bfi_def_i32(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { 10; GFX7-LABEL: s_bfi_def_i32: 11; GFX7: ; %bb.0: ; %entry 12; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 13; GFX7-NEXT: s_load_dword s6, s[0:1], 0xd 14; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 15; GFX7-NEXT: s_mov_b32 s3, 0xf000 16; GFX7-NEXT: s_mov_b32 s2, -1 17; GFX7-NEXT: s_waitcnt lgkmcnt(0) 18; GFX7-NEXT: s_andn2_b32 s6, s6, s4 19; GFX7-NEXT: s_and_b32 s4, s5, s4 20; GFX7-NEXT: s_or_b32 s4, s6, s4 21; GFX7-NEXT: v_mov_b32_e32 v0, s4 22; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 23; GFX7-NEXT: s_endpgm 24; 25; GFX8-LABEL: s_bfi_def_i32: 26; GFX8: ; %bb.0: ; %entry 27; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 28; GFX8-NEXT: s_load_dword s4, s[0:1], 0x34 29; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 30; GFX8-NEXT: s_waitcnt lgkmcnt(0) 31; GFX8-NEXT: s_andn2_b32 s4, s4, s2 32; GFX8-NEXT: s_and_b32 s2, s3, s2 33; GFX8-NEXT: s_or_b32 s2, s4, s2 34; GFX8-NEXT: v_mov_b32_e32 v0, s0 35; GFX8-NEXT: v_mov_b32_e32 v1, s1 36; GFX8-NEXT: v_mov_b32_e32 v2, s2 37; GFX8-NEXT: flat_store_dword v[0:1], v2 38; GFX8-NEXT: s_endpgm 39; 40; GFX10-LABEL: s_bfi_def_i32: 41; GFX10: ; %bb.0: ; %entry 42; GFX10-NEXT: s_clause 0x2 43; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 44; GFX10-NEXT: s_load_dword s4, s[0:1], 0x34 45; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 46; GFX10-NEXT: v_mov_b32_e32 v0, 0 47; GFX10-NEXT: s_waitcnt lgkmcnt(0) 48; GFX10-NEXT: s_andn2_b32 s4, s4, s2 49; GFX10-NEXT: s_and_b32 s2, s3, s2 50; GFX10-NEXT: s_or_b32 s2, s4, s2 51; GFX10-NEXT: v_mov_b32_e32 v1, s2 52; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 53; GFX10-NEXT: s_endpgm 54entry: 55 %0 = xor i32 %x, -1 56 %1 = and i32 %z, %0 57 %2 = and i32 %y, %x 58 %3 = or i32 %1, %2 59 store i32 %3, i32 addrspace(1)* %out 60 ret void 61} 62 63define i32 @v_bfi_def_i32(i32 %x, i32 %y, i32 %z) { 64; GFX7-LABEL: v_bfi_def_i32: 65; GFX7: ; %bb.0: ; %entry 66; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 67; GFX7-NEXT: v_bfi_b32 v0, v0, v1, v2 68; GFX7-NEXT: s_setpc_b64 s[30:31] 69; 70; GFX8-LABEL: v_bfi_def_i32: 71; GFX8: ; %bb.0: ; %entry 72; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 73; GFX8-NEXT: v_bfi_b32 v0, v0, v1, v2 74; GFX8-NEXT: s_setpc_b64 s[30:31] 75; 76; GFX10-LABEL: v_bfi_def_i32: 77; GFX10: ; %bb.0: ; %entry 78; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 79; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 80; GFX10-NEXT: v_bfi_b32 v0, v0, v1, v2 81; GFX10-NEXT: s_setpc_b64 s[30:31] 82entry: 83 %0 = xor i32 %x, -1 84 %1 = and i32 %z, %0 85 %2 = and i32 %y, %x 86 %3 = or i32 %1, %2 87 ret i32 %3 88} 89 90; SHA-256 Ch function 91; z ^ (x & (y ^ z)) 92define amdgpu_kernel void @s_bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { 93; GFX7-LABEL: s_bfi_sha256_ch: 94; GFX7: ; %bb.0: ; %entry 95; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 96; GFX7-NEXT: s_load_dword s6, s[0:1], 0xd 97; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 98; GFX7-NEXT: s_mov_b32 s3, 0xf000 99; GFX7-NEXT: s_mov_b32 s2, -1 100; GFX7-NEXT: s_waitcnt lgkmcnt(0) 101; GFX7-NEXT: s_xor_b32 s5, s5, s6 102; GFX7-NEXT: s_and_b32 s4, s4, s5 103; GFX7-NEXT: s_xor_b32 s4, s6, s4 104; GFX7-NEXT: v_mov_b32_e32 v0, s4 105; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 106; GFX7-NEXT: s_endpgm 107; 108; GFX8-LABEL: s_bfi_sha256_ch: 109; GFX8: ; %bb.0: ; %entry 110; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 111; GFX8-NEXT: s_load_dword s4, s[0:1], 0x34 112; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 113; GFX8-NEXT: s_waitcnt lgkmcnt(0) 114; GFX8-NEXT: s_xor_b32 s3, s3, s4 115; GFX8-NEXT: s_and_b32 s2, s2, s3 116; GFX8-NEXT: s_xor_b32 s2, s4, s2 117; GFX8-NEXT: v_mov_b32_e32 v0, s0 118; GFX8-NEXT: v_mov_b32_e32 v1, s1 119; GFX8-NEXT: v_mov_b32_e32 v2, s2 120; GFX8-NEXT: flat_store_dword v[0:1], v2 121; GFX8-NEXT: s_endpgm 122; 123; GFX10-LABEL: s_bfi_sha256_ch: 124; GFX10: ; %bb.0: ; %entry 125; GFX10-NEXT: s_clause 0x2 126; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 127; GFX10-NEXT: s_load_dword s4, s[0:1], 0x34 128; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 129; GFX10-NEXT: v_mov_b32_e32 v0, 0 130; GFX10-NEXT: s_waitcnt lgkmcnt(0) 131; GFX10-NEXT: s_xor_b32 s3, s3, s4 132; GFX10-NEXT: s_and_b32 s2, s2, s3 133; GFX10-NEXT: s_xor_b32 s2, s4, s2 134; GFX10-NEXT: v_mov_b32_e32 v1, s2 135; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 136; GFX10-NEXT: s_endpgm 137entry: 138 %0 = xor i32 %y, %z 139 %1 = and i32 %x, %0 140 %2 = xor i32 %z, %1 141 store i32 %2, i32 addrspace(1)* %out 142 ret void 143} 144 145define i32 @v_bfi_sha256_ch(i32 %x, i32 %y, i32 %z) { 146; GFX7-LABEL: v_bfi_sha256_ch: 147; GFX7: ; %bb.0: ; %entry 148; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 149; GFX7-NEXT: v_bfi_b32 v0, v0, v1, v2 150; GFX7-NEXT: s_setpc_b64 s[30:31] 151; 152; GFX8-LABEL: v_bfi_sha256_ch: 153; GFX8: ; %bb.0: ; %entry 154; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 155; GFX8-NEXT: v_bfi_b32 v0, v0, v1, v2 156; GFX8-NEXT: s_setpc_b64 s[30:31] 157; 158; GFX10-LABEL: v_bfi_sha256_ch: 159; GFX10: ; %bb.0: ; %entry 160; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 161; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 162; GFX10-NEXT: v_bfi_b32 v0, v0, v1, v2 163; GFX10-NEXT: s_setpc_b64 s[30:31] 164entry: 165 %0 = xor i32 %y, %z 166 %1 = and i32 %x, %0 167 %2 = xor i32 %z, %1 168 ret i32 %2 169} 170 171define amdgpu_ps float @v_s_s_bfi_sha256_ch(i32 %x, i32 inreg %y, i32 inreg %z) { 172; GFX7-LABEL: v_s_s_bfi_sha256_ch: 173; GFX7: ; %bb.0: ; %entry 174; GFX7-NEXT: v_mov_b32_e32 v1, s1 175; GFX7-NEXT: v_bfi_b32 v0, v0, s0, v1 176; GFX7-NEXT: ; return to shader part epilog 177; 178; GFX8-LABEL: v_s_s_bfi_sha256_ch: 179; GFX8: ; %bb.0: ; %entry 180; GFX8-NEXT: v_mov_b32_e32 v1, s1 181; GFX8-NEXT: v_bfi_b32 v0, v0, s0, v1 182; GFX8-NEXT: ; return to shader part epilog 183; 184; GFX10-LABEL: v_s_s_bfi_sha256_ch: 185; GFX10: ; %bb.0: ; %entry 186; GFX10-NEXT: v_bfi_b32 v0, v0, s0, s1 187; GFX10-NEXT: ; return to shader part epilog 188entry: 189 %xor0 = xor i32 %y, %z 190 %and = and i32 %x, %xor0 191 %xor1 = xor i32 %z, %and 192 %cast = bitcast i32 %xor1 to float 193 ret float %cast 194} 195 196define amdgpu_ps float @s_v_s_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 inreg %z) { 197; GFX7-LABEL: s_v_s_bfi_sha256_ch: 198; GFX7: ; %bb.0: ; %entry 199; GFX7-NEXT: v_mov_b32_e32 v1, s1 200; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1 201; GFX7-NEXT: ; return to shader part epilog 202; 203; GFX8-LABEL: s_v_s_bfi_sha256_ch: 204; GFX8: ; %bb.0: ; %entry 205; GFX8-NEXT: v_mov_b32_e32 v1, s1 206; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1 207; GFX8-NEXT: ; return to shader part epilog 208; 209; GFX10-LABEL: s_v_s_bfi_sha256_ch: 210; GFX10: ; %bb.0: ; %entry 211; GFX10-NEXT: v_bfi_b32 v0, s0, v0, s1 212; GFX10-NEXT: ; return to shader part epilog 213entry: 214 %xor0 = xor i32 %y, %z 215 %and = and i32 %x, %xor0 216 %xor1 = xor i32 %z, %and 217 %cast = bitcast i32 %xor1 to float 218 ret float %cast 219} 220 221define amdgpu_ps float @s_s_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) { 222; GFX7-LABEL: s_s_v_bfi_sha256_ch: 223; GFX7: ; %bb.0: ; %entry 224; GFX7-NEXT: v_mov_b32_e32 v1, s1 225; GFX7-NEXT: v_bfi_b32 v0, s0, v1, v0 226; GFX7-NEXT: ; return to shader part epilog 227; 228; GFX8-LABEL: s_s_v_bfi_sha256_ch: 229; GFX8: ; %bb.0: ; %entry 230; GFX8-NEXT: v_mov_b32_e32 v1, s1 231; GFX8-NEXT: v_bfi_b32 v0, s0, v1, v0 232; GFX8-NEXT: ; return to shader part epilog 233; 234; GFX10-LABEL: s_s_v_bfi_sha256_ch: 235; GFX10: ; %bb.0: ; %entry 236; GFX10-NEXT: v_bfi_b32 v0, s0, s1, v0 237; GFX10-NEXT: ; return to shader part epilog 238entry: 239 %xor0 = xor i32 %y, %z 240 %and = and i32 %x, %xor0 241 %xor1 = xor i32 %z, %and 242 %cast = bitcast i32 %xor1 to float 243 ret float %cast 244} 245 246define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 %z) { 247; GFX7-LABEL: s_v_v_bfi_sha256_ch: 248; GFX7: ; %bb.0: ; %entry 249; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1 250; GFX7-NEXT: ; return to shader part epilog 251; 252; GFX8-LABEL: s_v_v_bfi_sha256_ch: 253; GFX8: ; %bb.0: ; %entry 254; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1 255; GFX8-NEXT: ; return to shader part epilog 256; 257; GFX10-LABEL: s_v_v_bfi_sha256_ch: 258; GFX10: ; %bb.0: ; %entry 259; GFX10-NEXT: v_bfi_b32 v0, s0, v0, v1 260; GFX10-NEXT: ; return to shader part epilog 261entry: 262 %xor0 = xor i32 %y, %z 263 %and = and i32 %x, %xor0 264 %xor1 = xor i32 %z, %and 265 %cast = bitcast i32 %xor1 to float 266 ret float %cast 267} 268 269define amdgpu_ps float @v_s_v_bfi_sha256_ch(i32 %x, i32 inreg %y, i32 %z) { 270; GFX7-LABEL: v_s_v_bfi_sha256_ch: 271; GFX7: ; %bb.0: ; %entry 272; GFX7-NEXT: v_bfi_b32 v0, v0, s0, v1 273; GFX7-NEXT: ; return to shader part epilog 274; 275; GFX8-LABEL: v_s_v_bfi_sha256_ch: 276; GFX8: ; %bb.0: ; %entry 277; GFX8-NEXT: v_bfi_b32 v0, v0, s0, v1 278; GFX8-NEXT: ; return to shader part epilog 279; 280; GFX10-LABEL: v_s_v_bfi_sha256_ch: 281; GFX10: ; %bb.0: ; %entry 282; GFX10-NEXT: v_bfi_b32 v0, v0, s0, v1 283; GFX10-NEXT: ; return to shader part epilog 284entry: 285 %xor0 = xor i32 %y, %z 286 %and = and i32 %x, %xor0 287 %xor1 = xor i32 %z, %and 288 %cast = bitcast i32 %xor1 to float 289 ret float %cast 290} 291 292define amdgpu_ps float @v_v_s_bfi_sha256_ch(i32 %x, i32 %y, i32 inreg %z) { 293; GFX7-LABEL: v_v_s_bfi_sha256_ch: 294; GFX7: ; %bb.0: ; %entry 295; GFX7-NEXT: v_bfi_b32 v0, v0, v1, s0 296; GFX7-NEXT: ; return to shader part epilog 297; 298; GFX8-LABEL: v_v_s_bfi_sha256_ch: 299; GFX8: ; %bb.0: ; %entry 300; GFX8-NEXT: v_bfi_b32 v0, v0, v1, s0 301; GFX8-NEXT: ; return to shader part epilog 302; 303; GFX10-LABEL: v_v_s_bfi_sha256_ch: 304; GFX10: ; %bb.0: ; %entry 305; GFX10-NEXT: v_bfi_b32 v0, v0, v1, s0 306; GFX10-NEXT: ; return to shader part epilog 307entry: 308 %xor0 = xor i32 %y, %z 309 %and = and i32 %x, %xor0 310 %xor1 = xor i32 %z, %and 311 %cast = bitcast i32 %xor1 to float 312 ret float %cast 313} 314 315; SHA-256 Ma function 316; ((x & z) | (y & (x | z))) 317define amdgpu_kernel void @s_bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { 318; GFX7-LABEL: s_bfi_sha256_ma: 319; GFX7: ; %bb.0: ; %entry 320; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 321; GFX7-NEXT: s_load_dword s6, s[0:1], 0xd 322; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 323; GFX7-NEXT: s_mov_b32 s3, 0xf000 324; GFX7-NEXT: s_mov_b32 s2, -1 325; GFX7-NEXT: s_waitcnt lgkmcnt(0) 326; GFX7-NEXT: s_and_b32 s7, s4, s6 327; GFX7-NEXT: s_or_b32 s4, s4, s6 328; GFX7-NEXT: s_and_b32 s4, s5, s4 329; GFX7-NEXT: s_or_b32 s4, s7, s4 330; GFX7-NEXT: v_mov_b32_e32 v0, s4 331; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 332; GFX7-NEXT: s_endpgm 333; 334; GFX8-LABEL: s_bfi_sha256_ma: 335; GFX8: ; %bb.0: ; %entry 336; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 337; GFX8-NEXT: s_load_dword s4, s[0:1], 0x34 338; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 339; GFX8-NEXT: s_waitcnt lgkmcnt(0) 340; GFX8-NEXT: s_and_b32 s5, s2, s4 341; GFX8-NEXT: s_or_b32 s2, s2, s4 342; GFX8-NEXT: s_and_b32 s2, s3, s2 343; GFX8-NEXT: s_or_b32 s2, s5, s2 344; GFX8-NEXT: v_mov_b32_e32 v0, s0 345; GFX8-NEXT: v_mov_b32_e32 v1, s1 346; GFX8-NEXT: v_mov_b32_e32 v2, s2 347; GFX8-NEXT: flat_store_dword v[0:1], v2 348; GFX8-NEXT: s_endpgm 349; 350; GFX10-LABEL: s_bfi_sha256_ma: 351; GFX10: ; %bb.0: ; %entry 352; GFX10-NEXT: s_clause 0x2 353; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 354; GFX10-NEXT: s_load_dword s4, s[0:1], 0x34 355; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 356; GFX10-NEXT: v_mov_b32_e32 v0, 0 357; GFX10-NEXT: s_waitcnt lgkmcnt(0) 358; GFX10-NEXT: s_or_b32 s5, s2, s4 359; GFX10-NEXT: s_and_b32 s2, s2, s4 360; GFX10-NEXT: s_and_b32 s3, s3, s5 361; GFX10-NEXT: s_or_b32 s2, s2, s3 362; GFX10-NEXT: v_mov_b32_e32 v1, s2 363; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 364; GFX10-NEXT: s_endpgm 365entry: 366 %0 = and i32 %x, %z 367 %1 = or i32 %x, %z 368 %2 = and i32 %y, %1 369 %3 = or i32 %0, %2 370 store i32 %3, i32 addrspace(1)* %out 371 ret void 372} 373 374define i32 @v_bfi_sha256_ma(i32 %x, i32 %y, i32 %z) { 375; GFX7-LABEL: v_bfi_sha256_ma: 376; GFX7: ; %bb.0: ; %entry 377; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 378; GFX7-NEXT: v_xor_b32_e32 v0, v0, v1 379; GFX7-NEXT: v_bfi_b32 v0, v0, v2, v1 380; GFX7-NEXT: s_setpc_b64 s[30:31] 381; 382; GFX8-LABEL: v_bfi_sha256_ma: 383; GFX8: ; %bb.0: ; %entry 384; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 385; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1 386; GFX8-NEXT: v_bfi_b32 v0, v0, v2, v1 387; GFX8-NEXT: s_setpc_b64 s[30:31] 388; 389; GFX10-LABEL: v_bfi_sha256_ma: 390; GFX10: ; %bb.0: ; %entry 391; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 392; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 393; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 394; GFX10-NEXT: v_bfi_b32 v0, v0, v2, v1 395; GFX10-NEXT: s_setpc_b64 s[30:31] 396entry: 397 %0 = and i32 %x, %z 398 %1 = or i32 %x, %z 399 %2 = and i32 %y, %1 400 %3 = or i32 %0, %2 401 ret i32 %3 402} 403 404define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %mask) { 405; GFX7-LABEL: v_bitselect_v2i32_pat1: 406; GFX7: ; %bb.0: 407; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 408; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4 409; GFX7-NEXT: v_bfi_b32 v1, v3, v1, v5 410; GFX7-NEXT: s_setpc_b64 s[30:31] 411; 412; GFX8-LABEL: v_bitselect_v2i32_pat1: 413; GFX8: ; %bb.0: 414; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 415; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4 416; GFX8-NEXT: v_bfi_b32 v1, v3, v1, v5 417; GFX8-NEXT: s_setpc_b64 s[30:31] 418; 419; GFX10-LABEL: v_bitselect_v2i32_pat1: 420; GFX10: ; %bb.0: 421; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 422; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 423; GFX10-NEXT: v_bfi_b32 v0, v2, v0, v4 424; GFX10-NEXT: v_bfi_b32 v1, v3, v1, v5 425; GFX10-NEXT: s_setpc_b64 s[30:31] 426 %xor.0 = xor <2 x i32> %a, %mask 427 %and = and <2 x i32> %xor.0, %b 428 %bitselect = xor <2 x i32> %and, %mask 429 ret <2 x i32> %bitselect 430} 431 432define i64 @v_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { 433; GFX7-LABEL: v_bitselect_i64_pat_0: 434; GFX7: ; %bb.0: 435; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 436; GFX7-NEXT: v_bfi_b32 v1, v1, v3, v5 437; GFX7-NEXT: v_bfi_b32 v0, v0, v2, v4 438; GFX7-NEXT: s_setpc_b64 s[30:31] 439; 440; GFX8-LABEL: v_bitselect_i64_pat_0: 441; GFX8: ; %bb.0: 442; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 443; GFX8-NEXT: v_bfi_b32 v1, v1, v3, v5 444; GFX8-NEXT: v_bfi_b32 v0, v0, v2, v4 445; GFX8-NEXT: s_setpc_b64 s[30:31] 446; 447; GFX10-LABEL: v_bitselect_i64_pat_0: 448; GFX10: ; %bb.0: 449; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 450; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 451; GFX10-NEXT: v_bfi_b32 v0, v0, v2, v4 452; GFX10-NEXT: v_bfi_b32 v1, v1, v3, v5 453; GFX10-NEXT: s_setpc_b64 s[30:31] 454 %and0 = and i64 %a, %b 455 %not.a = xor i64 %a, -1 456 %and1 = and i64 %not.a, %mask 457 %bitselect = or i64 %and0, %and1 458 ret i64 %bitselect 459} 460 461define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_0(i64 %a, i64 inreg %b, i64 inreg %mask) { 462; GFX7-LABEL: v_s_s_bitselect_i64_pat_0: 463; GFX7: ; %bb.0: 464; GFX7-NEXT: v_mov_b32_e32 v2, s3 465; GFX7-NEXT: v_bfi_b32 v1, v1, s1, v2 466; GFX7-NEXT: v_mov_b32_e32 v2, s2 467; GFX7-NEXT: v_bfi_b32 v0, v0, s0, v2 468; GFX7-NEXT: ; return to shader part epilog 469; 470; GFX8-LABEL: v_s_s_bitselect_i64_pat_0: 471; GFX8: ; %bb.0: 472; GFX8-NEXT: v_mov_b32_e32 v2, s3 473; GFX8-NEXT: v_bfi_b32 v1, v1, s1, v2 474; GFX8-NEXT: v_mov_b32_e32 v2, s2 475; GFX8-NEXT: v_bfi_b32 v0, v0, s0, v2 476; GFX8-NEXT: ; return to shader part epilog 477; 478; GFX10-LABEL: v_s_s_bitselect_i64_pat_0: 479; GFX10: ; %bb.0: 480; GFX10-NEXT: v_bfi_b32 v0, v0, s0, s2 481; GFX10-NEXT: v_bfi_b32 v1, v1, s1, s3 482; GFX10-NEXT: ; return to shader part epilog 483 %and0 = and i64 %a, %b 484 %not.a = xor i64 %a, -1 485 %and1 = and i64 %not.a, %mask 486 %bitselect = or i64 %and0, %and1 487 %cast = bitcast i64 %bitselect to <2 x float> 488 ret <2 x float> %cast 489} 490 491define amdgpu_ps <2 x float> @s_v_s_bitselect_i64_pat_0(i64 inreg %a, i64 %b, i64 inreg %mask) { 492; GFX7-LABEL: s_v_s_bitselect_i64_pat_0: 493; GFX7: ; %bb.0: 494; GFX7-NEXT: v_mov_b32_e32 v2, s3 495; GFX7-NEXT: v_bfi_b32 v1, s1, v1, v2 496; GFX7-NEXT: v_mov_b32_e32 v2, s2 497; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v2 498; GFX7-NEXT: ; return to shader part epilog 499; 500; GFX8-LABEL: s_v_s_bitselect_i64_pat_0: 501; GFX8: ; %bb.0: 502; GFX8-NEXT: v_mov_b32_e32 v2, s3 503; GFX8-NEXT: v_bfi_b32 v1, s1, v1, v2 504; GFX8-NEXT: v_mov_b32_e32 v2, s2 505; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v2 506; GFX8-NEXT: ; return to shader part epilog 507; 508; GFX10-LABEL: s_v_s_bitselect_i64_pat_0: 509; GFX10: ; %bb.0: 510; GFX10-NEXT: v_bfi_b32 v0, s0, v0, s2 511; GFX10-NEXT: v_bfi_b32 v1, s1, v1, s3 512; GFX10-NEXT: ; return to shader part epilog 513 %and0 = and i64 %a, %b 514 %not.a = xor i64 %a, -1 515 %and1 = and i64 %not.a, %mask 516 %bitselect = or i64 %and0, %and1 517 %cast = bitcast i64 %bitselect to <2 x float> 518 ret <2 x float> %cast 519} 520 521define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_0(i64 inreg %a, i64 inreg %b, i64 %mask) { 522; GFX7-LABEL: s_s_v_bitselect_i64_pat_0: 523; GFX7: ; %bb.0: 524; GFX7-NEXT: v_mov_b32_e32 v2, s3 525; GFX7-NEXT: v_bfi_b32 v1, s1, v2, v1 526; GFX7-NEXT: v_mov_b32_e32 v2, s2 527; GFX7-NEXT: v_bfi_b32 v0, s0, v2, v0 528; GFX7-NEXT: ; return to shader part epilog 529; 530; GFX8-LABEL: s_s_v_bitselect_i64_pat_0: 531; GFX8: ; %bb.0: 532; GFX8-NEXT: v_mov_b32_e32 v2, s3 533; GFX8-NEXT: v_bfi_b32 v1, s1, v2, v1 534; GFX8-NEXT: v_mov_b32_e32 v2, s2 535; GFX8-NEXT: v_bfi_b32 v0, s0, v2, v0 536; GFX8-NEXT: ; return to shader part epilog 537; 538; GFX10-LABEL: s_s_v_bitselect_i64_pat_0: 539; GFX10: ; %bb.0: 540; GFX10-NEXT: v_bfi_b32 v0, s0, s2, v0 541; GFX10-NEXT: v_bfi_b32 v1, s1, s3, v1 542; GFX10-NEXT: ; return to shader part epilog 543 %and0 = and i64 %a, %b 544 %not.a = xor i64 %a, -1 545 %and1 = and i64 %not.a, %mask 546 %bitselect = or i64 %and0, %and1 547 %cast = bitcast i64 %bitselect to <2 x float> 548 ret <2 x float> %cast 549} 550 551define amdgpu_ps <2 x float> @v_v_s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 inreg %mask) { 552; GFX7-LABEL: v_v_s_bitselect_i64_pat_0: 553; GFX7: ; %bb.0: 554; GFX7-NEXT: v_bfi_b32 v1, v1, v3, s1 555; GFX7-NEXT: v_bfi_b32 v0, v0, v2, s0 556; GFX7-NEXT: ; return to shader part epilog 557; 558; GFX8-LABEL: v_v_s_bitselect_i64_pat_0: 559; GFX8: ; %bb.0: 560; GFX8-NEXT: v_bfi_b32 v1, v1, v3, s1 561; GFX8-NEXT: v_bfi_b32 v0, v0, v2, s0 562; GFX8-NEXT: ; return to shader part epilog 563; 564; GFX10-LABEL: v_v_s_bitselect_i64_pat_0: 565; GFX10: ; %bb.0: 566; GFX10-NEXT: v_bfi_b32 v0, v0, v2, s0 567; GFX10-NEXT: v_bfi_b32 v1, v1, v3, s1 568; GFX10-NEXT: ; return to shader part epilog 569 %and0 = and i64 %a, %b 570 %not.a = xor i64 %a, -1 571 %and1 = and i64 %not.a, %mask 572 %bitselect = or i64 %and0, %and1 573 %cast = bitcast i64 %bitselect to <2 x float> 574 ret <2 x float> %cast 575} 576 577define amdgpu_ps <2 x float> @v_s_v_bitselect_i64_pat_0(i64 %a, i64 inreg %b, i64 %mask) { 578; GFX7-LABEL: v_s_v_bitselect_i64_pat_0: 579; GFX7: ; %bb.0: 580; GFX7-NEXT: v_bfi_b32 v1, v1, s1, v3 581; GFX7-NEXT: v_bfi_b32 v0, v0, s0, v2 582; GFX7-NEXT: ; return to shader part epilog 583; 584; GFX8-LABEL: v_s_v_bitselect_i64_pat_0: 585; GFX8: ; %bb.0: 586; GFX8-NEXT: v_bfi_b32 v1, v1, s1, v3 587; GFX8-NEXT: v_bfi_b32 v0, v0, s0, v2 588; GFX8-NEXT: ; return to shader part epilog 589; 590; GFX10-LABEL: v_s_v_bitselect_i64_pat_0: 591; GFX10: ; %bb.0: 592; GFX10-NEXT: v_bfi_b32 v0, v0, s0, v2 593; GFX10-NEXT: v_bfi_b32 v1, v1, s1, v3 594; GFX10-NEXT: ; return to shader part epilog 595 %and0 = and i64 %a, %b 596 %not.a = xor i64 %a, -1 597 %and1 = and i64 %not.a, %mask 598 %bitselect = or i64 %and0, %and1 599 %cast = bitcast i64 %bitselect to <2 x float> 600 ret <2 x float> %cast 601} 602 603define amdgpu_ps <2 x float> @s_v_v_bitselect_i64_pat_0(i64 inreg %a, i64 %b, i64 %mask) { 604; GFX7-LABEL: s_v_v_bitselect_i64_pat_0: 605; GFX7: ; %bb.0: 606; GFX7-NEXT: v_bfi_b32 v1, s1, v1, v3 607; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v2 608; GFX7-NEXT: ; return to shader part epilog 609; 610; GFX8-LABEL: s_v_v_bitselect_i64_pat_0: 611; GFX8: ; %bb.0: 612; GFX8-NEXT: v_bfi_b32 v1, s1, v1, v3 613; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v2 614; GFX8-NEXT: ; return to shader part epilog 615; 616; GFX10-LABEL: s_v_v_bitselect_i64_pat_0: 617; GFX10: ; %bb.0: 618; GFX10-NEXT: v_bfi_b32 v0, s0, v0, v2 619; GFX10-NEXT: v_bfi_b32 v1, s1, v1, v3 620; GFX10-NEXT: ; return to shader part epilog 621 %and0 = and i64 %a, %b 622 %not.a = xor i64 %a, -1 623 %and1 = and i64 %not.a, %mask 624 %bitselect = or i64 %and0, %and1 625 %cast = bitcast i64 %bitselect to <2 x float> 626 ret <2 x float> %cast 627} 628 629define i64 @v_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { 630; GFX7-LABEL: v_bitselect_i64_pat_1: 631; GFX7: ; %bb.0: 632; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 633; GFX7-NEXT: v_bfi_b32 v1, v3, v1, v5 634; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4 635; GFX7-NEXT: s_setpc_b64 s[30:31] 636; 637; GFX8-LABEL: v_bitselect_i64_pat_1: 638; GFX8: ; %bb.0: 639; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 640; GFX8-NEXT: v_bfi_b32 v1, v3, v1, v5 641; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4 642; GFX8-NEXT: s_setpc_b64 s[30:31] 643; 644; GFX10-LABEL: v_bitselect_i64_pat_1: 645; GFX10: ; %bb.0: 646; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 647; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 648; GFX10-NEXT: v_bfi_b32 v0, v2, v0, v4 649; GFX10-NEXT: v_bfi_b32 v1, v3, v1, v5 650; GFX10-NEXT: s_setpc_b64 s[30:31] 651 %xor.0 = xor i64 %a, %mask 652 %and = and i64 %xor.0, %b 653 %bitselect = xor i64 %and, %mask 654 ret i64 %bitselect 655} 656 657define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_1(i64 %a, i64 inreg %b, i64 inreg %mask) { 658; GFX7-LABEL: v_s_s_bitselect_i64_pat_1: 659; GFX7: ; %bb.0: 660; GFX7-NEXT: v_mov_b32_e32 v2, s3 661; GFX7-NEXT: v_bfi_b32 v1, s1, v1, v2 662; GFX7-NEXT: v_mov_b32_e32 v2, s2 663; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v2 664; GFX7-NEXT: ; return to shader part epilog 665; 666; GFX8-LABEL: v_s_s_bitselect_i64_pat_1: 667; GFX8: ; %bb.0: 668; GFX8-NEXT: v_mov_b32_e32 v2, s3 669; GFX8-NEXT: v_bfi_b32 v1, s1, v1, v2 670; GFX8-NEXT: v_mov_b32_e32 v2, s2 671; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v2 672; GFX8-NEXT: ; return to shader part epilog 673; 674; GFX10-LABEL: v_s_s_bitselect_i64_pat_1: 675; GFX10: ; %bb.0: 676; GFX10-NEXT: v_bfi_b32 v0, s0, v0, s2 677; GFX10-NEXT: v_bfi_b32 v1, s1, v1, s3 678; GFX10-NEXT: ; return to shader part epilog 679 %xor.0 = xor i64 %a, %mask 680 %and = and i64 %xor.0, %b 681 %bitselect = xor i64 %and, %mask 682 %cast = bitcast i64 %bitselect to <2 x float> 683 ret <2 x float> %cast 684} 685 686define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_1(i64 inreg %a, i64 inreg %b, i64 %mask) { 687; GFX7-LABEL: s_s_v_bitselect_i64_pat_1: 688; GFX7: ; %bb.0: 689; GFX7-NEXT: v_mov_b32_e32 v2, s1 690; GFX7-NEXT: v_bfi_b32 v1, s3, v2, v1 691; GFX7-NEXT: v_mov_b32_e32 v2, s0 692; GFX7-NEXT: v_bfi_b32 v0, s2, v2, v0 693; GFX7-NEXT: ; return to shader part epilog 694; 695; GFX8-LABEL: s_s_v_bitselect_i64_pat_1: 696; GFX8: ; %bb.0: 697; GFX8-NEXT: v_mov_b32_e32 v2, s1 698; GFX8-NEXT: v_bfi_b32 v1, s3, v2, v1 699; GFX8-NEXT: v_mov_b32_e32 v2, s0 700; GFX8-NEXT: v_bfi_b32 v0, s2, v2, v0 701; GFX8-NEXT: ; return to shader part epilog 702; 703; GFX10-LABEL: s_s_v_bitselect_i64_pat_1: 704; GFX10: ; %bb.0: 705; GFX10-NEXT: v_bfi_b32 v0, s2, s0, v0 706; GFX10-NEXT: v_bfi_b32 v1, s3, s1, v1 707; GFX10-NEXT: ; return to shader part epilog 708 %xor.0 = xor i64 %a, %mask 709 %and = and i64 %xor.0, %b 710 %bitselect = xor i64 %and, %mask 711 %cast = bitcast i64 %bitselect to <2 x float> 712 ret <2 x float> %cast 713} 714 715define amdgpu_ps <2 x float> @s_v_s_bitselect_i64_pat_1(i64 inreg %a, i64 %b, i64 inreg %mask) { 716; GFX7-LABEL: s_v_s_bitselect_i64_pat_1: 717; GFX7: ; %bb.0: 718; GFX7-NEXT: v_mov_b32_e32 v2, s3 719; GFX7-NEXT: v_bfi_b32 v1, v1, s1, v2 720; GFX7-NEXT: v_mov_b32_e32 v2, s2 721; GFX7-NEXT: v_bfi_b32 v0, v0, s0, v2 722; GFX7-NEXT: ; return to shader part epilog 723; 724; GFX8-LABEL: s_v_s_bitselect_i64_pat_1: 725; GFX8: ; %bb.0: 726; GFX8-NEXT: v_mov_b32_e32 v2, s3 727; GFX8-NEXT: v_bfi_b32 v1, v1, s1, v2 728; GFX8-NEXT: v_mov_b32_e32 v2, s2 729; GFX8-NEXT: v_bfi_b32 v0, v0, s0, v2 730; GFX8-NEXT: ; return to shader part epilog 731; 732; GFX10-LABEL: s_v_s_bitselect_i64_pat_1: 733; GFX10: ; %bb.0: 734; GFX10-NEXT: v_bfi_b32 v0, v0, s0, s2 735; GFX10-NEXT: v_bfi_b32 v1, v1, s1, s3 736; GFX10-NEXT: ; return to shader part epilog 737 %xor.0 = xor i64 %a, %mask 738 %and = and i64 %xor.0, %b 739 %bitselect = xor i64 %and, %mask 740 %cast = bitcast i64 %bitselect to <2 x float> 741 ret <2 x float> %cast 742} 743 744define i64 @v_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { 745; GFX7-LABEL: v_bitselect_i64_pat_2: 746; GFX7: ; %bb.0: 747; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 748; GFX7-NEXT: v_bfi_b32 v1, v3, v1, v5 749; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4 750; GFX7-NEXT: s_setpc_b64 s[30:31] 751; 752; GFX8-LABEL: v_bitselect_i64_pat_2: 753; GFX8: ; %bb.0: 754; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 755; GFX8-NEXT: v_bfi_b32 v1, v3, v1, v5 756; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4 757; GFX8-NEXT: s_setpc_b64 s[30:31] 758; 759; GFX10-LABEL: v_bitselect_i64_pat_2: 760; GFX10: ; %bb.0: 761; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 762; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 763; GFX10-NEXT: v_bfi_b32 v0, v2, v0, v4 764; GFX10-NEXT: v_bfi_b32 v1, v3, v1, v5 765; GFX10-NEXT: s_setpc_b64 s[30:31] 766 %xor.0 = xor i64 %a, %mask 767 %and = and i64 %xor.0, %b 768 %bitselect = xor i64 %and, %mask 769 ret i64 %bitselect 770} 771 772define i64 @v_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { 773; GFX7-LABEL: v_bfi_sha256_ma_i64: 774; GFX7: ; %bb.0: ; %entry 775; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 776; GFX7-NEXT: v_xor_b32_e32 v1, v1, v3 777; GFX7-NEXT: v_xor_b32_e32 v0, v0, v2 778; GFX7-NEXT: v_bfi_b32 v1, v1, v5, v3 779; GFX7-NEXT: v_bfi_b32 v0, v0, v4, v2 780; GFX7-NEXT: s_setpc_b64 s[30:31] 781; 782; GFX8-LABEL: v_bfi_sha256_ma_i64: 783; GFX8: ; %bb.0: ; %entry 784; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 785; GFX8-NEXT: v_xor_b32_e32 v1, v1, v3 786; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2 787; GFX8-NEXT: v_bfi_b32 v1, v1, v5, v3 788; GFX8-NEXT: v_bfi_b32 v0, v0, v4, v2 789; GFX8-NEXT: s_setpc_b64 s[30:31] 790; 791; GFX10-LABEL: v_bfi_sha256_ma_i64: 792; GFX10: ; %bb.0: ; %entry 793; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 794; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 795; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2 796; GFX10-NEXT: v_xor_b32_e32 v1, v1, v3 797; GFX10-NEXT: v_bfi_b32 v0, v0, v4, v2 798; GFX10-NEXT: v_bfi_b32 v1, v1, v5, v3 799; GFX10-NEXT: s_setpc_b64 s[30:31] 800entry: 801 %and0 = and i64 %x, %z 802 %or0 = or i64 %x, %z 803 %and1 = and i64 %y, %or0 804 %or1 = or i64 %and0, %and1 805 ret i64 %or1 806} 807 808define amdgpu_ps <2 x float> @v_s_s_bfi_sha256_ma_i64(i64 %x, i64 inreg %y, i64 inreg %z) { 809; GFX7-LABEL: v_s_s_bfi_sha256_ma_i64: 810; GFX7: ; %bb.0: ; %entry 811; GFX7-NEXT: v_xor_b32_e32 v1, s1, v1 812; GFX7-NEXT: v_mov_b32_e32 v2, s1 813; GFX7-NEXT: v_bfi_b32 v1, v1, s3, v2 814; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0 815; GFX7-NEXT: v_mov_b32_e32 v2, s0 816; GFX7-NEXT: v_bfi_b32 v0, v0, s2, v2 817; GFX7-NEXT: ; return to shader part epilog 818; 819; GFX8-LABEL: v_s_s_bfi_sha256_ma_i64: 820; GFX8: ; %bb.0: ; %entry 821; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1 822; GFX8-NEXT: v_mov_b32_e32 v2, s1 823; GFX8-NEXT: v_bfi_b32 v1, v1, s3, v2 824; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 825; GFX8-NEXT: v_mov_b32_e32 v2, s0 826; GFX8-NEXT: v_bfi_b32 v0, v0, s2, v2 827; GFX8-NEXT: ; return to shader part epilog 828; 829; GFX10-LABEL: v_s_s_bfi_sha256_ma_i64: 830; GFX10: ; %bb.0: ; %entry 831; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 832; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1 833; GFX10-NEXT: v_bfi_b32 v0, v0, s2, s0 834; GFX10-NEXT: v_bfi_b32 v1, v1, s3, s1 835; GFX10-NEXT: ; return to shader part epilog 836entry: 837 %and0 = and i64 %x, %z 838 %or0 = or i64 %x, %z 839 %and1 = and i64 %y, %or0 840 %or1 = or i64 %and0, %and1 841 %cast = bitcast i64 %or1 to <2 x float> 842 ret <2 x float> %cast 843} 844 845define amdgpu_ps <2 x float> @s_v_s_bfi_sha256_ma_i64(i64 inreg %x, i64 %y, i64 inreg %z) { 846; GFX7-LABEL: s_v_s_bfi_sha256_ma_i64: 847; GFX7: ; %bb.0: ; %entry 848; GFX7-NEXT: v_xor_b32_e32 v2, s1, v1 849; GFX7-NEXT: v_bfi_b32 v1, v2, s3, v1 850; GFX7-NEXT: v_xor_b32_e32 v2, s0, v0 851; GFX7-NEXT: v_bfi_b32 v0, v2, s2, v0 852; GFX7-NEXT: ; return to shader part epilog 853; 854; GFX8-LABEL: s_v_s_bfi_sha256_ma_i64: 855; GFX8: ; %bb.0: ; %entry 856; GFX8-NEXT: v_xor_b32_e32 v2, s1, v1 857; GFX8-NEXT: v_bfi_b32 v1, v2, s3, v1 858; GFX8-NEXT: v_xor_b32_e32 v2, s0, v0 859; GFX8-NEXT: v_bfi_b32 v0, v2, s2, v0 860; GFX8-NEXT: ; return to shader part epilog 861; 862; GFX10-LABEL: s_v_s_bfi_sha256_ma_i64: 863; GFX10: ; %bb.0: ; %entry 864; GFX10-NEXT: v_xor_b32_e32 v2, s0, v0 865; GFX10-NEXT: v_xor_b32_e32 v3, s1, v1 866; GFX10-NEXT: v_bfi_b32 v0, v2, s2, v0 867; GFX10-NEXT: v_bfi_b32 v1, v3, s3, v1 868; GFX10-NEXT: ; return to shader part epilog 869entry: 870 %and0 = and i64 %x, %z 871 %or0 = or i64 %x, %z 872 %and1 = and i64 %y, %or0 873 %or1 = or i64 %and0, %and1 874 %cast = bitcast i64 %or1 to <2 x float> 875 ret <2 x float> %cast 876} 877 878define amdgpu_ps <2 x float> @s_s_v_bfi_sha256_ma_i64(i64 inreg %x, i64 inreg %y, i64 %z) { 879; GFX7-LABEL: s_s_v_bfi_sha256_ma_i64: 880; GFX7: ; %bb.0: ; %entry 881; GFX7-NEXT: v_mov_b32_e32 v2, s3 882; GFX7-NEXT: v_xor_b32_e32 v2, s1, v2 883; GFX7-NEXT: v_bfi_b32 v1, v2, v1, s3 884; GFX7-NEXT: v_mov_b32_e32 v2, s2 885; GFX7-NEXT: v_xor_b32_e32 v2, s0, v2 886; GFX7-NEXT: v_bfi_b32 v0, v2, v0, s2 887; GFX7-NEXT: ; return to shader part epilog 888; 889; GFX8-LABEL: s_s_v_bfi_sha256_ma_i64: 890; GFX8: ; %bb.0: ; %entry 891; GFX8-NEXT: v_mov_b32_e32 v2, s3 892; GFX8-NEXT: v_xor_b32_e32 v2, s1, v2 893; GFX8-NEXT: v_bfi_b32 v1, v2, v1, s3 894; GFX8-NEXT: v_mov_b32_e32 v2, s2 895; GFX8-NEXT: v_xor_b32_e32 v2, s0, v2 896; GFX8-NEXT: v_bfi_b32 v0, v2, v0, s2 897; GFX8-NEXT: ; return to shader part epilog 898; 899; GFX10-LABEL: s_s_v_bfi_sha256_ma_i64: 900; GFX10: ; %bb.0: ; %entry 901; GFX10-NEXT: v_xor_b32_e64 v2, s0, s2 902; GFX10-NEXT: v_xor_b32_e64 v3, s1, s3 903; GFX10-NEXT: v_bfi_b32 v0, v2, v0, s2 904; GFX10-NEXT: v_bfi_b32 v1, v3, v1, s3 905; GFX10-NEXT: ; return to shader part epilog 906entry: 907 %and0 = and i64 %x, %z 908 %or0 = or i64 %x, %z 909 %and1 = and i64 %y, %or0 910 %or1 = or i64 %and0, %and1 911 %cast = bitcast i64 %or1 to <2 x float> 912 ret <2 x float> %cast 913} 914 915define amdgpu_ps <2 x float> @v_s_v_bfi_sha256_ma_i64(i64 %x, i64 inreg %y, i64 %z) { 916; GFX7-LABEL: v_s_v_bfi_sha256_ma_i64: 917; GFX7: ; %bb.0: ; %entry 918; GFX7-NEXT: v_xor_b32_e32 v1, s1, v1 919; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0 920; GFX7-NEXT: v_bfi_b32 v1, v1, v3, s1 921; GFX7-NEXT: v_bfi_b32 v0, v0, v2, s0 922; GFX7-NEXT: ; return to shader part epilog 923; 924; GFX8-LABEL: v_s_v_bfi_sha256_ma_i64: 925; GFX8: ; %bb.0: ; %entry 926; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1 927; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 928; GFX8-NEXT: v_bfi_b32 v1, v1, v3, s1 929; GFX8-NEXT: v_bfi_b32 v0, v0, v2, s0 930; GFX8-NEXT: ; return to shader part epilog 931; 932; GFX10-LABEL: v_s_v_bfi_sha256_ma_i64: 933; GFX10: ; %bb.0: ; %entry 934; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 935; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1 936; GFX10-NEXT: v_bfi_b32 v0, v0, v2, s0 937; GFX10-NEXT: v_bfi_b32 v1, v1, v3, s1 938; GFX10-NEXT: ; return to shader part epilog 939entry: 940 %and0 = and i64 %x, %z 941 %or0 = or i64 %x, %z 942 %and1 = and i64 %y, %or0 943 %or1 = or i64 %and0, %and1 944 %cast = bitcast i64 %or1 to <2 x float> 945 ret <2 x float> %cast 946} 947 948define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { 949; GFX7-LABEL: s_bitselect_i64_pat_0: 950; GFX7: ; %bb.0: 951; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 952; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 953; GFX7-NEXT: s_mov_b32 s3, 0xf000 954; GFX7-NEXT: s_mov_b32 s2, -1 955; GFX7-NEXT: s_waitcnt lgkmcnt(0) 956; GFX7-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] 957; GFX7-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] 958; GFX7-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] 959; GFX7-NEXT: s_add_u32 s0, s0, 10 960; GFX7-NEXT: s_addc_u32 s1, s1, 0 961; GFX7-NEXT: v_mov_b32_e32 v0, s0 962; GFX7-NEXT: v_mov_b32_e32 v1, s1 963; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 964; GFX7-NEXT: s_endpgm 965; 966; GFX8-LABEL: s_bitselect_i64_pat_0: 967; GFX8: ; %bb.0: 968; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 969; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 970; GFX8-NEXT: s_waitcnt lgkmcnt(0) 971; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] 972; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] 973; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] 974; GFX8-NEXT: s_add_u32 s0, s0, 10 975; GFX8-NEXT: s_addc_u32 s1, s1, 0 976; GFX8-NEXT: v_mov_b32_e32 v0, s0 977; GFX8-NEXT: v_mov_b32_e32 v1, s1 978; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[0:1] 979; GFX8-NEXT: s_endpgm 980; 981; GFX10-LABEL: s_bitselect_i64_pat_0: 982; GFX10: ; %bb.0: 983; GFX10-NEXT: s_clause 0x1 984; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 985; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 986; GFX10-NEXT: s_waitcnt lgkmcnt(0) 987; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] 988; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] 989; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] 990; GFX10-NEXT: s_add_u32 s0, s0, 10 991; GFX10-NEXT: s_addc_u32 s1, s1, 0 992; GFX10-NEXT: v_mov_b32_e32 v0, s0 993; GFX10-NEXT: v_mov_b32_e32 v1, s1 994; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off 995; GFX10-NEXT: s_endpgm 996 %and0 = and i64 %a, %b 997 %not.a = xor i64 %a, -1 998 %and1 = and i64 %not.a, %mask 999 %bitselect = or i64 %and0, %and1 1000 %scalar.use = add i64 %bitselect, 10 1001 store i64 %scalar.use, i64 addrspace(1)* undef 1002 ret void 1003} 1004 1005define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { 1006; GFX7-LABEL: s_bitselect_i64_pat_1: 1007; GFX7: ; %bb.0: 1008; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1009; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1010; GFX7-NEXT: s_mov_b32 s3, 0xf000 1011; GFX7-NEXT: s_mov_b32 s2, -1 1012; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1013; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] 1014; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] 1015; GFX7-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] 1016; GFX7-NEXT: s_add_u32 s0, s0, 10 1017; GFX7-NEXT: s_addc_u32 s1, s1, 0 1018; GFX7-NEXT: v_mov_b32_e32 v0, s0 1019; GFX7-NEXT: v_mov_b32_e32 v1, s1 1020; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1021; GFX7-NEXT: s_endpgm 1022; 1023; GFX8-LABEL: s_bitselect_i64_pat_1: 1024; GFX8: ; %bb.0: 1025; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1026; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1027; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1028; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] 1029; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] 1030; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] 1031; GFX8-NEXT: s_add_u32 s0, s0, 10 1032; GFX8-NEXT: s_addc_u32 s1, s1, 0 1033; GFX8-NEXT: v_mov_b32_e32 v0, s0 1034; GFX8-NEXT: v_mov_b32_e32 v1, s1 1035; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[0:1] 1036; GFX8-NEXT: s_endpgm 1037; 1038; GFX10-LABEL: s_bitselect_i64_pat_1: 1039; GFX10: ; %bb.0: 1040; GFX10-NEXT: s_clause 0x1 1041; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1042; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1043; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1044; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] 1045; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] 1046; GFX10-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] 1047; GFX10-NEXT: s_add_u32 s0, s0, 10 1048; GFX10-NEXT: s_addc_u32 s1, s1, 0 1049; GFX10-NEXT: v_mov_b32_e32 v0, s0 1050; GFX10-NEXT: v_mov_b32_e32 v1, s1 1051; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off 1052; GFX10-NEXT: s_endpgm 1053 %xor.0 = xor i64 %a, %mask 1054 %and = and i64 %xor.0, %b 1055 %bitselect = xor i64 %and, %mask 1056 1057 %scalar.use = add i64 %bitselect, 10 1058 store i64 %scalar.use, i64 addrspace(1)* undef 1059 ret void 1060} 1061 1062define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { 1063; GFX7-LABEL: s_bitselect_i64_pat_2: 1064; GFX7: ; %bb.0: 1065; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1066; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1067; GFX7-NEXT: s_mov_b32 s3, 0xf000 1068; GFX7-NEXT: s_mov_b32 s2, -1 1069; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1070; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] 1071; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] 1072; GFX7-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] 1073; GFX7-NEXT: s_add_u32 s0, s0, 10 1074; GFX7-NEXT: s_addc_u32 s1, s1, 0 1075; GFX7-NEXT: v_mov_b32_e32 v0, s0 1076; GFX7-NEXT: v_mov_b32_e32 v1, s1 1077; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1078; GFX7-NEXT: s_endpgm 1079; 1080; GFX8-LABEL: s_bitselect_i64_pat_2: 1081; GFX8: ; %bb.0: 1082; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1083; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1084; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1085; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] 1086; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] 1087; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] 1088; GFX8-NEXT: s_add_u32 s0, s0, 10 1089; GFX8-NEXT: s_addc_u32 s1, s1, 0 1090; GFX8-NEXT: v_mov_b32_e32 v0, s0 1091; GFX8-NEXT: v_mov_b32_e32 v1, s1 1092; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[0:1] 1093; GFX8-NEXT: s_endpgm 1094; 1095; GFX10-LABEL: s_bitselect_i64_pat_2: 1096; GFX10: ; %bb.0: 1097; GFX10-NEXT: s_clause 0x1 1098; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1099; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1100; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1101; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1] 1102; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] 1103; GFX10-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] 1104; GFX10-NEXT: s_add_u32 s0, s0, 10 1105; GFX10-NEXT: s_addc_u32 s1, s1, 0 1106; GFX10-NEXT: v_mov_b32_e32 v0, s0 1107; GFX10-NEXT: v_mov_b32_e32 v1, s1 1108; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off 1109; GFX10-NEXT: s_endpgm 1110 %xor.0 = xor i64 %a, %mask 1111 %and = and i64 %xor.0, %b 1112 %bitselect = xor i64 %and, %mask 1113 1114 %scalar.use = add i64 %bitselect, 10 1115 store i64 %scalar.use, i64 addrspace(1)* undef 1116 ret void 1117} 1118 1119define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { 1120; GFX7-LABEL: s_bfi_sha256_ma_i64: 1121; GFX7: ; %bb.0: ; %entry 1122; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1123; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1124; GFX7-NEXT: s_mov_b32 s3, 0xf000 1125; GFX7-NEXT: s_mov_b32 s2, -1 1126; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1127; GFX7-NEXT: s_and_b64 s[8:9], s[4:5], s[0:1] 1128; GFX7-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] 1129; GFX7-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] 1130; GFX7-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] 1131; GFX7-NEXT: s_add_u32 s0, s0, 10 1132; GFX7-NEXT: s_addc_u32 s1, s1, 0 1133; GFX7-NEXT: v_mov_b32_e32 v0, s0 1134; GFX7-NEXT: v_mov_b32_e32 v1, s1 1135; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1136; GFX7-NEXT: s_endpgm 1137; 1138; GFX8-LABEL: s_bfi_sha256_ma_i64: 1139; GFX8: ; %bb.0: ; %entry 1140; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1141; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1142; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1143; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1] 1144; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] 1145; GFX8-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] 1146; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] 1147; GFX8-NEXT: s_add_u32 s0, s0, 10 1148; GFX8-NEXT: s_addc_u32 s1, s1, 0 1149; GFX8-NEXT: v_mov_b32_e32 v0, s0 1150; GFX8-NEXT: v_mov_b32_e32 v1, s1 1151; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[0:1] 1152; GFX8-NEXT: s_endpgm 1153; 1154; GFX10-LABEL: s_bfi_sha256_ma_i64: 1155; GFX10: ; %bb.0: ; %entry 1156; GFX10-NEXT: s_clause 0x1 1157; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1158; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1159; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1160; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1] 1161; GFX10-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] 1162; GFX10-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3] 1163; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] 1164; GFX10-NEXT: s_add_u32 s0, s0, 10 1165; GFX10-NEXT: s_addc_u32 s1, s1, 0 1166; GFX10-NEXT: v_mov_b32_e32 v0, s0 1167; GFX10-NEXT: v_mov_b32_e32 v1, s1 1168; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off 1169; GFX10-NEXT: s_endpgm 1170entry: 1171 %and0 = and i64 %x, %z 1172 %or0 = or i64 %x, %z 1173 %and1 = and i64 %y, %or0 1174 %or1 = or i64 %and0, %and1 1175 1176 %scalar.use = add i64 %or1, 10 1177 store i64 %scalar.use, i64 addrspace(1)* undef 1178 ret void 1179} 1180