1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI 3; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI 4; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9 5; RUN: llc < %s -mtriple=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600 6; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10 7; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s --check-prefix=GFX11 8 9declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone 10declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone 11declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone 12 13define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) { 14; SI-LABEL: fshl_i32: 15; SI: ; %bb.0: ; %entry 16; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 17; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 18; SI-NEXT: s_mov_b32 s7, 0xf000 19; SI-NEXT: s_mov_b32 s6, -1 20; SI-NEXT: s_waitcnt lgkmcnt(0) 21; SI-NEXT: v_mov_b32_e32 v0, s1 22; SI-NEXT: s_lshr_b32 s1, s0, 1 23; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1 24; SI-NEXT: s_not_b32 s0, s2 25; SI-NEXT: v_mov_b32_e32 v1, s0 26; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1 27; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 28; SI-NEXT: s_endpgm 29; 30; VI-LABEL: fshl_i32: 31; VI: ; %bb.0: ; %entry 32; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 33; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 34; VI-NEXT: s_waitcnt lgkmcnt(0) 35; VI-NEXT: v_mov_b32_e32 v0, s1 36; VI-NEXT: s_not_b32 s2, s2 37; VI-NEXT: s_lshr_b32 s1, s0, 1 38; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1 39; VI-NEXT: v_mov_b32_e32 v1, s2 40; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 41; VI-NEXT: v_mov_b32_e32 v0, s4 42; VI-NEXT: v_mov_b32_e32 v1, s5 43; VI-NEXT: flat_store_dword v[0:1], v2 44; VI-NEXT: s_endpgm 45; 46; GFX9-LABEL: fshl_i32: 47; GFX9: ; %bb.0: ; %entry 48; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 49; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 50; GFX9-NEXT: v_mov_b32_e32 v0, 0 51; GFX9-NEXT: s_waitcnt lgkmcnt(0) 52; GFX9-NEXT: v_mov_b32_e32 v1, s1 53; GFX9-NEXT: s_not_b32 s2, s2 54; GFX9-NEXT: s_lshr_b32 s1, s0, 1 55; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 1 56; GFX9-NEXT: v_mov_b32_e32 v2, s2 57; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, v2 58; GFX9-NEXT: global_store_dword v0, v1, s[6:7] 59; GFX9-NEXT: s_endpgm 60; 61; R600-LABEL: fshl_i32: 62; R600: ; %bb.0: ; %entry 63; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 64; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 65; R600-NEXT: CF_END 66; R600-NEXT: PAD 67; R600-NEXT: ALU clause starting at 4: 68; R600-NEXT: LSHR T0.Z, KC0[2].Z, 1, 69; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[2].Z, KC0[2].W, 1, 70; R600-NEXT: NOT_INT * T1.W, KC0[3].X, 71; R600-NEXT: BIT_ALIGN_INT T0.X, PV.Z, PV.W, PS, 72; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 73; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 74; 75; GFX10-LABEL: fshl_i32: 76; GFX10: ; %bb.0: ; %entry 77; GFX10-NEXT: s_clause 0x1 78; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 79; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 80; GFX10-NEXT: v_mov_b32_e32 v1, 0 81; GFX10-NEXT: s_waitcnt lgkmcnt(0) 82; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 1 83; GFX10-NEXT: s_lshr_b32 s0, s0, 1 84; GFX10-NEXT: s_not_b32 s1, s2 85; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 86; GFX10-NEXT: global_store_dword v1, v0, s[6:7] 87; GFX10-NEXT: s_endpgm 88; 89; GFX11-LABEL: fshl_i32: 90; GFX11: ; %bb.0: ; %entry 91; GFX11-NEXT: s_clause 0x1 92; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c 93; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 94; GFX11-NEXT: v_mov_b32_e32 v1, 0 95; GFX11-NEXT: s_waitcnt lgkmcnt(0) 96; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 97; GFX11-NEXT: s_lshr_b32 s0, s0, 1 98; GFX11-NEXT: s_not_b32 s1, s2 99; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 100; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 101; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] 102; GFX11-NEXT: s_endpgm 103entry: 104 %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) 105 store i32 %0, ptr addrspace(1) %in 106 ret void 107} 108 109define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { 110; SI-LABEL: fshl_i32_imm: 111; SI: ; %bb.0: ; %entry 112; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 113; SI-NEXT: s_mov_b32 s7, 0xf000 114; SI-NEXT: s_mov_b32 s6, -1 115; SI-NEXT: s_waitcnt lgkmcnt(0) 116; SI-NEXT: v_mov_b32_e32 v0, s3 117; SI-NEXT: s_mov_b32 s4, s0 118; SI-NEXT: s_mov_b32 s5, s1 119; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25 120; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 121; SI-NEXT: s_endpgm 122; 123; VI-LABEL: fshl_i32_imm: 124; VI: ; %bb.0: ; %entry 125; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 126; VI-NEXT: s_waitcnt lgkmcnt(0) 127; VI-NEXT: v_mov_b32_e32 v0, s3 128; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 129; VI-NEXT: v_mov_b32_e32 v0, s0 130; VI-NEXT: v_mov_b32_e32 v1, s1 131; VI-NEXT: flat_store_dword v[0:1], v2 132; VI-NEXT: s_endpgm 133; 134; GFX9-LABEL: fshl_i32_imm: 135; GFX9: ; %bb.0: ; %entry 136; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 137; GFX9-NEXT: v_mov_b32_e32 v0, 0 138; GFX9-NEXT: s_waitcnt lgkmcnt(0) 139; GFX9-NEXT: v_mov_b32_e32 v1, s3 140; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 141; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 142; GFX9-NEXT: s_endpgm 143; 144; R600-LABEL: fshl_i32_imm: 145; R600: ; %bb.0: ; %entry 146; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 147; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 148; R600-NEXT: CF_END 149; R600-NEXT: PAD 150; R600-NEXT: ALU clause starting at 4: 151; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 152; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 153; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x, 154; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) 155; 156; GFX10-LABEL: fshl_i32_imm: 157; GFX10: ; %bb.0: ; %entry 158; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 159; GFX10-NEXT: v_mov_b32_e32 v0, 0 160; GFX10-NEXT: s_waitcnt lgkmcnt(0) 161; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 162; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 163; GFX10-NEXT: s_endpgm 164; 165; GFX11-LABEL: fshl_i32_imm: 166; GFX11: ; %bb.0: ; %entry 167; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 168; GFX11-NEXT: v_mov_b32_e32 v0, 0 169; GFX11-NEXT: s_waitcnt lgkmcnt(0) 170; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 171; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 172; GFX11-NEXT: s_endpgm 173entry: 174 %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7) 175 store i32 %0, ptr addrspace(1) %in 176 ret void 177} 178 179define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { 180; SI-LABEL: fshl_v2i32: 181; SI: ; %bb.0: ; %entry 182; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 183; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 184; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf 185; SI-NEXT: s_mov_b32 s11, 0xf000 186; SI-NEXT: s_mov_b32 s10, -1 187; SI-NEXT: s_waitcnt lgkmcnt(0) 188; SI-NEXT: v_mov_b32_e32 v0, s3 189; SI-NEXT: v_alignbit_b32 v0, s1, v0, 1 190; SI-NEXT: s_not_b32 s3, s5 191; SI-NEXT: s_lshr_b32 s1, s1, 1 192; SI-NEXT: v_mov_b32_e32 v1, s3 193; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 194; SI-NEXT: v_mov_b32_e32 v0, s2 195; SI-NEXT: s_not_b32 s1, s4 196; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1 197; SI-NEXT: s_lshr_b32 s0, s0, 1 198; SI-NEXT: v_mov_b32_e32 v2, s1 199; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 200; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 201; SI-NEXT: s_endpgm 202; 203; VI-LABEL: fshl_v2i32: 204; VI: ; %bb.0: ; %entry 205; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 206; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 207; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 208; VI-NEXT: s_waitcnt lgkmcnt(0) 209; VI-NEXT: v_mov_b32_e32 v0, s3 210; VI-NEXT: s_not_b32 s7, s7 211; VI-NEXT: s_lshr_b32 s3, s1, 1 212; VI-NEXT: v_alignbit_b32 v0, s1, v0, 1 213; VI-NEXT: v_mov_b32_e32 v1, s7 214; VI-NEXT: v_alignbit_b32 v1, s3, v0, v1 215; VI-NEXT: v_mov_b32_e32 v0, s2 216; VI-NEXT: s_not_b32 s1, s6 217; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1 218; VI-NEXT: s_lshr_b32 s0, s0, 1 219; VI-NEXT: v_mov_b32_e32 v2, s1 220; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2 221; VI-NEXT: v_mov_b32_e32 v2, s4 222; VI-NEXT: v_mov_b32_e32 v3, s5 223; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 224; VI-NEXT: s_endpgm 225; 226; GFX9-LABEL: fshl_v2i32: 227; GFX9: ; %bb.0: ; %entry 228; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 229; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 230; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c 231; GFX9-NEXT: v_mov_b32_e32 v2, 0 232; GFX9-NEXT: s_waitcnt lgkmcnt(0) 233; GFX9-NEXT: v_mov_b32_e32 v0, s3 234; GFX9-NEXT: s_lshr_b32 s3, s1, 1 235; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, 1 236; GFX9-NEXT: s_not_b32 s1, s9 237; GFX9-NEXT: v_mov_b32_e32 v1, s1 238; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, v1 239; GFX9-NEXT: v_mov_b32_e32 v0, s2 240; GFX9-NEXT: s_not_b32 s1, s8 241; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 242; GFX9-NEXT: s_lshr_b32 s0, s0, 1 243; GFX9-NEXT: v_mov_b32_e32 v3, s1 244; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 245; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 246; GFX9-NEXT: s_endpgm 247; 248; R600-LABEL: fshl_v2i32: 249; R600: ; %bb.0: ; %entry 250; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 251; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 252; R600-NEXT: CF_END 253; R600-NEXT: PAD 254; R600-NEXT: ALU clause starting at 4: 255; R600-NEXT: LSHR T0.Z, KC0[3].X, 1, 256; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[3].X, KC0[3].Z, 1, 257; R600-NEXT: NOT_INT * T1.W, KC0[4].X, 258; R600-NEXT: BIT_ALIGN_INT T0.Y, T0.Z, T0.W, PV.W, 259; R600-NEXT: LSHR T0.Z, KC0[2].W, 1, 260; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[2].W, KC0[3].Y, 1, 261; R600-NEXT: NOT_INT * T1.W, KC0[3].W, 262; R600-NEXT: BIT_ALIGN_INT T0.X, T0.Z, T0.W, PV.W, 263; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 264; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 265; 266; GFX10-LABEL: fshl_v2i32: 267; GFX10: ; %bb.0: ; %entry 268; GFX10-NEXT: s_clause 0x2 269; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 270; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 271; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 272; GFX10-NEXT: v_mov_b32_e32 v2, 0 273; GFX10-NEXT: s_waitcnt lgkmcnt(0) 274; GFX10-NEXT: v_alignbit_b32 v0, s1, s3, 1 275; GFX10-NEXT: v_alignbit_b32 v3, s0, s2, 1 276; GFX10-NEXT: s_lshr_b32 s1, s1, 1 277; GFX10-NEXT: s_not_b32 s2, s7 278; GFX10-NEXT: s_lshr_b32 s0, s0, 1 279; GFX10-NEXT: s_not_b32 s3, s6 280; GFX10-NEXT: v_alignbit_b32 v1, s1, v0, s2 281; GFX10-NEXT: v_alignbit_b32 v0, s0, v3, s3 282; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 283; GFX10-NEXT: s_endpgm 284; 285; GFX11-LABEL: fshl_v2i32: 286; GFX11: ; %bb.0: ; %entry 287; GFX11-NEXT: s_clause 0x2 288; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c 289; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c 290; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 291; GFX11-NEXT: v_mov_b32_e32 v2, 0 292; GFX11-NEXT: s_waitcnt lgkmcnt(0) 293; GFX11-NEXT: v_alignbit_b32 v0, s1, s3, 1 294; GFX11-NEXT: v_alignbit_b32 v3, s0, s2, 1 295; GFX11-NEXT: s_lshr_b32 s1, s1, 1 296; GFX11-NEXT: s_not_b32 s2, s7 297; GFX11-NEXT: s_lshr_b32 s0, s0, 1 298; GFX11-NEXT: s_not_b32 s3, s6 299; GFX11-NEXT: v_alignbit_b32 v1, s1, v0, s2 300; GFX11-NEXT: v_alignbit_b32 v0, s0, v3, s3 301; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] 302; GFX11-NEXT: s_endpgm 303entry: 304 %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) 305 store <2 x i32> %0, ptr addrspace(1) %in 306 ret void 307} 308 309define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { 310; SI-LABEL: fshl_v2i32_imm: 311; SI: ; %bb.0: ; %entry 312; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 313; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 314; SI-NEXT: s_mov_b32 s7, 0xf000 315; SI-NEXT: s_mov_b32 s6, -1 316; SI-NEXT: s_waitcnt lgkmcnt(0) 317; SI-NEXT: v_mov_b32_e32 v0, s3 318; SI-NEXT: v_mov_b32_e32 v2, s2 319; SI-NEXT: v_alignbit_b32 v1, s1, v0, 23 320; SI-NEXT: v_alignbit_b32 v0, s0, v2, 25 321; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 322; SI-NEXT: s_endpgm 323; 324; VI-LABEL: fshl_v2i32_imm: 325; VI: ; %bb.0: ; %entry 326; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 327; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 328; VI-NEXT: s_waitcnt lgkmcnt(0) 329; VI-NEXT: v_mov_b32_e32 v0, s3 330; VI-NEXT: v_mov_b32_e32 v2, s2 331; VI-NEXT: v_alignbit_b32 v1, s1, v0, 23 332; VI-NEXT: v_alignbit_b32 v0, s0, v2, 25 333; VI-NEXT: v_mov_b32_e32 v2, s4 334; VI-NEXT: v_mov_b32_e32 v3, s5 335; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 336; VI-NEXT: s_endpgm 337; 338; GFX9-LABEL: fshl_v2i32_imm: 339; GFX9: ; %bb.0: ; %entry 340; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 341; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 342; GFX9-NEXT: v_mov_b32_e32 v2, 0 343; GFX9-NEXT: s_waitcnt lgkmcnt(0) 344; GFX9-NEXT: v_mov_b32_e32 v0, s3 345; GFX9-NEXT: v_mov_b32_e32 v3, s2 346; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 23 347; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 25 348; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 349; GFX9-NEXT: s_endpgm 350; 351; R600-LABEL: fshl_v2i32_imm: 352; R600: ; %bb.0: ; %entry 353; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 354; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 355; R600-NEXT: CF_END 356; R600-NEXT: PAD 357; R600-NEXT: ALU clause starting at 4: 358; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x, 359; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) 360; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x, 361; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) 362; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 363; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 364; 365; GFX10-LABEL: fshl_v2i32_imm: 366; GFX10: ; %bb.0: ; %entry 367; GFX10-NEXT: s_clause 0x1 368; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 369; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 370; GFX10-NEXT: v_mov_b32_e32 v2, 0 371; GFX10-NEXT: s_waitcnt lgkmcnt(0) 372; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 23 373; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 25 374; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 375; GFX10-NEXT: s_endpgm 376; 377; GFX11-LABEL: fshl_v2i32_imm: 378; GFX11: ; %bb.0: ; %entry 379; GFX11-NEXT: s_clause 0x1 380; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c 381; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 382; GFX11-NEXT: v_mov_b32_e32 v2, 0 383; GFX11-NEXT: s_waitcnt lgkmcnt(0) 384; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 23 385; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 25 386; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] 387; GFX11-NEXT: s_endpgm 388entry: 389 %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>) 390 store <2 x i32> %0, ptr addrspace(1) %in 391 ret void 392} 393 394define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { 395; SI-LABEL: fshl_v4i32: 396; SI: ; %bb.0: ; %entry 397; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 398; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x15 399; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 400; SI-NEXT: s_mov_b32 s3, 0xf000 401; SI-NEXT: s_mov_b32 s2, -1 402; SI-NEXT: s_waitcnt lgkmcnt(0) 403; SI-NEXT: s_not_b32 s5, s19 404; SI-NEXT: v_mov_b32_e32 v0, s15 405; SI-NEXT: v_alignbit_b32 v0, s11, v0, 1 406; SI-NEXT: s_lshr_b32 s4, s11, 1 407; SI-NEXT: v_mov_b32_e32 v1, s5 408; SI-NEXT: v_alignbit_b32 v3, s4, v0, v1 409; SI-NEXT: v_mov_b32_e32 v0, s14 410; SI-NEXT: s_not_b32 s5, s18 411; SI-NEXT: v_alignbit_b32 v0, s10, v0, 1 412; SI-NEXT: s_lshr_b32 s4, s10, 1 413; SI-NEXT: v_mov_b32_e32 v1, s5 414; SI-NEXT: v_alignbit_b32 v2, s4, v0, v1 415; SI-NEXT: v_mov_b32_e32 v0, s13 416; SI-NEXT: s_not_b32 s5, s17 417; SI-NEXT: v_alignbit_b32 v0, s9, v0, 1 418; SI-NEXT: s_lshr_b32 s4, s9, 1 419; SI-NEXT: v_mov_b32_e32 v1, s5 420; SI-NEXT: v_alignbit_b32 v1, s4, v0, v1 421; SI-NEXT: v_mov_b32_e32 v0, s12 422; SI-NEXT: s_not_b32 s5, s16 423; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 424; SI-NEXT: s_lshr_b32 s4, s8, 1 425; SI-NEXT: v_mov_b32_e32 v4, s5 426; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 427; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 428; SI-NEXT: s_endpgm 429; 430; VI-LABEL: fshl_v4i32: 431; VI: ; %bb.0: ; %entry 432; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 433; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 434; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 435; VI-NEXT: s_waitcnt lgkmcnt(0) 436; VI-NEXT: v_mov_b32_e32 v0, s15 437; VI-NEXT: s_not_b32 s3, s3 438; VI-NEXT: s_lshr_b32 s6, s11, 1 439; VI-NEXT: v_alignbit_b32 v0, s11, v0, 1 440; VI-NEXT: v_mov_b32_e32 v1, s3 441; VI-NEXT: v_alignbit_b32 v3, s6, v0, v1 442; VI-NEXT: v_mov_b32_e32 v0, s14 443; VI-NEXT: s_not_b32 s2, s2 444; VI-NEXT: v_alignbit_b32 v0, s10, v0, 1 445; VI-NEXT: s_lshr_b32 s3, s10, 1 446; VI-NEXT: v_mov_b32_e32 v1, s2 447; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1 448; VI-NEXT: v_mov_b32_e32 v0, s13 449; VI-NEXT: s_not_b32 s1, s1 450; VI-NEXT: v_alignbit_b32 v0, s9, v0, 1 451; VI-NEXT: s_lshr_b32 s2, s9, 1 452; VI-NEXT: v_mov_b32_e32 v1, s1 453; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1 454; VI-NEXT: v_mov_b32_e32 v0, s12 455; VI-NEXT: s_not_b32 s0, s0 456; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 457; VI-NEXT: s_lshr_b32 s1, s8, 1 458; VI-NEXT: v_mov_b32_e32 v4, s0 459; VI-NEXT: v_alignbit_b32 v0, s1, v0, v4 460; VI-NEXT: v_mov_b32_e32 v4, s4 461; VI-NEXT: v_mov_b32_e32 v5, s5 462; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 463; VI-NEXT: s_endpgm 464; 465; GFX9-LABEL: fshl_v4i32: 466; GFX9: ; %bb.0: ; %entry 467; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 468; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 469; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 470; GFX9-NEXT: v_mov_b32_e32 v4, 0 471; GFX9-NEXT: s_waitcnt lgkmcnt(0) 472; GFX9-NEXT: s_not_b32 s3, s3 473; GFX9-NEXT: v_mov_b32_e32 v0, s15 474; GFX9-NEXT: s_lshr_b32 s4, s11, 1 475; GFX9-NEXT: v_alignbit_b32 v0, s11, v0, 1 476; GFX9-NEXT: v_mov_b32_e32 v1, s3 477; GFX9-NEXT: v_alignbit_b32 v3, s4, v0, v1 478; GFX9-NEXT: v_mov_b32_e32 v0, s14 479; GFX9-NEXT: s_not_b32 s2, s2 480; GFX9-NEXT: v_alignbit_b32 v0, s10, v0, 1 481; GFX9-NEXT: s_lshr_b32 s3, s10, 1 482; GFX9-NEXT: v_mov_b32_e32 v1, s2 483; GFX9-NEXT: v_alignbit_b32 v2, s3, v0, v1 484; GFX9-NEXT: v_mov_b32_e32 v0, s13 485; GFX9-NEXT: s_not_b32 s1, s1 486; GFX9-NEXT: v_alignbit_b32 v0, s9, v0, 1 487; GFX9-NEXT: s_lshr_b32 s2, s9, 1 488; GFX9-NEXT: v_mov_b32_e32 v1, s1 489; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 490; GFX9-NEXT: v_mov_b32_e32 v0, s12 491; GFX9-NEXT: s_not_b32 s0, s0 492; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 493; GFX9-NEXT: s_lshr_b32 s1, s8, 1 494; GFX9-NEXT: v_mov_b32_e32 v5, s0 495; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v5 496; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 497; GFX9-NEXT: s_endpgm 498; 499; R600-LABEL: fshl_v4i32: 500; R600: ; %bb.0: ; %entry 501; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] 502; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 503; R600-NEXT: CF_END 504; R600-NEXT: PAD 505; R600-NEXT: ALU clause starting at 4: 506; R600-NEXT: LSHR T0.Z, KC0[4].X, 1, 507; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1, 508; R600-NEXT: NOT_INT * T1.W, KC0[6].X, 509; R600-NEXT: LSHR T0.Y, KC0[3].W, 1, 510; R600-NEXT: BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, 1, 511; R600-NEXT: BIT_ALIGN_INT * T0.W, T0.Z, T0.W, PV.W, 512; R600-NEXT: NOT_INT * T1.W, KC0[5].W, 513; R600-NEXT: LSHR T1.Y, KC0[3].Z, 1, 514; R600-NEXT: BIT_ALIGN_INT T0.Z, T0.Y, T1.Z, PV.W, 515; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Z, KC0[4].Z, 1, 516; R600-NEXT: NOT_INT * T2.W, KC0[5].Z, 517; R600-NEXT: BIT_ALIGN_INT T0.Y, T1.Y, T1.W, PV.W, 518; R600-NEXT: LSHR T1.Z, KC0[3].Y, 1, 519; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Y, KC0[4].Y, 1, 520; R600-NEXT: NOT_INT * T2.W, KC0[5].Y, 521; R600-NEXT: BIT_ALIGN_INT T0.X, T1.Z, T1.W, PV.W, 522; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 523; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 524; 525; GFX10-LABEL: fshl_v4i32: 526; GFX10: ; %bb.0: ; %entry 527; GFX10-NEXT: s_clause 0x2 528; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 529; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 530; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 531; GFX10-NEXT: v_mov_b32_e32 v4, 0 532; GFX10-NEXT: s_waitcnt lgkmcnt(0) 533; GFX10-NEXT: v_alignbit_b32 v0, s11, s15, 1 534; GFX10-NEXT: v_alignbit_b32 v1, s10, s14, 1 535; GFX10-NEXT: v_alignbit_b32 v5, s9, s13, 1 536; GFX10-NEXT: v_alignbit_b32 v6, s8, s12, 1 537; GFX10-NEXT: s_lshr_b32 s4, s11, 1 538; GFX10-NEXT: s_not_b32 s3, s3 539; GFX10-NEXT: s_lshr_b32 s5, s10, 1 540; GFX10-NEXT: s_not_b32 s2, s2 541; GFX10-NEXT: s_lshr_b32 s9, s9, 1 542; GFX10-NEXT: s_not_b32 s1, s1 543; GFX10-NEXT: s_lshr_b32 s8, s8, 1 544; GFX10-NEXT: s_not_b32 s0, s0 545; GFX10-NEXT: v_alignbit_b32 v3, s4, v0, s3 546; GFX10-NEXT: v_alignbit_b32 v2, s5, v1, s2 547; GFX10-NEXT: v_alignbit_b32 v1, s9, v5, s1 548; GFX10-NEXT: v_alignbit_b32 v0, s8, v6, s0 549; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 550; GFX10-NEXT: s_endpgm 551; 552; GFX11-LABEL: fshl_v4i32: 553; GFX11: ; %bb.0: ; %entry 554; GFX11-NEXT: s_clause 0x2 555; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 556; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 557; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 558; GFX11-NEXT: v_mov_b32_e32 v4, 0 559; GFX11-NEXT: s_waitcnt lgkmcnt(0) 560; GFX11-NEXT: v_alignbit_b32 v0, s11, s15, 1 561; GFX11-NEXT: v_alignbit_b32 v1, s10, s14, 1 562; GFX11-NEXT: v_alignbit_b32 v5, s9, s13, 1 563; GFX11-NEXT: v_alignbit_b32 v6, s8, s12, 1 564; GFX11-NEXT: s_lshr_b32 s6, s11, 1 565; GFX11-NEXT: s_not_b32 s3, s3 566; GFX11-NEXT: s_lshr_b32 s7, s10, 1 567; GFX11-NEXT: s_not_b32 s2, s2 568; GFX11-NEXT: s_lshr_b32 s9, s9, 1 569; GFX11-NEXT: s_not_b32 s1, s1 570; GFX11-NEXT: s_lshr_b32 s8, s8, 1 571; GFX11-NEXT: s_not_b32 s0, s0 572; GFX11-NEXT: v_alignbit_b32 v3, s6, v0, s3 573; GFX11-NEXT: v_alignbit_b32 v2, s7, v1, s2 574; GFX11-NEXT: v_alignbit_b32 v1, s9, v5, s1 575; GFX11-NEXT: v_alignbit_b32 v0, s8, v6, s0 576; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] 577; GFX11-NEXT: s_endpgm 578entry: 579 %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) 580 store <4 x i32> %0, ptr addrspace(1) %in 581 ret void 582} 583 584define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { 585; SI-LABEL: fshl_v4i32_imm: 586; SI: ; %bb.0: ; %entry 587; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 588; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 589; SI-NEXT: s_mov_b32 s3, 0xf000 590; SI-NEXT: s_mov_b32 s2, -1 591; SI-NEXT: s_waitcnt lgkmcnt(0) 592; SI-NEXT: v_mov_b32_e32 v0, s15 593; SI-NEXT: v_mov_b32_e32 v1, s14 594; SI-NEXT: v_alignbit_b32 v3, s11, v0, 31 595; SI-NEXT: v_mov_b32_e32 v0, s13 596; SI-NEXT: v_alignbit_b32 v2, s10, v1, 23 597; SI-NEXT: v_alignbit_b32 v1, s9, v0, 25 598; SI-NEXT: v_mov_b32_e32 v0, s12 599; SI-NEXT: v_alignbit_b32 v0, s8, v0, 31 600; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 601; SI-NEXT: s_endpgm 602; 603; VI-LABEL: fshl_v4i32_imm: 604; VI: ; %bb.0: ; %entry 605; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 606; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 607; VI-NEXT: s_waitcnt lgkmcnt(0) 608; VI-NEXT: v_mov_b32_e32 v0, s15 609; VI-NEXT: v_mov_b32_e32 v1, s14 610; VI-NEXT: v_mov_b32_e32 v4, s13 611; VI-NEXT: v_alignbit_b32 v3, s11, v0, 31 612; VI-NEXT: v_alignbit_b32 v2, s10, v1, 23 613; VI-NEXT: v_alignbit_b32 v1, s9, v4, 25 614; VI-NEXT: v_mov_b32_e32 v0, s12 615; VI-NEXT: v_mov_b32_e32 v5, s1 616; VI-NEXT: v_alignbit_b32 v0, s8, v0, 31 617; VI-NEXT: v_mov_b32_e32 v4, s0 618; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 619; VI-NEXT: s_endpgm 620; 621; GFX9-LABEL: fshl_v4i32_imm: 622; GFX9: ; %bb.0: ; %entry 623; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 624; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 625; GFX9-NEXT: v_mov_b32_e32 v4, 0 626; GFX9-NEXT: s_waitcnt lgkmcnt(0) 627; GFX9-NEXT: v_mov_b32_e32 v0, s15 628; GFX9-NEXT: v_mov_b32_e32 v1, s14 629; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 31 630; GFX9-NEXT: v_mov_b32_e32 v0, s13 631; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 23 632; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 25 633; GFX9-NEXT: v_mov_b32_e32 v0, s12 634; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 31 635; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 636; GFX9-NEXT: s_endpgm 637; 638; R600-LABEL: fshl_v4i32_imm: 639; R600: ; %bb.0: ; %entry 640; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 641; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 642; R600-NEXT: CF_END 643; R600-NEXT: PAD 644; R600-NEXT: ALU clause starting at 4: 645; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, literal.x, 646; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) 647; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x, 648; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) 649; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x, 650; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) 651; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, literal.x, 652; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) 653; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 654; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 655; 656; GFX10-LABEL: fshl_v4i32_imm: 657; GFX10: ; %bb.0: ; %entry 658; GFX10-NEXT: s_clause 0x1 659; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 660; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 661; GFX10-NEXT: v_mov_b32_e32 v4, 0 662; GFX10-NEXT: s_waitcnt lgkmcnt(0) 663; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 31 664; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 23 665; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 25 666; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 31 667; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 668; GFX10-NEXT: s_endpgm 669; 670; GFX11-LABEL: fshl_v4i32_imm: 671; GFX11: ; %bb.0: ; %entry 672; GFX11-NEXT: s_clause 0x1 673; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 674; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 675; GFX11-NEXT: v_mov_b32_e32 v4, 0 676; GFX11-NEXT: s_waitcnt lgkmcnt(0) 677; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 31 678; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 23 679; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 25 680; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 31 681; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 682; GFX11-NEXT: s_endpgm 683entry: 684 %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>) 685 store <4 x i32> %0, ptr addrspace(1) %in 686 ret void 687} 688 689; (a ^ b) | a --> a | b 690define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { 691; SI-LABEL: orxor2or1: 692; SI: ; %bb.0: 693; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 694; SI-NEXT: s_mov_b32 s7, 0xf000 695; SI-NEXT: s_mov_b32 s6, -1 696; SI-NEXT: s_waitcnt lgkmcnt(0) 697; SI-NEXT: s_mov_b32 s4, s0 698; SI-NEXT: s_lshl_b32 s0, s2, 7 699; SI-NEXT: s_or_b32 s0, s3, s0 700; SI-NEXT: s_cmp_eq_u32 s0, 0 701; SI-NEXT: s_cselect_b32 s0, s2, s3 702; SI-NEXT: s_mov_b32 s5, s1 703; SI-NEXT: v_mov_b32_e32 v0, s0 704; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 705; SI-NEXT: s_endpgm 706; 707; VI-LABEL: orxor2or1: 708; VI: ; %bb.0: 709; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 710; VI-NEXT: s_waitcnt lgkmcnt(0) 711; VI-NEXT: s_lshl_b32 s4, s2, 7 712; VI-NEXT: s_or_b32 s4, s3, s4 713; VI-NEXT: s_cmp_eq_u32 s4, 0 714; VI-NEXT: s_cselect_b32 s2, s2, s3 715; VI-NEXT: v_mov_b32_e32 v0, s0 716; VI-NEXT: v_mov_b32_e32 v1, s1 717; VI-NEXT: v_mov_b32_e32 v2, s2 718; VI-NEXT: flat_store_dword v[0:1], v2 719; VI-NEXT: s_endpgm 720; 721; GFX9-LABEL: orxor2or1: 722; GFX9: ; %bb.0: 723; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 724; GFX9-NEXT: v_mov_b32_e32 v0, 0 725; GFX9-NEXT: s_waitcnt lgkmcnt(0) 726; GFX9-NEXT: s_lshl_b32 s4, s2, 7 727; GFX9-NEXT: s_or_b32 s4, s3, s4 728; GFX9-NEXT: s_cmp_eq_u32 s4, 0 729; GFX9-NEXT: s_cselect_b32 s2, s2, s3 730; GFX9-NEXT: v_mov_b32_e32 v1, s2 731; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 732; GFX9-NEXT: s_endpgm 733; 734; R600-LABEL: orxor2or1: 735; R600: ; %bb.0: 736; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 737; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 738; R600-NEXT: CF_END 739; R600-NEXT: PAD 740; R600-NEXT: ALU clause starting at 4: 741; R600-NEXT: LSHL * T0.W, KC0[2].Z, literal.x, 742; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 743; R600-NEXT: OR_INT * T0.W, KC0[2].W, PV.W, 744; R600-NEXT: CNDE_INT T0.X, PV.W, KC0[2].Z, KC0[2].W, 745; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 746; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 747; 748; GFX10-LABEL: orxor2or1: 749; GFX10: ; %bb.0: 750; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 751; GFX10-NEXT: v_mov_b32_e32 v0, 0 752; GFX10-NEXT: s_waitcnt lgkmcnt(0) 753; GFX10-NEXT: s_lshl_b32 s4, s2, 7 754; GFX10-NEXT: s_or_b32 s4, s3, s4 755; GFX10-NEXT: s_cmp_eq_u32 s4, 0 756; GFX10-NEXT: s_cselect_b32 s2, s2, s3 757; GFX10-NEXT: v_mov_b32_e32 v1, s2 758; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 759; GFX10-NEXT: s_endpgm 760; 761; GFX11-LABEL: orxor2or1: 762; GFX11: ; %bb.0: 763; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 764; GFX11-NEXT: s_waitcnt lgkmcnt(0) 765; GFX11-NEXT: s_lshl_b32 s4, s2, 7 766; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 767; GFX11-NEXT: s_or_b32 s4, s3, s4 768; GFX11-NEXT: s_cmp_eq_u32 s4, 0 769; GFX11-NEXT: s_cselect_b32 s2, s2, s3 770; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 771; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 772; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 773; GFX11-NEXT: s_endpgm 774 %shl = shl i32 %a, 7 775 %xor = xor i32 %shl, %b 776 %or = or i32 %a, %xor 777 %fshl = call i32 @llvm.fshl.i32(i32 %or, i32 %xor, i32 7) 778 %cond = icmp eq i32 %fshl, 0 779 %r = select i1 %cond, i32 %a, i32 %b 780 store i32 %r, ptr addrspace(1) %in 781 ret void 782} 783