1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,SI 3; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,VI 4; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9 5; RUN: llc < %s -mtriple=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600 6; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10 7; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11 8 9declare i32 @llvm.fshr.i32(i32, i32, i32) 10declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) 11declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>) 12declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) 13declare i16 @llvm.fshr.i16(i16, i16, i16) 14declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>) 15declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>) 16declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) 17declare i64 @llvm.fshr.i64(i64, i64, i64) 18declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) 19declare i24 @llvm.fshr.i24(i24, i24, i24) 20declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) 21 22define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) { 23; SI-LABEL: fshr_i32: 24; SI: ; %bb.0: ; %entry 25; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 26; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 27; SI-NEXT: s_mov_b32 s7, 0xf000 28; SI-NEXT: s_mov_b32 s6, -1 29; SI-NEXT: s_waitcnt lgkmcnt(0) 30; SI-NEXT: v_mov_b32_e32 v0, s1 31; SI-NEXT: v_mov_b32_e32 v1, s2 32; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1 33; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 34; SI-NEXT: s_endpgm 35; 36; VI-LABEL: fshr_i32: 37; VI: ; %bb.0: ; %entry 38; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 39; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 40; VI-NEXT: s_waitcnt lgkmcnt(0) 41; VI-NEXT: v_mov_b32_e32 v0, s1 42; VI-NEXT: v_mov_b32_e32 v1, s2 43; VI-NEXT: v_alignbit_b32 v2, s0, v0, v1 44; VI-NEXT: v_mov_b32_e32 v0, s4 45; VI-NEXT: v_mov_b32_e32 v1, s5 46; VI-NEXT: flat_store_dword v[0:1], v2 47; VI-NEXT: s_endpgm 48; 49; GFX9-LABEL: fshr_i32: 50; GFX9: ; %bb.0: ; %entry 51; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 52; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 53; GFX9-NEXT: v_mov_b32_e32 v0, 0 54; GFX9-NEXT: s_waitcnt lgkmcnt(0) 55; GFX9-NEXT: v_mov_b32_e32 v1, s1 56; GFX9-NEXT: v_mov_b32_e32 v2, s2 57; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 58; GFX9-NEXT: global_store_dword v0, v1, s[6:7] 59; GFX9-NEXT: s_endpgm 60; 61; R600-LABEL: fshr_i32: 62; R600: ; %bb.0: ; %entry 63; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 64; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 65; R600-NEXT: CF_END 66; R600-NEXT: PAD 67; R600-NEXT: ALU clause starting at 4: 68; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 69; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 70; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X, 71; 72; GFX10-LABEL: fshr_i32: 73; GFX10: ; %bb.0: ; %entry 74; GFX10-NEXT: s_clause 0x1 75; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 76; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 77; GFX10-NEXT: v_mov_b32_e32 v1, 0 78; GFX10-NEXT: s_waitcnt lgkmcnt(0) 79; GFX10-NEXT: v_mov_b32_e32 v0, s2 80; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 81; GFX10-NEXT: global_store_dword v1, v0, s[6:7] 82; GFX10-NEXT: s_endpgm 83; 84; GFX11-LABEL: fshr_i32: 85; GFX11: ; %bb.0: ; %entry 86; GFX11-NEXT: s_clause 0x1 87; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c 88; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 89; GFX11-NEXT: s_waitcnt lgkmcnt(0) 90; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 91; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 92; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, v0 93; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] 94; GFX11-NEXT: s_endpgm 95entry: 96 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) 97 store i32 %0, ptr addrspace(1) %in 98 ret void 99} 100 101define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { 102; SI-LABEL: fshr_i32_imm: 103; SI: ; %bb.0: ; %entry 104; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 105; SI-NEXT: s_mov_b32 s7, 0xf000 106; SI-NEXT: s_mov_b32 s6, -1 107; SI-NEXT: s_waitcnt lgkmcnt(0) 108; SI-NEXT: v_mov_b32_e32 v0, s3 109; SI-NEXT: s_mov_b32 s4, s0 110; SI-NEXT: s_mov_b32 s5, s1 111; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7 112; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 113; SI-NEXT: s_endpgm 114; 115; VI-LABEL: fshr_i32_imm: 116; VI: ; %bb.0: ; %entry 117; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 118; VI-NEXT: s_waitcnt lgkmcnt(0) 119; VI-NEXT: v_mov_b32_e32 v0, s3 120; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 121; VI-NEXT: v_mov_b32_e32 v0, s0 122; VI-NEXT: v_mov_b32_e32 v1, s1 123; VI-NEXT: flat_store_dword v[0:1], v2 124; VI-NEXT: s_endpgm 125; 126; GFX9-LABEL: fshr_i32_imm: 127; GFX9: ; %bb.0: ; %entry 128; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 129; GFX9-NEXT: v_mov_b32_e32 v0, 0 130; GFX9-NEXT: s_waitcnt lgkmcnt(0) 131; GFX9-NEXT: v_mov_b32_e32 v1, s3 132; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7 133; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 134; GFX9-NEXT: s_endpgm 135; 136; R600-LABEL: fshr_i32_imm: 137; R600: ; %bb.0: ; %entry 138; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 139; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 140; R600-NEXT: CF_END 141; R600-NEXT: PAD 142; R600-NEXT: ALU clause starting at 4: 143; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 144; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 145; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x, 146; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 147; 148; GFX10-LABEL: fshr_i32_imm: 149; GFX10: ; %bb.0: ; %entry 150; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 151; GFX10-NEXT: v_mov_b32_e32 v0, 0 152; GFX10-NEXT: s_waitcnt lgkmcnt(0) 153; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 154; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 155; GFX10-NEXT: s_endpgm 156; 157; GFX11-LABEL: fshr_i32_imm: 158; GFX11: ; %bb.0: ; %entry 159; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 160; GFX11-NEXT: v_mov_b32_e32 v0, 0 161; GFX11-NEXT: s_waitcnt lgkmcnt(0) 162; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 163; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 164; GFX11-NEXT: s_endpgm 165entry: 166 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7) 167 store i32 %0, ptr addrspace(1) %in 168 ret void 169} 170 171define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { 172; SI-LABEL: fshr_v2i32: 173; SI: ; %bb.0: ; %entry 174; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 175; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf 176; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 177; SI-NEXT: s_mov_b32 s7, 0xf000 178; SI-NEXT: s_mov_b32 s6, -1 179; SI-NEXT: s_waitcnt lgkmcnt(0) 180; SI-NEXT: v_mov_b32_e32 v0, s3 181; SI-NEXT: v_mov_b32_e32 v1, s9 182; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 183; SI-NEXT: v_mov_b32_e32 v0, s2 184; SI-NEXT: v_mov_b32_e32 v2, s8 185; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 186; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 187; SI-NEXT: s_endpgm 188; 189; VI-LABEL: fshr_v2i32: 190; VI: ; %bb.0: ; %entry 191; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 192; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 193; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 194; VI-NEXT: s_waitcnt lgkmcnt(0) 195; VI-NEXT: v_mov_b32_e32 v0, s3 196; VI-NEXT: v_mov_b32_e32 v1, s7 197; VI-NEXT: v_mov_b32_e32 v2, s2 198; VI-NEXT: v_alignbit_b32 v1, s1, v0, v1 199; VI-NEXT: v_mov_b32_e32 v0, s6 200; VI-NEXT: v_alignbit_b32 v0, s0, v2, v0 201; VI-NEXT: v_mov_b32_e32 v2, s4 202; VI-NEXT: v_mov_b32_e32 v3, s5 203; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 204; VI-NEXT: s_endpgm 205; 206; GFX9-LABEL: fshr_v2i32: 207; GFX9: ; %bb.0: ; %entry 208; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 209; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 210; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 211; GFX9-NEXT: v_mov_b32_e32 v2, 0 212; GFX9-NEXT: s_waitcnt lgkmcnt(0) 213; GFX9-NEXT: v_mov_b32_e32 v0, s3 214; GFX9-NEXT: v_mov_b32_e32 v1, s7 215; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, v1 216; GFX9-NEXT: v_mov_b32_e32 v0, s2 217; GFX9-NEXT: v_mov_b32_e32 v3, s6 218; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 219; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 220; GFX9-NEXT: s_endpgm 221; 222; R600-LABEL: fshr_v2i32: 223; R600: ; %bb.0: ; %entry 224; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 225; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 226; R600-NEXT: CF_END 227; R600-NEXT: PAD 228; R600-NEXT: ALU clause starting at 4: 229; R600-NEXT: MOV * T0.W, KC0[4].X, 230; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W, 231; R600-NEXT: MOV * T0.W, KC0[3].W, 232; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W, 233; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 234; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 235; 236; GFX10-LABEL: fshr_v2i32: 237; GFX10: ; %bb.0: ; %entry 238; GFX10-NEXT: s_clause 0x2 239; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 240; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 241; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 242; GFX10-NEXT: v_mov_b32_e32 v3, 0 243; GFX10-NEXT: s_waitcnt lgkmcnt(0) 244; GFX10-NEXT: v_mov_b32_e32 v0, s7 245; GFX10-NEXT: v_mov_b32_e32 v2, s6 246; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, v0 247; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, v2 248; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] 249; GFX10-NEXT: s_endpgm 250; 251; GFX11-LABEL: fshr_v2i32: 252; GFX11: ; %bb.0: ; %entry 253; GFX11-NEXT: s_clause 0x2 254; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c 255; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c 256; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 257; GFX11-NEXT: s_waitcnt lgkmcnt(0) 258; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7 259; GFX11-NEXT: v_mov_b32_e32 v2, s6 260; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 261; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, v0 262; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, v2 263; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] 264; GFX11-NEXT: s_endpgm 265entry: 266 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) 267 store <2 x i32> %0, ptr addrspace(1) %in 268 ret void 269} 270 271define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { 272; SI-LABEL: fshr_v2i32_imm: 273; SI: ; %bb.0: ; %entry 274; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 275; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 276; SI-NEXT: s_mov_b32 s7, 0xf000 277; SI-NEXT: s_mov_b32 s6, -1 278; SI-NEXT: s_waitcnt lgkmcnt(0) 279; SI-NEXT: v_mov_b32_e32 v0, s3 280; SI-NEXT: v_mov_b32_e32 v2, s2 281; SI-NEXT: v_alignbit_b32 v1, s1, v0, 9 282; SI-NEXT: v_alignbit_b32 v0, s0, v2, 7 283; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 284; SI-NEXT: s_endpgm 285; 286; VI-LABEL: fshr_v2i32_imm: 287; VI: ; %bb.0: ; %entry 288; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 289; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 290; VI-NEXT: s_waitcnt lgkmcnt(0) 291; VI-NEXT: v_mov_b32_e32 v0, s3 292; VI-NEXT: v_mov_b32_e32 v2, s2 293; VI-NEXT: v_alignbit_b32 v1, s1, v0, 9 294; VI-NEXT: v_alignbit_b32 v0, s0, v2, 7 295; VI-NEXT: v_mov_b32_e32 v2, s4 296; VI-NEXT: v_mov_b32_e32 v3, s5 297; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 298; VI-NEXT: s_endpgm 299; 300; GFX9-LABEL: fshr_v2i32_imm: 301; GFX9: ; %bb.0: ; %entry 302; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 303; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 304; GFX9-NEXT: v_mov_b32_e32 v2, 0 305; GFX9-NEXT: s_waitcnt lgkmcnt(0) 306; GFX9-NEXT: v_mov_b32_e32 v0, s3 307; GFX9-NEXT: v_mov_b32_e32 v3, s2 308; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 9 309; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 7 310; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 311; GFX9-NEXT: s_endpgm 312; 313; R600-LABEL: fshr_v2i32_imm: 314; R600: ; %bb.0: ; %entry 315; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 316; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 317; R600-NEXT: CF_END 318; R600-NEXT: PAD 319; R600-NEXT: ALU clause starting at 4: 320; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x, 321; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) 322; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x, 323; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 324; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 325; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 326; 327; GFX10-LABEL: fshr_v2i32_imm: 328; GFX10: ; %bb.0: ; %entry 329; GFX10-NEXT: s_clause 0x1 330; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 331; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 332; GFX10-NEXT: v_mov_b32_e32 v2, 0 333; GFX10-NEXT: s_waitcnt lgkmcnt(0) 334; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 9 335; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 7 336; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 337; GFX10-NEXT: s_endpgm 338; 339; GFX11-LABEL: fshr_v2i32_imm: 340; GFX11: ; %bb.0: ; %entry 341; GFX11-NEXT: s_clause 0x1 342; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c 343; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 344; GFX11-NEXT: v_mov_b32_e32 v2, 0 345; GFX11-NEXT: s_waitcnt lgkmcnt(0) 346; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 9 347; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 7 348; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] 349; GFX11-NEXT: s_endpgm 350entry: 351 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>) 352 store <2 x i32> %0, ptr addrspace(1) %in 353 ret void 354} 355 356define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { 357; SI-LABEL: fshr_v4i32: 358; SI: ; %bb.0: ; %entry 359; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 360; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x15 361; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 362; SI-NEXT: s_mov_b32 s7, 0xf000 363; SI-NEXT: s_mov_b32 s6, -1 364; SI-NEXT: s_waitcnt lgkmcnt(0) 365; SI-NEXT: v_mov_b32_e32 v0, s15 366; SI-NEXT: v_mov_b32_e32 v1, s3 367; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1 368; SI-NEXT: v_mov_b32_e32 v0, s14 369; SI-NEXT: v_mov_b32_e32 v1, s2 370; SI-NEXT: v_alignbit_b32 v2, s10, v0, v1 371; SI-NEXT: v_mov_b32_e32 v0, s13 372; SI-NEXT: v_mov_b32_e32 v1, s1 373; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1 374; SI-NEXT: v_mov_b32_e32 v0, s12 375; SI-NEXT: v_mov_b32_e32 v4, s0 376; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4 377; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 378; SI-NEXT: s_endpgm 379; 380; VI-LABEL: fshr_v4i32: 381; VI: ; %bb.0: ; %entry 382; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 383; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 384; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 385; VI-NEXT: s_waitcnt lgkmcnt(0) 386; VI-NEXT: v_mov_b32_e32 v0, s15 387; VI-NEXT: v_mov_b32_e32 v1, s3 388; VI-NEXT: v_mov_b32_e32 v2, s14 389; VI-NEXT: v_alignbit_b32 v3, s11, v0, v1 390; VI-NEXT: v_mov_b32_e32 v0, s2 391; VI-NEXT: v_alignbit_b32 v2, s10, v2, v0 392; VI-NEXT: v_mov_b32_e32 v0, s13 393; VI-NEXT: v_mov_b32_e32 v1, s1 394; VI-NEXT: v_alignbit_b32 v1, s9, v0, v1 395; VI-NEXT: v_mov_b32_e32 v0, s12 396; VI-NEXT: v_mov_b32_e32 v4, s0 397; VI-NEXT: v_alignbit_b32 v0, s8, v0, v4 398; VI-NEXT: v_mov_b32_e32 v4, s4 399; VI-NEXT: v_mov_b32_e32 v5, s5 400; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 401; VI-NEXT: s_endpgm 402; 403; GFX9-LABEL: fshr_v4i32: 404; GFX9: ; %bb.0: ; %entry 405; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 406; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 407; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 408; GFX9-NEXT: v_mov_b32_e32 v4, 0 409; GFX9-NEXT: s_waitcnt lgkmcnt(0) 410; GFX9-NEXT: v_mov_b32_e32 v0, s15 411; GFX9-NEXT: v_mov_b32_e32 v1, s3 412; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, v1 413; GFX9-NEXT: v_mov_b32_e32 v0, s14 414; GFX9-NEXT: v_mov_b32_e32 v1, s2 415; GFX9-NEXT: v_alignbit_b32 v2, s10, v0, v1 416; GFX9-NEXT: v_mov_b32_e32 v0, s13 417; GFX9-NEXT: v_mov_b32_e32 v1, s1 418; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, v1 419; GFX9-NEXT: v_mov_b32_e32 v0, s12 420; GFX9-NEXT: v_mov_b32_e32 v5, s0 421; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, v5 422; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 423; GFX9-NEXT: s_endpgm 424; 425; R600-LABEL: fshr_v4i32: 426; R600: ; %bb.0: ; %entry 427; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 428; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 429; R600-NEXT: CF_END 430; R600-NEXT: PAD 431; R600-NEXT: ALU clause starting at 4: 432; R600-NEXT: MOV * T0.W, KC0[6].X, 433; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W, 434; R600-NEXT: MOV * T1.W, KC0[5].W, 435; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W, 436; R600-NEXT: MOV * T1.W, KC0[5].Z, 437; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W, 438; R600-NEXT: MOV * T1.W, KC0[5].Y, 439; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W, 440; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 441; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 442; 443; GFX10-LABEL: fshr_v4i32: 444; GFX10: ; %bb.0: ; %entry 445; GFX10-NEXT: s_clause 0x2 446; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 447; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 448; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 449; GFX10-NEXT: v_mov_b32_e32 v6, 0 450; GFX10-NEXT: s_waitcnt lgkmcnt(0) 451; GFX10-NEXT: v_mov_b32_e32 v0, s3 452; GFX10-NEXT: v_mov_b32_e32 v1, s2 453; GFX10-NEXT: v_mov_b32_e32 v4, s1 454; GFX10-NEXT: v_mov_b32_e32 v5, s0 455; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, v0 456; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, v1 457; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, v4 458; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, v5 459; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7] 460; GFX10-NEXT: s_endpgm 461; 462; GFX11-LABEL: fshr_v4i32: 463; GFX11: ; %bb.0: ; %entry 464; GFX11-NEXT: s_clause 0x2 465; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54 466; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 467; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 468; GFX11-NEXT: v_mov_b32_e32 v6, 0 469; GFX11-NEXT: s_waitcnt lgkmcnt(0) 470; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 471; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 472; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 473; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, v0 474; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, v1 475; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) 476; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, v4 477; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, v5 478; GFX11-NEXT: global_store_b128 v6, v[0:3], s[4:5] 479; GFX11-NEXT: s_endpgm 480entry: 481 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) 482 store <4 x i32> %0, ptr addrspace(1) %in 483 ret void 484} 485 486define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { 487; SI-LABEL: fshr_v4i32_imm: 488; SI: ; %bb.0: ; %entry 489; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 490; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 491; SI-NEXT: s_mov_b32 s3, 0xf000 492; SI-NEXT: s_mov_b32 s2, -1 493; SI-NEXT: s_waitcnt lgkmcnt(0) 494; SI-NEXT: v_mov_b32_e32 v0, s15 495; SI-NEXT: v_mov_b32_e32 v1, s14 496; SI-NEXT: v_alignbit_b32 v3, s11, v0, 1 497; SI-NEXT: v_mov_b32_e32 v0, s13 498; SI-NEXT: v_alignbit_b32 v2, s10, v1, 9 499; SI-NEXT: v_alignbit_b32 v1, s9, v0, 7 500; SI-NEXT: v_mov_b32_e32 v0, s12 501; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 502; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 503; SI-NEXT: s_endpgm 504; 505; VI-LABEL: fshr_v4i32_imm: 506; VI: ; %bb.0: ; %entry 507; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 508; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 509; VI-NEXT: s_waitcnt lgkmcnt(0) 510; VI-NEXT: v_mov_b32_e32 v0, s15 511; VI-NEXT: v_mov_b32_e32 v1, s14 512; VI-NEXT: v_mov_b32_e32 v4, s13 513; VI-NEXT: v_alignbit_b32 v3, s11, v0, 1 514; VI-NEXT: v_alignbit_b32 v2, s10, v1, 9 515; VI-NEXT: v_alignbit_b32 v1, s9, v4, 7 516; VI-NEXT: v_mov_b32_e32 v0, s12 517; VI-NEXT: v_mov_b32_e32 v5, s1 518; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 519; VI-NEXT: v_mov_b32_e32 v4, s0 520; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 521; VI-NEXT: s_endpgm 522; 523; GFX9-LABEL: fshr_v4i32_imm: 524; GFX9: ; %bb.0: ; %entry 525; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 526; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 527; GFX9-NEXT: v_mov_b32_e32 v4, 0 528; GFX9-NEXT: s_waitcnt lgkmcnt(0) 529; GFX9-NEXT: v_mov_b32_e32 v0, s15 530; GFX9-NEXT: v_mov_b32_e32 v1, s14 531; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 1 532; GFX9-NEXT: v_mov_b32_e32 v0, s13 533; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 9 534; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 7 535; GFX9-NEXT: v_mov_b32_e32 v0, s12 536; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 537; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 538; GFX9-NEXT: s_endpgm 539; 540; R600-LABEL: fshr_v4i32_imm: 541; R600: ; %bb.0: ; %entry 542; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 543; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 544; R600-NEXT: CF_END 545; R600-NEXT: PAD 546; R600-NEXT: ALU clause starting at 4: 547; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1, 548; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x, 549; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) 550; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x, 551; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 552; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1, 553; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 554; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 555; 556; GFX10-LABEL: fshr_v4i32_imm: 557; GFX10: ; %bb.0: ; %entry 558; GFX10-NEXT: s_clause 0x1 559; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 560; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 561; GFX10-NEXT: v_mov_b32_e32 v4, 0 562; GFX10-NEXT: s_waitcnt lgkmcnt(0) 563; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 1 564; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 9 565; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 7 566; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 1 567; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 568; GFX10-NEXT: s_endpgm 569; 570; GFX11-LABEL: fshr_v4i32_imm: 571; GFX11: ; %bb.0: ; %entry 572; GFX11-NEXT: s_clause 0x1 573; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 574; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 575; GFX11-NEXT: v_mov_b32_e32 v4, 0 576; GFX11-NEXT: s_waitcnt lgkmcnt(0) 577; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 1 578; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 9 579; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 7 580; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 1 581; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 582; GFX11-NEXT: s_endpgm 583entry: 584 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>) 585 store <4 x i32> %0, ptr addrspace(1) %in 586 ret void 587} 588 589define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) { 590; GFX89-LABEL: v_fshr_i32: 591; GFX89: ; %bb.0: 592; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 593; GFX89-NEXT: v_alignbit_b32 v0, v0, v1, v2 594; GFX89-NEXT: s_setpc_b64 s[30:31] 595; 596; R600-LABEL: v_fshr_i32: 597; R600: ; %bb.0: 598; R600-NEXT: CF_END 599; R600-NEXT: PAD 600; 601; GFX10-LABEL: v_fshr_i32: 602; GFX10: ; %bb.0: 603; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 604; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 605; GFX10-NEXT: s_setpc_b64 s[30:31] 606; 607; GFX11-LABEL: v_fshr_i32: 608; GFX11: ; %bb.0: 609; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 610; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2 611; GFX11-NEXT: s_setpc_b64 s[30:31] 612 %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2) 613 ret i32 %ret 614} 615 616define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) { 617; GFX89-LABEL: v_fshr_v2i32: 618; GFX89: ; %bb.0: 619; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 620; GFX89-NEXT: v_alignbit_b32 v0, v0, v2, v4 621; GFX89-NEXT: v_alignbit_b32 v1, v1, v3, v5 622; GFX89-NEXT: s_setpc_b64 s[30:31] 623; 624; R600-LABEL: v_fshr_v2i32: 625; R600: ; %bb.0: 626; R600-NEXT: CF_END 627; R600-NEXT: PAD 628; 629; GFX10-LABEL: v_fshr_v2i32: 630; GFX10: ; %bb.0: 631; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 632; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 633; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 634; GFX10-NEXT: s_setpc_b64 s[30:31] 635; 636; GFX11-LABEL: v_fshr_v2i32: 637; GFX11: ; %bb.0: 638; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 639; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 640; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 641; GFX11-NEXT: s_setpc_b64 s[30:31] 642 %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) 643 ret <2 x i32> %ret 644} 645 646define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) { 647; GFX89-LABEL: v_fshr_v3i32: 648; GFX89: ; %bb.0: 649; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 650; GFX89-NEXT: v_alignbit_b32 v0, v0, v3, v6 651; GFX89-NEXT: v_alignbit_b32 v1, v1, v4, v7 652; GFX89-NEXT: v_alignbit_b32 v2, v2, v5, v8 653; GFX89-NEXT: s_setpc_b64 s[30:31] 654; 655; R600-LABEL: v_fshr_v3i32: 656; R600: ; %bb.0: 657; R600-NEXT: CF_END 658; R600-NEXT: PAD 659; 660; GFX10-LABEL: v_fshr_v3i32: 661; GFX10: ; %bb.0: 662; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 663; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6 664; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7 665; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8 666; GFX10-NEXT: s_setpc_b64 s[30:31] 667; 668; GFX11-LABEL: v_fshr_v3i32: 669; GFX11: ; %bb.0: 670; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 671; GFX11-NEXT: v_alignbit_b32 v0, v0, v3, v6 672; GFX11-NEXT: v_alignbit_b32 v1, v1, v4, v7 673; GFX11-NEXT: v_alignbit_b32 v2, v2, v5, v8 674; GFX11-NEXT: s_setpc_b64 s[30:31] 675 %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) 676 ret <3 x i32> %ret 677} 678 679define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) { 680; GFX89-LABEL: v_fshr_v4i32: 681; GFX89: ; %bb.0: 682; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 683; GFX89-NEXT: v_alignbit_b32 v0, v0, v4, v8 684; GFX89-NEXT: v_alignbit_b32 v1, v1, v5, v9 685; GFX89-NEXT: v_alignbit_b32 v2, v2, v6, v10 686; GFX89-NEXT: v_alignbit_b32 v3, v3, v7, v11 687; GFX89-NEXT: s_setpc_b64 s[30:31] 688; 689; R600-LABEL: v_fshr_v4i32: 690; R600: ; %bb.0: 691; R600-NEXT: CF_END 692; R600-NEXT: PAD 693; 694; GFX10-LABEL: v_fshr_v4i32: 695; GFX10: ; %bb.0: 696; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 697; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8 698; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9 699; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10 700; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11 701; GFX10-NEXT: s_setpc_b64 s[30:31] 702; 703; GFX11-LABEL: v_fshr_v4i32: 704; GFX11: ; %bb.0: 705; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 706; GFX11-NEXT: v_alignbit_b32 v0, v0, v4, v8 707; GFX11-NEXT: v_alignbit_b32 v1, v1, v5, v9 708; GFX11-NEXT: v_alignbit_b32 v2, v2, v6, v10 709; GFX11-NEXT: v_alignbit_b32 v3, v3, v7, v11 710; GFX11-NEXT: s_setpc_b64 s[30:31] 711 %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) 712 ret <4 x i32> %ret 713} 714 715define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) { 716; SI-LABEL: v_fshr_i16: 717; SI: ; %bb.0: 718; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 719; SI-NEXT: v_or_b32_e32 v2, 16, v2 720; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 721; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2 722; SI-NEXT: s_setpc_b64 s[30:31] 723; 724; VI-LABEL: v_fshr_i16: 725; VI: ; %bb.0: 726; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 727; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 728; VI-NEXT: v_xor_b32_e32 v3, -1, v2 729; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 730; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 731; VI-NEXT: v_or_b32_e32 v0, v0, v1 732; VI-NEXT: s_setpc_b64 s[30:31] 733; 734; GFX9-LABEL: v_fshr_i16: 735; GFX9: ; %bb.0: 736; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 737; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 738; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 739; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 740; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 741; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 742; GFX9-NEXT: s_setpc_b64 s[30:31] 743; 744; R600-LABEL: v_fshr_i16: 745; R600: ; %bb.0: 746; R600-NEXT: CF_END 747; R600-NEXT: PAD 748; 749; GFX10-LABEL: v_fshr_i16: 750; GFX10: ; %bb.0: 751; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 752; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 753; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 754; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 755; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 756; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 757; GFX10-NEXT: s_setpc_b64 s[30:31] 758; 759; GFX11-LABEL: v_fshr_i16: 760; GFX11: ; %bb.0: 761; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 762; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 763; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 764; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 765; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 766; GFX11-NEXT: v_lshlrev_b16 v0, v3, v0 767; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 768; GFX11-NEXT: s_setpc_b64 s[30:31] 769 %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2) 770 ret i16 %ret 771} 772 773define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) { 774; SI-LABEL: v_fshr_v2i16: 775; SI: ; %bb.0: 776; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 777; SI-NEXT: v_or_b32_e32 v5, 16, v5 778; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 779; SI-NEXT: v_or_b32_e32 v4, 16, v4 780; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 781; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5 782; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 783; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 784; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 785; SI-NEXT: v_or_b32_e32 v0, v0, v3 786; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 787; SI-NEXT: s_setpc_b64 s[30:31] 788; 789; VI-LABEL: v_fshr_v2i16: 790; VI: ; %bb.0: 791; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 792; VI-NEXT: v_mov_b32_e32 v4, 1 793; VI-NEXT: v_mov_b32_e32 v5, -1 794; VI-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 795; VI-NEXT: v_xor_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 796; VI-NEXT: v_lshrrev_b16_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 797; VI-NEXT: v_lshlrev_b16_e32 v4, v5, v4 798; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 799; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 800; VI-NEXT: v_xor_b32_e32 v4, -1, v2 801; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0 802; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 803; VI-NEXT: v_or_b32_e32 v0, v0, v1 804; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 805; VI-NEXT: s_setpc_b64 s[30:31] 806; 807; GFX9-LABEL: v_fshr_v2i16: 808; GFX9: ; %bb.0: 809; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 810; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 811; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] 812; GFX9-NEXT: v_and_b32_e32 v3, 0xf000f, v3 813; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v2 814; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0 815; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 816; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 817; GFX9-NEXT: s_setpc_b64 s[30:31] 818; 819; R600-LABEL: v_fshr_v2i16: 820; R600: ; %bb.0: 821; R600-NEXT: CF_END 822; R600-NEXT: PAD 823; 824; GFX10-LABEL: v_fshr_v2i16: 825; GFX10: ; %bb.0: 826; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 827; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 828; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] 829; GFX10-NEXT: v_and_b32_e32 v2, 0xf000f, v2 830; GFX10-NEXT: v_and_b32_e32 v3, 0xf000f, v3 831; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 832; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 833; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 834; GFX10-NEXT: s_setpc_b64 s[30:31] 835; 836; GFX11-LABEL: v_fshr_v2i16: 837; GFX11: ; %bb.0: 838; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 839; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 840; GFX11-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] 841; GFX11-NEXT: v_and_b32_e32 v2, 0xf000f, v2 842; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 843; GFX11-NEXT: v_and_b32_e32 v3, 0xf000f, v3 844; GFX11-NEXT: v_pk_lshrrev_b16 v1, v2, v1 845; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 846; GFX11-NEXT: v_pk_lshlrev_b16 v0, v3, v0 847; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 848; GFX11-NEXT: s_setpc_b64 s[30:31] 849 %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) 850 ret <2 x i16> %ret 851} 852 853define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) { 854; SI-LABEL: v_fshr_v3i16: 855; SI: ; %bb.0: 856; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 857; SI-NEXT: v_or_b32_e32 v7, 16, v7 858; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 859; SI-NEXT: v_alignbit_b32 v1, v1, v4, v7 860; SI-NEXT: v_or_b32_e32 v4, 16, v6 861; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 862; SI-NEXT: v_alignbit_b32 v0, v0, v3, v4 863; SI-NEXT: v_or_b32_e32 v3, 16, v8 864; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 865; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 866; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 867; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3 868; SI-NEXT: v_or_b32_e32 v0, v0, v1 869; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 870; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 871; SI-NEXT: s_setpc_b64 s[30:31] 872; 873; VI-LABEL: v_fshr_v3i16: 874; VI: ; %bb.0: 875; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 876; VI-NEXT: v_mov_b32_e32 v7, 1 877; VI-NEXT: v_mov_b32_e32 v8, -1 878; VI-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 879; VI-NEXT: v_xor_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 880; VI-NEXT: v_lshrrev_b16_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 881; VI-NEXT: v_lshlrev_b16_e32 v7, v8, v7 882; VI-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 883; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 884; VI-NEXT: v_xor_b32_e32 v7, -1, v5 885; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1 886; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 887; VI-NEXT: v_or_b32_e32 v1, v1, v3 888; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 889; VI-NEXT: v_xor_b32_e32 v3, -1, v4 890; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 891; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2 892; VI-NEXT: v_or_b32_e32 v0, v0, v2 893; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 894; VI-NEXT: s_setpc_b64 s[30:31] 895; 896; GFX9-LABEL: v_fshr_v3i16: 897; GFX9: ; %bb.0: 898; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 899; GFX9-NEXT: v_mov_b32_e32 v7, 1 900; GFX9-NEXT: v_mov_b32_e32 v8, -1 901; GFX9-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 902; GFX9-NEXT: v_xor_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 903; GFX9-NEXT: v_lshrrev_b16_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 904; GFX9-NEXT: v_lshlrev_b16_e32 v7, v8, v7 905; GFX9-NEXT: v_or_b32_e32 v6, v7, v6 906; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 907; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5 908; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1 909; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 910; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 911; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 912; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 913; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 914; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 915; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 916; GFX9-NEXT: s_mov_b32 s4, 0x5040100 917; GFX9-NEXT: v_perm_b32 v0, v6, v0, s4 918; GFX9-NEXT: s_setpc_b64 s[30:31] 919; 920; R600-LABEL: v_fshr_v3i16: 921; R600: ; %bb.0: 922; R600-NEXT: CF_END 923; R600-NEXT: PAD 924; 925; GFX10-LABEL: v_fshr_v3i16: 926; GFX10: ; %bb.0: 927; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 928; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 929; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v4 930; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2 931; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 932; GFX10-NEXT: v_xor_b32_e32 v10, -1, v4 933; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 934; GFX10-NEXT: v_xor_b32_e32 v9, -1, v7 935; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 936; GFX10-NEXT: v_lshrrev_b16 v7, v7, v8 937; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0 938; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 939; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6 940; GFX10-NEXT: v_xor_b32_e32 v4, -1, v5 941; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 942; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 943; GFX10-NEXT: v_or_b32_e32 v5, v6, v7 944; GFX10-NEXT: v_lshlrev_b16 v1, v4, v1 945; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 946; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 947; GFX10-NEXT: s_setpc_b64 s[30:31] 948; 949; GFX11-LABEL: v_fshr_v3i16: 950; GFX11: ; %bb.0: 951; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 952; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 953; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 954; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 955; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 956; GFX11-NEXT: v_xor_b32_e32 v10, -1, v4 957; GFX11-NEXT: v_lshlrev_b16 v6, 1, v6 958; GFX11-NEXT: v_xor_b32_e32 v9, -1, v7 959; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1 960; GFX11-NEXT: v_lshrrev_b16 v7, v7, v8 961; GFX11-NEXT: v_lshlrev_b16 v0, v10, v0 962; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2 963; GFX11-NEXT: v_lshlrev_b16 v6, v9, v6 964; GFX11-NEXT: v_xor_b32_e32 v4, -1, v5 965; GFX11-NEXT: v_lshrrev_b16 v3, v5, v3 966; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 967; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 968; GFX11-NEXT: v_or_b32_e32 v5, v6, v7 969; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 970; GFX11-NEXT: v_lshlrev_b16 v1, v4, v1 971; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 972; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 973; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 974; GFX11-NEXT: s_setpc_b64 s[30:31] 975 %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) 976 ret <3 x i16> %ret 977} 978 979define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) { 980; SI-LABEL: v_fshr_v4i16: 981; SI: ; %bb.0: 982; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 983; SI-NEXT: v_or_b32_e32 v9, 16, v9 984; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 985; SI-NEXT: v_alignbit_b32 v1, v1, v5, v9 986; SI-NEXT: v_or_b32_e32 v5, 16, v8 987; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 988; SI-NEXT: v_alignbit_b32 v0, v0, v4, v5 989; SI-NEXT: v_or_b32_e32 v4, 16, v11 990; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 991; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4 992; SI-NEXT: v_or_b32_e32 v5, 16, v10 993; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 994; SI-NEXT: v_alignbit_b32 v2, v2, v6, v5 995; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 996; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 997; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 998; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 999; SI-NEXT: v_or_b32_e32 v2, v2, v4 1000; SI-NEXT: v_or_b32_e32 v0, v0, v1 1001; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 1002; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 1003; SI-NEXT: s_setpc_b64 s[30:31] 1004; 1005; VI-LABEL: v_fshr_v4i16: 1006; VI: ; %bb.0: 1007; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1008; VI-NEXT: v_mov_b32_e32 v7, 1 1009; VI-NEXT: v_mov_b32_e32 v9, -1 1010; VI-NEXT: v_lshlrev_b16_sdwa v8, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1011; VI-NEXT: v_xor_b32_sdwa v10, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1012; VI-NEXT: v_lshrrev_b16_sdwa v6, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1013; VI-NEXT: v_lshlrev_b16_e32 v8, v10, v8 1014; VI-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1015; VI-NEXT: v_xor_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1016; VI-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1017; VI-NEXT: v_lshrrev_b16_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1018; VI-NEXT: v_lshlrev_b16_e32 v7, v9, v7 1019; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1020; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 1021; VI-NEXT: v_xor_b32_e32 v8, -1, v5 1022; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1 1023; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 1024; VI-NEXT: v_or_b32_e32 v1, v1, v3 1025; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 1026; VI-NEXT: v_xor_b32_e32 v3, -1, v4 1027; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 1028; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2 1029; VI-NEXT: v_or_b32_e32 v0, v0, v2 1030; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1031; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1032; VI-NEXT: s_setpc_b64 s[30:31] 1033; 1034; GFX9-LABEL: v_fshr_v4i16: 1035; GFX9: ; %bb.0: 1036; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1037; GFX9-NEXT: v_mov_b32_e32 v7, 1 1038; GFX9-NEXT: v_mov_b32_e32 v9, -1 1039; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1040; GFX9-NEXT: v_xor_b32_sdwa v10, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1041; GFX9-NEXT: v_lshrrev_b16_sdwa v6, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1042; GFX9-NEXT: v_lshlrev_b16_e32 v8, v10, v8 1043; GFX9-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1044; GFX9-NEXT: v_xor_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1045; GFX9-NEXT: v_or_b32_e32 v6, v8, v6 1046; GFX9-NEXT: v_lshrrev_b16_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1047; GFX9-NEXT: v_lshlrev_b16_e32 v7, v9, v7 1048; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 1049; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 1050; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5 1051; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1 1052; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 1053; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 1054; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 1055; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 1056; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 1057; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 1058; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 1059; GFX9-NEXT: s_mov_b32 s4, 0x5040100 1060; GFX9-NEXT: v_perm_b32 v0, v7, v0, s4 1061; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4 1062; GFX9-NEXT: s_setpc_b64 s[30:31] 1063; 1064; R600-LABEL: v_fshr_v4i16: 1065; R600: ; %bb.0: 1066; R600-NEXT: CF_END 1067; R600-NEXT: PAD 1068; 1069; GFX10-LABEL: v_fshr_v4i16: 1070; GFX10: ; %bb.0: 1071; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1072; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v3 1073; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 1074; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 1075; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v0 1076; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v4 1077; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 1078; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6 1079; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8 1080; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 1081; GFX10-NEXT: v_lshlrev_b16 v9, 1, v9 1082; GFX10-NEXT: v_xor_b32_e32 v12, -1, v10 1083; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 1084; GFX10-NEXT: v_xor_b32_e32 v13, -1, v5 1085; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 1086; GFX10-NEXT: v_xor_b32_e32 v14, -1, v4 1087; GFX10-NEXT: v_lshlrev_b16 v7, v7, v8 1088; GFX10-NEXT: v_lshrrev_b16 v8, v10, v11 1089; GFX10-NEXT: v_lshlrev_b16 v9, v12, v9 1090; GFX10-NEXT: v_lshlrev_b16 v1, v13, v1 1091; GFX10-NEXT: v_lshlrev_b16 v0, v14, v0 1092; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 1093; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 1094; GFX10-NEXT: v_or_b32_e32 v4, v7, v6 1095; GFX10-NEXT: v_or_b32_e32 v5, v9, v8 1096; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 1097; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 1098; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 1099; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 1100; GFX10-NEXT: s_setpc_b64 s[30:31] 1101; 1102; GFX11-LABEL: v_fshr_v4i16: 1103; GFX11: ; %bb.0: 1104; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1105; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 1106; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v5 1107; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 1108; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 1109; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v4 1110; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2 1111; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6 1112; GFX11-NEXT: v_lshlrev_b16 v8, 1, v8 1113; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7 1114; GFX11-NEXT: v_lshlrev_b16 v9, 1, v9 1115; GFX11-NEXT: v_xor_b32_e32 v12, -1, v10 1116; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1 1117; GFX11-NEXT: v_xor_b32_e32 v13, -1, v5 1118; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 1119; GFX11-NEXT: v_xor_b32_e32 v14, -1, v4 1120; GFX11-NEXT: v_lshlrev_b16 v7, v7, v8 1121; GFX11-NEXT: v_lshrrev_b16 v8, v10, v11 1122; GFX11-NEXT: v_lshlrev_b16 v9, v12, v9 1123; GFX11-NEXT: v_lshlrev_b16 v1, v13, v1 1124; GFX11-NEXT: v_lshlrev_b16 v0, v14, v0 1125; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2 1126; GFX11-NEXT: v_lshrrev_b16 v3, v5, v3 1127; GFX11-NEXT: v_or_b32_e32 v4, v7, v6 1128; GFX11-NEXT: v_or_b32_e32 v5, v9, v8 1129; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 1130; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 1131; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 1132; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1133; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 1134; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 1135; GFX11-NEXT: s_setpc_b64 s[30:31] 1136 %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) 1137 ret <4 x i16> %ret 1138} 1139 1140define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) { 1141; SI-LABEL: v_fshr_i64: 1142; SI: ; %bb.0: 1143; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1144; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 1145; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 1146; SI-NEXT: v_not_b32_e32 v4, v4 1147; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 1148; SI-NEXT: v_or_b32_e32 v1, v1, v3 1149; SI-NEXT: v_or_b32_e32 v0, v0, v2 1150; SI-NEXT: s_setpc_b64 s[30:31] 1151; 1152; VI-LABEL: v_fshr_i64: 1153; VI: ; %bb.0: 1154; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1155; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1156; VI-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] 1157; VI-NEXT: v_not_b32_e32 v4, v4 1158; VI-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 1159; VI-NEXT: v_or_b32_e32 v1, v1, v3 1160; VI-NEXT: v_or_b32_e32 v0, v0, v2 1161; VI-NEXT: s_setpc_b64 s[30:31] 1162; 1163; GFX9-LABEL: v_fshr_i64: 1164; GFX9: ; %bb.0: 1165; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1166; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1167; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] 1168; GFX9-NEXT: v_not_b32_e32 v4, v4 1169; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 1170; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 1171; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 1172; GFX9-NEXT: s_setpc_b64 s[30:31] 1173; 1174; R600-LABEL: v_fshr_i64: 1175; R600: ; %bb.0: 1176; R600-NEXT: CF_END 1177; R600-NEXT: PAD 1178; 1179; GFX10-LABEL: v_fshr_i64: 1180; GFX10: ; %bb.0: 1181; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1182; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1183; GFX10-NEXT: v_not_b32_e32 v5, v4 1184; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] 1185; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] 1186; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 1187; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 1188; GFX10-NEXT: s_setpc_b64 s[30:31] 1189; 1190; GFX11-LABEL: v_fshr_i64: 1191; GFX11: ; %bb.0: 1192; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1193; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1194; GFX11-NEXT: v_not_b32_e32 v5, v4 1195; GFX11-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] 1196; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1197; GFX11-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] 1198; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 1199; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1200; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 1201; GFX11-NEXT: s_setpc_b64 s[30:31] 1202 %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2) 1203 ret i64 %ret 1204} 1205 1206define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) { 1207; SI-LABEL: v_fshr_v2i64: 1208; SI: ; %bb.0: 1209; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1210; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 1211; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 1212; SI-NEXT: v_not_b32_e32 v8, v8 1213; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 1214; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 1215; SI-NEXT: v_or_b32_e32 v1, v1, v5 1216; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], v10 1217; SI-NEXT: v_not_b32_e32 v7, v10 1218; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v7 1219; SI-NEXT: v_or_b32_e32 v0, v0, v4 1220; SI-NEXT: v_or_b32_e32 v3, v3, v6 1221; SI-NEXT: v_or_b32_e32 v2, v2, v5 1222; SI-NEXT: s_setpc_b64 s[30:31] 1223; 1224; VI-LABEL: v_fshr_v2i64: 1225; VI: ; %bb.0: 1226; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1227; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1228; VI-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] 1229; VI-NEXT: v_not_b32_e32 v8, v8 1230; VI-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] 1231; VI-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 1232; VI-NEXT: v_or_b32_e32 v1, v1, v5 1233; VI-NEXT: v_lshrrev_b64 v[5:6], v10, v[6:7] 1234; VI-NEXT: v_not_b32_e32 v7, v10 1235; VI-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] 1236; VI-NEXT: v_or_b32_e32 v0, v0, v4 1237; VI-NEXT: v_or_b32_e32 v3, v3, v6 1238; VI-NEXT: v_or_b32_e32 v2, v2, v5 1239; VI-NEXT: s_setpc_b64 s[30:31] 1240; 1241; GFX9-LABEL: v_fshr_v2i64: 1242; GFX9: ; %bb.0: 1243; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1244; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1245; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] 1246; GFX9-NEXT: v_not_b32_e32 v8, v8 1247; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] 1248; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 1249; GFX9-NEXT: v_or_b32_e32 v1, v1, v5 1250; GFX9-NEXT: v_lshrrev_b64 v[5:6], v10, v[6:7] 1251; GFX9-NEXT: v_not_b32_e32 v7, v10 1252; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] 1253; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 1254; GFX9-NEXT: v_or_b32_e32 v3, v3, v6 1255; GFX9-NEXT: v_or_b32_e32 v2, v2, v5 1256; GFX9-NEXT: s_setpc_b64 s[30:31] 1257; 1258; R600-LABEL: v_fshr_v2i64: 1259; R600: ; %bb.0: 1260; R600-NEXT: CF_END 1261; R600-NEXT: PAD 1262; 1263; GFX10-LABEL: v_fshr_v2i64: 1264; GFX10: ; %bb.0: 1265; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1266; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1267; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 1268; GFX10-NEXT: v_not_b32_e32 v9, v8 1269; GFX10-NEXT: v_not_b32_e32 v11, v10 1270; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] 1271; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] 1272; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] 1273; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] 1274; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 1275; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 1276; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 1277; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 1278; GFX10-NEXT: s_setpc_b64 s[30:31] 1279; 1280; GFX11-LABEL: v_fshr_v2i64: 1281; GFX11: ; %bb.0: 1282; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1283; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1284; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 1285; GFX11-NEXT: v_not_b32_e32 v9, v8 1286; GFX11-NEXT: v_not_b32_e32 v11, v10 1287; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] 1288; GFX11-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] 1289; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 1290; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] 1291; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] 1292; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 1293; GFX11-NEXT: v_or_b32_e32 v0, v0, v4 1294; GFX11-NEXT: v_or_b32_e32 v1, v1, v5 1295; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) 1296; GFX11-NEXT: v_or_b32_e32 v2, v2, v6 1297; GFX11-NEXT: v_or_b32_e32 v3, v3, v7 1298; GFX11-NEXT: s_setpc_b64 s[30:31] 1299 %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) 1300 ret <2 x i64> %ret 1301} 1302 1303define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) { 1304; SI-LABEL: v_fshr_i24: 1305; SI: ; %bb.0: 1306; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1307; SI-NEXT: v_and_b32_e32 v3, 0xffffff, v2 1308; SI-NEXT: s_mov_b32 s4, 0xaaaaaab 1309; SI-NEXT: v_mul_hi_u32 v3, v3, s4 1310; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1311; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v3 1312; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 1313; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2 1314; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2 1315; SI-NEXT: s_setpc_b64 s[30:31] 1316; 1317; VI-LABEL: v_fshr_i24: 1318; VI: ; %bb.0: 1319; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1320; VI-NEXT: v_and_b32_e32 v3, 0xffffff, v2 1321; VI-NEXT: s_mov_b32 s4, 0xaaaaaab 1322; VI-NEXT: v_mul_hi_u32 v3, v3, s4 1323; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1324; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v3 1325; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 1326; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2 1327; VI-NEXT: v_alignbit_b32 v0, v0, v1, v2 1328; VI-NEXT: s_setpc_b64 s[30:31] 1329; 1330; GFX9-LABEL: v_fshr_i24: 1331; GFX9: ; %bb.0: 1332; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1333; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v2 1334; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaab 1335; GFX9-NEXT: v_mul_hi_u32 v3, v3, s4 1336; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1337; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v3 1338; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 1339; GFX9-NEXT: v_add_u32_e32 v2, 8, v2 1340; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, v2 1341; GFX9-NEXT: s_setpc_b64 s[30:31] 1342; 1343; R600-LABEL: v_fshr_i24: 1344; R600: ; %bb.0: 1345; R600-NEXT: CF_END 1346; R600-NEXT: PAD 1347; 1348; GFX10-LABEL: v_fshr_i24: 1349; GFX10: ; %bb.0: 1350; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1351; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v2 1352; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1353; GFX10-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3 1354; GFX10-NEXT: v_mul_u32_u24_e32 v3, 24, v3 1355; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 1356; GFX10-NEXT: v_add_nc_u32_e32 v2, 8, v2 1357; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 1358; GFX10-NEXT: s_setpc_b64 s[30:31] 1359; 1360; GFX11-LABEL: v_fshr_i24: 1361; GFX11: ; %bb.0: 1362; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1363; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v2 1364; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1365; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1366; GFX11-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3 1367; GFX11-NEXT: v_mul_u32_u24_e32 v3, 24, v3 1368; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1369; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 1370; GFX11-NEXT: v_add_nc_u32_e32 v2, 8, v2 1371; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1372; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2 1373; GFX11-NEXT: s_setpc_b64 s[30:31] 1374 %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2) 1375 ret i24 %ret 1376} 1377 1378define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) { 1379; SI-LABEL: v_fshr_v2i24: 1380; SI: ; %bb.0: 1381; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1382; SI-NEXT: v_and_b32_e32 v6, 0xffffff, v4 1383; SI-NEXT: s_mov_b32 s4, 0xaaaaaab 1384; SI-NEXT: v_mul_hi_u32 v6, v6, s4 1385; SI-NEXT: v_and_b32_e32 v7, 0xffffff, v5 1386; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1387; SI-NEXT: v_mul_u32_u24_e32 v6, 24, v6 1388; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 1389; SI-NEXT: v_mul_hi_u32 v6, v7, s4 1390; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4 1391; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 1392; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 1393; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v6 1394; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 1395; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3 1396; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3 1397; SI-NEXT: s_setpc_b64 s[30:31] 1398; 1399; VI-LABEL: v_fshr_v2i24: 1400; VI: ; %bb.0: 1401; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1402; VI-NEXT: v_and_b32_e32 v6, 0xffffff, v4 1403; VI-NEXT: s_mov_b32 s4, 0xaaaaaab 1404; VI-NEXT: v_mul_hi_u32 v6, v6, s4 1405; VI-NEXT: v_and_b32_e32 v7, 0xffffff, v5 1406; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1407; VI-NEXT: v_mul_u32_u24_e32 v6, 24, v6 1408; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 1409; VI-NEXT: v_mul_hi_u32 v6, v7, s4 1410; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4 1411; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4 1412; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 1413; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v6 1414; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v3 1415; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3 1416; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3 1417; VI-NEXT: s_setpc_b64 s[30:31] 1418; 1419; GFX9-LABEL: v_fshr_v2i24: 1420; GFX9: ; %bb.0: 1421; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1422; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v4 1423; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaab 1424; GFX9-NEXT: v_mul_hi_u32 v6, v6, s4 1425; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff, v5 1426; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1427; GFX9-NEXT: v_mul_u32_u24_e32 v6, 24, v6 1428; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 1429; GFX9-NEXT: v_mul_hi_u32 v6, v7, s4 1430; GFX9-NEXT: v_add_u32_e32 v4, 8, v4 1431; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 1432; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3 1433; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v6 1434; GFX9-NEXT: v_sub_u32_e32 v3, v5, v3 1435; GFX9-NEXT: v_add_u32_e32 v3, 8, v3 1436; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3 1437; GFX9-NEXT: s_setpc_b64 s[30:31] 1438; 1439; R600-LABEL: v_fshr_v2i24: 1440; R600: ; %bb.0: 1441; R600-NEXT: CF_END 1442; R600-NEXT: PAD 1443; 1444; GFX10-LABEL: v_fshr_v2i24: 1445; GFX10: ; %bb.0: 1446; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1447; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v4 1448; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v5 1449; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1450; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 1451; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 1452; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 1453; GFX10-NEXT: v_mul_u32_u24_e32 v6, 24, v6 1454; GFX10-NEXT: v_mul_u32_u24_e32 v7, 24, v7 1455; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 1456; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 1457; GFX10-NEXT: v_add_nc_u32_e32 v4, 8, v4 1458; GFX10-NEXT: v_add_nc_u32_e32 v5, 8, v5 1459; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 1460; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 1461; GFX10-NEXT: s_setpc_b64 s[30:31] 1462; 1463; GFX11-LABEL: v_fshr_v2i24: 1464; GFX11: ; %bb.0: 1465; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1466; GFX11-NEXT: v_and_b32_e32 v6, 0xffffff, v4 1467; GFX11-NEXT: v_and_b32_e32 v7, 0xffffff, v5 1468; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1469; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 1470; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 1471; GFX11-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 1472; GFX11-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 1473; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1474; GFX11-NEXT: v_mul_u32_u24_e32 v6, 24, v6 1475; GFX11-NEXT: v_mul_u32_u24_e32 v7, 24, v7 1476; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1477; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6 1478; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7 1479; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1480; GFX11-NEXT: v_add_nc_u32_e32 v4, 8, v4 1481; GFX11-NEXT: v_add_nc_u32_e32 v5, 8, v5 1482; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1483; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 1484; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 1485; GFX11-NEXT: s_setpc_b64 s[30:31] 1486 %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) 1487 ret <2 x i24> %ret 1488} 1489