1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI 3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT 4; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT 5; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GISEL 6; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11-FLAT 7; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GFX11-GISEL 8 9declare i32 @llvm.amdgcn.workitem.id.x() #1 10 11declare i16 @llvm.bitreverse.i16(i16) #1 12declare i32 @llvm.bitreverse.i32(i32) #1 13declare i64 @llvm.bitreverse.i64(i64) #1 14 15declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) #1 16declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1 17 18declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1 19declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 20 21define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #0 { 22; SI-LABEL: s_brev_i16: 23; SI: ; %bb.0: 24; SI-NEXT: s_load_dword s6, s[4:5], 0xb 25; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 26; SI-NEXT: s_mov_b32 s3, 0xf000 27; SI-NEXT: s_mov_b32 s2, -1 28; SI-NEXT: s_waitcnt lgkmcnt(0) 29; SI-NEXT: s_brev_b32 s4, s6 30; SI-NEXT: s_lshr_b32 s4, s4, 16 31; SI-NEXT: v_mov_b32_e32 v0, s4 32; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 33; SI-NEXT: s_endpgm 34; 35; FLAT-LABEL: s_brev_i16: 36; FLAT: ; %bb.0: 37; FLAT-NEXT: s_load_dword s6, s[4:5], 0x2c 38; FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 39; FLAT-NEXT: s_mov_b32 s3, 0xf000 40; FLAT-NEXT: s_mov_b32 s2, -1 41; FLAT-NEXT: s_waitcnt lgkmcnt(0) 42; FLAT-NEXT: s_brev_b32 s4, s6 43; FLAT-NEXT: s_lshr_b32 s4, s4, 16 44; FLAT-NEXT: v_mov_b32_e32 v0, s4 45; FLAT-NEXT: buffer_store_short v0, off, s[0:3], 0 46; FLAT-NEXT: s_endpgm 47; 48; GISEL-LABEL: s_brev_i16: 49; GISEL: ; %bb.0: 50; GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c 51; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 52; GISEL-NEXT: s_waitcnt lgkmcnt(0) 53; GISEL-NEXT: s_and_b32 s2, s2, 0xffff 54; GISEL-NEXT: s_brev_b32 s2, s2 55; GISEL-NEXT: s_lshr_b32 s2, s2, 16 56; GISEL-NEXT: v_mov_b32_e32 v0, s0 57; GISEL-NEXT: v_mov_b32_e32 v2, s2 58; GISEL-NEXT: v_mov_b32_e32 v1, s1 59; GISEL-NEXT: flat_store_short v[0:1], v2 60; GISEL-NEXT: s_endpgm 61; 62; GFX11-FLAT-LABEL: s_brev_i16: 63; GFX11-FLAT: ; %bb.0: 64; GFX11-FLAT-NEXT: s_clause 0x1 65; GFX11-FLAT-NEXT: s_load_b32 s2, s[4:5], 0x2c 66; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 67; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) 68; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 69; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 70; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 71; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] 72; GFX11-FLAT-NEXT: s_endpgm 73; 74; GFX11-GISEL-LABEL: s_brev_i16: 75; GFX11-GISEL: ; %bb.0: 76; GFX11-GISEL-NEXT: s_clause 0x1 77; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c 78; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 79; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 80; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) 81; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xffff 82; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 83; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 84; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 16 85; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 86; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 87; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[0:1] 88; GFX11-GISEL-NEXT: s_endpgm 89 %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 90 store i16 %brev, ptr addrspace(1) %out 91 ret void 92} 93 94define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { 95; SI-LABEL: v_brev_i16: 96; SI: ; %bb.0: 97; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 98; SI-NEXT: s_mov_b32 s7, 0xf000 99; SI-NEXT: s_mov_b32 s6, -1 100; SI-NEXT: s_mov_b32 s10, s6 101; SI-NEXT: s_mov_b32 s11, s7 102; SI-NEXT: s_waitcnt lgkmcnt(0) 103; SI-NEXT: s_mov_b32 s8, s2 104; SI-NEXT: s_mov_b32 s9, s3 105; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 106; SI-NEXT: s_mov_b32 s4, s0 107; SI-NEXT: s_mov_b32 s5, s1 108; SI-NEXT: s_waitcnt vmcnt(0) 109; SI-NEXT: v_bfrev_b32_e32 v0, v0 110; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 111; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 112; SI-NEXT: s_endpgm 113; 114; FLAT-LABEL: v_brev_i16: 115; FLAT: ; %bb.0: 116; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 117; FLAT-NEXT: s_mov_b32 s7, 0xf000 118; FLAT-NEXT: s_mov_b32 s6, -1 119; FLAT-NEXT: s_mov_b32 s10, s6 120; FLAT-NEXT: s_mov_b32 s11, s7 121; FLAT-NEXT: s_waitcnt lgkmcnt(0) 122; FLAT-NEXT: s_mov_b32 s8, s2 123; FLAT-NEXT: s_mov_b32 s9, s3 124; FLAT-NEXT: buffer_load_ushort v0, off, s[8:11], 0 125; FLAT-NEXT: s_mov_b32 s4, s0 126; FLAT-NEXT: s_mov_b32 s5, s1 127; FLAT-NEXT: s_waitcnt vmcnt(0) 128; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 129; FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0 130; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0 131; FLAT-NEXT: s_endpgm 132; 133; GISEL-LABEL: v_brev_i16: 134; GISEL: ; %bb.0: 135; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 136; GISEL-NEXT: s_waitcnt lgkmcnt(0) 137; GISEL-NEXT: v_mov_b32_e32 v0, s2 138; GISEL-NEXT: v_mov_b32_e32 v1, s3 139; GISEL-NEXT: flat_load_ushort v0, v[0:1] 140; GISEL-NEXT: s_waitcnt vmcnt(0) 141; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 142; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 143; GISEL-NEXT: v_mov_b32_e32 v0, s0 144; GISEL-NEXT: v_mov_b32_e32 v1, s1 145; GISEL-NEXT: flat_store_short v[0:1], v2 146; GISEL-NEXT: s_endpgm 147; 148; GFX11-FLAT-LABEL: v_brev_i16: 149; GFX11-FLAT: ; %bb.0: 150; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 151; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 152; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 153; GFX11-FLAT-NEXT: v_mov_b32_e32 v1, 0 154; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) 155; GFX11-FLAT-NEXT: s_mov_b32 s4, s2 156; GFX11-FLAT-NEXT: s_mov_b32 s5, s3 157; GFX11-FLAT-NEXT: buffer_load_u16 v0, off, s[4:7], 0 158; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) 159; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0 160; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1] 161; GFX11-FLAT-NEXT: s_endpgm 162; 163; GFX11-GISEL-LABEL: v_brev_i16: 164; GFX11-GISEL: ; %bb.0: 165; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 166; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 167; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) 168; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3] 169; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) 170; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1 171; GFX11-GISEL-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] 172; GFX11-GISEL-NEXT: s_endpgm 173 %val = load i16, ptr addrspace(1) %valptr 174 %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 175 store i16 %brev, ptr addrspace(1) %out 176 ret void 177} 178 179define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #0 { 180; SI-LABEL: s_brev_i32: 181; SI: ; %bb.0: 182; SI-NEXT: s_load_dword s6, s[4:5], 0xb 183; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 184; SI-NEXT: s_mov_b32 s3, 0xf000 185; SI-NEXT: s_mov_b32 s2, -1 186; SI-NEXT: s_waitcnt lgkmcnt(0) 187; SI-NEXT: s_brev_b32 s4, s6 188; SI-NEXT: v_mov_b32_e32 v0, s4 189; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 190; SI-NEXT: s_endpgm 191; 192; FLAT-LABEL: s_brev_i32: 193; FLAT: ; %bb.0: 194; FLAT-NEXT: s_load_dword s6, s[4:5], 0x2c 195; FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 196; FLAT-NEXT: s_mov_b32 s3, 0xf000 197; FLAT-NEXT: s_mov_b32 s2, -1 198; FLAT-NEXT: s_waitcnt lgkmcnt(0) 199; FLAT-NEXT: s_brev_b32 s4, s6 200; FLAT-NEXT: v_mov_b32_e32 v0, s4 201; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0 202; FLAT-NEXT: s_endpgm 203; 204; GISEL-LABEL: s_brev_i32: 205; GISEL: ; %bb.0: 206; GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c 207; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 208; GISEL-NEXT: s_waitcnt lgkmcnt(0) 209; GISEL-NEXT: s_brev_b32 s2, s2 210; GISEL-NEXT: v_mov_b32_e32 v0, s0 211; GISEL-NEXT: v_mov_b32_e32 v2, s2 212; GISEL-NEXT: v_mov_b32_e32 v1, s1 213; GISEL-NEXT: flat_store_dword v[0:1], v2 214; GISEL-NEXT: s_endpgm 215; 216; GFX11-FLAT-LABEL: s_brev_i32: 217; GFX11-FLAT: ; %bb.0: 218; GFX11-FLAT-NEXT: s_clause 0x1 219; GFX11-FLAT-NEXT: s_load_b32 s2, s[4:5], 0x2c 220; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 221; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 222; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) 223; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 224; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 225; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s2 226; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 227; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[0:3], 0 228; GFX11-FLAT-NEXT: s_endpgm 229; 230; GFX11-GISEL-LABEL: s_brev_i32: 231; GFX11-GISEL: ; %bb.0: 232; GFX11-GISEL-NEXT: s_clause 0x1 233; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c 234; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 235; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 236; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) 237; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 238; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 239; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 240; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] 241; GFX11-GISEL-NEXT: s_endpgm 242 %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 243 store i32 %brev, ptr addrspace(1) %out 244 ret void 245} 246 247define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { 248; SI-LABEL: v_brev_i32: 249; SI: ; %bb.0: 250; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 251; SI-NEXT: s_mov_b32 s7, 0xf000 252; SI-NEXT: s_mov_b32 s10, 0 253; SI-NEXT: s_mov_b32 s11, s7 254; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 255; SI-NEXT: s_waitcnt lgkmcnt(0) 256; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 257; SI-NEXT: v_mov_b32_e32 v1, 0 258; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 259; SI-NEXT: s_mov_b32 s6, -1 260; SI-NEXT: s_mov_b32 s4, s0 261; SI-NEXT: s_mov_b32 s5, s1 262; SI-NEXT: s_waitcnt vmcnt(0) 263; SI-NEXT: v_bfrev_b32_e32 v0, v0 264; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 265; SI-NEXT: s_endpgm 266; 267; FLAT-LABEL: v_brev_i32: 268; FLAT: ; %bb.0: 269; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 270; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 271; FLAT-NEXT: s_waitcnt lgkmcnt(0) 272; FLAT-NEXT: v_mov_b32_e32 v1, s3 273; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 274; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 275; FLAT-NEXT: flat_load_dword v0, v[0:1] 276; FLAT-NEXT: s_mov_b32 s3, 0xf000 277; FLAT-NEXT: s_mov_b32 s2, -1 278; FLAT-NEXT: s_waitcnt vmcnt(0) 279; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 280; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0 281; FLAT-NEXT: s_endpgm 282; 283; GISEL-LABEL: v_brev_i32: 284; GISEL: ; %bb.0: 285; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 286; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 287; GISEL-NEXT: s_waitcnt lgkmcnt(0) 288; GISEL-NEXT: v_mov_b32_e32 v0, s2 289; GISEL-NEXT: v_mov_b32_e32 v1, s3 290; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 291; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 292; GISEL-NEXT: flat_load_dword v0, v[0:1] 293; GISEL-NEXT: s_waitcnt vmcnt(0) 294; GISEL-NEXT: v_bfrev_b32_e32 v2, v0 295; GISEL-NEXT: v_mov_b32_e32 v0, s0 296; GISEL-NEXT: v_mov_b32_e32 v1, s1 297; GISEL-NEXT: flat_store_dword v[0:1], v2 298; GISEL-NEXT: s_endpgm 299; 300; GFX11-FLAT-LABEL: v_brev_i32: 301; GFX11-FLAT: ; %bb.0: 302; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 303; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 304; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) 305; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 306; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) 307; GFX11-FLAT-NEXT: global_load_b32 v0, v0, s[2:3] 308; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 309; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 310; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) 311; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0 312; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[0:3], 0 313; GFX11-FLAT-NEXT: s_endpgm 314; 315; GFX11-GISEL-LABEL: v_brev_i32: 316; GFX11-GISEL: ; %bb.0: 317; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 318; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 319; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) 320; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 321; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) 322; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] 323; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) 324; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v0, v0 325; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] 326; GFX11-GISEL-NEXT: s_endpgm 327 %tid = call i32 @llvm.amdgcn.workitem.id.x() 328 %gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 329 %val = load i32, ptr addrspace(1) %gep 330 %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 331 store i32 %brev, ptr addrspace(1) %out 332 ret void 333} 334 335define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> %val) #0 { 336; SI-LABEL: s_brev_v2i32: 337; SI: ; %bb.0: 338; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 339; SI-NEXT: s_mov_b32 s7, 0xf000 340; SI-NEXT: s_mov_b32 s6, -1 341; SI-NEXT: s_waitcnt lgkmcnt(0) 342; SI-NEXT: s_mov_b32 s4, s0 343; SI-NEXT: s_mov_b32 s5, s1 344; SI-NEXT: s_brev_b32 s0, s3 345; SI-NEXT: s_brev_b32 s1, s2 346; SI-NEXT: v_mov_b32_e32 v0, s1 347; SI-NEXT: v_mov_b32_e32 v1, s0 348; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 349; SI-NEXT: s_endpgm 350; 351; FLAT-LABEL: s_brev_v2i32: 352; FLAT: ; %bb.0: 353; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 354; FLAT-NEXT: s_mov_b32 s7, 0xf000 355; FLAT-NEXT: s_mov_b32 s6, -1 356; FLAT-NEXT: s_waitcnt lgkmcnt(0) 357; FLAT-NEXT: s_mov_b32 s4, s0 358; FLAT-NEXT: s_mov_b32 s5, s1 359; FLAT-NEXT: s_brev_b32 s0, s3 360; FLAT-NEXT: s_brev_b32 s1, s2 361; FLAT-NEXT: v_mov_b32_e32 v0, s1 362; FLAT-NEXT: v_mov_b32_e32 v1, s0 363; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 364; FLAT-NEXT: s_endpgm 365; 366; GISEL-LABEL: s_brev_v2i32: 367; GISEL: ; %bb.0: 368; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 369; GISEL-NEXT: s_waitcnt lgkmcnt(0) 370; GISEL-NEXT: s_brev_b32 s2, s2 371; GISEL-NEXT: s_brev_b32 s3, s3 372; GISEL-NEXT: v_mov_b32_e32 v0, s2 373; GISEL-NEXT: v_mov_b32_e32 v3, s1 374; GISEL-NEXT: v_mov_b32_e32 v1, s3 375; GISEL-NEXT: v_mov_b32_e32 v2, s0 376; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 377; GISEL-NEXT: s_endpgm 378; 379; GFX11-FLAT-LABEL: s_brev_v2i32: 380; GFX11-FLAT: ; %bb.0: 381; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 382; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 383; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 384; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) 385; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 386; GFX11-FLAT-NEXT: s_brev_b32 s3, s3 387; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 388; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 389; GFX11-FLAT-NEXT: s_mov_b32 s4, s0 390; GFX11-FLAT-NEXT: s_mov_b32 s5, s1 391; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 392; GFX11-FLAT-NEXT: s_endpgm 393; 394; GFX11-GISEL-LABEL: s_brev_v2i32: 395; GFX11-GISEL: ; %bb.0: 396; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 397; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 398; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) 399; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 400; GFX11-GISEL-NEXT: s_brev_b32 s3, s3 401; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 402; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 403; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] 404; GFX11-GISEL-NEXT: s_endpgm 405 %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 406 store <2 x i32> %brev, ptr addrspace(1) %out 407 ret void 408} 409 410define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { 411; SI-LABEL: v_brev_v2i32: 412; SI: ; %bb.0: 413; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 414; SI-NEXT: s_mov_b32 s7, 0xf000 415; SI-NEXT: s_mov_b32 s10, 0 416; SI-NEXT: s_mov_b32 s11, s7 417; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 418; SI-NEXT: s_waitcnt lgkmcnt(0) 419; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 420; SI-NEXT: v_mov_b32_e32 v1, 0 421; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 422; SI-NEXT: s_mov_b32 s6, -1 423; SI-NEXT: s_mov_b32 s4, s0 424; SI-NEXT: s_mov_b32 s5, s1 425; SI-NEXT: s_waitcnt vmcnt(0) 426; SI-NEXT: v_bfrev_b32_e32 v1, v1 427; SI-NEXT: v_bfrev_b32_e32 v0, v0 428; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 429; SI-NEXT: s_endpgm 430; 431; FLAT-LABEL: v_brev_v2i32: 432; FLAT: ; %bb.0: 433; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 434; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 435; FLAT-NEXT: s_waitcnt lgkmcnt(0) 436; FLAT-NEXT: v_mov_b32_e32 v1, s3 437; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 438; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 439; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 440; FLAT-NEXT: s_mov_b32 s3, 0xf000 441; FLAT-NEXT: s_mov_b32 s2, -1 442; FLAT-NEXT: s_waitcnt vmcnt(0) 443; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 444; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 445; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 446; FLAT-NEXT: s_endpgm 447; 448; GISEL-LABEL: v_brev_v2i32: 449; GISEL: ; %bb.0: 450; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 451; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 452; GISEL-NEXT: s_waitcnt lgkmcnt(0) 453; GISEL-NEXT: v_mov_b32_e32 v0, s2 454; GISEL-NEXT: v_mov_b32_e32 v1, s3 455; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 456; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 457; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 458; GISEL-NEXT: v_mov_b32_e32 v3, s1 459; GISEL-NEXT: v_mov_b32_e32 v2, s0 460; GISEL-NEXT: s_waitcnt vmcnt(0) 461; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 462; GISEL-NEXT: v_bfrev_b32_e32 v1, v1 463; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 464; GISEL-NEXT: s_endpgm 465; 466; GFX11-FLAT-LABEL: v_brev_v2i32: 467; GFX11-FLAT: ; %bb.0: 468; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 469; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 470; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) 471; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 472; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) 473; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3] 474; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 475; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 476; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) 477; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1 478; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0 479; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 480; GFX11-FLAT-NEXT: s_endpgm 481; 482; GFX11-GISEL-LABEL: v_brev_v2i32: 483; GFX11-GISEL: ; %bb.0: 484; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 485; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 486; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 487; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) 488; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 489; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) 490; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] 491; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) 492; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v0, v0 493; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1 494; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] 495; GFX11-GISEL-NEXT: s_endpgm 496 %tid = call i32 @llvm.amdgcn.workitem.id.x() 497 %gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid 498 %val = load <2 x i32>, ptr addrspace(1) %gep 499 %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 500 store <2 x i32> %brev, ptr addrspace(1) %out 501 ret void 502} 503 504define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) #0 { 505; SI-LABEL: s_brev_i64: 506; SI: ; %bb.0: 507; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 508; SI-NEXT: s_mov_b32 s7, 0xf000 509; SI-NEXT: s_mov_b32 s6, -1 510; SI-NEXT: s_waitcnt lgkmcnt(0) 511; SI-NEXT: s_mov_b32 s4, s0 512; SI-NEXT: s_mov_b32 s5, s1 513; SI-NEXT: s_brev_b64 s[0:1], s[2:3] 514; SI-NEXT: v_mov_b32_e32 v0, s0 515; SI-NEXT: v_mov_b32_e32 v1, s1 516; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 517; SI-NEXT: s_endpgm 518; 519; FLAT-LABEL: s_brev_i64: 520; FLAT: ; %bb.0: 521; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 522; FLAT-NEXT: s_mov_b32 s7, 0xf000 523; FLAT-NEXT: s_mov_b32 s6, -1 524; FLAT-NEXT: s_waitcnt lgkmcnt(0) 525; FLAT-NEXT: s_mov_b32 s4, s0 526; FLAT-NEXT: s_mov_b32 s5, s1 527; FLAT-NEXT: s_brev_b64 s[0:1], s[2:3] 528; FLAT-NEXT: v_mov_b32_e32 v0, s0 529; FLAT-NEXT: v_mov_b32_e32 v1, s1 530; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 531; FLAT-NEXT: s_endpgm 532; 533; GISEL-LABEL: s_brev_i64: 534; GISEL: ; %bb.0: 535; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 536; GISEL-NEXT: s_waitcnt lgkmcnt(0) 537; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] 538; GISEL-NEXT: v_mov_b32_e32 v0, s2 539; GISEL-NEXT: v_mov_b32_e32 v3, s1 540; GISEL-NEXT: v_mov_b32_e32 v1, s3 541; GISEL-NEXT: v_mov_b32_e32 v2, s0 542; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 543; GISEL-NEXT: s_endpgm 544; 545; GFX11-FLAT-LABEL: s_brev_i64: 546; GFX11-FLAT: ; %bb.0: 547; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 548; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) 549; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[2:3] 550; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 551; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 552; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 553; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 554; GFX11-FLAT-NEXT: s_endpgm 555; 556; GFX11-GISEL-LABEL: s_brev_i64: 557; GFX11-GISEL: ; %bb.0: 558; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 559; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 560; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) 561; GFX11-GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] 562; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 563; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 564; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] 565; GFX11-GISEL-NEXT: s_endpgm 566 %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 567 store i64 %brev, ptr addrspace(1) %out 568 ret void 569} 570 571define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { 572; SI-LABEL: v_brev_i64: 573; SI: ; %bb.0: 574; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 575; SI-NEXT: s_mov_b32 s7, 0xf000 576; SI-NEXT: s_mov_b32 s10, 0 577; SI-NEXT: s_mov_b32 s11, s7 578; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 579; SI-NEXT: s_waitcnt lgkmcnt(0) 580; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 581; SI-NEXT: v_mov_b32_e32 v1, 0 582; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 583; SI-NEXT: s_mov_b32 s6, -1 584; SI-NEXT: s_mov_b32 s4, s0 585; SI-NEXT: s_mov_b32 s5, s1 586; SI-NEXT: s_waitcnt vmcnt(0) 587; SI-NEXT: v_bfrev_b32_e32 v2, v0 588; SI-NEXT: v_bfrev_b32_e32 v1, v1 589; SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 590; SI-NEXT: s_endpgm 591; 592; FLAT-LABEL: v_brev_i64: 593; FLAT: ; %bb.0: 594; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 595; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 596; FLAT-NEXT: s_waitcnt lgkmcnt(0) 597; FLAT-NEXT: v_mov_b32_e32 v1, s3 598; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 599; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 600; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 601; FLAT-NEXT: s_mov_b32 s3, 0xf000 602; FLAT-NEXT: s_mov_b32 s2, -1 603; FLAT-NEXT: s_waitcnt vmcnt(0) 604; FLAT-NEXT: v_bfrev_b32_e32 v2, v0 605; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 606; FLAT-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 607; FLAT-NEXT: s_endpgm 608; 609; GISEL-LABEL: v_brev_i64: 610; GISEL: ; %bb.0: 611; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 612; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 613; GISEL-NEXT: s_waitcnt lgkmcnt(0) 614; GISEL-NEXT: v_mov_b32_e32 v0, s2 615; GISEL-NEXT: v_mov_b32_e32 v1, s3 616; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 617; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 618; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 619; GISEL-NEXT: v_mov_b32_e32 v4, s1 620; GISEL-NEXT: v_mov_b32_e32 v3, s0 621; GISEL-NEXT: s_waitcnt vmcnt(0) 622; GISEL-NEXT: v_bfrev_b32_e32 v1, v1 623; GISEL-NEXT: v_bfrev_b32_e32 v2, v0 624; GISEL-NEXT: flat_store_dwordx2 v[3:4], v[1:2] 625; GISEL-NEXT: s_endpgm 626; 627; GFX11-FLAT-LABEL: v_brev_i64: 628; GFX11-FLAT: ; %bb.0: 629; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 630; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 631; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) 632; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 633; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) 634; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3] 635; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 636; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 637; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) 638; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v2, v0 639; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1 640; GFX11-FLAT-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 641; GFX11-FLAT-NEXT: s_endpgm 642; 643; GFX11-GISEL-LABEL: v_brev_i64: 644; GFX11-GISEL: ; %bb.0: 645; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 646; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 647; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) 648; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 649; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) 650; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] 651; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) 652; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1 653; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v2, v0 654; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 655; GFX11-GISEL-NEXT: global_store_b64 v0, v[1:2], s[0:1] 656; GFX11-GISEL-NEXT: s_endpgm 657 %tid = call i32 @llvm.amdgcn.workitem.id.x() 658 %gep = getelementptr i64, ptr addrspace(1) %valptr, i32 %tid 659 %val = load i64, ptr addrspace(1) %gep 660 %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 661 store i64 %brev, ptr addrspace(1) %out 662 ret void 663} 664 665define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) #0 { 666; SI-LABEL: s_brev_v2i64: 667; SI: ; %bb.0: 668; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 669; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 670; SI-NEXT: s_mov_b32 s7, 0xf000 671; SI-NEXT: s_mov_b32 s6, -1 672; SI-NEXT: s_waitcnt lgkmcnt(0) 673; SI-NEXT: s_brev_b64 s[2:3], s[2:3] 674; SI-NEXT: s_brev_b64 s[0:1], s[0:1] 675; SI-NEXT: v_mov_b32_e32 v0, s0 676; SI-NEXT: v_mov_b32_e32 v1, s1 677; SI-NEXT: v_mov_b32_e32 v2, s2 678; SI-NEXT: v_mov_b32_e32 v3, s3 679; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 680; SI-NEXT: s_endpgm 681; 682; FLAT-LABEL: s_brev_v2i64: 683; FLAT: ; %bb.0: 684; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 685; FLAT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 686; FLAT-NEXT: s_mov_b32 s7, 0xf000 687; FLAT-NEXT: s_mov_b32 s6, -1 688; FLAT-NEXT: s_waitcnt lgkmcnt(0) 689; FLAT-NEXT: s_brev_b64 s[2:3], s[2:3] 690; FLAT-NEXT: s_brev_b64 s[0:1], s[0:1] 691; FLAT-NEXT: v_mov_b32_e32 v0, s0 692; FLAT-NEXT: v_mov_b32_e32 v1, s1 693; FLAT-NEXT: v_mov_b32_e32 v2, s2 694; FLAT-NEXT: v_mov_b32_e32 v3, s3 695; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 696; FLAT-NEXT: s_endpgm 697; 698; GISEL-LABEL: s_brev_v2i64: 699; GISEL: ; %bb.0: 700; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 701; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 702; GISEL-NEXT: s_waitcnt lgkmcnt(0) 703; GISEL-NEXT: s_brev_b64 s[0:1], s[0:1] 704; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] 705; GISEL-NEXT: v_mov_b32_e32 v0, s0 706; GISEL-NEXT: v_mov_b32_e32 v4, s4 707; GISEL-NEXT: v_mov_b32_e32 v1, s1 708; GISEL-NEXT: v_mov_b32_e32 v2, s2 709; GISEL-NEXT: v_mov_b32_e32 v3, s3 710; GISEL-NEXT: v_mov_b32_e32 v5, s5 711; GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 712; GISEL-NEXT: s_endpgm 713; 714; GFX11-FLAT-LABEL: s_brev_v2i64: 715; GFX11-FLAT: ; %bb.0: 716; GFX11-FLAT-NEXT: s_clause 0x1 717; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 718; GFX11-FLAT-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 719; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 720; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 721; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) 722; GFX11-FLAT-NEXT: s_brev_b64 s[0:1], s[0:1] 723; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[2:3] 724; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 725; GFX11-FLAT-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 726; GFX11-FLAT-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 727; GFX11-FLAT-NEXT: s_endpgm 728; 729; GFX11-GISEL-LABEL: s_brev_v2i64: 730; GFX11-GISEL: ; %bb.0: 731; GFX11-GISEL-NEXT: s_clause 0x1 732; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 733; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 734; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0 735; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) 736; GFX11-GISEL-NEXT: s_brev_b64 s[0:1], s[0:1] 737; GFX11-GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] 738; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 739; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 740; GFX11-GISEL-NEXT: global_store_b128 v4, v[0:3], s[4:5] 741; GFX11-GISEL-NEXT: s_endpgm 742 %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 743 store <2 x i64> %brev, ptr addrspace(1) %out 744 ret void 745} 746 747define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 { 748; SI-LABEL: v_brev_v2i64: 749; SI: ; %bb.0: 750; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 751; SI-NEXT: s_mov_b32 s7, 0xf000 752; SI-NEXT: s_mov_b32 s10, 0 753; SI-NEXT: s_mov_b32 s11, s7 754; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 755; SI-NEXT: s_waitcnt lgkmcnt(0) 756; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 757; SI-NEXT: v_mov_b32_e32 v1, 0 758; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 759; SI-NEXT: s_mov_b32 s6, -1 760; SI-NEXT: s_mov_b32 s4, s0 761; SI-NEXT: s_mov_b32 s5, s1 762; SI-NEXT: s_waitcnt vmcnt(0) 763; SI-NEXT: v_bfrev_b32_e32 v4, v2 764; SI-NEXT: v_bfrev_b32_e32 v3, v3 765; SI-NEXT: v_bfrev_b32_e32 v2, v0 766; SI-NEXT: v_bfrev_b32_e32 v1, v1 767; SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0 768; SI-NEXT: s_endpgm 769; 770; FLAT-LABEL: v_brev_v2i64: 771; FLAT: ; %bb.0: 772; FLAT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 773; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 774; FLAT-NEXT: s_waitcnt lgkmcnt(0) 775; FLAT-NEXT: v_mov_b32_e32 v1, s3 776; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 777; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 778; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 779; FLAT-NEXT: s_mov_b32 s3, 0xf000 780; FLAT-NEXT: s_mov_b32 s2, -1 781; FLAT-NEXT: s_waitcnt vmcnt(0) 782; FLAT-NEXT: v_bfrev_b32_e32 v4, v2 783; FLAT-NEXT: v_bfrev_b32_e32 v3, v3 784; FLAT-NEXT: v_bfrev_b32_e32 v2, v0 785; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 786; FLAT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 787; FLAT-NEXT: s_endpgm 788; 789; GISEL-LABEL: v_brev_v2i64: 790; GISEL: ; %bb.0: 791; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 792; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 793; GISEL-NEXT: s_waitcnt lgkmcnt(0) 794; GISEL-NEXT: v_mov_b32_e32 v0, s2 795; GISEL-NEXT: v_mov_b32_e32 v1, s3 796; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 797; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 798; GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 799; GISEL-NEXT: s_waitcnt vmcnt(0) 800; GISEL-NEXT: v_bfrev_b32_e32 v4, v1 801; GISEL-NEXT: v_bfrev_b32_e32 v5, v0 802; GISEL-NEXT: v_mov_b32_e32 v0, s0 803; GISEL-NEXT: v_bfrev_b32_e32 v6, v3 804; GISEL-NEXT: v_bfrev_b32_e32 v7, v2 805; GISEL-NEXT: v_mov_b32_e32 v1, s1 806; GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] 807; GISEL-NEXT: s_endpgm 808; 809; GFX11-FLAT-LABEL: v_brev_v2i64: 810; GFX11-FLAT: ; %bb.0: 811; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 812; GFX11-FLAT-NEXT: v_and_b32_e32 v0, 0x3ff, v0 813; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) 814; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 815; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) 816; GFX11-FLAT-NEXT: global_load_b128 v[0:3], v0, s[2:3] 817; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 818; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 819; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) 820; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v4, v2 821; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v3, v3 822; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v2, v0 823; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1 824; GFX11-FLAT-NEXT: buffer_store_b128 v[1:4], off, s[0:3], 0 825; GFX11-FLAT-NEXT: s_endpgm 826; 827; GFX11-GISEL-LABEL: v_brev_v2i64: 828; GFX11-GISEL: ; %bb.0: 829; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 830; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 831; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) 832; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 833; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) 834; GFX11-GISEL-NEXT: global_load_b128 v[0:3], v0, s[2:3] 835; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) 836; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v4, v1 837; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v5, v0 838; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v6, v3 839; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v7, v2 840; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 841; GFX11-GISEL-NEXT: global_store_b128 v0, v[4:7], s[0:1] 842; GFX11-GISEL-NEXT: s_endpgm 843 %tid = call i32 @llvm.amdgcn.workitem.id.x() 844 %gep = getelementptr <2 x i64> , ptr addrspace(1) %valptr, i32 %tid 845 %val = load <2 x i64>, ptr addrspace(1) %gep 846 %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 847 store <2 x i64> %brev, ptr addrspace(1) %out 848 ret void 849} 850 851define float @missing_truncate_promote_bitreverse(i32 %arg) { 852; SI-LABEL: missing_truncate_promote_bitreverse: 853; SI: ; %bb.0: ; %bb 854; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 855; SI-NEXT: v_bfrev_b32_e32 v0, v0 856; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 857; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 858; SI-NEXT: s_setpc_b64 s[30:31] 859; 860; FLAT-LABEL: missing_truncate_promote_bitreverse: 861; FLAT: ; %bb.0: ; %bb 862; FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 863; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 864; FLAT-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 865; FLAT-NEXT: s_setpc_b64 s[30:31] 866; 867; GISEL-LABEL: missing_truncate_promote_bitreverse: 868; GISEL: ; %bb.0: ; %bb 869; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 870; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 871; GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 872; GISEL-NEXT: s_setpc_b64 s[30:31] 873; 874; GFX11-FLAT-LABEL: missing_truncate_promote_bitreverse: 875; GFX11-FLAT: ; %bb.0: ; %bb 876; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 877; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0 878; GFX11-FLAT-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 879; GFX11-FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0 880; GFX11-FLAT-NEXT: v_cvt_f32_f16_e32 v0, v0 881; GFX11-FLAT-NEXT: s_setpc_b64 s[30:31] 882; 883; GFX11-GISEL-LABEL: missing_truncate_promote_bitreverse: 884; GFX11-GISEL: ; %bb.0: ; %bb 885; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 886; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v0, v0 887; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 888; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 889; GFX11-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 890; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] 891bb: 892 %tmp = trunc i32 %arg to i16 893 %tmp1 = call i16 @llvm.bitreverse.i16(i16 %tmp) 894 %tmp2 = bitcast i16 %tmp1 to half 895 %tmp3 = fpext half %tmp2 to float 896 ret float %tmp3 897} 898 899attributes #0 = { nounwind } 900attributes #1 = { nounwind readnone } 901