1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck %s --check-prefix=SI 3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI 4; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16 5; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16 6 7declare i16 @llvm.bswap.i16(i16) nounwind readnone 8declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) nounwind readnone 9declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) nounwind readnone 10declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>) nounwind readnone 11declare i32 @llvm.bswap.i32(i32) nounwind readnone 12declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone 13declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone 14declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone 15declare i64 @llvm.bswap.i64(i64) nounwind readnone 16declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone 17declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone 18declare i48 @llvm.bswap.i48(i48) #1 19 20define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 21; SI-LABEL: test_bswap_i32: 22; SI: ; %bb.0: 23; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 24; SI-NEXT: s_waitcnt lgkmcnt(0) 25; SI-NEXT: s_load_dword s4, s[2:3], 0x0 26; SI-NEXT: s_mov_b32 s3, 0xf000 27; SI-NEXT: s_mov_b32 s2, -1 28; SI-NEXT: s_waitcnt lgkmcnt(0) 29; SI-NEXT: v_alignbit_b32 v0, s4, s4, 8 30; SI-NEXT: v_alignbit_b32 v1, s4, s4, 24 31; SI-NEXT: s_mov_b32 s4, 0xff00ff 32; SI-NEXT: v_bfi_b32 v0, s4, v1, v0 33; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 34; SI-NEXT: s_endpgm 35; 36; VI-LABEL: test_bswap_i32: 37; VI: ; %bb.0: 38; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 39; VI-NEXT: v_mov_b32_e32 v0, 0x10203 40; VI-NEXT: s_mov_b32 s7, 0xf000 41; VI-NEXT: s_mov_b32 s6, -1 42; VI-NEXT: s_waitcnt lgkmcnt(0) 43; VI-NEXT: s_load_dword s2, s[2:3], 0x0 44; VI-NEXT: s_mov_b32 s4, s0 45; VI-NEXT: s_mov_b32 s5, s1 46; VI-NEXT: s_waitcnt lgkmcnt(0) 47; VI-NEXT: v_perm_b32 v0, 0, s2, v0 48; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 49; VI-NEXT: s_endpgm 50; 51; GFX11-LABEL: test_bswap_i32: 52; GFX11: ; %bb.0: 53; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 54; GFX11-NEXT: s_waitcnt lgkmcnt(0) 55; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 56; GFX11-NEXT: s_mov_b32 s3, 0x31016000 57; GFX11-NEXT: s_waitcnt lgkmcnt(0) 58; GFX11-NEXT: v_perm_b32 v0, 0, s2, 0x10203 59; GFX11-NEXT: s_mov_b32 s2, -1 60; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 61; GFX11-NEXT: s_endpgm 62 %val = load i32, ptr addrspace(1) %in, align 4 63 %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone 64 store i32 %bswap, ptr addrspace(1) %out, align 4 65 ret void 66} 67 68define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 69; SI-LABEL: test_bswap_v2i32: 70; SI: ; %bb.0: 71; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 72; SI-NEXT: s_waitcnt lgkmcnt(0) 73; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 74; SI-NEXT: s_mov_b32 s3, 0xf000 75; SI-NEXT: s_mov_b32 s2, -1 76; SI-NEXT: s_mov_b32 s6, 0xff00ff 77; SI-NEXT: s_waitcnt lgkmcnt(0) 78; SI-NEXT: v_alignbit_b32 v0, s5, s5, 8 79; SI-NEXT: v_alignbit_b32 v1, s5, s5, 24 80; SI-NEXT: v_alignbit_b32 v2, s4, s4, 8 81; SI-NEXT: v_alignbit_b32 v3, s4, s4, 24 82; SI-NEXT: v_bfi_b32 v1, s6, v1, v0 83; SI-NEXT: v_bfi_b32 v0, s6, v3, v2 84; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 85; SI-NEXT: s_endpgm 86; 87; VI-LABEL: test_bswap_v2i32: 88; VI: ; %bb.0: 89; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 90; VI-NEXT: v_mov_b32_e32 v0, 0x10203 91; VI-NEXT: s_mov_b32 s7, 0xf000 92; VI-NEXT: s_mov_b32 s6, -1 93; VI-NEXT: s_waitcnt lgkmcnt(0) 94; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 95; VI-NEXT: s_mov_b32 s4, s0 96; VI-NEXT: s_mov_b32 s5, s1 97; VI-NEXT: s_waitcnt lgkmcnt(0) 98; VI-NEXT: v_perm_b32 v1, 0, s3, v0 99; VI-NEXT: v_perm_b32 v0, 0, s2, v0 100; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 101; VI-NEXT: s_endpgm 102; 103; GFX11-LABEL: test_bswap_v2i32: 104; GFX11: ; %bb.0: 105; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 106; GFX11-NEXT: s_waitcnt lgkmcnt(0) 107; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 108; GFX11-NEXT: s_mov_b32 s3, 0x31016000 109; GFX11-NEXT: s_mov_b32 s2, -1 110; GFX11-NEXT: s_waitcnt lgkmcnt(0) 111; GFX11-NEXT: v_perm_b32 v1, 0, s5, 0x10203 112; GFX11-NEXT: v_perm_b32 v0, 0, s4, 0x10203 113; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 114; GFX11-NEXT: s_endpgm 115 %val = load <2 x i32>, ptr addrspace(1) %in, align 8 116 %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone 117 store <2 x i32> %bswap, ptr addrspace(1) %out, align 8 118 ret void 119} 120 121define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 122; SI-LABEL: test_bswap_v4i32: 123; SI: ; %bb.0: 124; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 125; SI-NEXT: s_waitcnt lgkmcnt(0) 126; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 127; SI-NEXT: s_mov_b32 s3, 0xf000 128; SI-NEXT: s_mov_b32 s2, -1 129; SI-NEXT: s_mov_b32 s8, 0xff00ff 130; SI-NEXT: s_waitcnt lgkmcnt(0) 131; SI-NEXT: v_alignbit_b32 v0, s7, s7, 8 132; SI-NEXT: v_alignbit_b32 v1, s7, s7, 24 133; SI-NEXT: v_alignbit_b32 v2, s6, s6, 8 134; SI-NEXT: v_alignbit_b32 v4, s6, s6, 24 135; SI-NEXT: v_alignbit_b32 v5, s5, s5, 8 136; SI-NEXT: v_alignbit_b32 v6, s5, s5, 24 137; SI-NEXT: v_alignbit_b32 v7, s4, s4, 8 138; SI-NEXT: v_alignbit_b32 v8, s4, s4, 24 139; SI-NEXT: v_bfi_b32 v3, s8, v1, v0 140; SI-NEXT: v_bfi_b32 v2, s8, v4, v2 141; SI-NEXT: v_bfi_b32 v1, s8, v6, v5 142; SI-NEXT: v_bfi_b32 v0, s8, v8, v7 143; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 144; SI-NEXT: s_endpgm 145; 146; VI-LABEL: test_bswap_v4i32: 147; VI: ; %bb.0: 148; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 149; VI-NEXT: v_mov_b32_e32 v0, 0x10203 150; VI-NEXT: s_mov_b32 s7, 0xf000 151; VI-NEXT: s_mov_b32 s6, -1 152; VI-NEXT: s_waitcnt lgkmcnt(0) 153; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 154; VI-NEXT: s_mov_b32 s4, s0 155; VI-NEXT: s_mov_b32 s5, s1 156; VI-NEXT: s_waitcnt lgkmcnt(0) 157; VI-NEXT: v_perm_b32 v3, 0, s11, v0 158; VI-NEXT: v_perm_b32 v2, 0, s10, v0 159; VI-NEXT: v_perm_b32 v1, 0, s9, v0 160; VI-NEXT: v_perm_b32 v0, 0, s8, v0 161; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 162; VI-NEXT: s_endpgm 163; 164; GFX11-LABEL: test_bswap_v4i32: 165; GFX11: ; %bb.0: 166; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 167; GFX11-NEXT: s_waitcnt lgkmcnt(0) 168; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 169; GFX11-NEXT: s_mov_b32 s3, 0x31016000 170; GFX11-NEXT: s_mov_b32 s2, -1 171; GFX11-NEXT: s_waitcnt lgkmcnt(0) 172; GFX11-NEXT: v_perm_b32 v3, 0, s7, 0x10203 173; GFX11-NEXT: v_perm_b32 v2, 0, s6, 0x10203 174; GFX11-NEXT: v_perm_b32 v1, 0, s5, 0x10203 175; GFX11-NEXT: v_perm_b32 v0, 0, s4, 0x10203 176; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 177; GFX11-NEXT: s_endpgm 178 %val = load <4 x i32>, ptr addrspace(1) %in, align 16 179 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone 180 store <4 x i32> %bswap, ptr addrspace(1) %out, align 16 181 ret void 182} 183 184define amdgpu_kernel void @test_bswap_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 185; SI-LABEL: test_bswap_v8i32: 186; SI: ; %bb.0: 187; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 188; SI-NEXT: s_waitcnt lgkmcnt(0) 189; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 190; SI-NEXT: s_mov_b32 s3, 0xf000 191; SI-NEXT: s_mov_b32 s2, -1 192; SI-NEXT: s_mov_b32 s12, 0xff00ff 193; SI-NEXT: s_waitcnt lgkmcnt(0) 194; SI-NEXT: v_alignbit_b32 v0, s7, s7, 8 195; SI-NEXT: v_alignbit_b32 v1, s7, s7, 24 196; SI-NEXT: v_alignbit_b32 v2, s6, s6, 8 197; SI-NEXT: v_alignbit_b32 v4, s6, s6, 24 198; SI-NEXT: v_alignbit_b32 v5, s5, s5, 8 199; SI-NEXT: v_alignbit_b32 v6, s5, s5, 24 200; SI-NEXT: v_alignbit_b32 v7, s4, s4, 8 201; SI-NEXT: v_alignbit_b32 v8, s4, s4, 24 202; SI-NEXT: v_alignbit_b32 v9, s11, s11, 8 203; SI-NEXT: v_alignbit_b32 v10, s11, s11, 24 204; SI-NEXT: v_alignbit_b32 v11, s10, s10, 8 205; SI-NEXT: v_alignbit_b32 v12, s10, s10, 24 206; SI-NEXT: v_alignbit_b32 v13, s9, s9, 8 207; SI-NEXT: v_alignbit_b32 v14, s9, s9, 24 208; SI-NEXT: v_alignbit_b32 v15, s8, s8, 8 209; SI-NEXT: v_alignbit_b32 v16, s8, s8, 24 210; SI-NEXT: v_bfi_b32 v3, s12, v1, v0 211; SI-NEXT: v_bfi_b32 v2, s12, v4, v2 212; SI-NEXT: v_bfi_b32 v1, s12, v6, v5 213; SI-NEXT: v_bfi_b32 v0, s12, v8, v7 214; SI-NEXT: v_bfi_b32 v7, s12, v10, v9 215; SI-NEXT: v_bfi_b32 v6, s12, v12, v11 216; SI-NEXT: v_bfi_b32 v5, s12, v14, v13 217; SI-NEXT: v_bfi_b32 v4, s12, v16, v15 218; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 219; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 220; SI-NEXT: s_endpgm 221; 222; VI-LABEL: test_bswap_v8i32: 223; VI: ; %bb.0: 224; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 225; VI-NEXT: v_mov_b32_e32 v4, 0x10203 226; VI-NEXT: s_mov_b32 s15, 0xf000 227; VI-NEXT: s_mov_b32 s14, -1 228; VI-NEXT: s_waitcnt lgkmcnt(0) 229; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 230; VI-NEXT: s_mov_b32 s12, s8 231; VI-NEXT: s_mov_b32 s13, s9 232; VI-NEXT: s_waitcnt lgkmcnt(0) 233; VI-NEXT: v_perm_b32 v3, 0, s3, v4 234; VI-NEXT: v_perm_b32 v2, 0, s2, v4 235; VI-NEXT: v_perm_b32 v1, 0, s1, v4 236; VI-NEXT: v_perm_b32 v0, 0, s0, v4 237; VI-NEXT: v_perm_b32 v7, 0, s7, v4 238; VI-NEXT: v_perm_b32 v6, 0, s6, v4 239; VI-NEXT: v_perm_b32 v5, 0, s5, v4 240; VI-NEXT: v_perm_b32 v4, 0, s4, v4 241; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 242; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 243; VI-NEXT: s_endpgm 244; 245; GFX11-LABEL: test_bswap_v8i32: 246; GFX11: ; %bb.0: 247; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 248; GFX11-NEXT: s_waitcnt lgkmcnt(0) 249; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 250; GFX11-NEXT: s_mov_b32 s11, 0x31016000 251; GFX11-NEXT: s_mov_b32 s10, -1 252; GFX11-NEXT: s_waitcnt lgkmcnt(0) 253; GFX11-NEXT: v_perm_b32 v7, 0, s7, 0x10203 254; GFX11-NEXT: v_perm_b32 v6, 0, s6, 0x10203 255; GFX11-NEXT: v_perm_b32 v5, 0, s5, 0x10203 256; GFX11-NEXT: v_perm_b32 v4, 0, s4, 0x10203 257; GFX11-NEXT: v_perm_b32 v3, 0, s3, 0x10203 258; GFX11-NEXT: v_perm_b32 v2, 0, s2, 0x10203 259; GFX11-NEXT: v_perm_b32 v1, 0, s1, 0x10203 260; GFX11-NEXT: v_perm_b32 v0, 0, s0, 0x10203 261; GFX11-NEXT: s_clause 0x1 262; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16 263; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0 264; GFX11-NEXT: s_endpgm 265 %val = load <8 x i32>, ptr addrspace(1) %in, align 32 266 %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone 267 store <8 x i32> %bswap, ptr addrspace(1) %out, align 32 268 ret void 269} 270 271define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 272; SI-LABEL: test_bswap_i64: 273; SI: ; %bb.0: 274; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 275; SI-NEXT: s_waitcnt lgkmcnt(0) 276; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 277; SI-NEXT: s_mov_b32 s3, 0xf000 278; SI-NEXT: s_mov_b32 s2, -1 279; SI-NEXT: s_mov_b32 s6, 0xff00ff 280; SI-NEXT: s_waitcnt lgkmcnt(0) 281; SI-NEXT: v_alignbit_b32 v0, s4, s4, 8 282; SI-NEXT: v_alignbit_b32 v1, s4, s4, 24 283; SI-NEXT: v_alignbit_b32 v2, s5, s5, 8 284; SI-NEXT: v_alignbit_b32 v3, s5, s5, 24 285; SI-NEXT: v_bfi_b32 v1, s6, v1, v0 286; SI-NEXT: v_bfi_b32 v0, s6, v3, v2 287; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 288; SI-NEXT: s_endpgm 289; 290; VI-LABEL: test_bswap_i64: 291; VI: ; %bb.0: 292; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 293; VI-NEXT: v_mov_b32_e32 v0, 0x10203 294; VI-NEXT: s_mov_b32 s7, 0xf000 295; VI-NEXT: s_mov_b32 s6, -1 296; VI-NEXT: s_waitcnt lgkmcnt(0) 297; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 298; VI-NEXT: s_mov_b32 s4, s0 299; VI-NEXT: s_mov_b32 s5, s1 300; VI-NEXT: s_waitcnt lgkmcnt(0) 301; VI-NEXT: v_perm_b32 v1, 0, s2, v0 302; VI-NEXT: v_perm_b32 v0, 0, s3, v0 303; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 304; VI-NEXT: s_endpgm 305; 306; GFX11-LABEL: test_bswap_i64: 307; GFX11: ; %bb.0: 308; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 309; GFX11-NEXT: s_waitcnt lgkmcnt(0) 310; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 311; GFX11-NEXT: s_mov_b32 s3, 0x31016000 312; GFX11-NEXT: s_mov_b32 s2, -1 313; GFX11-NEXT: s_waitcnt lgkmcnt(0) 314; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203 315; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203 316; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 317; GFX11-NEXT: s_endpgm 318 %val = load i64, ptr addrspace(1) %in, align 8 319 %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone 320 store i64 %bswap, ptr addrspace(1) %out, align 8 321 ret void 322} 323 324define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 325; SI-LABEL: test_bswap_v2i64: 326; SI: ; %bb.0: 327; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 328; SI-NEXT: s_waitcnt lgkmcnt(0) 329; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 330; SI-NEXT: s_mov_b32 s3, 0xf000 331; SI-NEXT: s_mov_b32 s2, -1 332; SI-NEXT: s_mov_b32 s8, 0xff00ff 333; SI-NEXT: s_waitcnt lgkmcnt(0) 334; SI-NEXT: v_alignbit_b32 v0, s6, s6, 8 335; SI-NEXT: v_alignbit_b32 v1, s6, s6, 24 336; SI-NEXT: v_alignbit_b32 v2, s7, s7, 8 337; SI-NEXT: v_alignbit_b32 v4, s7, s7, 24 338; SI-NEXT: v_alignbit_b32 v5, s4, s4, 8 339; SI-NEXT: v_alignbit_b32 v6, s4, s4, 24 340; SI-NEXT: v_alignbit_b32 v7, s5, s5, 8 341; SI-NEXT: v_alignbit_b32 v8, s5, s5, 24 342; SI-NEXT: v_bfi_b32 v3, s8, v1, v0 343; SI-NEXT: v_bfi_b32 v2, s8, v4, v2 344; SI-NEXT: v_bfi_b32 v1, s8, v6, v5 345; SI-NEXT: v_bfi_b32 v0, s8, v8, v7 346; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 347; SI-NEXT: s_endpgm 348; 349; VI-LABEL: test_bswap_v2i64: 350; VI: ; %bb.0: 351; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 352; VI-NEXT: v_mov_b32_e32 v0, 0x10203 353; VI-NEXT: s_mov_b32 s7, 0xf000 354; VI-NEXT: s_mov_b32 s6, -1 355; VI-NEXT: s_waitcnt lgkmcnt(0) 356; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 357; VI-NEXT: s_mov_b32 s4, s0 358; VI-NEXT: s_mov_b32 s5, s1 359; VI-NEXT: s_waitcnt lgkmcnt(0) 360; VI-NEXT: v_perm_b32 v3, 0, s10, v0 361; VI-NEXT: v_perm_b32 v2, 0, s11, v0 362; VI-NEXT: v_perm_b32 v1, 0, s8, v0 363; VI-NEXT: v_perm_b32 v0, 0, s9, v0 364; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 365; VI-NEXT: s_endpgm 366; 367; GFX11-LABEL: test_bswap_v2i64: 368; GFX11: ; %bb.0: 369; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 370; GFX11-NEXT: s_waitcnt lgkmcnt(0) 371; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 372; GFX11-NEXT: s_mov_b32 s3, 0x31016000 373; GFX11-NEXT: s_mov_b32 s2, -1 374; GFX11-NEXT: s_waitcnt lgkmcnt(0) 375; GFX11-NEXT: v_perm_b32 v3, 0, s6, 0x10203 376; GFX11-NEXT: v_perm_b32 v2, 0, s7, 0x10203 377; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203 378; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203 379; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 380; GFX11-NEXT: s_endpgm 381 %val = load <2 x i64>, ptr addrspace(1) %in, align 16 382 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone 383 store <2 x i64> %bswap, ptr addrspace(1) %out, align 16 384 ret void 385} 386 387define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 388; SI-LABEL: test_bswap_v4i64: 389; SI: ; %bb.0: 390; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 391; SI-NEXT: s_waitcnt lgkmcnt(0) 392; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 393; SI-NEXT: s_mov_b32 s3, 0xf000 394; SI-NEXT: s_mov_b32 s2, -1 395; SI-NEXT: s_mov_b32 s12, 0xff00ff 396; SI-NEXT: s_waitcnt lgkmcnt(0) 397; SI-NEXT: v_alignbit_b32 v0, s6, s6, 8 398; SI-NEXT: v_alignbit_b32 v1, s6, s6, 24 399; SI-NEXT: v_alignbit_b32 v2, s7, s7, 8 400; SI-NEXT: v_alignbit_b32 v4, s7, s7, 24 401; SI-NEXT: v_alignbit_b32 v5, s4, s4, 8 402; SI-NEXT: v_alignbit_b32 v6, s4, s4, 24 403; SI-NEXT: v_alignbit_b32 v7, s5, s5, 8 404; SI-NEXT: v_alignbit_b32 v8, s5, s5, 24 405; SI-NEXT: v_alignbit_b32 v9, s10, s10, 8 406; SI-NEXT: v_alignbit_b32 v10, s10, s10, 24 407; SI-NEXT: v_alignbit_b32 v11, s11, s11, 8 408; SI-NEXT: v_alignbit_b32 v12, s11, s11, 24 409; SI-NEXT: v_alignbit_b32 v13, s8, s8, 8 410; SI-NEXT: v_alignbit_b32 v14, s8, s8, 24 411; SI-NEXT: v_alignbit_b32 v15, s9, s9, 8 412; SI-NEXT: v_alignbit_b32 v16, s9, s9, 24 413; SI-NEXT: v_bfi_b32 v3, s12, v1, v0 414; SI-NEXT: v_bfi_b32 v2, s12, v4, v2 415; SI-NEXT: v_bfi_b32 v1, s12, v6, v5 416; SI-NEXT: v_bfi_b32 v0, s12, v8, v7 417; SI-NEXT: v_bfi_b32 v7, s12, v10, v9 418; SI-NEXT: v_bfi_b32 v6, s12, v12, v11 419; SI-NEXT: v_bfi_b32 v5, s12, v14, v13 420; SI-NEXT: v_bfi_b32 v4, s12, v16, v15 421; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 422; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 423; SI-NEXT: s_endpgm 424; 425; VI-LABEL: test_bswap_v4i64: 426; VI: ; %bb.0: 427; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 428; VI-NEXT: v_mov_b32_e32 v4, 0x10203 429; VI-NEXT: s_mov_b32 s15, 0xf000 430; VI-NEXT: s_mov_b32 s14, -1 431; VI-NEXT: s_waitcnt lgkmcnt(0) 432; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 433; VI-NEXT: s_mov_b32 s12, s8 434; VI-NEXT: s_mov_b32 s13, s9 435; VI-NEXT: s_waitcnt lgkmcnt(0) 436; VI-NEXT: v_perm_b32 v3, 0, s2, v4 437; VI-NEXT: v_perm_b32 v2, 0, s3, v4 438; VI-NEXT: v_perm_b32 v1, 0, s0, v4 439; VI-NEXT: v_perm_b32 v0, 0, s1, v4 440; VI-NEXT: v_perm_b32 v7, 0, s6, v4 441; VI-NEXT: v_perm_b32 v6, 0, s7, v4 442; VI-NEXT: v_perm_b32 v5, 0, s4, v4 443; VI-NEXT: v_perm_b32 v4, 0, s5, v4 444; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 445; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 446; VI-NEXT: s_endpgm 447; 448; GFX11-LABEL: test_bswap_v4i64: 449; GFX11: ; %bb.0: 450; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 451; GFX11-NEXT: s_waitcnt lgkmcnt(0) 452; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 453; GFX11-NEXT: s_mov_b32 s11, 0x31016000 454; GFX11-NEXT: s_mov_b32 s10, -1 455; GFX11-NEXT: s_waitcnt lgkmcnt(0) 456; GFX11-NEXT: v_perm_b32 v7, 0, s6, 0x10203 457; GFX11-NEXT: v_perm_b32 v6, 0, s7, 0x10203 458; GFX11-NEXT: v_perm_b32 v5, 0, s4, 0x10203 459; GFX11-NEXT: v_perm_b32 v4, 0, s5, 0x10203 460; GFX11-NEXT: v_perm_b32 v3, 0, s2, 0x10203 461; GFX11-NEXT: v_perm_b32 v2, 0, s3, 0x10203 462; GFX11-NEXT: v_perm_b32 v1, 0, s0, 0x10203 463; GFX11-NEXT: v_perm_b32 v0, 0, s1, 0x10203 464; GFX11-NEXT: s_clause 0x1 465; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16 466; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0 467; GFX11-NEXT: s_endpgm 468 %val = load <4 x i64>, ptr addrspace(1) %in, align 32 469 %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone 470 store <4 x i64> %bswap, ptr addrspace(1) %out, align 32 471 ret void 472} 473 474define float @missing_truncate_promote_bswap(i32 %arg) { 475; SI-LABEL: missing_truncate_promote_bswap: 476; SI: ; %bb.0: ; %bb 477; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 478; SI-NEXT: v_alignbit_b32 v1, v0, v0, 8 479; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 480; SI-NEXT: s_mov_b32 s4, 0xff00ff 481; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 482; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 483; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 484; SI-NEXT: s_setpc_b64 s[30:31] 485; 486; VI-LABEL: missing_truncate_promote_bswap: 487; VI: ; %bb.0: ; %bb 488; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 489; VI-NEXT: s_mov_b32 s4, 0xc0c0001 490; VI-NEXT: v_perm_b32 v0, 0, v0, s4 491; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 492; VI-NEXT: s_setpc_b64 s[30:31] 493; 494; GFX11-REAL16-LABEL: missing_truncate_promote_bswap: 495; GFX11-REAL16: ; %bb.0: ; %bb 496; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 497; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001 498; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) 499; GFX11-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l 500; GFX11-REAL16-NEXT: s_setpc_b64 s[30:31] 501; 502; GFX11-FAKE16-LABEL: missing_truncate_promote_bswap: 503; GFX11-FAKE16: ; %bb.0: ; %bb 504; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 505; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001 506; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) 507; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 508; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] 509bb: 510 %tmp = trunc i32 %arg to i16 511 %tmp1 = call i16 @llvm.bswap.i16(i16 %tmp) 512 %tmp2 = bitcast i16 %tmp1 to half 513 %tmp3 = fpext half %tmp2 to float 514 ret float %tmp3 515} 516 517define i16 @v_bswap_i16(i16 %src) { 518; SI-LABEL: v_bswap_i16: 519; SI: ; %bb.0: 520; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 521; SI-NEXT: v_alignbit_b32 v1, v0, v0, 8 522; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 523; SI-NEXT: s_mov_b32 s4, 0xff00ff 524; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 525; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 526; SI-NEXT: s_setpc_b64 s[30:31] 527; 528; VI-LABEL: v_bswap_i16: 529; VI: ; %bb.0: 530; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 531; VI-NEXT: s_mov_b32 s4, 0xc0c0001 532; VI-NEXT: v_perm_b32 v0, 0, v0, s4 533; VI-NEXT: s_setpc_b64 s[30:31] 534; 535; GFX11-LABEL: v_bswap_i16: 536; GFX11: ; %bb.0: 537; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 538; GFX11-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001 539; GFX11-NEXT: s_setpc_b64 s[30:31] 540 %bswap = call i16 @llvm.bswap.i16(i16 %src) 541 ret i16 %bswap 542} 543 544define i32 @v_bswap_i16_zext_to_i32(i16 %src) { 545; SI-LABEL: v_bswap_i16_zext_to_i32: 546; SI: ; %bb.0: 547; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 548; SI-NEXT: v_alignbit_b32 v1, v0, v0, 8 549; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 550; SI-NEXT: s_mov_b32 s4, 0xff00ff 551; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 552; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 553; SI-NEXT: s_setpc_b64 s[30:31] 554; 555; VI-LABEL: v_bswap_i16_zext_to_i32: 556; VI: ; %bb.0: 557; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 558; VI-NEXT: s_mov_b32 s4, 0xc0c0001 559; VI-NEXT: v_perm_b32 v0, 0, v0, s4 560; VI-NEXT: s_setpc_b64 s[30:31] 561; 562; GFX11-LABEL: v_bswap_i16_zext_to_i32: 563; GFX11: ; %bb.0: 564; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 565; GFX11-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001 566; GFX11-NEXT: s_setpc_b64 s[30:31] 567 %bswap = call i16 @llvm.bswap.i16(i16 %src) 568 %zext = zext i16 %bswap to i32 569 ret i32 %zext 570} 571 572define i32 @v_bswap_i16_sext_to_i32(i16 %src) { 573; SI-LABEL: v_bswap_i16_sext_to_i32: 574; SI: ; %bb.0: 575; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 576; SI-NEXT: v_alignbit_b32 v1, v0, v0, 8 577; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 578; SI-NEXT: s_mov_b32 s4, 0xff00ff 579; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 580; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 581; SI-NEXT: s_setpc_b64 s[30:31] 582; 583; VI-LABEL: v_bswap_i16_sext_to_i32: 584; VI: ; %bb.0: 585; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 586; VI-NEXT: s_mov_b32 s4, 0xc0c0001 587; VI-NEXT: v_perm_b32 v0, 0, v0, s4 588; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 589; VI-NEXT: s_setpc_b64 s[30:31] 590; 591; GFX11-LABEL: v_bswap_i16_sext_to_i32: 592; GFX11: ; %bb.0: 593; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 594; GFX11-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001 595; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 596; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 597; GFX11-NEXT: s_setpc_b64 s[30:31] 598 %bswap = call i16 @llvm.bswap.i16(i16 %src) 599 %zext = sext i16 %bswap to i32 600 ret i32 %zext 601} 602 603define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) { 604; SI-LABEL: v_bswap_v2i16: 605; SI: ; %bb.0: 606; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 607; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8 608; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 609; SI-NEXT: s_mov_b32 s4, 0xff00ff 610; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 611; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 612; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 613; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 614; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 615; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 616; SI-NEXT: s_setpc_b64 s[30:31] 617; 618; VI-LABEL: v_bswap_v2i16: 619; VI: ; %bb.0: 620; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 621; VI-NEXT: s_mov_b32 s4, 0x2030001 622; VI-NEXT: v_perm_b32 v0, 0, v0, s4 623; VI-NEXT: s_setpc_b64 s[30:31] 624; 625; GFX11-LABEL: v_bswap_v2i16: 626; GFX11: ; %bb.0: 627; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 628; GFX11-NEXT: v_perm_b32 v0, 0, v0, 0x2030001 629; GFX11-NEXT: s_setpc_b64 s[30:31] 630 %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src) 631 ret <2 x i16> %bswap 632} 633 634define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) { 635; SI-LABEL: v_bswap_v3i16: 636; SI: ; %bb.0: 637; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 638; SI-NEXT: v_alignbit_b32 v3, v0, v0, 8 639; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 640; SI-NEXT: s_mov_b32 s4, 0xff00ff 641; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8 642; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 643; SI-NEXT: v_alignbit_b32 v5, v2, v2, 8 644; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 645; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 646; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 647; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 648; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 649; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 650; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 651; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 652; SI-NEXT: s_setpc_b64 s[30:31] 653; 654; VI-LABEL: v_bswap_v3i16: 655; VI: ; %bb.0: 656; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 657; VI-NEXT: s_mov_b32 s4, 0x2030001 658; VI-NEXT: v_perm_b32 v0, 0, v0, s4 659; VI-NEXT: v_perm_b32 v1, 0, v1, s4 660; VI-NEXT: s_setpc_b64 s[30:31] 661; 662; GFX11-LABEL: v_bswap_v3i16: 663; GFX11: ; %bb.0: 664; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 665; GFX11-NEXT: v_perm_b32 v0, 0, v0, 0x2030001 666; GFX11-NEXT: v_perm_b32 v1, 0, v1, 0x2030001 667; GFX11-NEXT: s_setpc_b64 s[30:31] 668 %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %src) 669 ret <3 x i16> %bswap 670} 671 672define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) { 673; SI-LABEL: v_bswap_v4i16: 674; SI: ; %bb.0: 675; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 676; SI-NEXT: v_alignbit_b32 v4, v2, v2, 8 677; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 678; SI-NEXT: s_mov_b32 s4, 0xff00ff 679; SI-NEXT: v_alignbit_b32 v5, v3, v3, 8 680; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 681; SI-NEXT: v_alignbit_b32 v6, v0, v0, 8 682; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 683; SI-NEXT: v_alignbit_b32 v7, v1, v1, 8 684; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 685; SI-NEXT: v_bfi_b32 v2, s4, v2, v4 686; SI-NEXT: v_bfi_b32 v3, s4, v3, v5 687; SI-NEXT: v_bfi_b32 v0, s4, v0, v6 688; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 689; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 690; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 691; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 692; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 693; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 694; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 695; SI-NEXT: s_setpc_b64 s[30:31] 696; 697; VI-LABEL: v_bswap_v4i16: 698; VI: ; %bb.0: 699; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 700; VI-NEXT: s_mov_b32 s4, 0x2030001 701; VI-NEXT: v_perm_b32 v0, 0, v0, s4 702; VI-NEXT: v_perm_b32 v1, 0, v1, s4 703; VI-NEXT: s_setpc_b64 s[30:31] 704; 705; GFX11-LABEL: v_bswap_v4i16: 706; GFX11: ; %bb.0: 707; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 708; GFX11-NEXT: v_perm_b32 v0, 0, v0, 0x2030001 709; GFX11-NEXT: v_perm_b32 v1, 0, v1, 0x2030001 710; GFX11-NEXT: s_setpc_b64 s[30:31] 711 %bswap = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %src) 712 ret <4 x i16> %bswap 713} 714 715define i64 @v_bswap_i48(i64 %src) { 716; SI-LABEL: v_bswap_i48: 717; SI: ; %bb.0: 718; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 719; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8 720; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 721; SI-NEXT: s_mov_b32 s4, 0xff00ff 722; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 723; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 724; SI-NEXT: v_bfi_b32 v2, s4, v0, v2 725; SI-NEXT: v_bfi_b32 v0, s4, v1, v3 726; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 727; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 728; SI-NEXT: s_setpc_b64 s[30:31] 729; 730; VI-LABEL: v_bswap_i48: 731; VI: ; %bb.0: 732; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 733; VI-NEXT: s_mov_b32 s4, 0x10203 734; VI-NEXT: v_perm_b32 v2, 0, v0, s4 735; VI-NEXT: v_perm_b32 v0, 0, v1, s4 736; VI-NEXT: v_alignbit_b32 v0, v2, v0, 16 737; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 738; VI-NEXT: s_setpc_b64 s[30:31] 739; 740; GFX11-LABEL: v_bswap_i48: 741; GFX11: ; %bb.0: 742; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 743; GFX11-NEXT: v_perm_b32 v2, 0, v0, 0x10203 744; GFX11-NEXT: v_perm_b32 v0, 0, v1, 0x10203 745; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 746; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 747; GFX11-NEXT: v_alignbit_b32 v0, v2, v0, 16 748; GFX11-NEXT: s_setpc_b64 s[30:31] 749 %trunc = trunc i64 %src to i48 750 %bswap = call i48 @llvm.bswap.i48(i48 %trunc) 751 %zext = zext i48 %bswap to i64 752 ret i64 %zext 753} 754