1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s 5 6 7define void @v_shuffle_v4bf16_v3bf16__u_u_u_u(ptr addrspace(1) inreg %ptr) { 8; GFX9-LABEL: v_shuffle_v4bf16_v3bf16__u_u_u_u: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GFX9-NEXT: s_setpc_b64 s[30:31] 12 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 13 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 14 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> poison 15 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 16 ret void 17} 18 19define void @v_shuffle_v4bf16_v3bf16__0_u_u_u(ptr addrspace(1) inreg %ptr) { 20; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_u_u_u: 21; GFX900: ; %bb.0: 22; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 23; GFX900-NEXT: v_mov_b32_e32 v2, 0 24; GFX900-NEXT: ;;#ASMSTART 25; GFX900-NEXT: ; def v[0:1] 26; GFX900-NEXT: ;;#ASMEND 27; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 28; GFX900-NEXT: s_waitcnt vmcnt(0) 29; GFX900-NEXT: s_setpc_b64 s[30:31] 30; 31; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_u_u_u: 32; GFX90A: ; %bb.0: 33; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GFX90A-NEXT: v_mov_b32_e32 v2, 0 35; GFX90A-NEXT: ;;#ASMSTART 36; GFX90A-NEXT: ; def v[0:1] 37; GFX90A-NEXT: ;;#ASMEND 38; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 39; GFX90A-NEXT: s_waitcnt vmcnt(0) 40; GFX90A-NEXT: s_setpc_b64 s[30:31] 41; 42; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_u_u_u: 43; GFX940: ; %bb.0: 44; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 45; GFX940-NEXT: v_mov_b32_e32 v2, 0 46; GFX940-NEXT: ;;#ASMSTART 47; GFX940-NEXT: ; def v[0:1] 48; GFX940-NEXT: ;;#ASMEND 49; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 50; GFX940-NEXT: s_waitcnt vmcnt(0) 51; GFX940-NEXT: s_setpc_b64 s[30:31] 52 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 53 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 54 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison> 55 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 56 ret void 57} 58 59define void @v_shuffle_v4bf16_v3bf16__1_u_u_u(ptr addrspace(1) inreg %ptr) { 60; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_u_u_u: 61; GFX900: ; %bb.0: 62; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 63; GFX900-NEXT: ;;#ASMSTART 64; GFX900-NEXT: ; def v[0:1] 65; GFX900-NEXT: ;;#ASMEND 66; GFX900-NEXT: v_mov_b32_e32 v2, 0 67; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 68; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 69; GFX900-NEXT: s_waitcnt vmcnt(0) 70; GFX900-NEXT: s_setpc_b64 s[30:31] 71; 72; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_u_u_u: 73; GFX90A: ; %bb.0: 74; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 75; GFX90A-NEXT: ;;#ASMSTART 76; GFX90A-NEXT: ; def v[0:1] 77; GFX90A-NEXT: ;;#ASMEND 78; GFX90A-NEXT: v_mov_b32_e32 v2, 0 79; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 80; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 81; GFX90A-NEXT: s_waitcnt vmcnt(0) 82; GFX90A-NEXT: s_setpc_b64 s[30:31] 83; 84; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_u_u_u: 85; GFX940: ; %bb.0: 86; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 87; GFX940-NEXT: ;;#ASMSTART 88; GFX940-NEXT: ; def v[0:1] 89; GFX940-NEXT: ;;#ASMEND 90; GFX940-NEXT: v_mov_b32_e32 v2, 0 91; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 92; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 93; GFX940-NEXT: s_waitcnt vmcnt(0) 94; GFX940-NEXT: s_setpc_b64 s[30:31] 95 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 96 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 97 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison> 98 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 99 ret void 100} 101 102define void @v_shuffle_v4bf16_v3bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) { 103; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u: 104; GFX900: ; %bb.0: 105; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 106; GFX900-NEXT: ;;#ASMSTART 107; GFX900-NEXT: ; def v[0:1] 108; GFX900-NEXT: ;;#ASMEND 109; GFX900-NEXT: v_mov_b32_e32 v2, 0 110; GFX900-NEXT: v_mov_b32_e32 v0, v1 111; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 112; GFX900-NEXT: s_waitcnt vmcnt(0) 113; GFX900-NEXT: s_setpc_b64 s[30:31] 114; 115; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u: 116; GFX90A: ; %bb.0: 117; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 118; GFX90A-NEXT: ;;#ASMSTART 119; GFX90A-NEXT: ; def v[0:1] 120; GFX90A-NEXT: ;;#ASMEND 121; GFX90A-NEXT: v_mov_b32_e32 v2, 0 122; GFX90A-NEXT: v_mov_b32_e32 v0, v1 123; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 124; GFX90A-NEXT: s_waitcnt vmcnt(0) 125; GFX90A-NEXT: s_setpc_b64 s[30:31] 126; 127; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u: 128; GFX940: ; %bb.0: 129; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 130; GFX940-NEXT: ;;#ASMSTART 131; GFX940-NEXT: ; def v[0:1] 132; GFX940-NEXT: ;;#ASMEND 133; GFX940-NEXT: v_mov_b32_e32 v2, 0 134; GFX940-NEXT: v_mov_b32_e32 v0, v1 135; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 136; GFX940-NEXT: s_waitcnt vmcnt(0) 137; GFX940-NEXT: s_setpc_b64 s[30:31] 138 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 139 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 140 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison> 141 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 142 ret void 143} 144 145define void @v_shuffle_v4bf16_v3bf16__3_u_u_u(ptr addrspace(1) inreg %ptr) { 146; GFX9-LABEL: v_shuffle_v4bf16_v3bf16__3_u_u_u: 147; GFX9: ; %bb.0: 148; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 149; GFX9-NEXT: s_setpc_b64 s[30:31] 150 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 151 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 152 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison> 153 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 154 ret void 155} 156 157define void @v_shuffle_v4bf16_v3bf16__4_u_u_u(ptr addrspace(1) inreg %ptr) { 158; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_u_u_u: 159; GFX900: ; %bb.0: 160; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 161; GFX900-NEXT: ;;#ASMSTART 162; GFX900-NEXT: ; def v[0:1] 163; GFX900-NEXT: ;;#ASMEND 164; GFX900-NEXT: v_mov_b32_e32 v2, 0 165; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 166; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 167; GFX900-NEXT: s_waitcnt vmcnt(0) 168; GFX900-NEXT: s_setpc_b64 s[30:31] 169; 170; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_u_u_u: 171; GFX90A: ; %bb.0: 172; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 173; GFX90A-NEXT: ;;#ASMSTART 174; GFX90A-NEXT: ; def v[0:1] 175; GFX90A-NEXT: ;;#ASMEND 176; GFX90A-NEXT: v_mov_b32_e32 v2, 0 177; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 178; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 179; GFX90A-NEXT: s_waitcnt vmcnt(0) 180; GFX90A-NEXT: s_setpc_b64 s[30:31] 181; 182; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_u_u_u: 183; GFX940: ; %bb.0: 184; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 185; GFX940-NEXT: ;;#ASMSTART 186; GFX940-NEXT: ; def v[0:1] 187; GFX940-NEXT: ;;#ASMEND 188; GFX940-NEXT: v_mov_b32_e32 v2, 0 189; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 190; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 191; GFX940-NEXT: s_waitcnt vmcnt(0) 192; GFX940-NEXT: s_setpc_b64 s[30:31] 193 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 194 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 195 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 196 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 197 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 poison, i32 poison, i32 poison> 198 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 199 ret void 200} 201 202define void @v_shuffle_v4bf16_v3bf16__5_u_u_u(ptr addrspace(1) inreg %ptr) { 203; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u: 204; GFX900: ; %bb.0: 205; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 206; GFX900-NEXT: ;;#ASMSTART 207; GFX900-NEXT: ; def v[0:1] 208; GFX900-NEXT: ;;#ASMEND 209; GFX900-NEXT: v_mov_b32_e32 v2, 0 210; GFX900-NEXT: v_mov_b32_e32 v0, v1 211; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 212; GFX900-NEXT: s_waitcnt vmcnt(0) 213; GFX900-NEXT: s_setpc_b64 s[30:31] 214; 215; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u: 216; GFX90A: ; %bb.0: 217; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 218; GFX90A-NEXT: ;;#ASMSTART 219; GFX90A-NEXT: ; def v[0:1] 220; GFX90A-NEXT: ;;#ASMEND 221; GFX90A-NEXT: v_mov_b32_e32 v2, 0 222; GFX90A-NEXT: v_mov_b32_e32 v0, v1 223; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 224; GFX90A-NEXT: s_waitcnt vmcnt(0) 225; GFX90A-NEXT: s_setpc_b64 s[30:31] 226; 227; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u: 228; GFX940: ; %bb.0: 229; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 230; GFX940-NEXT: ;;#ASMSTART 231; GFX940-NEXT: ; def v[0:1] 232; GFX940-NEXT: ;;#ASMEND 233; GFX940-NEXT: v_mov_b32_e32 v2, 0 234; GFX940-NEXT: v_mov_b32_e32 v0, v1 235; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 236; GFX940-NEXT: s_waitcnt vmcnt(0) 237; GFX940-NEXT: s_setpc_b64 s[30:31] 238 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 239 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 240 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 241 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 242 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 poison, i32 poison> 243 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 244 ret void 245} 246 247define void @v_shuffle_v4bf16_v3bf16__5_0_u_u(ptr addrspace(1) inreg %ptr) { 248; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_u_u: 249; GFX900: ; %bb.0: 250; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 251; GFX900-NEXT: ;;#ASMSTART 252; GFX900-NEXT: ; def v[0:1] 253; GFX900-NEXT: ;;#ASMEND 254; GFX900-NEXT: s_mov_b32 s4, 0x5040100 255; GFX900-NEXT: v_mov_b32_e32 v3, 0 256; GFX900-NEXT: ;;#ASMSTART 257; GFX900-NEXT: ; def v[1:2] 258; GFX900-NEXT: ;;#ASMEND 259; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 260; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 261; GFX900-NEXT: s_waitcnt vmcnt(0) 262; GFX900-NEXT: s_setpc_b64 s[30:31] 263; 264; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_u_u: 265; GFX90A: ; %bb.0: 266; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 267; GFX90A-NEXT: ;;#ASMSTART 268; GFX90A-NEXT: ; def v[0:1] 269; GFX90A-NEXT: ;;#ASMEND 270; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 271; GFX90A-NEXT: v_mov_b32_e32 v4, 0 272; GFX90A-NEXT: ;;#ASMSTART 273; GFX90A-NEXT: ; def v[2:3] 274; GFX90A-NEXT: ;;#ASMEND 275; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 276; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 277; GFX90A-NEXT: s_waitcnt vmcnt(0) 278; GFX90A-NEXT: s_setpc_b64 s[30:31] 279; 280; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_u_u: 281; GFX940: ; %bb.0: 282; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 283; GFX940-NEXT: ;;#ASMSTART 284; GFX940-NEXT: ; def v[0:1] 285; GFX940-NEXT: ;;#ASMEND 286; GFX940-NEXT: s_mov_b32 s2, 0x5040100 287; GFX940-NEXT: v_mov_b32_e32 v4, 0 288; GFX940-NEXT: ;;#ASMSTART 289; GFX940-NEXT: ; def v[2:3] 290; GFX940-NEXT: ;;#ASMEND 291; GFX940-NEXT: s_nop 0 292; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 293; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 294; GFX940-NEXT: s_waitcnt vmcnt(0) 295; GFX940-NEXT: s_setpc_b64 s[30:31] 296 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 297 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 298 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 299 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 300 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 poison, i32 poison> 301 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 302 ret void 303} 304 305define void @v_shuffle_v4bf16_v3bf16__5_1_u_u(ptr addrspace(1) inreg %ptr) { 306; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_u_u: 307; GFX900: ; %bb.0: 308; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 309; GFX900-NEXT: ;;#ASMSTART 310; GFX900-NEXT: ; def v[0:1] 311; GFX900-NEXT: ;;#ASMEND 312; GFX900-NEXT: s_mov_b32 s4, 0xffff 313; GFX900-NEXT: v_mov_b32_e32 v3, 0 314; GFX900-NEXT: ;;#ASMSTART 315; GFX900-NEXT: ; def v[1:2] 316; GFX900-NEXT: ;;#ASMEND 317; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 318; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 319; GFX900-NEXT: s_waitcnt vmcnt(0) 320; GFX900-NEXT: s_setpc_b64 s[30:31] 321; 322; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_u_u: 323; GFX90A: ; %bb.0: 324; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 325; GFX90A-NEXT: ;;#ASMSTART 326; GFX90A-NEXT: ; def v[0:1] 327; GFX90A-NEXT: ;;#ASMEND 328; GFX90A-NEXT: s_mov_b32 s4, 0xffff 329; GFX90A-NEXT: v_mov_b32_e32 v4, 0 330; GFX90A-NEXT: ;;#ASMSTART 331; GFX90A-NEXT: ; def v[2:3] 332; GFX90A-NEXT: ;;#ASMEND 333; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 334; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 335; GFX90A-NEXT: s_waitcnt vmcnt(0) 336; GFX90A-NEXT: s_setpc_b64 s[30:31] 337; 338; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_u_u: 339; GFX940: ; %bb.0: 340; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 341; GFX940-NEXT: ;;#ASMSTART 342; GFX940-NEXT: ; def v[0:1] 343; GFX940-NEXT: ;;#ASMEND 344; GFX940-NEXT: s_mov_b32 s2, 0xffff 345; GFX940-NEXT: v_mov_b32_e32 v4, 0 346; GFX940-NEXT: ;;#ASMSTART 347; GFX940-NEXT: ; def v[2:3] 348; GFX940-NEXT: ;;#ASMEND 349; GFX940-NEXT: s_nop 0 350; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 351; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 352; GFX940-NEXT: s_waitcnt vmcnt(0) 353; GFX940-NEXT: s_setpc_b64 s[30:31] 354 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 355 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 356 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 357 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 358 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 poison, i32 poison> 359 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 360 ret void 361} 362 363define void @v_shuffle_v4bf16_v3bf16__5_2_u_u(ptr addrspace(1) inreg %ptr) { 364; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_u_u: 365; GFX900: ; %bb.0: 366; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 367; GFX900-NEXT: ;;#ASMSTART 368; GFX900-NEXT: ; def v[0:1] 369; GFX900-NEXT: ;;#ASMEND 370; GFX900-NEXT: s_mov_b32 s4, 0x5040100 371; GFX900-NEXT: v_mov_b32_e32 v4, 0 372; GFX900-NEXT: ;;#ASMSTART 373; GFX900-NEXT: ; def v[2:3] 374; GFX900-NEXT: ;;#ASMEND 375; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 376; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 377; GFX900-NEXT: s_waitcnt vmcnt(0) 378; GFX900-NEXT: s_setpc_b64 s[30:31] 379; 380; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_u_u: 381; GFX90A: ; %bb.0: 382; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 383; GFX90A-NEXT: ;;#ASMSTART 384; GFX90A-NEXT: ; def v[0:1] 385; GFX90A-NEXT: ;;#ASMEND 386; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 387; GFX90A-NEXT: v_mov_b32_e32 v4, 0 388; GFX90A-NEXT: ;;#ASMSTART 389; GFX90A-NEXT: ; def v[2:3] 390; GFX90A-NEXT: ;;#ASMEND 391; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 392; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 393; GFX90A-NEXT: s_waitcnt vmcnt(0) 394; GFX90A-NEXT: s_setpc_b64 s[30:31] 395; 396; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_u_u: 397; GFX940: ; %bb.0: 398; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 399; GFX940-NEXT: ;;#ASMSTART 400; GFX940-NEXT: ; def v[0:1] 401; GFX940-NEXT: ;;#ASMEND 402; GFX940-NEXT: s_mov_b32 s2, 0x5040100 403; GFX940-NEXT: v_mov_b32_e32 v4, 0 404; GFX940-NEXT: ;;#ASMSTART 405; GFX940-NEXT: ; def v[2:3] 406; GFX940-NEXT: ;;#ASMEND 407; GFX940-NEXT: s_nop 0 408; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 409; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 410; GFX940-NEXT: s_waitcnt vmcnt(0) 411; GFX940-NEXT: s_setpc_b64 s[30:31] 412 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 413 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 414 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 415 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 416 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 poison, i32 poison> 417 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 418 ret void 419} 420 421define void @v_shuffle_v4bf16_v3bf16__5_3_u_u(ptr addrspace(1) inreg %ptr) { 422; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_u_u: 423; GFX900: ; %bb.0: 424; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 425; GFX900-NEXT: ;;#ASMSTART 426; GFX900-NEXT: ; def v[0:1] 427; GFX900-NEXT: ;;#ASMEND 428; GFX900-NEXT: s_mov_b32 s4, 0x5040100 429; GFX900-NEXT: v_mov_b32_e32 v2, 0 430; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 431; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 432; GFX900-NEXT: s_waitcnt vmcnt(0) 433; GFX900-NEXT: s_setpc_b64 s[30:31] 434; 435; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_u_u: 436; GFX90A: ; %bb.0: 437; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 438; GFX90A-NEXT: ;;#ASMSTART 439; GFX90A-NEXT: ; def v[0:1] 440; GFX90A-NEXT: ;;#ASMEND 441; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 442; GFX90A-NEXT: v_mov_b32_e32 v2, 0 443; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 444; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 445; GFX90A-NEXT: s_waitcnt vmcnt(0) 446; GFX90A-NEXT: s_setpc_b64 s[30:31] 447; 448; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_u_u: 449; GFX940: ; %bb.0: 450; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 451; GFX940-NEXT: ;;#ASMSTART 452; GFX940-NEXT: ; def v[0:1] 453; GFX940-NEXT: ;;#ASMEND 454; GFX940-NEXT: s_mov_b32 s2, 0x5040100 455; GFX940-NEXT: v_mov_b32_e32 v2, 0 456; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 457; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 458; GFX940-NEXT: s_waitcnt vmcnt(0) 459; GFX940-NEXT: s_setpc_b64 s[30:31] 460 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 461 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 462 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 463 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 464 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 poison, i32 poison> 465 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 466 ret void 467} 468 469define void @v_shuffle_v4bf16_v3bf16__5_4_u_u(ptr addrspace(1) inreg %ptr) { 470; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_u_u: 471; GFX900: ; %bb.0: 472; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 473; GFX900-NEXT: ;;#ASMSTART 474; GFX900-NEXT: ; def v[0:1] 475; GFX900-NEXT: ;;#ASMEND 476; GFX900-NEXT: s_mov_b32 s4, 0xffff 477; GFX900-NEXT: v_mov_b32_e32 v2, 0 478; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 479; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 480; GFX900-NEXT: s_waitcnt vmcnt(0) 481; GFX900-NEXT: s_setpc_b64 s[30:31] 482; 483; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_u_u: 484; GFX90A: ; %bb.0: 485; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 486; GFX90A-NEXT: ;;#ASMSTART 487; GFX90A-NEXT: ; def v[0:1] 488; GFX90A-NEXT: ;;#ASMEND 489; GFX90A-NEXT: s_mov_b32 s4, 0xffff 490; GFX90A-NEXT: v_mov_b32_e32 v2, 0 491; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 492; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 493; GFX90A-NEXT: s_waitcnt vmcnt(0) 494; GFX90A-NEXT: s_setpc_b64 s[30:31] 495; 496; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_u_u: 497; GFX940: ; %bb.0: 498; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 499; GFX940-NEXT: ;;#ASMSTART 500; GFX940-NEXT: ; def v[0:1] 501; GFX940-NEXT: ;;#ASMEND 502; GFX940-NEXT: s_mov_b32 s2, 0xffff 503; GFX940-NEXT: v_mov_b32_e32 v2, 0 504; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 505; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 506; GFX940-NEXT: s_waitcnt vmcnt(0) 507; GFX940-NEXT: s_setpc_b64 s[30:31] 508 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 509 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 510 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 511 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 512 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 poison, i32 poison> 513 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 514 ret void 515} 516 517define void @v_shuffle_v4bf16_v3bf16__5_5_u_u(ptr addrspace(1) inreg %ptr) { 518; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_u: 519; GFX900: ; %bb.0: 520; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 521; GFX900-NEXT: ;;#ASMSTART 522; GFX900-NEXT: ; def v[0:1] 523; GFX900-NEXT: ;;#ASMEND 524; GFX900-NEXT: s_mov_b32 s4, 0x5040100 525; GFX900-NEXT: v_mov_b32_e32 v2, 0 526; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 527; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 528; GFX900-NEXT: s_waitcnt vmcnt(0) 529; GFX900-NEXT: s_setpc_b64 s[30:31] 530; 531; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_u: 532; GFX90A: ; %bb.0: 533; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 534; GFX90A-NEXT: ;;#ASMSTART 535; GFX90A-NEXT: ; def v[0:1] 536; GFX90A-NEXT: ;;#ASMEND 537; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 538; GFX90A-NEXT: v_mov_b32_e32 v2, 0 539; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 540; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 541; GFX90A-NEXT: s_waitcnt vmcnt(0) 542; GFX90A-NEXT: s_setpc_b64 s[30:31] 543; 544; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_u: 545; GFX940: ; %bb.0: 546; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 547; GFX940-NEXT: ;;#ASMSTART 548; GFX940-NEXT: ; def v[0:1] 549; GFX940-NEXT: ;;#ASMEND 550; GFX940-NEXT: s_mov_b32 s2, 0x5040100 551; GFX940-NEXT: v_mov_b32_e32 v2, 0 552; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 553; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 554; GFX940-NEXT: s_waitcnt vmcnt(0) 555; GFX940-NEXT: s_setpc_b64 s[30:31] 556 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 557 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 558 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 559 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 560 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 poison> 561 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 562 ret void 563} 564 565define void @v_shuffle_v4bf16_v3bf16__5_5_0_u(ptr addrspace(1) inreg %ptr) { 566; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_u: 567; GFX900: ; %bb.0: 568; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 569; GFX900-NEXT: ;;#ASMSTART 570; GFX900-NEXT: ; def v[0:1] 571; GFX900-NEXT: ;;#ASMEND 572; GFX900-NEXT: ;;#ASMSTART 573; GFX900-NEXT: ; def v[1:2] 574; GFX900-NEXT: ;;#ASMEND 575; GFX900-NEXT: s_mov_b32 s4, 0x5040100 576; GFX900-NEXT: v_mov_b32_e32 v3, 0 577; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 578; GFX900-NEXT: v_mov_b32_e32 v2, v0 579; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 580; GFX900-NEXT: s_waitcnt vmcnt(0) 581; GFX900-NEXT: s_setpc_b64 s[30:31] 582; 583; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_u: 584; GFX90A: ; %bb.0: 585; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 586; GFX90A-NEXT: ;;#ASMSTART 587; GFX90A-NEXT: ; def v[2:3] 588; GFX90A-NEXT: ;;#ASMEND 589; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 590; GFX90A-NEXT: v_mov_b32_e32 v4, 0 591; GFX90A-NEXT: ;;#ASMSTART 592; GFX90A-NEXT: ; def v[0:1] 593; GFX90A-NEXT: ;;#ASMEND 594; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 595; GFX90A-NEXT: v_mov_b32_e32 v3, v0 596; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 597; GFX90A-NEXT: s_waitcnt vmcnt(0) 598; GFX90A-NEXT: s_setpc_b64 s[30:31] 599; 600; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_u: 601; GFX940: ; %bb.0: 602; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 603; GFX940-NEXT: ;;#ASMSTART 604; GFX940-NEXT: ; def v[2:3] 605; GFX940-NEXT: ;;#ASMEND 606; GFX940-NEXT: s_mov_b32 s2, 0x5040100 607; GFX940-NEXT: v_mov_b32_e32 v4, 0 608; GFX940-NEXT: ;;#ASMSTART 609; GFX940-NEXT: ; def v[0:1] 610; GFX940-NEXT: ;;#ASMEND 611; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 612; GFX940-NEXT: v_mov_b32_e32 v3, v0 613; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 614; GFX940-NEXT: s_waitcnt vmcnt(0) 615; GFX940-NEXT: s_setpc_b64 s[30:31] 616 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 617 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 618 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 619 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 620 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 poison> 621 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 622 ret void 623} 624 625define void @v_shuffle_v4bf16_v3bf16__5_5_1_u(ptr addrspace(1) inreg %ptr) { 626; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_u: 627; GFX900: ; %bb.0: 628; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 629; GFX900-NEXT: ;;#ASMSTART 630; GFX900-NEXT: ; def v[0:1] 631; GFX900-NEXT: ;;#ASMEND 632; GFX900-NEXT: ;;#ASMSTART 633; GFX900-NEXT: ; def v[1:2] 634; GFX900-NEXT: ;;#ASMEND 635; GFX900-NEXT: v_alignbit_b32 v1, s4, v0, 16 636; GFX900-NEXT: s_mov_b32 s4, 0x5040100 637; GFX900-NEXT: v_mov_b32_e32 v3, 0 638; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 639; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 640; GFX900-NEXT: s_waitcnt vmcnt(0) 641; GFX900-NEXT: s_setpc_b64 s[30:31] 642; 643; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_u: 644; GFX90A: ; %bb.0: 645; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 646; GFX90A-NEXT: ;;#ASMSTART 647; GFX90A-NEXT: ; def v[0:1] 648; GFX90A-NEXT: ;;#ASMEND 649; GFX90A-NEXT: v_alignbit_b32 v1, s4, v0, 16 650; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 651; GFX90A-NEXT: v_mov_b32_e32 v4, 0 652; GFX90A-NEXT: ;;#ASMSTART 653; GFX90A-NEXT: ; def v[2:3] 654; GFX90A-NEXT: ;;#ASMEND 655; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 656; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 657; GFX90A-NEXT: s_waitcnt vmcnt(0) 658; GFX90A-NEXT: s_setpc_b64 s[30:31] 659; 660; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_u: 661; GFX940: ; %bb.0: 662; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 663; GFX940-NEXT: ;;#ASMSTART 664; GFX940-NEXT: ; def v[0:1] 665; GFX940-NEXT: ;;#ASMEND 666; GFX940-NEXT: s_mov_b32 s2, 0x5040100 667; GFX940-NEXT: v_mov_b32_e32 v4, 0 668; GFX940-NEXT: ;;#ASMSTART 669; GFX940-NEXT: ; def v[2:3] 670; GFX940-NEXT: ;;#ASMEND 671; GFX940-NEXT: v_alignbit_b32 v1, s0, v0, 16 672; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 673; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 674; GFX940-NEXT: s_waitcnt vmcnt(0) 675; GFX940-NEXT: s_setpc_b64 s[30:31] 676 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 677 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 678 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 679 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 680 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 poison> 681 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 682 ret void 683} 684 685define void @v_shuffle_v4bf16_v3bf16__5_5_2_u(ptr addrspace(1) inreg %ptr) { 686; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_u: 687; GFX900: ; %bb.0: 688; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 689; GFX900-NEXT: ;;#ASMSTART 690; GFX900-NEXT: ; def v[0:1] 691; GFX900-NEXT: ;;#ASMEND 692; GFX900-NEXT: s_mov_b32 s4, 0x5040100 693; GFX900-NEXT: v_mov_b32_e32 v4, 0 694; GFX900-NEXT: ;;#ASMSTART 695; GFX900-NEXT: ; def v[2:3] 696; GFX900-NEXT: ;;#ASMEND 697; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 698; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 699; GFX900-NEXT: s_waitcnt vmcnt(0) 700; GFX900-NEXT: s_setpc_b64 s[30:31] 701; 702; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_u: 703; GFX90A: ; %bb.0: 704; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 705; GFX90A-NEXT: ;;#ASMSTART 706; GFX90A-NEXT: ; def v[0:1] 707; GFX90A-NEXT: ;;#ASMEND 708; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 709; GFX90A-NEXT: v_mov_b32_e32 v4, 0 710; GFX90A-NEXT: ;;#ASMSTART 711; GFX90A-NEXT: ; def v[2:3] 712; GFX90A-NEXT: ;;#ASMEND 713; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 714; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 715; GFX90A-NEXT: s_waitcnt vmcnt(0) 716; GFX90A-NEXT: s_setpc_b64 s[30:31] 717; 718; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_u: 719; GFX940: ; %bb.0: 720; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 721; GFX940-NEXT: ;;#ASMSTART 722; GFX940-NEXT: ; def v[0:1] 723; GFX940-NEXT: ;;#ASMEND 724; GFX940-NEXT: s_mov_b32 s2, 0x5040100 725; GFX940-NEXT: v_mov_b32_e32 v4, 0 726; GFX940-NEXT: ;;#ASMSTART 727; GFX940-NEXT: ; def v[2:3] 728; GFX940-NEXT: ;;#ASMEND 729; GFX940-NEXT: s_nop 0 730; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 731; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 732; GFX940-NEXT: s_waitcnt vmcnt(0) 733; GFX940-NEXT: s_setpc_b64 s[30:31] 734 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 735 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 736 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 737 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 738 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 poison> 739 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 740 ret void 741} 742 743define void @v_shuffle_v4bf16_v3bf16__5_5_3_u(ptr addrspace(1) inreg %ptr) { 744; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_u: 745; GFX900: ; %bb.0: 746; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 747; GFX900-NEXT: ;;#ASMSTART 748; GFX900-NEXT: ; def v[0:1] 749; GFX900-NEXT: ;;#ASMEND 750; GFX900-NEXT: s_mov_b32 s4, 0x5040100 751; GFX900-NEXT: v_mov_b32_e32 v3, 0 752; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 753; GFX900-NEXT: v_mov_b32_e32 v2, v0 754; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 755; GFX900-NEXT: s_waitcnt vmcnt(0) 756; GFX900-NEXT: s_setpc_b64 s[30:31] 757; 758; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_u: 759; GFX90A: ; %bb.0: 760; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 761; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 762; GFX90A-NEXT: v_mov_b32_e32 v4, 0 763; GFX90A-NEXT: ;;#ASMSTART 764; GFX90A-NEXT: ; def v[0:1] 765; GFX90A-NEXT: ;;#ASMEND 766; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 767; GFX90A-NEXT: v_mov_b32_e32 v3, v0 768; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 769; GFX90A-NEXT: s_waitcnt vmcnt(0) 770; GFX90A-NEXT: s_setpc_b64 s[30:31] 771; 772; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_u: 773; GFX940: ; %bb.0: 774; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 775; GFX940-NEXT: s_mov_b32 s2, 0x5040100 776; GFX940-NEXT: v_mov_b32_e32 v4, 0 777; GFX940-NEXT: ;;#ASMSTART 778; GFX940-NEXT: ; def v[0:1] 779; GFX940-NEXT: ;;#ASMEND 780; GFX940-NEXT: s_nop 0 781; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 782; GFX940-NEXT: v_mov_b32_e32 v3, v0 783; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 784; GFX940-NEXT: s_waitcnt vmcnt(0) 785; GFX940-NEXT: s_setpc_b64 s[30:31] 786 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 787 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 788 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 789 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 790 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 poison> 791 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 792 ret void 793} 794 795define void @v_shuffle_v4bf16_v3bf16__5_5_4_u(ptr addrspace(1) inreg %ptr) { 796; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_u: 797; GFX900: ; %bb.0: 798; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 799; GFX900-NEXT: ;;#ASMSTART 800; GFX900-NEXT: ; def v[0:1] 801; GFX900-NEXT: ;;#ASMEND 802; GFX900-NEXT: v_alignbit_b32 v2, s4, v0, 16 803; GFX900-NEXT: s_mov_b32 s4, 0x5040100 804; GFX900-NEXT: v_mov_b32_e32 v3, 0 805; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 806; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 807; GFX900-NEXT: s_waitcnt vmcnt(0) 808; GFX900-NEXT: s_setpc_b64 s[30:31] 809; 810; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_u: 811; GFX90A: ; %bb.0: 812; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 813; GFX90A-NEXT: ;;#ASMSTART 814; GFX90A-NEXT: ; def v[0:1] 815; GFX90A-NEXT: ;;#ASMEND 816; GFX90A-NEXT: v_alignbit_b32 v3, s4, v0, 16 817; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 818; GFX90A-NEXT: v_mov_b32_e32 v4, 0 819; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 820; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 821; GFX90A-NEXT: s_waitcnt vmcnt(0) 822; GFX90A-NEXT: s_setpc_b64 s[30:31] 823; 824; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_u: 825; GFX940: ; %bb.0: 826; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 827; GFX940-NEXT: s_mov_b32 s2, 0x5040100 828; GFX940-NEXT: v_mov_b32_e32 v4, 0 829; GFX940-NEXT: ;;#ASMSTART 830; GFX940-NEXT: ; def v[0:1] 831; GFX940-NEXT: ;;#ASMEND 832; GFX940-NEXT: s_nop 0 833; GFX940-NEXT: v_alignbit_b32 v3, s0, v0, 16 834; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 835; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 836; GFX940-NEXT: s_waitcnt vmcnt(0) 837; GFX940-NEXT: s_setpc_b64 s[30:31] 838 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 839 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 840 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 841 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 842 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 poison> 843 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 844 ret void 845} 846 847define void @v_shuffle_v4bf16_v3bf16__5_5_5_u(ptr addrspace(1) inreg %ptr) { 848; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_u: 849; GFX900: ; %bb.0: 850; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 851; GFX900-NEXT: ;;#ASMSTART 852; GFX900-NEXT: ; def v[0:1] 853; GFX900-NEXT: ;;#ASMEND 854; GFX900-NEXT: s_mov_b32 s4, 0x5040100 855; GFX900-NEXT: v_mov_b32_e32 v2, 0 856; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 857; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 858; GFX900-NEXT: s_waitcnt vmcnt(0) 859; GFX900-NEXT: s_setpc_b64 s[30:31] 860; 861; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_u: 862; GFX90A: ; %bb.0: 863; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 864; GFX90A-NEXT: ;;#ASMSTART 865; GFX90A-NEXT: ; def v[0:1] 866; GFX90A-NEXT: ;;#ASMEND 867; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 868; GFX90A-NEXT: v_mov_b32_e32 v2, 0 869; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 870; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 871; GFX90A-NEXT: s_waitcnt vmcnt(0) 872; GFX90A-NEXT: s_setpc_b64 s[30:31] 873; 874; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_u: 875; GFX940: ; %bb.0: 876; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 877; GFX940-NEXT: ;;#ASMSTART 878; GFX940-NEXT: ; def v[0:1] 879; GFX940-NEXT: ;;#ASMEND 880; GFX940-NEXT: s_mov_b32 s2, 0x5040100 881; GFX940-NEXT: v_mov_b32_e32 v2, 0 882; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 883; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 884; GFX940-NEXT: s_waitcnt vmcnt(0) 885; GFX940-NEXT: s_setpc_b64 s[30:31] 886 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 887 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 888 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 889 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 890 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 poison> 891 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 892 ret void 893} 894 895define void @v_shuffle_v4bf16_v3bf16__5_5_5_0(ptr addrspace(1) inreg %ptr) { 896; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_0: 897; GFX900: ; %bb.0: 898; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 899; GFX900-NEXT: ;;#ASMSTART 900; GFX900-NEXT: ; def v[0:1] 901; GFX900-NEXT: ;;#ASMEND 902; GFX900-NEXT: ;;#ASMSTART 903; GFX900-NEXT: ; def v[1:2] 904; GFX900-NEXT: ;;#ASMEND 905; GFX900-NEXT: s_mov_b32 s4, 0x5040100 906; GFX900-NEXT: v_mov_b32_e32 v3, 0 907; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 908; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 909; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 910; GFX900-NEXT: s_waitcnt vmcnt(0) 911; GFX900-NEXT: s_setpc_b64 s[30:31] 912; 913; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_0: 914; GFX90A: ; %bb.0: 915; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 916; GFX90A-NEXT: ;;#ASMSTART 917; GFX90A-NEXT: ; def v[0:1] 918; GFX90A-NEXT: ;;#ASMEND 919; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 920; GFX90A-NEXT: v_mov_b32_e32 v4, 0 921; GFX90A-NEXT: ;;#ASMSTART 922; GFX90A-NEXT: ; def v[2:3] 923; GFX90A-NEXT: ;;#ASMEND 924; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 925; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 926; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 927; GFX90A-NEXT: s_waitcnt vmcnt(0) 928; GFX90A-NEXT: s_setpc_b64 s[30:31] 929; 930; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_0: 931; GFX940: ; %bb.0: 932; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 933; GFX940-NEXT: ;;#ASMSTART 934; GFX940-NEXT: ; def v[0:1] 935; GFX940-NEXT: ;;#ASMEND 936; GFX940-NEXT: s_mov_b32 s2, 0x5040100 937; GFX940-NEXT: v_mov_b32_e32 v4, 0 938; GFX940-NEXT: ;;#ASMSTART 939; GFX940-NEXT: ; def v[2:3] 940; GFX940-NEXT: ;;#ASMEND 941; GFX940-NEXT: s_nop 0 942; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 943; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 944; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 945; GFX940-NEXT: s_waitcnt vmcnt(0) 946; GFX940-NEXT: s_setpc_b64 s[30:31] 947 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 948 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 949 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 950 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 951 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 0> 952 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 953 ret void 954} 955 956define void @v_shuffle_v4bf16_v3bf16__5_5_5_1(ptr addrspace(1) inreg %ptr) { 957; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_1: 958; GFX900: ; %bb.0: 959; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 960; GFX900-NEXT: ;;#ASMSTART 961; GFX900-NEXT: ; def v[0:1] 962; GFX900-NEXT: ;;#ASMEND 963; GFX900-NEXT: ;;#ASMSTART 964; GFX900-NEXT: ; def v[1:2] 965; GFX900-NEXT: ;;#ASMEND 966; GFX900-NEXT: s_mov_b32 s4, 0xffff 967; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 968; GFX900-NEXT: s_mov_b32 s4, 0x5040100 969; GFX900-NEXT: v_mov_b32_e32 v3, 0 970; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 971; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 972; GFX900-NEXT: s_waitcnt vmcnt(0) 973; GFX900-NEXT: s_setpc_b64 s[30:31] 974; 975; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_1: 976; GFX90A: ; %bb.0: 977; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 978; GFX90A-NEXT: ;;#ASMSTART 979; GFX90A-NEXT: ; def v[0:1] 980; GFX90A-NEXT: ;;#ASMEND 981; GFX90A-NEXT: s_mov_b32 s4, 0xffff 982; GFX90A-NEXT: ;;#ASMSTART 983; GFX90A-NEXT: ; def v[2:3] 984; GFX90A-NEXT: ;;#ASMEND 985; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 986; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 987; GFX90A-NEXT: v_mov_b32_e32 v4, 0 988; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 989; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 990; GFX90A-NEXT: s_waitcnt vmcnt(0) 991; GFX90A-NEXT: s_setpc_b64 s[30:31] 992; 993; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_1: 994; GFX940: ; %bb.0: 995; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 996; GFX940-NEXT: ;;#ASMSTART 997; GFX940-NEXT: ; def v[0:1] 998; GFX940-NEXT: ;;#ASMEND 999; GFX940-NEXT: s_mov_b32 s2, 0xffff 1000; GFX940-NEXT: ;;#ASMSTART 1001; GFX940-NEXT: ; def v[2:3] 1002; GFX940-NEXT: ;;#ASMEND 1003; GFX940-NEXT: v_mov_b32_e32 v4, 0 1004; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 1005; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1006; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 1007; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 1008; GFX940-NEXT: s_waitcnt vmcnt(0) 1009; GFX940-NEXT: s_setpc_b64 s[30:31] 1010 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1011 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 1012 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1013 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1014 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 1> 1015 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1016 ret void 1017} 1018 1019define void @v_shuffle_v4bf16_v3bf16__5_5_5_2(ptr addrspace(1) inreg %ptr) { 1020; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_2: 1021; GFX900: ; %bb.0: 1022; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1023; GFX900-NEXT: ;;#ASMSTART 1024; GFX900-NEXT: ; def v[0:1] 1025; GFX900-NEXT: ;;#ASMEND 1026; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1027; GFX900-NEXT: v_mov_b32_e32 v4, 0 1028; GFX900-NEXT: ;;#ASMSTART 1029; GFX900-NEXT: ; def v[2:3] 1030; GFX900-NEXT: ;;#ASMEND 1031; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 1032; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 1033; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 1034; GFX900-NEXT: s_waitcnt vmcnt(0) 1035; GFX900-NEXT: s_setpc_b64 s[30:31] 1036; 1037; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_2: 1038; GFX90A: ; %bb.0: 1039; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1040; GFX90A-NEXT: ;;#ASMSTART 1041; GFX90A-NEXT: ; def v[0:1] 1042; GFX90A-NEXT: ;;#ASMEND 1043; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1044; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1045; GFX90A-NEXT: ;;#ASMSTART 1046; GFX90A-NEXT: ; def v[2:3] 1047; GFX90A-NEXT: ;;#ASMEND 1048; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 1049; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 1050; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 1051; GFX90A-NEXT: s_waitcnt vmcnt(0) 1052; GFX90A-NEXT: s_setpc_b64 s[30:31] 1053; 1054; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_2: 1055; GFX940: ; %bb.0: 1056; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1057; GFX940-NEXT: ;;#ASMSTART 1058; GFX940-NEXT: ; def v[0:1] 1059; GFX940-NEXT: ;;#ASMEND 1060; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1061; GFX940-NEXT: v_mov_b32_e32 v4, 0 1062; GFX940-NEXT: ;;#ASMSTART 1063; GFX940-NEXT: ; def v[2:3] 1064; GFX940-NEXT: ;;#ASMEND 1065; GFX940-NEXT: s_nop 0 1066; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 1067; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 1068; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 1069; GFX940-NEXT: s_waitcnt vmcnt(0) 1070; GFX940-NEXT: s_setpc_b64 s[30:31] 1071 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1072 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 1073 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1074 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1075 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 2> 1076 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1077 ret void 1078} 1079 1080define void @v_shuffle_v4bf16_v3bf16__5_5_5_3(ptr addrspace(1) inreg %ptr) { 1081; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_3: 1082; GFX900: ; %bb.0: 1083; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1084; GFX900-NEXT: ;;#ASMSTART 1085; GFX900-NEXT: ; def v[0:1] 1086; GFX900-NEXT: ;;#ASMEND 1087; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1088; GFX900-NEXT: v_mov_b32_e32 v3, 0 1089; GFX900-NEXT: v_perm_b32 v2, v0, v1, s4 1090; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 1091; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 1092; GFX900-NEXT: s_waitcnt vmcnt(0) 1093; GFX900-NEXT: s_setpc_b64 s[30:31] 1094; 1095; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_3: 1096; GFX90A: ; %bb.0: 1097; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1098; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1099; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1100; GFX90A-NEXT: ;;#ASMSTART 1101; GFX90A-NEXT: ; def v[0:1] 1102; GFX90A-NEXT: ;;#ASMEND 1103; GFX90A-NEXT: v_perm_b32 v3, v0, v1, s4 1104; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 1105; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 1106; GFX90A-NEXT: s_waitcnt vmcnt(0) 1107; GFX90A-NEXT: s_setpc_b64 s[30:31] 1108; 1109; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_3: 1110; GFX940: ; %bb.0: 1111; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1112; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1113; GFX940-NEXT: v_mov_b32_e32 v4, 0 1114; GFX940-NEXT: ;;#ASMSTART 1115; GFX940-NEXT: ; def v[0:1] 1116; GFX940-NEXT: ;;#ASMEND 1117; GFX940-NEXT: s_nop 0 1118; GFX940-NEXT: v_perm_b32 v3, v0, v1, s2 1119; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 1120; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 1121; GFX940-NEXT: s_waitcnt vmcnt(0) 1122; GFX940-NEXT: s_setpc_b64 s[30:31] 1123 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1124 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 1125 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1126 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1127 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 3> 1128 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1129 ret void 1130} 1131 1132define void @v_shuffle_v4bf16_v3bf16__5_5_5_4(ptr addrspace(1) inreg %ptr) { 1133; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_4: 1134; GFX900: ; %bb.0: 1135; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1136; GFX900-NEXT: s_mov_b32 s4, 0xffff 1137; GFX900-NEXT: ;;#ASMSTART 1138; GFX900-NEXT: ; def v[0:1] 1139; GFX900-NEXT: ;;#ASMEND 1140; GFX900-NEXT: v_bfi_b32 v2, s4, v1, v0 1141; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1142; GFX900-NEXT: v_mov_b32_e32 v3, 0 1143; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 1144; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 1145; GFX900-NEXT: s_waitcnt vmcnt(0) 1146; GFX900-NEXT: s_setpc_b64 s[30:31] 1147; 1148; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_4: 1149; GFX90A: ; %bb.0: 1150; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1151; GFX90A-NEXT: s_mov_b32 s4, 0xffff 1152; GFX90A-NEXT: ;;#ASMSTART 1153; GFX90A-NEXT: ; def v[0:1] 1154; GFX90A-NEXT: ;;#ASMEND 1155; GFX90A-NEXT: v_bfi_b32 v3, s4, v1, v0 1156; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1157; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1158; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 1159; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 1160; GFX90A-NEXT: s_waitcnt vmcnt(0) 1161; GFX90A-NEXT: s_setpc_b64 s[30:31] 1162; 1163; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_4: 1164; GFX940: ; %bb.0: 1165; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1166; GFX940-NEXT: s_mov_b32 s2, 0xffff 1167; GFX940-NEXT: ;;#ASMSTART 1168; GFX940-NEXT: ; def v[0:1] 1169; GFX940-NEXT: ;;#ASMEND 1170; GFX940-NEXT: v_mov_b32_e32 v4, 0 1171; GFX940-NEXT: v_bfi_b32 v3, s2, v1, v0 1172; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1173; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 1174; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 1175; GFX940-NEXT: s_waitcnt vmcnt(0) 1176; GFX940-NEXT: s_setpc_b64 s[30:31] 1177 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1178 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 1179 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1180 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1181 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 4> 1182 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1183 ret void 1184} 1185 1186define void @v_shuffle_v4bf16_v3bf16__5_5_5_5(ptr addrspace(1) inreg %ptr) { 1187; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_5: 1188; GFX900: ; %bb.0: 1189; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1190; GFX900-NEXT: ;;#ASMSTART 1191; GFX900-NEXT: ; def v[0:1] 1192; GFX900-NEXT: ;;#ASMEND 1193; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1194; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 1195; GFX900-NEXT: v_mov_b32_e32 v2, 0 1196; GFX900-NEXT: v_mov_b32_e32 v1, v0 1197; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 1198; GFX900-NEXT: s_waitcnt vmcnt(0) 1199; GFX900-NEXT: s_setpc_b64 s[30:31] 1200; 1201; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_5: 1202; GFX90A: ; %bb.0: 1203; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1204; GFX90A-NEXT: ;;#ASMSTART 1205; GFX90A-NEXT: ; def v[0:1] 1206; GFX90A-NEXT: ;;#ASMEND 1207; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1208; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 1209; GFX90A-NEXT: v_mov_b32_e32 v2, 0 1210; GFX90A-NEXT: v_mov_b32_e32 v1, v0 1211; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 1212; GFX90A-NEXT: s_waitcnt vmcnt(0) 1213; GFX90A-NEXT: s_setpc_b64 s[30:31] 1214; 1215; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_5: 1216; GFX940: ; %bb.0: 1217; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1218; GFX940-NEXT: ;;#ASMSTART 1219; GFX940-NEXT: ; def v[0:1] 1220; GFX940-NEXT: ;;#ASMEND 1221; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1222; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 1223; GFX940-NEXT: v_mov_b32_e32 v2, 0 1224; GFX940-NEXT: v_mov_b32_e32 v1, v0 1225; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 1226; GFX940-NEXT: s_waitcnt vmcnt(0) 1227; GFX940-NEXT: s_setpc_b64 s[30:31] 1228 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1229 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 1230 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1231 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1232 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 5> 1233 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1234 ret void 1235} 1236 1237define void @v_shuffle_v4bf16_v3bf16__u_0_0_0(ptr addrspace(1) inreg %ptr) { 1238; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__u_0_0_0: 1239; GFX900: ; %bb.0: 1240; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1241; GFX900-NEXT: ;;#ASMSTART 1242; GFX900-NEXT: ; def v[0:1] 1243; GFX900-NEXT: ;;#ASMEND 1244; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1245; GFX900-NEXT: v_mov_b32_e32 v2, 0 1246; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 1247; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1248; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 1249; GFX900-NEXT: s_waitcnt vmcnt(0) 1250; GFX900-NEXT: s_setpc_b64 s[30:31] 1251; 1252; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__u_0_0_0: 1253; GFX90A: ; %bb.0: 1254; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1255; GFX90A-NEXT: ;;#ASMSTART 1256; GFX90A-NEXT: ; def v[0:1] 1257; GFX90A-NEXT: ;;#ASMEND 1258; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1259; GFX90A-NEXT: v_mov_b32_e32 v2, 0 1260; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 1261; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1262; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 1263; GFX90A-NEXT: s_waitcnt vmcnt(0) 1264; GFX90A-NEXT: s_setpc_b64 s[30:31] 1265; 1266; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__u_0_0_0: 1267; GFX940: ; %bb.0: 1268; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1269; GFX940-NEXT: ;;#ASMSTART 1270; GFX940-NEXT: ; def v[0:1] 1271; GFX940-NEXT: ;;#ASMEND 1272; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1273; GFX940-NEXT: v_mov_b32_e32 v2, 0 1274; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 1275; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1276; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 1277; GFX940-NEXT: s_waitcnt vmcnt(0) 1278; GFX940-NEXT: s_setpc_b64 s[30:31] 1279 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1280 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1281 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 0, i32 0, i32 0> 1282 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1283 ret void 1284} 1285 1286define void @v_shuffle_v4bf16_v3bf16__0_0_0_0(ptr addrspace(1) inreg %ptr) { 1287; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_0_0_0: 1288; GFX900: ; %bb.0: 1289; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1290; GFX900-NEXT: ;;#ASMSTART 1291; GFX900-NEXT: ; def v[0:1] 1292; GFX900-NEXT: ;;#ASMEND 1293; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1294; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 1295; GFX900-NEXT: v_mov_b32_e32 v2, 0 1296; GFX900-NEXT: v_mov_b32_e32 v1, v0 1297; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 1298; GFX900-NEXT: s_waitcnt vmcnt(0) 1299; GFX900-NEXT: s_setpc_b64 s[30:31] 1300; 1301; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_0_0_0: 1302; GFX90A: ; %bb.0: 1303; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1304; GFX90A-NEXT: ;;#ASMSTART 1305; GFX90A-NEXT: ; def v[0:1] 1306; GFX90A-NEXT: ;;#ASMEND 1307; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1308; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 1309; GFX90A-NEXT: v_mov_b32_e32 v2, 0 1310; GFX90A-NEXT: v_mov_b32_e32 v1, v0 1311; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 1312; GFX90A-NEXT: s_waitcnt vmcnt(0) 1313; GFX90A-NEXT: s_setpc_b64 s[30:31] 1314; 1315; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_0_0_0: 1316; GFX940: ; %bb.0: 1317; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1318; GFX940-NEXT: ;;#ASMSTART 1319; GFX940-NEXT: ; def v[0:1] 1320; GFX940-NEXT: ;;#ASMEND 1321; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1322; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 1323; GFX940-NEXT: v_mov_b32_e32 v2, 0 1324; GFX940-NEXT: v_mov_b32_e32 v1, v0 1325; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 1326; GFX940-NEXT: s_waitcnt vmcnt(0) 1327; GFX940-NEXT: s_setpc_b64 s[30:31] 1328 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1329 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1330 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> zeroinitializer 1331 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1332 ret void 1333} 1334 1335define void @v_shuffle_v4bf16_v3bf16__1_0_0_0(ptr addrspace(1) inreg %ptr) { 1336; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_0_0_0: 1337; GFX900: ; %bb.0: 1338; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1339; GFX900-NEXT: ;;#ASMSTART 1340; GFX900-NEXT: ; def v[0:1] 1341; GFX900-NEXT: ;;#ASMEND 1342; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1343; GFX900-NEXT: v_mov_b32_e32 v2, 0 1344; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 1345; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 1346; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 1347; GFX900-NEXT: s_waitcnt vmcnt(0) 1348; GFX900-NEXT: s_setpc_b64 s[30:31] 1349; 1350; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_0_0_0: 1351; GFX90A: ; %bb.0: 1352; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1353; GFX90A-NEXT: ;;#ASMSTART 1354; GFX90A-NEXT: ; def v[0:1] 1355; GFX90A-NEXT: ;;#ASMEND 1356; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1357; GFX90A-NEXT: v_mov_b32_e32 v2, 0 1358; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 1359; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 1360; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 1361; GFX90A-NEXT: s_waitcnt vmcnt(0) 1362; GFX90A-NEXT: s_setpc_b64 s[30:31] 1363; 1364; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_0_0_0: 1365; GFX940: ; %bb.0: 1366; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1367; GFX940-NEXT: ;;#ASMSTART 1368; GFX940-NEXT: ; def v[0:1] 1369; GFX940-NEXT: ;;#ASMEND 1370; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1371; GFX940-NEXT: v_mov_b32_e32 v2, 0 1372; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 1373; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 1374; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 1375; GFX940-NEXT: s_waitcnt vmcnt(0) 1376; GFX940-NEXT: s_setpc_b64 s[30:31] 1377 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1378 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1379 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 1380 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1381 ret void 1382} 1383 1384define void @v_shuffle_v4bf16_v3bf16__2_0_0_0(ptr addrspace(1) inreg %ptr) { 1385; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_0_0_0: 1386; GFX900: ; %bb.0: 1387; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1388; GFX900-NEXT: ;;#ASMSTART 1389; GFX900-NEXT: ; def v[0:1] 1390; GFX900-NEXT: ;;#ASMEND 1391; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1392; GFX900-NEXT: v_mov_b32_e32 v3, 0 1393; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 1394; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 1395; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 1396; GFX900-NEXT: s_waitcnt vmcnt(0) 1397; GFX900-NEXT: s_setpc_b64 s[30:31] 1398; 1399; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_0_0_0: 1400; GFX90A: ; %bb.0: 1401; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1402; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1403; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1404; GFX90A-NEXT: ;;#ASMSTART 1405; GFX90A-NEXT: ; def v[0:1] 1406; GFX90A-NEXT: ;;#ASMEND 1407; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 1408; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 1409; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 1410; GFX90A-NEXT: s_waitcnt vmcnt(0) 1411; GFX90A-NEXT: s_setpc_b64 s[30:31] 1412; 1413; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_0_0_0: 1414; GFX940: ; %bb.0: 1415; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1416; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1417; GFX940-NEXT: v_mov_b32_e32 v4, 0 1418; GFX940-NEXT: ;;#ASMSTART 1419; GFX940-NEXT: ; def v[0:1] 1420; GFX940-NEXT: ;;#ASMEND 1421; GFX940-NEXT: s_nop 0 1422; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 1423; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 1424; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 1425; GFX940-NEXT: s_waitcnt vmcnt(0) 1426; GFX940-NEXT: s_setpc_b64 s[30:31] 1427 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1428 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1429 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 0, i32 0, i32 0> 1430 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1431 ret void 1432} 1433 1434define void @v_shuffle_v4bf16_v3bf16__3_0_0_0(ptr addrspace(1) inreg %ptr) { 1435; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__3_0_0_0: 1436; GFX900: ; %bb.0: 1437; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1438; GFX900-NEXT: ;;#ASMSTART 1439; GFX900-NEXT: ; def v[0:1] 1440; GFX900-NEXT: ;;#ASMEND 1441; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1442; GFX900-NEXT: v_mov_b32_e32 v2, 0 1443; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 1444; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1445; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 1446; GFX900-NEXT: s_waitcnt vmcnt(0) 1447; GFX900-NEXT: s_setpc_b64 s[30:31] 1448; 1449; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__3_0_0_0: 1450; GFX90A: ; %bb.0: 1451; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1452; GFX90A-NEXT: ;;#ASMSTART 1453; GFX90A-NEXT: ; def v[0:1] 1454; GFX90A-NEXT: ;;#ASMEND 1455; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1456; GFX90A-NEXT: v_mov_b32_e32 v2, 0 1457; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 1458; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1459; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 1460; GFX90A-NEXT: s_waitcnt vmcnt(0) 1461; GFX90A-NEXT: s_setpc_b64 s[30:31] 1462; 1463; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__3_0_0_0: 1464; GFX940: ; %bb.0: 1465; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1466; GFX940-NEXT: ;;#ASMSTART 1467; GFX940-NEXT: ; def v[0:1] 1468; GFX940-NEXT: ;;#ASMEND 1469; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1470; GFX940-NEXT: v_mov_b32_e32 v2, 0 1471; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 1472; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1473; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 1474; GFX940-NEXT: s_waitcnt vmcnt(0) 1475; GFX940-NEXT: s_setpc_b64 s[30:31] 1476 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1477 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1478 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 1479 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1480 ret void 1481} 1482 1483define void @v_shuffle_v4bf16_v3bf16__4_0_0_0(ptr addrspace(1) inreg %ptr) { 1484; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_0_0_0: 1485; GFX900: ; %bb.0: 1486; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1487; GFX900-NEXT: ;;#ASMSTART 1488; GFX900-NEXT: ; def v[0:1] 1489; GFX900-NEXT: ;;#ASMEND 1490; GFX900-NEXT: ;;#ASMSTART 1491; GFX900-NEXT: ; def v[1:2] 1492; GFX900-NEXT: ;;#ASMEND 1493; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1494; GFX900-NEXT: v_mov_b32_e32 v3, 0 1495; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 1496; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 1497; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 1498; GFX900-NEXT: s_waitcnt vmcnt(0) 1499; GFX900-NEXT: s_setpc_b64 s[30:31] 1500; 1501; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_0_0_0: 1502; GFX90A: ; %bb.0: 1503; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1504; GFX90A-NEXT: ;;#ASMSTART 1505; GFX90A-NEXT: ; def v[0:1] 1506; GFX90A-NEXT: ;;#ASMEND 1507; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1508; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1509; GFX90A-NEXT: ;;#ASMSTART 1510; GFX90A-NEXT: ; def v[2:3] 1511; GFX90A-NEXT: ;;#ASMEND 1512; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 1513; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 1514; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 1515; GFX90A-NEXT: s_waitcnt vmcnt(0) 1516; GFX90A-NEXT: s_setpc_b64 s[30:31] 1517; 1518; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_0_0_0: 1519; GFX940: ; %bb.0: 1520; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1521; GFX940-NEXT: ;;#ASMSTART 1522; GFX940-NEXT: ; def v[0:1] 1523; GFX940-NEXT: ;;#ASMEND 1524; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1525; GFX940-NEXT: v_mov_b32_e32 v4, 0 1526; GFX940-NEXT: ;;#ASMSTART 1527; GFX940-NEXT: ; def v[2:3] 1528; GFX940-NEXT: ;;#ASMEND 1529; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 1530; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 1531; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 1532; GFX940-NEXT: s_waitcnt vmcnt(0) 1533; GFX940-NEXT: s_setpc_b64 s[30:31] 1534 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1535 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 1536 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1537 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1538 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 0, i32 0, i32 0> 1539 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1540 ret void 1541} 1542 1543define void @v_shuffle_v4bf16_v3bf16__5_0_0_0(ptr addrspace(1) inreg %ptr) { 1544; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_0_0: 1545; GFX900: ; %bb.0: 1546; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1547; GFX900-NEXT: ;;#ASMSTART 1548; GFX900-NEXT: ; def v[0:1] 1549; GFX900-NEXT: ;;#ASMEND 1550; GFX900-NEXT: ;;#ASMSTART 1551; GFX900-NEXT: ; def v[1:2] 1552; GFX900-NEXT: ;;#ASMEND 1553; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1554; GFX900-NEXT: v_mov_b32_e32 v3, 0 1555; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 1556; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 1557; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 1558; GFX900-NEXT: s_waitcnt vmcnt(0) 1559; GFX900-NEXT: s_setpc_b64 s[30:31] 1560; 1561; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_0_0: 1562; GFX90A: ; %bb.0: 1563; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1564; GFX90A-NEXT: ;;#ASMSTART 1565; GFX90A-NEXT: ; def v[2:3] 1566; GFX90A-NEXT: ;;#ASMEND 1567; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1568; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1569; GFX90A-NEXT: ;;#ASMSTART 1570; GFX90A-NEXT: ; def v[0:1] 1571; GFX90A-NEXT: ;;#ASMEND 1572; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 1573; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 1574; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 1575; GFX90A-NEXT: s_waitcnt vmcnt(0) 1576; GFX90A-NEXT: s_setpc_b64 s[30:31] 1577; 1578; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_0_0: 1579; GFX940: ; %bb.0: 1580; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1581; GFX940-NEXT: ;;#ASMSTART 1582; GFX940-NEXT: ; def v[2:3] 1583; GFX940-NEXT: ;;#ASMEND 1584; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1585; GFX940-NEXT: v_mov_b32_e32 v4, 0 1586; GFX940-NEXT: ;;#ASMSTART 1587; GFX940-NEXT: ; def v[0:1] 1588; GFX940-NEXT: ;;#ASMEND 1589; GFX940-NEXT: s_nop 0 1590; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 1591; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 1592; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 1593; GFX940-NEXT: s_waitcnt vmcnt(0) 1594; GFX940-NEXT: s_setpc_b64 s[30:31] 1595 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1596 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 1597 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1598 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1599 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 0, i32 0> 1600 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1601 ret void 1602} 1603 1604define void @v_shuffle_v4bf16_v3bf16__5_u_0_0(ptr addrspace(1) inreg %ptr) { 1605; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_0_0: 1606; GFX900: ; %bb.0: 1607; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1608; GFX900-NEXT: ;;#ASMSTART 1609; GFX900-NEXT: ; def v[0:1] 1610; GFX900-NEXT: ;;#ASMEND 1611; GFX900-NEXT: ;;#ASMSTART 1612; GFX900-NEXT: ; def v[1:2] 1613; GFX900-NEXT: ;;#ASMEND 1614; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1615; GFX900-NEXT: v_mov_b32_e32 v3, 0 1616; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 1617; GFX900-NEXT: v_mov_b32_e32 v0, v2 1618; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 1619; GFX900-NEXT: s_waitcnt vmcnt(0) 1620; GFX900-NEXT: s_setpc_b64 s[30:31] 1621; 1622; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_0_0: 1623; GFX90A: ; %bb.0: 1624; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1625; GFX90A-NEXT: ;;#ASMSTART 1626; GFX90A-NEXT: ; def v[0:1] 1627; GFX90A-NEXT: ;;#ASMEND 1628; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1629; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1630; GFX90A-NEXT: ;;#ASMSTART 1631; GFX90A-NEXT: ; def v[2:3] 1632; GFX90A-NEXT: ;;#ASMEND 1633; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 1634; GFX90A-NEXT: v_mov_b32_e32 v0, v3 1635; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 1636; GFX90A-NEXT: s_waitcnt vmcnt(0) 1637; GFX90A-NEXT: s_setpc_b64 s[30:31] 1638; 1639; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_0_0: 1640; GFX940: ; %bb.0: 1641; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1642; GFX940-NEXT: ;;#ASMSTART 1643; GFX940-NEXT: ; def v[0:1] 1644; GFX940-NEXT: ;;#ASMEND 1645; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1646; GFX940-NEXT: v_mov_b32_e32 v4, 0 1647; GFX940-NEXT: ;;#ASMSTART 1648; GFX940-NEXT: ; def v[2:3] 1649; GFX940-NEXT: ;;#ASMEND 1650; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 1651; GFX940-NEXT: v_mov_b32_e32 v0, v3 1652; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 1653; GFX940-NEXT: s_waitcnt vmcnt(0) 1654; GFX940-NEXT: s_setpc_b64 s[30:31] 1655 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1656 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 1657 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1658 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1659 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 0, i32 0> 1660 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1661 ret void 1662} 1663 1664define void @v_shuffle_v4bf16_v3bf16__5_1_0_0(ptr addrspace(1) inreg %ptr) { 1665; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_0_0: 1666; GFX900: ; %bb.0: 1667; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1668; GFX900-NEXT: ;;#ASMSTART 1669; GFX900-NEXT: ; def v[0:1] 1670; GFX900-NEXT: ;;#ASMEND 1671; GFX900-NEXT: ;;#ASMSTART 1672; GFX900-NEXT: ; def v[1:2] 1673; GFX900-NEXT: ;;#ASMEND 1674; GFX900-NEXT: s_mov_b32 s4, 0xffff 1675; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 1676; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1677; GFX900-NEXT: v_mov_b32_e32 v3, 0 1678; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 1679; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 1680; GFX900-NEXT: s_waitcnt vmcnt(0) 1681; GFX900-NEXT: s_setpc_b64 s[30:31] 1682; 1683; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_0_0: 1684; GFX90A: ; %bb.0: 1685; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1686; GFX90A-NEXT: ;;#ASMSTART 1687; GFX90A-NEXT: ; def v[2:3] 1688; GFX90A-NEXT: ;;#ASMEND 1689; GFX90A-NEXT: s_mov_b32 s4, 0xffff 1690; GFX90A-NEXT: ;;#ASMSTART 1691; GFX90A-NEXT: ; def v[0:1] 1692; GFX90A-NEXT: ;;#ASMEND 1693; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v0 1694; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1695; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1696; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 1697; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 1698; GFX90A-NEXT: s_waitcnt vmcnt(0) 1699; GFX90A-NEXT: s_setpc_b64 s[30:31] 1700; 1701; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_0_0: 1702; GFX940: ; %bb.0: 1703; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1704; GFX940-NEXT: ;;#ASMSTART 1705; GFX940-NEXT: ; def v[2:3] 1706; GFX940-NEXT: ;;#ASMEND 1707; GFX940-NEXT: s_mov_b32 s2, 0xffff 1708; GFX940-NEXT: ;;#ASMSTART 1709; GFX940-NEXT: ; def v[0:1] 1710; GFX940-NEXT: ;;#ASMEND 1711; GFX940-NEXT: v_mov_b32_e32 v4, 0 1712; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v0 1713; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1714; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 1715; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 1716; GFX940-NEXT: s_waitcnt vmcnt(0) 1717; GFX940-NEXT: s_setpc_b64 s[30:31] 1718 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1719 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 1720 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1721 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1722 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 0, i32 0> 1723 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1724 ret void 1725} 1726 1727define void @v_shuffle_v4bf16_v3bf16__5_2_0_0(ptr addrspace(1) inreg %ptr) { 1728; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_0_0: 1729; GFX900: ; %bb.0: 1730; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1731; GFX900-NEXT: ;;#ASMSTART 1732; GFX900-NEXT: ; def v[0:1] 1733; GFX900-NEXT: ;;#ASMEND 1734; GFX900-NEXT: ;;#ASMSTART 1735; GFX900-NEXT: ; def v[2:3] 1736; GFX900-NEXT: ;;#ASMEND 1737; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1738; GFX900-NEXT: v_mov_b32_e32 v4, 0 1739; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 1740; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 1741; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] 1742; GFX900-NEXT: s_waitcnt vmcnt(0) 1743; GFX900-NEXT: s_setpc_b64 s[30:31] 1744; 1745; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_0_0: 1746; GFX90A: ; %bb.0: 1747; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1748; GFX90A-NEXT: ;;#ASMSTART 1749; GFX90A-NEXT: ; def v[2:3] 1750; GFX90A-NEXT: ;;#ASMEND 1751; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1752; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1753; GFX90A-NEXT: ;;#ASMSTART 1754; GFX90A-NEXT: ; def v[0:1] 1755; GFX90A-NEXT: ;;#ASMEND 1756; GFX90A-NEXT: v_perm_b32 v2, v1, v3, s4 1757; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 1758; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 1759; GFX90A-NEXT: s_waitcnt vmcnt(0) 1760; GFX90A-NEXT: s_setpc_b64 s[30:31] 1761; 1762; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_0_0: 1763; GFX940: ; %bb.0: 1764; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1765; GFX940-NEXT: ;;#ASMSTART 1766; GFX940-NEXT: ; def v[2:3] 1767; GFX940-NEXT: ;;#ASMEND 1768; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1769; GFX940-NEXT: v_mov_b32_e32 v4, 0 1770; GFX940-NEXT: ;;#ASMSTART 1771; GFX940-NEXT: ; def v[0:1] 1772; GFX940-NEXT: ;;#ASMEND 1773; GFX940-NEXT: s_nop 0 1774; GFX940-NEXT: v_perm_b32 v2, v1, v3, s2 1775; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 1776; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 1777; GFX940-NEXT: s_waitcnt vmcnt(0) 1778; GFX940-NEXT: s_setpc_b64 s[30:31] 1779 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1780 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 1781 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1782 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1783 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 0, i32 0> 1784 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1785 ret void 1786} 1787 1788define void @v_shuffle_v4bf16_v3bf16__5_3_0_0(ptr addrspace(1) inreg %ptr) { 1789; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_0_0: 1790; GFX900: ; %bb.0: 1791; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1792; GFX900-NEXT: ;;#ASMSTART 1793; GFX900-NEXT: ; def v[0:1] 1794; GFX900-NEXT: ;;#ASMEND 1795; GFX900-NEXT: ;;#ASMSTART 1796; GFX900-NEXT: ; def v[1:2] 1797; GFX900-NEXT: ;;#ASMEND 1798; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1799; GFX900-NEXT: v_mov_b32_e32 v3, 0 1800; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 1801; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 1802; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 1803; GFX900-NEXT: s_waitcnt vmcnt(0) 1804; GFX900-NEXT: s_setpc_b64 s[30:31] 1805; 1806; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_0_0: 1807; GFX90A: ; %bb.0: 1808; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1809; GFX90A-NEXT: ;;#ASMSTART 1810; GFX90A-NEXT: ; def v[2:3] 1811; GFX90A-NEXT: ;;#ASMEND 1812; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1813; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1814; GFX90A-NEXT: ;;#ASMSTART 1815; GFX90A-NEXT: ; def v[0:1] 1816; GFX90A-NEXT: ;;#ASMEND 1817; GFX90A-NEXT: v_perm_b32 v2, v2, v3, s4 1818; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 1819; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 1820; GFX90A-NEXT: s_waitcnt vmcnt(0) 1821; GFX90A-NEXT: s_setpc_b64 s[30:31] 1822; 1823; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_0_0: 1824; GFX940: ; %bb.0: 1825; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1826; GFX940-NEXT: ;;#ASMSTART 1827; GFX940-NEXT: ; def v[2:3] 1828; GFX940-NEXT: ;;#ASMEND 1829; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1830; GFX940-NEXT: v_mov_b32_e32 v4, 0 1831; GFX940-NEXT: ;;#ASMSTART 1832; GFX940-NEXT: ; def v[0:1] 1833; GFX940-NEXT: ;;#ASMEND 1834; GFX940-NEXT: v_perm_b32 v2, v2, v3, s2 1835; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 1836; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 1837; GFX940-NEXT: s_waitcnt vmcnt(0) 1838; GFX940-NEXT: s_setpc_b64 s[30:31] 1839 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1840 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 1841 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1842 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1843 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 0, i32 0> 1844 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1845 ret void 1846} 1847 1848define void @v_shuffle_v4bf16_v3bf16__5_4_0_0(ptr addrspace(1) inreg %ptr) { 1849; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_0_0: 1850; GFX900: ; %bb.0: 1851; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1852; GFX900-NEXT: ;;#ASMSTART 1853; GFX900-NEXT: ; def v[0:1] 1854; GFX900-NEXT: ;;#ASMEND 1855; GFX900-NEXT: ;;#ASMSTART 1856; GFX900-NEXT: ; def v[1:2] 1857; GFX900-NEXT: ;;#ASMEND 1858; GFX900-NEXT: s_mov_b32 s4, 0xffff 1859; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 1860; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1861; GFX900-NEXT: v_mov_b32_e32 v3, 0 1862; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 1863; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 1864; GFX900-NEXT: s_waitcnt vmcnt(0) 1865; GFX900-NEXT: s_setpc_b64 s[30:31] 1866; 1867; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_0_0: 1868; GFX90A: ; %bb.0: 1869; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1870; GFX90A-NEXT: ;;#ASMSTART 1871; GFX90A-NEXT: ; def v[2:3] 1872; GFX90A-NEXT: ;;#ASMEND 1873; GFX90A-NEXT: s_mov_b32 s4, 0xffff 1874; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v2 1875; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1876; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1877; GFX90A-NEXT: ;;#ASMSTART 1878; GFX90A-NEXT: ; def v[0:1] 1879; GFX90A-NEXT: ;;#ASMEND 1880; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 1881; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 1882; GFX90A-NEXT: s_waitcnt vmcnt(0) 1883; GFX90A-NEXT: s_setpc_b64 s[30:31] 1884; 1885; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_0_0: 1886; GFX940: ; %bb.0: 1887; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1888; GFX940-NEXT: ;;#ASMSTART 1889; GFX940-NEXT: ; def v[2:3] 1890; GFX940-NEXT: ;;#ASMEND 1891; GFX940-NEXT: s_mov_b32 s2, 0xffff 1892; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v2 1893; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1894; GFX940-NEXT: v_mov_b32_e32 v4, 0 1895; GFX940-NEXT: ;;#ASMSTART 1896; GFX940-NEXT: ; def v[0:1] 1897; GFX940-NEXT: ;;#ASMEND 1898; GFX940-NEXT: s_nop 0 1899; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 1900; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 1901; GFX940-NEXT: s_waitcnt vmcnt(0) 1902; GFX940-NEXT: s_setpc_b64 s[30:31] 1903 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1904 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 1905 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1906 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1907 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 0, i32 0> 1908 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1909 ret void 1910} 1911 1912define void @v_shuffle_v4bf16_v3bf16__5_5_0_0(ptr addrspace(1) inreg %ptr) { 1913; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_0: 1914; GFX900: ; %bb.0: 1915; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1916; GFX900-NEXT: ;;#ASMSTART 1917; GFX900-NEXT: ; def v[0:1] 1918; GFX900-NEXT: ;;#ASMEND 1919; GFX900-NEXT: ;;#ASMSTART 1920; GFX900-NEXT: ; def v[1:2] 1921; GFX900-NEXT: ;;#ASMEND 1922; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1923; GFX900-NEXT: v_mov_b32_e32 v3, 0 1924; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 1925; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 1926; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 1927; GFX900-NEXT: s_waitcnt vmcnt(0) 1928; GFX900-NEXT: s_setpc_b64 s[30:31] 1929; 1930; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_0: 1931; GFX90A: ; %bb.0: 1932; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1933; GFX90A-NEXT: ;;#ASMSTART 1934; GFX90A-NEXT: ; def v[0:1] 1935; GFX90A-NEXT: ;;#ASMEND 1936; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1937; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1938; GFX90A-NEXT: ;;#ASMSTART 1939; GFX90A-NEXT: ; def v[2:3] 1940; GFX90A-NEXT: ;;#ASMEND 1941; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 1942; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 1943; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 1944; GFX90A-NEXT: s_waitcnt vmcnt(0) 1945; GFX90A-NEXT: s_setpc_b64 s[30:31] 1946; 1947; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_0: 1948; GFX940: ; %bb.0: 1949; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1950; GFX940-NEXT: ;;#ASMSTART 1951; GFX940-NEXT: ; def v[0:1] 1952; GFX940-NEXT: ;;#ASMEND 1953; GFX940-NEXT: s_mov_b32 s2, 0x5040100 1954; GFX940-NEXT: v_mov_b32_e32 v4, 0 1955; GFX940-NEXT: ;;#ASMSTART 1956; GFX940-NEXT: ; def v[2:3] 1957; GFX940-NEXT: ;;#ASMEND 1958; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 1959; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 1960; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 1961; GFX940-NEXT: s_waitcnt vmcnt(0) 1962; GFX940-NEXT: s_setpc_b64 s[30:31] 1963 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 1964 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 1965 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1966 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 1967 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 0> 1968 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 1969 ret void 1970} 1971 1972define void @v_shuffle_v4bf16_v3bf16__5_5_u_0(ptr addrspace(1) inreg %ptr) { 1973; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_0: 1974; GFX900: ; %bb.0: 1975; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1976; GFX900-NEXT: ;;#ASMSTART 1977; GFX900-NEXT: ; def v[0:1] 1978; GFX900-NEXT: ;;#ASMEND 1979; GFX900-NEXT: ;;#ASMSTART 1980; GFX900-NEXT: ; def v[1:2] 1981; GFX900-NEXT: ;;#ASMEND 1982; GFX900-NEXT: s_mov_b32 s4, 0x5040100 1983; GFX900-NEXT: v_mov_b32_e32 v3, 0 1984; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 1985; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 1986; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 1987; GFX900-NEXT: s_waitcnt vmcnt(0) 1988; GFX900-NEXT: s_setpc_b64 s[30:31] 1989; 1990; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_0: 1991; GFX90A: ; %bb.0: 1992; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1993; GFX90A-NEXT: ;;#ASMSTART 1994; GFX90A-NEXT: ; def v[2:3] 1995; GFX90A-NEXT: ;;#ASMEND 1996; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 1997; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1998; GFX90A-NEXT: ;;#ASMSTART 1999; GFX90A-NEXT: ; def v[0:1] 2000; GFX90A-NEXT: ;;#ASMEND 2001; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 2002; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 2003; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 2004; GFX90A-NEXT: s_waitcnt vmcnt(0) 2005; GFX90A-NEXT: s_setpc_b64 s[30:31] 2006; 2007; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_0: 2008; GFX940: ; %bb.0: 2009; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2010; GFX940-NEXT: ;;#ASMSTART 2011; GFX940-NEXT: ; def v[2:3] 2012; GFX940-NEXT: ;;#ASMEND 2013; GFX940-NEXT: s_mov_b32 s2, 0x5040100 2014; GFX940-NEXT: v_mov_b32_e32 v4, 0 2015; GFX940-NEXT: ;;#ASMSTART 2016; GFX940-NEXT: ; def v[0:1] 2017; GFX940-NEXT: ;;#ASMEND 2018; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 2019; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 2020; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 2021; GFX940-NEXT: s_waitcnt vmcnt(0) 2022; GFX940-NEXT: s_setpc_b64 s[30:31] 2023 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2024 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 2025 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2026 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2027 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 0> 2028 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2029 ret void 2030} 2031 2032define void @v_shuffle_v4bf16_v3bf16__5_5_1_0(ptr addrspace(1) inreg %ptr) { 2033; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_0: 2034; GFX900: ; %bb.0: 2035; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2036; GFX900-NEXT: ;;#ASMSTART 2037; GFX900-NEXT: ; def v[0:1] 2038; GFX900-NEXT: ;;#ASMEND 2039; GFX900-NEXT: ;;#ASMSTART 2040; GFX900-NEXT: ; def v[1:2] 2041; GFX900-NEXT: ;;#ASMEND 2042; GFX900-NEXT: s_mov_b32 s4, 0x5040100 2043; GFX900-NEXT: v_mov_b32_e32 v3, 0 2044; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 2045; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 2046; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 2047; GFX900-NEXT: s_waitcnt vmcnt(0) 2048; GFX900-NEXT: s_setpc_b64 s[30:31] 2049; 2050; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_0: 2051; GFX90A: ; %bb.0: 2052; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2053; GFX90A-NEXT: ;;#ASMSTART 2054; GFX90A-NEXT: ; def v[0:1] 2055; GFX90A-NEXT: ;;#ASMEND 2056; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 2057; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2058; GFX90A-NEXT: ;;#ASMSTART 2059; GFX90A-NEXT: ; def v[2:3] 2060; GFX90A-NEXT: ;;#ASMEND 2061; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 2062; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 2063; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 2064; GFX90A-NEXT: s_waitcnt vmcnt(0) 2065; GFX90A-NEXT: s_setpc_b64 s[30:31] 2066; 2067; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_0: 2068; GFX940: ; %bb.0: 2069; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2070; GFX940-NEXT: ;;#ASMSTART 2071; GFX940-NEXT: ; def v[0:1] 2072; GFX940-NEXT: ;;#ASMEND 2073; GFX940-NEXT: s_mov_b32 s2, 0x5040100 2074; GFX940-NEXT: v_mov_b32_e32 v4, 0 2075; GFX940-NEXT: ;;#ASMSTART 2076; GFX940-NEXT: ; def v[2:3] 2077; GFX940-NEXT: ;;#ASMEND 2078; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 2079; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 2080; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 2081; GFX940-NEXT: s_waitcnt vmcnt(0) 2082; GFX940-NEXT: s_setpc_b64 s[30:31] 2083 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2084 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 2085 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2086 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2087 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 0> 2088 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2089 ret void 2090} 2091 2092define void @v_shuffle_v4bf16_v3bf16__5_5_2_0(ptr addrspace(1) inreg %ptr) { 2093; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_0: 2094; GFX900: ; %bb.0: 2095; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2096; GFX900-NEXT: ;;#ASMSTART 2097; GFX900-NEXT: ; def v[0:1] 2098; GFX900-NEXT: ;;#ASMEND 2099; GFX900-NEXT: s_mov_b32 s4, 0x5040100 2100; GFX900-NEXT: v_mov_b32_e32 v4, 0 2101; GFX900-NEXT: ;;#ASMSTART 2102; GFX900-NEXT: ; def v[2:3] 2103; GFX900-NEXT: ;;#ASMEND 2104; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 2105; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 2106; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 2107; GFX900-NEXT: s_waitcnt vmcnt(0) 2108; GFX900-NEXT: s_setpc_b64 s[30:31] 2109; 2110; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_0: 2111; GFX90A: ; %bb.0: 2112; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2113; GFX90A-NEXT: ;;#ASMSTART 2114; GFX90A-NEXT: ; def v[0:1] 2115; GFX90A-NEXT: ;;#ASMEND 2116; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 2117; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2118; GFX90A-NEXT: ;;#ASMSTART 2119; GFX90A-NEXT: ; def v[2:3] 2120; GFX90A-NEXT: ;;#ASMEND 2121; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 2122; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 2123; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 2124; GFX90A-NEXT: s_waitcnt vmcnt(0) 2125; GFX90A-NEXT: s_setpc_b64 s[30:31] 2126; 2127; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_0: 2128; GFX940: ; %bb.0: 2129; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2130; GFX940-NEXT: ;;#ASMSTART 2131; GFX940-NEXT: ; def v[0:1] 2132; GFX940-NEXT: ;;#ASMEND 2133; GFX940-NEXT: s_mov_b32 s2, 0x5040100 2134; GFX940-NEXT: v_mov_b32_e32 v4, 0 2135; GFX940-NEXT: ;;#ASMSTART 2136; GFX940-NEXT: ; def v[2:3] 2137; GFX940-NEXT: ;;#ASMEND 2138; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 2139; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 2140; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 2141; GFX940-NEXT: s_waitcnt vmcnt(0) 2142; GFX940-NEXT: s_setpc_b64 s[30:31] 2143 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2144 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 2145 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2146 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2147 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 0> 2148 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2149 ret void 2150} 2151 2152define void @v_shuffle_v4bf16_v3bf16__5_5_3_0(ptr addrspace(1) inreg %ptr) { 2153; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_0: 2154; GFX900: ; %bb.0: 2155; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2156; GFX900-NEXT: ;;#ASMSTART 2157; GFX900-NEXT: ; def v[0:1] 2158; GFX900-NEXT: ;;#ASMEND 2159; GFX900-NEXT: ;;#ASMSTART 2160; GFX900-NEXT: ; def v[1:2] 2161; GFX900-NEXT: ;;#ASMEND 2162; GFX900-NEXT: s_mov_b32 s4, 0x5040100 2163; GFX900-NEXT: v_mov_b32_e32 v3, 0 2164; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 2165; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 2166; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 2167; GFX900-NEXT: s_waitcnt vmcnt(0) 2168; GFX900-NEXT: s_setpc_b64 s[30:31] 2169; 2170; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_0: 2171; GFX90A: ; %bb.0: 2172; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2173; GFX90A-NEXT: ;;#ASMSTART 2174; GFX90A-NEXT: ; def v[0:1] 2175; GFX90A-NEXT: ;;#ASMEND 2176; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 2177; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2178; GFX90A-NEXT: ;;#ASMSTART 2179; GFX90A-NEXT: ; def v[2:3] 2180; GFX90A-NEXT: ;;#ASMEND 2181; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 2182; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 2183; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 2184; GFX90A-NEXT: s_waitcnt vmcnt(0) 2185; GFX90A-NEXT: s_setpc_b64 s[30:31] 2186; 2187; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_0: 2188; GFX940: ; %bb.0: 2189; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2190; GFX940-NEXT: ;;#ASMSTART 2191; GFX940-NEXT: ; def v[0:1] 2192; GFX940-NEXT: ;;#ASMEND 2193; GFX940-NEXT: s_mov_b32 s2, 0x5040100 2194; GFX940-NEXT: v_mov_b32_e32 v4, 0 2195; GFX940-NEXT: ;;#ASMSTART 2196; GFX940-NEXT: ; def v[2:3] 2197; GFX940-NEXT: ;;#ASMEND 2198; GFX940-NEXT: s_nop 0 2199; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 2200; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 2201; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 2202; GFX940-NEXT: s_waitcnt vmcnt(0) 2203; GFX940-NEXT: s_setpc_b64 s[30:31] 2204 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2205 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 2206 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2207 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2208 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 0> 2209 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2210 ret void 2211} 2212 2213define void @v_shuffle_v4bf16_v3bf16__5_5_4_0(ptr addrspace(1) inreg %ptr) { 2214; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_0: 2215; GFX900: ; %bb.0: 2216; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2217; GFX900-NEXT: ;;#ASMSTART 2218; GFX900-NEXT: ; def v[0:1] 2219; GFX900-NEXT: ;;#ASMEND 2220; GFX900-NEXT: ;;#ASMSTART 2221; GFX900-NEXT: ; def v[1:2] 2222; GFX900-NEXT: ;;#ASMEND 2223; GFX900-NEXT: s_mov_b32 s4, 0x5040100 2224; GFX900-NEXT: v_mov_b32_e32 v3, 0 2225; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 2226; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 2227; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 2228; GFX900-NEXT: s_waitcnt vmcnt(0) 2229; GFX900-NEXT: s_setpc_b64 s[30:31] 2230; 2231; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_0: 2232; GFX90A: ; %bb.0: 2233; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2234; GFX90A-NEXT: ;;#ASMSTART 2235; GFX90A-NEXT: ; def v[0:1] 2236; GFX90A-NEXT: ;;#ASMEND 2237; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 2238; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2239; GFX90A-NEXT: ;;#ASMSTART 2240; GFX90A-NEXT: ; def v[2:3] 2241; GFX90A-NEXT: ;;#ASMEND 2242; GFX90A-NEXT: v_alignbit_b32 v1, v0, v2, 16 2243; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 2244; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 2245; GFX90A-NEXT: s_waitcnt vmcnt(0) 2246; GFX90A-NEXT: s_setpc_b64 s[30:31] 2247; 2248; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_0: 2249; GFX940: ; %bb.0: 2250; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2251; GFX940-NEXT: ;;#ASMSTART 2252; GFX940-NEXT: ; def v[0:1] 2253; GFX940-NEXT: ;;#ASMEND 2254; GFX940-NEXT: s_mov_b32 s2, 0x5040100 2255; GFX940-NEXT: v_mov_b32_e32 v4, 0 2256; GFX940-NEXT: ;;#ASMSTART 2257; GFX940-NEXT: ; def v[2:3] 2258; GFX940-NEXT: ;;#ASMEND 2259; GFX940-NEXT: s_nop 0 2260; GFX940-NEXT: v_alignbit_b32 v1, v0, v2, 16 2261; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 2262; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 2263; GFX940-NEXT: s_waitcnt vmcnt(0) 2264; GFX940-NEXT: s_setpc_b64 s[30:31] 2265 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2266 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 2267 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2268 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2269 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 0> 2270 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2271 ret void 2272} 2273 2274define void @v_shuffle_v4bf16_v3bf16__u_1_1_1(ptr addrspace(1) inreg %ptr) { 2275; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__u_1_1_1: 2276; GFX900: ; %bb.0: 2277; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2278; GFX900-NEXT: ;;#ASMSTART 2279; GFX900-NEXT: ; def v[0:1] 2280; GFX900-NEXT: ;;#ASMEND 2281; GFX900-NEXT: s_mov_b32 s4, 0x7060302 2282; GFX900-NEXT: v_mov_b32_e32 v2, 0 2283; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 2284; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 2285; GFX900-NEXT: s_waitcnt vmcnt(0) 2286; GFX900-NEXT: s_setpc_b64 s[30:31] 2287; 2288; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__u_1_1_1: 2289; GFX90A: ; %bb.0: 2290; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2291; GFX90A-NEXT: ;;#ASMSTART 2292; GFX90A-NEXT: ; def v[0:1] 2293; GFX90A-NEXT: ;;#ASMEND 2294; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 2295; GFX90A-NEXT: v_mov_b32_e32 v2, 0 2296; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 2297; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 2298; GFX90A-NEXT: s_waitcnt vmcnt(0) 2299; GFX90A-NEXT: s_setpc_b64 s[30:31] 2300; 2301; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__u_1_1_1: 2302; GFX940: ; %bb.0: 2303; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2304; GFX940-NEXT: ;;#ASMSTART 2305; GFX940-NEXT: ; def v[0:1] 2306; GFX940-NEXT: ;;#ASMEND 2307; GFX940-NEXT: s_mov_b32 s2, 0x7060302 2308; GFX940-NEXT: v_mov_b32_e32 v2, 0 2309; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 2310; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 2311; GFX940-NEXT: s_waitcnt vmcnt(0) 2312; GFX940-NEXT: s_setpc_b64 s[30:31] 2313 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2314 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2315 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1> 2316 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2317 ret void 2318} 2319 2320define void @v_shuffle_v4bf16_v3bf16__0_1_1_1(ptr addrspace(1) inreg %ptr) { 2321; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_1_1_1: 2322; GFX900: ; %bb.0: 2323; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2324; GFX900-NEXT: ;;#ASMSTART 2325; GFX900-NEXT: ; def v[0:1] 2326; GFX900-NEXT: ;;#ASMEND 2327; GFX900-NEXT: s_mov_b32 s4, 0x7060302 2328; GFX900-NEXT: v_mov_b32_e32 v2, 0 2329; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 2330; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 2331; GFX900-NEXT: s_waitcnt vmcnt(0) 2332; GFX900-NEXT: s_setpc_b64 s[30:31] 2333; 2334; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_1_1_1: 2335; GFX90A: ; %bb.0: 2336; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2337; GFX90A-NEXT: ;;#ASMSTART 2338; GFX90A-NEXT: ; def v[0:1] 2339; GFX90A-NEXT: ;;#ASMEND 2340; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 2341; GFX90A-NEXT: v_mov_b32_e32 v2, 0 2342; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 2343; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 2344; GFX90A-NEXT: s_waitcnt vmcnt(0) 2345; GFX90A-NEXT: s_setpc_b64 s[30:31] 2346; 2347; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_1_1_1: 2348; GFX940: ; %bb.0: 2349; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2350; GFX940-NEXT: ;;#ASMSTART 2351; GFX940-NEXT: ; def v[0:1] 2352; GFX940-NEXT: ;;#ASMEND 2353; GFX940-NEXT: s_mov_b32 s2, 0x7060302 2354; GFX940-NEXT: v_mov_b32_e32 v2, 0 2355; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 2356; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 2357; GFX940-NEXT: s_waitcnt vmcnt(0) 2358; GFX940-NEXT: s_setpc_b64 s[30:31] 2359 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2360 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2361 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1> 2362 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2363 ret void 2364} 2365 2366define void @v_shuffle_v4bf16_v3bf16__1_1_1_1(ptr addrspace(1) inreg %ptr) { 2367; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_1_1_1: 2368; GFX900: ; %bb.0: 2369; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2370; GFX900-NEXT: ;;#ASMSTART 2371; GFX900-NEXT: ; def v[0:1] 2372; GFX900-NEXT: ;;#ASMEND 2373; GFX900-NEXT: s_mov_b32 s4, 0x7060302 2374; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 2375; GFX900-NEXT: v_mov_b32_e32 v2, 0 2376; GFX900-NEXT: v_mov_b32_e32 v1, v0 2377; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 2378; GFX900-NEXT: s_waitcnt vmcnt(0) 2379; GFX900-NEXT: s_setpc_b64 s[30:31] 2380; 2381; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_1_1_1: 2382; GFX90A: ; %bb.0: 2383; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2384; GFX90A-NEXT: ;;#ASMSTART 2385; GFX90A-NEXT: ; def v[0:1] 2386; GFX90A-NEXT: ;;#ASMEND 2387; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 2388; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 2389; GFX90A-NEXT: v_mov_b32_e32 v2, 0 2390; GFX90A-NEXT: v_mov_b32_e32 v1, v0 2391; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 2392; GFX90A-NEXT: s_waitcnt vmcnt(0) 2393; GFX90A-NEXT: s_setpc_b64 s[30:31] 2394; 2395; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_1_1_1: 2396; GFX940: ; %bb.0: 2397; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2398; GFX940-NEXT: ;;#ASMSTART 2399; GFX940-NEXT: ; def v[0:1] 2400; GFX940-NEXT: ;;#ASMEND 2401; GFX940-NEXT: s_mov_b32 s2, 0x7060302 2402; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 2403; GFX940-NEXT: v_mov_b32_e32 v2, 0 2404; GFX940-NEXT: v_mov_b32_e32 v1, v0 2405; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 2406; GFX940-NEXT: s_waitcnt vmcnt(0) 2407; GFX940-NEXT: s_setpc_b64 s[30:31] 2408 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2409 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2410 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 2411 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2412 ret void 2413} 2414 2415define void @v_shuffle_v4bf16_v3bf16__2_1_1_1(ptr addrspace(1) inreg %ptr) { 2416; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_1_1_1: 2417; GFX900: ; %bb.0: 2418; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2419; GFX900-NEXT: ;;#ASMSTART 2420; GFX900-NEXT: ; def v[0:1] 2421; GFX900-NEXT: ;;#ASMEND 2422; GFX900-NEXT: s_mov_b32 s4, 0xffff 2423; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 2424; GFX900-NEXT: s_mov_b32 s4, 0x7060302 2425; GFX900-NEXT: v_mov_b32_e32 v3, 0 2426; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 2427; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 2428; GFX900-NEXT: s_waitcnt vmcnt(0) 2429; GFX900-NEXT: s_setpc_b64 s[30:31] 2430; 2431; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_1_1_1: 2432; GFX90A: ; %bb.0: 2433; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2434; GFX90A-NEXT: s_mov_b32 s4, 0xffff 2435; GFX90A-NEXT: ;;#ASMSTART 2436; GFX90A-NEXT: ; def v[0:1] 2437; GFX90A-NEXT: ;;#ASMEND 2438; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 2439; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 2440; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2441; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 2442; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 2443; GFX90A-NEXT: s_waitcnt vmcnt(0) 2444; GFX90A-NEXT: s_setpc_b64 s[30:31] 2445; 2446; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_1_1_1: 2447; GFX940: ; %bb.0: 2448; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2449; GFX940-NEXT: s_mov_b32 s2, 0xffff 2450; GFX940-NEXT: ;;#ASMSTART 2451; GFX940-NEXT: ; def v[0:1] 2452; GFX940-NEXT: ;;#ASMEND 2453; GFX940-NEXT: v_mov_b32_e32 v4, 0 2454; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 2455; GFX940-NEXT: s_mov_b32 s2, 0x7060302 2456; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 2457; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 2458; GFX940-NEXT: s_waitcnt vmcnt(0) 2459; GFX940-NEXT: s_setpc_b64 s[30:31] 2460 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2461 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2462 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1> 2463 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2464 ret void 2465} 2466 2467define void @v_shuffle_v4bf16_v3bf16__3_1_1_1(ptr addrspace(1) inreg %ptr) { 2468; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__3_1_1_1: 2469; GFX900: ; %bb.0: 2470; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2471; GFX900-NEXT: ;;#ASMSTART 2472; GFX900-NEXT: ; def v[0:1] 2473; GFX900-NEXT: ;;#ASMEND 2474; GFX900-NEXT: s_mov_b32 s4, 0x7060302 2475; GFX900-NEXT: v_mov_b32_e32 v2, 0 2476; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 2477; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 2478; GFX900-NEXT: s_waitcnt vmcnt(0) 2479; GFX900-NEXT: s_setpc_b64 s[30:31] 2480; 2481; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__3_1_1_1: 2482; GFX90A: ; %bb.0: 2483; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2484; GFX90A-NEXT: ;;#ASMSTART 2485; GFX90A-NEXT: ; def v[0:1] 2486; GFX90A-NEXT: ;;#ASMEND 2487; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 2488; GFX90A-NEXT: v_mov_b32_e32 v2, 0 2489; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 2490; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 2491; GFX90A-NEXT: s_waitcnt vmcnt(0) 2492; GFX90A-NEXT: s_setpc_b64 s[30:31] 2493; 2494; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__3_1_1_1: 2495; GFX940: ; %bb.0: 2496; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2497; GFX940-NEXT: ;;#ASMSTART 2498; GFX940-NEXT: ; def v[0:1] 2499; GFX940-NEXT: ;;#ASMEND 2500; GFX940-NEXT: s_mov_b32 s2, 0x7060302 2501; GFX940-NEXT: v_mov_b32_e32 v2, 0 2502; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 2503; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 2504; GFX940-NEXT: s_waitcnt vmcnt(0) 2505; GFX940-NEXT: s_setpc_b64 s[30:31] 2506 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2507 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2508 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1> 2509 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2510 ret void 2511} 2512 2513define void @v_shuffle_v4bf16_v3bf16__4_1_1_1(ptr addrspace(1) inreg %ptr) { 2514; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_1_1_1: 2515; GFX900: ; %bb.0: 2516; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2517; GFX900-NEXT: ;;#ASMSTART 2518; GFX900-NEXT: ; def v[0:1] 2519; GFX900-NEXT: ;;#ASMEND 2520; GFX900-NEXT: ;;#ASMSTART 2521; GFX900-NEXT: ; def v[1:2] 2522; GFX900-NEXT: ;;#ASMEND 2523; GFX900-NEXT: s_mov_b32 s4, 0x7060302 2524; GFX900-NEXT: v_mov_b32_e32 v3, 0 2525; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 2526; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 2527; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 2528; GFX900-NEXT: s_waitcnt vmcnt(0) 2529; GFX900-NEXT: s_setpc_b64 s[30:31] 2530; 2531; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_1_1_1: 2532; GFX90A: ; %bb.0: 2533; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2534; GFX90A-NEXT: ;;#ASMSTART 2535; GFX90A-NEXT: ; def v[2:3] 2536; GFX90A-NEXT: ;;#ASMEND 2537; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 2538; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2539; GFX90A-NEXT: ;;#ASMSTART 2540; GFX90A-NEXT: ; def v[0:1] 2541; GFX90A-NEXT: ;;#ASMEND 2542; GFX90A-NEXT: v_perm_b32 v2, v0, v2, s4 2543; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 2544; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 2545; GFX90A-NEXT: s_waitcnt vmcnt(0) 2546; GFX90A-NEXT: s_setpc_b64 s[30:31] 2547; 2548; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_1_1_1: 2549; GFX940: ; %bb.0: 2550; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2551; GFX940-NEXT: ;;#ASMSTART 2552; GFX940-NEXT: ; def v[2:3] 2553; GFX940-NEXT: ;;#ASMEND 2554; GFX940-NEXT: s_mov_b32 s2, 0x7060302 2555; GFX940-NEXT: v_mov_b32_e32 v4, 0 2556; GFX940-NEXT: ;;#ASMSTART 2557; GFX940-NEXT: ; def v[0:1] 2558; GFX940-NEXT: ;;#ASMEND 2559; GFX940-NEXT: s_nop 0 2560; GFX940-NEXT: v_perm_b32 v2, v0, v2, s2 2561; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 2562; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 2563; GFX940-NEXT: s_waitcnt vmcnt(0) 2564; GFX940-NEXT: s_setpc_b64 s[30:31] 2565 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2566 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 2567 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2568 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2569 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 1, i32 1, i32 1> 2570 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2571 ret void 2572} 2573 2574define void @v_shuffle_v4bf16_v3bf16__5_1_1_1(ptr addrspace(1) inreg %ptr) { 2575; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_1_1: 2576; GFX900: ; %bb.0: 2577; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2578; GFX900-NEXT: ;;#ASMSTART 2579; GFX900-NEXT: ; def v[0:1] 2580; GFX900-NEXT: ;;#ASMEND 2581; GFX900-NEXT: ;;#ASMSTART 2582; GFX900-NEXT: ; def v[1:2] 2583; GFX900-NEXT: ;;#ASMEND 2584; GFX900-NEXT: s_mov_b32 s4, 0xffff 2585; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 2586; GFX900-NEXT: s_mov_b32 s4, 0x7060302 2587; GFX900-NEXT: v_mov_b32_e32 v3, 0 2588; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 2589; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 2590; GFX900-NEXT: s_waitcnt vmcnt(0) 2591; GFX900-NEXT: s_setpc_b64 s[30:31] 2592; 2593; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_1_1: 2594; GFX90A: ; %bb.0: 2595; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2596; GFX90A-NEXT: ;;#ASMSTART 2597; GFX90A-NEXT: ; def v[2:3] 2598; GFX90A-NEXT: ;;#ASMEND 2599; GFX90A-NEXT: s_mov_b32 s4, 0xffff 2600; GFX90A-NEXT: ;;#ASMSTART 2601; GFX90A-NEXT: ; def v[0:1] 2602; GFX90A-NEXT: ;;#ASMEND 2603; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v0 2604; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 2605; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2606; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 2607; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 2608; GFX90A-NEXT: s_waitcnt vmcnt(0) 2609; GFX90A-NEXT: s_setpc_b64 s[30:31] 2610; 2611; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_1_1: 2612; GFX940: ; %bb.0: 2613; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2614; GFX940-NEXT: ;;#ASMSTART 2615; GFX940-NEXT: ; def v[2:3] 2616; GFX940-NEXT: ;;#ASMEND 2617; GFX940-NEXT: s_mov_b32 s2, 0xffff 2618; GFX940-NEXT: ;;#ASMSTART 2619; GFX940-NEXT: ; def v[0:1] 2620; GFX940-NEXT: ;;#ASMEND 2621; GFX940-NEXT: v_mov_b32_e32 v4, 0 2622; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v0 2623; GFX940-NEXT: s_mov_b32 s2, 0x7060302 2624; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 2625; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 2626; GFX940-NEXT: s_waitcnt vmcnt(0) 2627; GFX940-NEXT: s_setpc_b64 s[30:31] 2628 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2629 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 2630 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2631 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2632 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 1, i32 1> 2633 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2634 ret void 2635} 2636 2637define void @v_shuffle_v4bf16_v3bf16__5_u_1_1(ptr addrspace(1) inreg %ptr) { 2638; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_1_1: 2639; GFX900: ; %bb.0: 2640; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2641; GFX900-NEXT: ;;#ASMSTART 2642; GFX900-NEXT: ; def v[0:1] 2643; GFX900-NEXT: ;;#ASMEND 2644; GFX900-NEXT: ;;#ASMSTART 2645; GFX900-NEXT: ; def v[1:2] 2646; GFX900-NEXT: ;;#ASMEND 2647; GFX900-NEXT: s_mov_b32 s4, 0x7060302 2648; GFX900-NEXT: v_mov_b32_e32 v3, 0 2649; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 2650; GFX900-NEXT: v_mov_b32_e32 v0, v2 2651; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 2652; GFX900-NEXT: s_waitcnt vmcnt(0) 2653; GFX900-NEXT: s_setpc_b64 s[30:31] 2654; 2655; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_1_1: 2656; GFX90A: ; %bb.0: 2657; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2658; GFX90A-NEXT: ;;#ASMSTART 2659; GFX90A-NEXT: ; def v[0:1] 2660; GFX90A-NEXT: ;;#ASMEND 2661; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 2662; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2663; GFX90A-NEXT: ;;#ASMSTART 2664; GFX90A-NEXT: ; def v[2:3] 2665; GFX90A-NEXT: ;;#ASMEND 2666; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 2667; GFX90A-NEXT: v_mov_b32_e32 v0, v3 2668; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 2669; GFX90A-NEXT: s_waitcnt vmcnt(0) 2670; GFX90A-NEXT: s_setpc_b64 s[30:31] 2671; 2672; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_1_1: 2673; GFX940: ; %bb.0: 2674; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2675; GFX940-NEXT: ;;#ASMSTART 2676; GFX940-NEXT: ; def v[0:1] 2677; GFX940-NEXT: ;;#ASMEND 2678; GFX940-NEXT: s_mov_b32 s2, 0x7060302 2679; GFX940-NEXT: v_mov_b32_e32 v4, 0 2680; GFX940-NEXT: ;;#ASMSTART 2681; GFX940-NEXT: ; def v[2:3] 2682; GFX940-NEXT: ;;#ASMEND 2683; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 2684; GFX940-NEXT: v_mov_b32_e32 v0, v3 2685; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 2686; GFX940-NEXT: s_waitcnt vmcnt(0) 2687; GFX940-NEXT: s_setpc_b64 s[30:31] 2688 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2689 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 2690 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2691 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2692 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 1, i32 1> 2693 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2694 ret void 2695} 2696 2697define void @v_shuffle_v4bf16_v3bf16__5_0_1_1(ptr addrspace(1) inreg %ptr) { 2698; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_1_1: 2699; GFX900: ; %bb.0: 2700; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2701; GFX900-NEXT: ;;#ASMSTART 2702; GFX900-NEXT: ; def v[0:1] 2703; GFX900-NEXT: ;;#ASMEND 2704; GFX900-NEXT: ;;#ASMSTART 2705; GFX900-NEXT: ; def v[1:2] 2706; GFX900-NEXT: ;;#ASMEND 2707; GFX900-NEXT: s_mov_b32 s4, 0x5040100 2708; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 2709; GFX900-NEXT: s_mov_b32 s4, 0x7060302 2710; GFX900-NEXT: v_mov_b32_e32 v3, 0 2711; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 2712; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 2713; GFX900-NEXT: s_waitcnt vmcnt(0) 2714; GFX900-NEXT: s_setpc_b64 s[30:31] 2715; 2716; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_1_1: 2717; GFX90A: ; %bb.0: 2718; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2719; GFX90A-NEXT: ;;#ASMSTART 2720; GFX90A-NEXT: ; def v[2:3] 2721; GFX90A-NEXT: ;;#ASMEND 2722; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 2723; GFX90A-NEXT: ;;#ASMSTART 2724; GFX90A-NEXT: ; def v[0:1] 2725; GFX90A-NEXT: ;;#ASMEND 2726; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 2727; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 2728; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2729; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 2730; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 2731; GFX90A-NEXT: s_waitcnt vmcnt(0) 2732; GFX90A-NEXT: s_setpc_b64 s[30:31] 2733; 2734; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_1_1: 2735; GFX940: ; %bb.0: 2736; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2737; GFX940-NEXT: ;;#ASMSTART 2738; GFX940-NEXT: ; def v[2:3] 2739; GFX940-NEXT: ;;#ASMEND 2740; GFX940-NEXT: s_mov_b32 s2, 0x5040100 2741; GFX940-NEXT: ;;#ASMSTART 2742; GFX940-NEXT: ; def v[0:1] 2743; GFX940-NEXT: ;;#ASMEND 2744; GFX940-NEXT: v_mov_b32_e32 v4, 0 2745; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 2746; GFX940-NEXT: s_mov_b32 s2, 0x7060302 2747; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 2748; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 2749; GFX940-NEXT: s_waitcnt vmcnt(0) 2750; GFX940-NEXT: s_setpc_b64 s[30:31] 2751 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2752 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 2753 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2754 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2755 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 1, i32 1> 2756 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2757 ret void 2758} 2759 2760define void @v_shuffle_v4bf16_v3bf16__5_2_1_1(ptr addrspace(1) inreg %ptr) { 2761; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_1_1: 2762; GFX900: ; %bb.0: 2763; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2764; GFX900-NEXT: ;;#ASMSTART 2765; GFX900-NEXT: ; def v[0:1] 2766; GFX900-NEXT: ;;#ASMEND 2767; GFX900-NEXT: s_mov_b32 s4, 0x5040100 2768; GFX900-NEXT: ;;#ASMSTART 2769; GFX900-NEXT: ; def v[2:3] 2770; GFX900-NEXT: ;;#ASMEND 2771; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 2772; GFX900-NEXT: s_mov_b32 s4, 0x7060302 2773; GFX900-NEXT: v_mov_b32_e32 v4, 0 2774; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 2775; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] 2776; GFX900-NEXT: s_waitcnt vmcnt(0) 2777; GFX900-NEXT: s_setpc_b64 s[30:31] 2778; 2779; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_1_1: 2780; GFX90A: ; %bb.0: 2781; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2782; GFX90A-NEXT: ;;#ASMSTART 2783; GFX90A-NEXT: ; def v[2:3] 2784; GFX90A-NEXT: ;;#ASMEND 2785; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 2786; GFX90A-NEXT: ;;#ASMSTART 2787; GFX90A-NEXT: ; def v[0:1] 2788; GFX90A-NEXT: ;;#ASMEND 2789; GFX90A-NEXT: v_perm_b32 v2, v1, v3, s4 2790; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 2791; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2792; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 2793; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 2794; GFX90A-NEXT: s_waitcnt vmcnt(0) 2795; GFX90A-NEXT: s_setpc_b64 s[30:31] 2796; 2797; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_1_1: 2798; GFX940: ; %bb.0: 2799; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2800; GFX940-NEXT: ;;#ASMSTART 2801; GFX940-NEXT: ; def v[2:3] 2802; GFX940-NEXT: ;;#ASMEND 2803; GFX940-NEXT: s_mov_b32 s2, 0x5040100 2804; GFX940-NEXT: ;;#ASMSTART 2805; GFX940-NEXT: ; def v[0:1] 2806; GFX940-NEXT: ;;#ASMEND 2807; GFX940-NEXT: v_mov_b32_e32 v4, 0 2808; GFX940-NEXT: v_perm_b32 v2, v1, v3, s2 2809; GFX940-NEXT: s_mov_b32 s2, 0x7060302 2810; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 2811; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 2812; GFX940-NEXT: s_waitcnt vmcnt(0) 2813; GFX940-NEXT: s_setpc_b64 s[30:31] 2814 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2815 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 2816 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2817 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2818 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 1, i32 1> 2819 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2820 ret void 2821} 2822 2823define void @v_shuffle_v4bf16_v3bf16__5_3_1_1(ptr addrspace(1) inreg %ptr) { 2824; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_1_1: 2825; GFX900: ; %bb.0: 2826; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2827; GFX900-NEXT: ;;#ASMSTART 2828; GFX900-NEXT: ; def v[0:1] 2829; GFX900-NEXT: ;;#ASMEND 2830; GFX900-NEXT: ;;#ASMSTART 2831; GFX900-NEXT: ; def v[1:2] 2832; GFX900-NEXT: ;;#ASMEND 2833; GFX900-NEXT: s_mov_b32 s4, 0x5040100 2834; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 2835; GFX900-NEXT: s_mov_b32 s4, 0x7060302 2836; GFX900-NEXT: v_mov_b32_e32 v3, 0 2837; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 2838; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 2839; GFX900-NEXT: s_waitcnt vmcnt(0) 2840; GFX900-NEXT: s_setpc_b64 s[30:31] 2841; 2842; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_1_1: 2843; GFX90A: ; %bb.0: 2844; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2845; GFX90A-NEXT: ;;#ASMSTART 2846; GFX90A-NEXT: ; def v[2:3] 2847; GFX90A-NEXT: ;;#ASMEND 2848; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 2849; GFX90A-NEXT: v_perm_b32 v2, v2, v3, s4 2850; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 2851; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2852; GFX90A-NEXT: ;;#ASMSTART 2853; GFX90A-NEXT: ; def v[0:1] 2854; GFX90A-NEXT: ;;#ASMEND 2855; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 2856; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 2857; GFX90A-NEXT: s_waitcnt vmcnt(0) 2858; GFX90A-NEXT: s_setpc_b64 s[30:31] 2859; 2860; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_1_1: 2861; GFX940: ; %bb.0: 2862; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2863; GFX940-NEXT: ;;#ASMSTART 2864; GFX940-NEXT: ; def v[2:3] 2865; GFX940-NEXT: ;;#ASMEND 2866; GFX940-NEXT: s_mov_b32 s2, 0x5040100 2867; GFX940-NEXT: v_perm_b32 v2, v2, v3, s2 2868; GFX940-NEXT: s_mov_b32 s2, 0x7060302 2869; GFX940-NEXT: v_mov_b32_e32 v4, 0 2870; GFX940-NEXT: ;;#ASMSTART 2871; GFX940-NEXT: ; def v[0:1] 2872; GFX940-NEXT: ;;#ASMEND 2873; GFX940-NEXT: s_nop 0 2874; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 2875; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 2876; GFX940-NEXT: s_waitcnt vmcnt(0) 2877; GFX940-NEXT: s_setpc_b64 s[30:31] 2878 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2879 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 2880 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2881 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2882 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 1, i32 1> 2883 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2884 ret void 2885} 2886 2887define void @v_shuffle_v4bf16_v3bf16__5_4_1_1(ptr addrspace(1) inreg %ptr) { 2888; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_1_1: 2889; GFX900: ; %bb.0: 2890; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2891; GFX900-NEXT: ;;#ASMSTART 2892; GFX900-NEXT: ; def v[0:1] 2893; GFX900-NEXT: ;;#ASMEND 2894; GFX900-NEXT: ;;#ASMSTART 2895; GFX900-NEXT: ; def v[1:2] 2896; GFX900-NEXT: ;;#ASMEND 2897; GFX900-NEXT: s_mov_b32 s4, 0xffff 2898; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 2899; GFX900-NEXT: s_mov_b32 s4, 0x7060302 2900; GFX900-NEXT: v_mov_b32_e32 v3, 0 2901; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 2902; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 2903; GFX900-NEXT: s_waitcnt vmcnt(0) 2904; GFX900-NEXT: s_setpc_b64 s[30:31] 2905; 2906; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_1_1: 2907; GFX90A: ; %bb.0: 2908; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2909; GFX90A-NEXT: ;;#ASMSTART 2910; GFX90A-NEXT: ; def v[2:3] 2911; GFX90A-NEXT: ;;#ASMEND 2912; GFX90A-NEXT: s_mov_b32 s4, 0xffff 2913; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v2 2914; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 2915; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2916; GFX90A-NEXT: ;;#ASMSTART 2917; GFX90A-NEXT: ; def v[0:1] 2918; GFX90A-NEXT: ;;#ASMEND 2919; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 2920; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 2921; GFX90A-NEXT: s_waitcnt vmcnt(0) 2922; GFX90A-NEXT: s_setpc_b64 s[30:31] 2923; 2924; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_1_1: 2925; GFX940: ; %bb.0: 2926; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2927; GFX940-NEXT: ;;#ASMSTART 2928; GFX940-NEXT: ; def v[2:3] 2929; GFX940-NEXT: ;;#ASMEND 2930; GFX940-NEXT: s_mov_b32 s2, 0xffff 2931; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v2 2932; GFX940-NEXT: s_mov_b32 s2, 0x7060302 2933; GFX940-NEXT: v_mov_b32_e32 v4, 0 2934; GFX940-NEXT: ;;#ASMSTART 2935; GFX940-NEXT: ; def v[0:1] 2936; GFX940-NEXT: ;;#ASMEND 2937; GFX940-NEXT: s_nop 0 2938; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 2939; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 2940; GFX940-NEXT: s_waitcnt vmcnt(0) 2941; GFX940-NEXT: s_setpc_b64 s[30:31] 2942 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 2943 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 2944 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2945 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 2946 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 1, i32 1> 2947 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 2948 ret void 2949} 2950 2951define void @v_shuffle_v4bf16_v3bf16__5_5_1_1(ptr addrspace(1) inreg %ptr) { 2952; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_1: 2953; GFX900: ; %bb.0: 2954; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2955; GFX900-NEXT: ;;#ASMSTART 2956; GFX900-NEXT: ; def v[0:1] 2957; GFX900-NEXT: ;;#ASMEND 2958; GFX900-NEXT: ;;#ASMSTART 2959; GFX900-NEXT: ; def v[1:2] 2960; GFX900-NEXT: ;;#ASMEND 2961; GFX900-NEXT: s_mov_b32 s4, 0x7060302 2962; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 2963; GFX900-NEXT: s_mov_b32 s4, 0x5040100 2964; GFX900-NEXT: v_mov_b32_e32 v3, 0 2965; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 2966; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 2967; GFX900-NEXT: s_waitcnt vmcnt(0) 2968; GFX900-NEXT: s_setpc_b64 s[30:31] 2969; 2970; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_1: 2971; GFX90A: ; %bb.0: 2972; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2973; GFX90A-NEXT: ;;#ASMSTART 2974; GFX90A-NEXT: ; def v[0:1] 2975; GFX90A-NEXT: ;;#ASMEND 2976; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 2977; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 2978; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 2979; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2980; GFX90A-NEXT: ;;#ASMSTART 2981; GFX90A-NEXT: ; def v[2:3] 2982; GFX90A-NEXT: ;;#ASMEND 2983; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 2984; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 2985; GFX90A-NEXT: s_waitcnt vmcnt(0) 2986; GFX90A-NEXT: s_setpc_b64 s[30:31] 2987; 2988; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_1: 2989; GFX940: ; %bb.0: 2990; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2991; GFX940-NEXT: ;;#ASMSTART 2992; GFX940-NEXT: ; def v[0:1] 2993; GFX940-NEXT: ;;#ASMEND 2994; GFX940-NEXT: s_mov_b32 s2, 0x7060302 2995; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 2996; GFX940-NEXT: s_mov_b32 s2, 0x5040100 2997; GFX940-NEXT: v_mov_b32_e32 v4, 0 2998; GFX940-NEXT: ;;#ASMSTART 2999; GFX940-NEXT: ; def v[2:3] 3000; GFX940-NEXT: ;;#ASMEND 3001; GFX940-NEXT: s_nop 0 3002; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 3003; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 3004; GFX940-NEXT: s_waitcnt vmcnt(0) 3005; GFX940-NEXT: s_setpc_b64 s[30:31] 3006 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3007 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 3008 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3009 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3010 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 1> 3011 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3012 ret void 3013} 3014 3015define void @v_shuffle_v4bf16_v3bf16__5_5_u_1(ptr addrspace(1) inreg %ptr) { 3016; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_1: 3017; GFX900: ; %bb.0: 3018; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3019; GFX900-NEXT: ;;#ASMSTART 3020; GFX900-NEXT: ; def v[0:1] 3021; GFX900-NEXT: ;;#ASMEND 3022; GFX900-NEXT: ;;#ASMSTART 3023; GFX900-NEXT: ; def v[1:2] 3024; GFX900-NEXT: ;;#ASMEND 3025; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3026; GFX900-NEXT: v_mov_b32_e32 v3, 0 3027; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 3028; GFX900-NEXT: v_mov_b32_e32 v2, v0 3029; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 3030; GFX900-NEXT: s_waitcnt vmcnt(0) 3031; GFX900-NEXT: s_setpc_b64 s[30:31] 3032; 3033; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_1: 3034; GFX90A: ; %bb.0: 3035; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3036; GFX90A-NEXT: ;;#ASMSTART 3037; GFX90A-NEXT: ; def v[2:3] 3038; GFX90A-NEXT: ;;#ASMEND 3039; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3040; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3041; GFX90A-NEXT: ;;#ASMSTART 3042; GFX90A-NEXT: ; def v[0:1] 3043; GFX90A-NEXT: ;;#ASMEND 3044; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 3045; GFX90A-NEXT: v_mov_b32_e32 v3, v0 3046; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 3047; GFX90A-NEXT: s_waitcnt vmcnt(0) 3048; GFX90A-NEXT: s_setpc_b64 s[30:31] 3049; 3050; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_1: 3051; GFX940: ; %bb.0: 3052; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3053; GFX940-NEXT: ;;#ASMSTART 3054; GFX940-NEXT: ; def v[2:3] 3055; GFX940-NEXT: ;;#ASMEND 3056; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3057; GFX940-NEXT: v_mov_b32_e32 v4, 0 3058; GFX940-NEXT: ;;#ASMSTART 3059; GFX940-NEXT: ; def v[0:1] 3060; GFX940-NEXT: ;;#ASMEND 3061; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 3062; GFX940-NEXT: v_mov_b32_e32 v3, v0 3063; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 3064; GFX940-NEXT: s_waitcnt vmcnt(0) 3065; GFX940-NEXT: s_setpc_b64 s[30:31] 3066 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3067 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 3068 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3069 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3070 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 1> 3071 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3072 ret void 3073} 3074 3075define void @v_shuffle_v4bf16_v3bf16__5_5_0_1(ptr addrspace(1) inreg %ptr) { 3076; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_1: 3077; GFX900: ; %bb.0: 3078; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3079; GFX900-NEXT: ;;#ASMSTART 3080; GFX900-NEXT: ; def v[0:1] 3081; GFX900-NEXT: ;;#ASMEND 3082; GFX900-NEXT: ;;#ASMSTART 3083; GFX900-NEXT: ; def v[1:2] 3084; GFX900-NEXT: ;;#ASMEND 3085; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3086; GFX900-NEXT: v_mov_b32_e32 v3, 0 3087; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 3088; GFX900-NEXT: v_mov_b32_e32 v2, v0 3089; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 3090; GFX900-NEXT: s_waitcnt vmcnt(0) 3091; GFX900-NEXT: s_setpc_b64 s[30:31] 3092; 3093; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_1: 3094; GFX90A: ; %bb.0: 3095; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3096; GFX90A-NEXT: ;;#ASMSTART 3097; GFX90A-NEXT: ; def v[2:3] 3098; GFX90A-NEXT: ;;#ASMEND 3099; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3100; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3101; GFX90A-NEXT: ;;#ASMSTART 3102; GFX90A-NEXT: ; def v[0:1] 3103; GFX90A-NEXT: ;;#ASMEND 3104; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 3105; GFX90A-NEXT: v_mov_b32_e32 v3, v0 3106; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 3107; GFX90A-NEXT: s_waitcnt vmcnt(0) 3108; GFX90A-NEXT: s_setpc_b64 s[30:31] 3109; 3110; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_1: 3111; GFX940: ; %bb.0: 3112; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3113; GFX940-NEXT: ;;#ASMSTART 3114; GFX940-NEXT: ; def v[2:3] 3115; GFX940-NEXT: ;;#ASMEND 3116; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3117; GFX940-NEXT: v_mov_b32_e32 v4, 0 3118; GFX940-NEXT: ;;#ASMSTART 3119; GFX940-NEXT: ; def v[0:1] 3120; GFX940-NEXT: ;;#ASMEND 3121; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 3122; GFX940-NEXT: v_mov_b32_e32 v3, v0 3123; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 3124; GFX940-NEXT: s_waitcnt vmcnt(0) 3125; GFX940-NEXT: s_setpc_b64 s[30:31] 3126 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3127 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 3128 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3129 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3130 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 1> 3131 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3132 ret void 3133} 3134 3135define void @v_shuffle_v4bf16_v3bf16__5_5_2_1(ptr addrspace(1) inreg %ptr) { 3136; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_1: 3137; GFX900: ; %bb.0: 3138; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3139; GFX900-NEXT: ;;#ASMSTART 3140; GFX900-NEXT: ; def v[0:1] 3141; GFX900-NEXT: ;;#ASMEND 3142; GFX900-NEXT: s_mov_b32 s4, 0xffff 3143; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 3144; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3145; GFX900-NEXT: v_mov_b32_e32 v4, 0 3146; GFX900-NEXT: ;;#ASMSTART 3147; GFX900-NEXT: ; def v[2:3] 3148; GFX900-NEXT: ;;#ASMEND 3149; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 3150; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3151; GFX900-NEXT: s_waitcnt vmcnt(0) 3152; GFX900-NEXT: s_setpc_b64 s[30:31] 3153; 3154; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_1: 3155; GFX90A: ; %bb.0: 3156; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3157; GFX90A-NEXT: ;;#ASMSTART 3158; GFX90A-NEXT: ; def v[0:1] 3159; GFX90A-NEXT: ;;#ASMEND 3160; GFX90A-NEXT: s_mov_b32 s4, 0xffff 3161; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 3162; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3163; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3164; GFX90A-NEXT: ;;#ASMSTART 3165; GFX90A-NEXT: ; def v[2:3] 3166; GFX90A-NEXT: ;;#ASMEND 3167; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 3168; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3169; GFX90A-NEXT: s_waitcnt vmcnt(0) 3170; GFX90A-NEXT: s_setpc_b64 s[30:31] 3171; 3172; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_1: 3173; GFX940: ; %bb.0: 3174; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3175; GFX940-NEXT: ;;#ASMSTART 3176; GFX940-NEXT: ; def v[0:1] 3177; GFX940-NEXT: ;;#ASMEND 3178; GFX940-NEXT: s_mov_b32 s2, 0xffff 3179; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 3180; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3181; GFX940-NEXT: v_mov_b32_e32 v4, 0 3182; GFX940-NEXT: ;;#ASMSTART 3183; GFX940-NEXT: ; def v[2:3] 3184; GFX940-NEXT: ;;#ASMEND 3185; GFX940-NEXT: s_nop 0 3186; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 3187; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 3188; GFX940-NEXT: s_waitcnt vmcnt(0) 3189; GFX940-NEXT: s_setpc_b64 s[30:31] 3190 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3191 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 3192 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3193 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3194 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 1> 3195 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3196 ret void 3197} 3198 3199define void @v_shuffle_v4bf16_v3bf16__5_5_3_1(ptr addrspace(1) inreg %ptr) { 3200; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_1: 3201; GFX900: ; %bb.0: 3202; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3203; GFX900-NEXT: ;;#ASMSTART 3204; GFX900-NEXT: ; def v[0:1] 3205; GFX900-NEXT: ;;#ASMEND 3206; GFX900-NEXT: ;;#ASMSTART 3207; GFX900-NEXT: ; def v[1:2] 3208; GFX900-NEXT: ;;#ASMEND 3209; GFX900-NEXT: s_mov_b32 s4, 0xffff 3210; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 3211; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3212; GFX900-NEXT: v_mov_b32_e32 v3, 0 3213; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 3214; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 3215; GFX900-NEXT: s_waitcnt vmcnt(0) 3216; GFX900-NEXT: s_setpc_b64 s[30:31] 3217; 3218; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_1: 3219; GFX90A: ; %bb.0: 3220; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3221; GFX90A-NEXT: ;;#ASMSTART 3222; GFX90A-NEXT: ; def v[0:1] 3223; GFX90A-NEXT: ;;#ASMEND 3224; GFX90A-NEXT: s_mov_b32 s4, 0xffff 3225; GFX90A-NEXT: ;;#ASMSTART 3226; GFX90A-NEXT: ; def v[2:3] 3227; GFX90A-NEXT: ;;#ASMEND 3228; GFX90A-NEXT: v_bfi_b32 v1, s4, v2, v0 3229; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3230; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3231; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 3232; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3233; GFX90A-NEXT: s_waitcnt vmcnt(0) 3234; GFX90A-NEXT: s_setpc_b64 s[30:31] 3235; 3236; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_1: 3237; GFX940: ; %bb.0: 3238; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3239; GFX940-NEXT: ;;#ASMSTART 3240; GFX940-NEXT: ; def v[0:1] 3241; GFX940-NEXT: ;;#ASMEND 3242; GFX940-NEXT: s_mov_b32 s2, 0xffff 3243; GFX940-NEXT: ;;#ASMSTART 3244; GFX940-NEXT: ; def v[2:3] 3245; GFX940-NEXT: ;;#ASMEND 3246; GFX940-NEXT: v_mov_b32_e32 v4, 0 3247; GFX940-NEXT: v_bfi_b32 v1, s2, v2, v0 3248; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3249; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 3250; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 3251; GFX940-NEXT: s_waitcnt vmcnt(0) 3252; GFX940-NEXT: s_setpc_b64 s[30:31] 3253 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3254 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 3255 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3256 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3257 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 1> 3258 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3259 ret void 3260} 3261 3262define void @v_shuffle_v4bf16_v3bf16__5_5_4_1(ptr addrspace(1) inreg %ptr) { 3263; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_1: 3264; GFX900: ; %bb.0: 3265; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3266; GFX900-NEXT: ;;#ASMSTART 3267; GFX900-NEXT: ; def v[0:1] 3268; GFX900-NEXT: ;;#ASMEND 3269; GFX900-NEXT: ;;#ASMSTART 3270; GFX900-NEXT: ; def v[1:2] 3271; GFX900-NEXT: ;;#ASMEND 3272; GFX900-NEXT: s_mov_b32 s4, 0x7060302 3273; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 3274; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3275; GFX900-NEXT: v_mov_b32_e32 v3, 0 3276; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 3277; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 3278; GFX900-NEXT: s_waitcnt vmcnt(0) 3279; GFX900-NEXT: s_setpc_b64 s[30:31] 3280; 3281; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_1: 3282; GFX90A: ; %bb.0: 3283; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3284; GFX90A-NEXT: ;;#ASMSTART 3285; GFX90A-NEXT: ; def v[0:1] 3286; GFX90A-NEXT: ;;#ASMEND 3287; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 3288; GFX90A-NEXT: ;;#ASMSTART 3289; GFX90A-NEXT: ; def v[2:3] 3290; GFX90A-NEXT: ;;#ASMEND 3291; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 3292; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3293; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3294; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 3295; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3296; GFX90A-NEXT: s_waitcnt vmcnt(0) 3297; GFX90A-NEXT: s_setpc_b64 s[30:31] 3298; 3299; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_1: 3300; GFX940: ; %bb.0: 3301; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3302; GFX940-NEXT: ;;#ASMSTART 3303; GFX940-NEXT: ; def v[0:1] 3304; GFX940-NEXT: ;;#ASMEND 3305; GFX940-NEXT: s_mov_b32 s2, 0x7060302 3306; GFX940-NEXT: ;;#ASMSTART 3307; GFX940-NEXT: ; def v[2:3] 3308; GFX940-NEXT: ;;#ASMEND 3309; GFX940-NEXT: v_mov_b32_e32 v4, 0 3310; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 3311; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3312; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 3313; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 3314; GFX940-NEXT: s_waitcnt vmcnt(0) 3315; GFX940-NEXT: s_setpc_b64 s[30:31] 3316 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3317 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 3318 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3319 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3320 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 1> 3321 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3322 ret void 3323} 3324 3325define void @v_shuffle_v4bf16_v3bf16__u_2_2_2(ptr addrspace(1) inreg %ptr) { 3326; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__u_2_2_2: 3327; GFX900: ; %bb.0: 3328; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3329; GFX900-NEXT: ;;#ASMSTART 3330; GFX900-NEXT: ; def v[0:1] 3331; GFX900-NEXT: ;;#ASMEND 3332; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3333; GFX900-NEXT: v_mov_b32_e32 v3, 0 3334; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 3335; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3336; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 3337; GFX900-NEXT: s_waitcnt vmcnt(0) 3338; GFX900-NEXT: s_setpc_b64 s[30:31] 3339; 3340; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__u_2_2_2: 3341; GFX90A: ; %bb.0: 3342; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3343; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3344; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3345; GFX90A-NEXT: ;;#ASMSTART 3346; GFX90A-NEXT: ; def v[0:1] 3347; GFX90A-NEXT: ;;#ASMEND 3348; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 3349; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 3350; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 3351; GFX90A-NEXT: s_waitcnt vmcnt(0) 3352; GFX90A-NEXT: s_setpc_b64 s[30:31] 3353; 3354; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__u_2_2_2: 3355; GFX940: ; %bb.0: 3356; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3357; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3358; GFX940-NEXT: v_mov_b32_e32 v4, 0 3359; GFX940-NEXT: ;;#ASMSTART 3360; GFX940-NEXT: ; def v[0:1] 3361; GFX940-NEXT: ;;#ASMEND 3362; GFX940-NEXT: s_nop 0 3363; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 3364; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 3365; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 3366; GFX940-NEXT: s_waitcnt vmcnt(0) 3367; GFX940-NEXT: s_setpc_b64 s[30:31] 3368 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3369 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3370 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2> 3371 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3372 ret void 3373} 3374 3375define void @v_shuffle_v4bf16_v3bf16__0_2_2_2(ptr addrspace(1) inreg %ptr) { 3376; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_2_2_2: 3377; GFX900: ; %bb.0: 3378; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3379; GFX900-NEXT: ;;#ASMSTART 3380; GFX900-NEXT: ; def v[0:1] 3381; GFX900-NEXT: ;;#ASMEND 3382; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3383; GFX900-NEXT: v_mov_b32_e32 v2, 0 3384; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 3385; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 3386; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 3387; GFX900-NEXT: s_waitcnt vmcnt(0) 3388; GFX900-NEXT: s_setpc_b64 s[30:31] 3389; 3390; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_2_2_2: 3391; GFX90A: ; %bb.0: 3392; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3393; GFX90A-NEXT: ;;#ASMSTART 3394; GFX90A-NEXT: ; def v[0:1] 3395; GFX90A-NEXT: ;;#ASMEND 3396; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3397; GFX90A-NEXT: v_mov_b32_e32 v2, 0 3398; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 3399; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 3400; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 3401; GFX90A-NEXT: s_waitcnt vmcnt(0) 3402; GFX90A-NEXT: s_setpc_b64 s[30:31] 3403; 3404; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_2_2_2: 3405; GFX940: ; %bb.0: 3406; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3407; GFX940-NEXT: ;;#ASMSTART 3408; GFX940-NEXT: ; def v[0:1] 3409; GFX940-NEXT: ;;#ASMEND 3410; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3411; GFX940-NEXT: v_mov_b32_e32 v2, 0 3412; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 3413; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 3414; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 3415; GFX940-NEXT: s_waitcnt vmcnt(0) 3416; GFX940-NEXT: s_setpc_b64 s[30:31] 3417 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3418 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3419 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2> 3420 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3421 ret void 3422} 3423 3424define void @v_shuffle_v4bf16_v3bf16__1_2_2_2(ptr addrspace(1) inreg %ptr) { 3425; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_2_2_2: 3426; GFX900: ; %bb.0: 3427; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3428; GFX900-NEXT: ;;#ASMSTART 3429; GFX900-NEXT: ; def v[0:1] 3430; GFX900-NEXT: ;;#ASMEND 3431; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3432; GFX900-NEXT: v_mov_b32_e32 v3, 0 3433; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 3434; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 3435; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 3436; GFX900-NEXT: s_waitcnt vmcnt(0) 3437; GFX900-NEXT: s_setpc_b64 s[30:31] 3438; 3439; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_2_2_2: 3440; GFX90A: ; %bb.0: 3441; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3442; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3443; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3444; GFX90A-NEXT: ;;#ASMSTART 3445; GFX90A-NEXT: ; def v[0:1] 3446; GFX90A-NEXT: ;;#ASMEND 3447; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 3448; GFX90A-NEXT: v_alignbit_b32 v2, v1, v0, 16 3449; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 3450; GFX90A-NEXT: s_waitcnt vmcnt(0) 3451; GFX90A-NEXT: s_setpc_b64 s[30:31] 3452; 3453; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_2_2_2: 3454; GFX940: ; %bb.0: 3455; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3456; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3457; GFX940-NEXT: v_mov_b32_e32 v4, 0 3458; GFX940-NEXT: ;;#ASMSTART 3459; GFX940-NEXT: ; def v[0:1] 3460; GFX940-NEXT: ;;#ASMEND 3461; GFX940-NEXT: s_nop 0 3462; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 3463; GFX940-NEXT: v_alignbit_b32 v2, v1, v0, 16 3464; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 3465; GFX940-NEXT: s_waitcnt vmcnt(0) 3466; GFX940-NEXT: s_setpc_b64 s[30:31] 3467 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3468 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3469 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2> 3470 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3471 ret void 3472} 3473 3474define void @v_shuffle_v4bf16_v3bf16__2_2_2_2(ptr addrspace(1) inreg %ptr) { 3475; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_2_2_2: 3476; GFX900: ; %bb.0: 3477; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3478; GFX900-NEXT: ;;#ASMSTART 3479; GFX900-NEXT: ; def v[0:1] 3480; GFX900-NEXT: ;;#ASMEND 3481; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3482; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 3483; GFX900-NEXT: v_mov_b32_e32 v2, 0 3484; GFX900-NEXT: v_mov_b32_e32 v1, v0 3485; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 3486; GFX900-NEXT: s_waitcnt vmcnt(0) 3487; GFX900-NEXT: s_setpc_b64 s[30:31] 3488; 3489; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_2_2_2: 3490; GFX90A: ; %bb.0: 3491; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3492; GFX90A-NEXT: ;;#ASMSTART 3493; GFX90A-NEXT: ; def v[0:1] 3494; GFX90A-NEXT: ;;#ASMEND 3495; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3496; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 3497; GFX90A-NEXT: v_mov_b32_e32 v2, 0 3498; GFX90A-NEXT: v_mov_b32_e32 v1, v0 3499; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 3500; GFX90A-NEXT: s_waitcnt vmcnt(0) 3501; GFX90A-NEXT: s_setpc_b64 s[30:31] 3502; 3503; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_2_2_2: 3504; GFX940: ; %bb.0: 3505; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3506; GFX940-NEXT: ;;#ASMSTART 3507; GFX940-NEXT: ; def v[0:1] 3508; GFX940-NEXT: ;;#ASMEND 3509; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3510; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 3511; GFX940-NEXT: v_mov_b32_e32 v2, 0 3512; GFX940-NEXT: v_mov_b32_e32 v1, v0 3513; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 3514; GFX940-NEXT: s_waitcnt vmcnt(0) 3515; GFX940-NEXT: s_setpc_b64 s[30:31] 3516 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3517 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3518 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2> 3519 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3520 ret void 3521} 3522 3523define void @v_shuffle_v4bf16_v3bf16__3_2_2_2(ptr addrspace(1) inreg %ptr) { 3524; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__3_2_2_2: 3525; GFX900: ; %bb.0: 3526; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3527; GFX900-NEXT: ;;#ASMSTART 3528; GFX900-NEXT: ; def v[0:1] 3529; GFX900-NEXT: ;;#ASMEND 3530; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3531; GFX900-NEXT: v_mov_b32_e32 v3, 0 3532; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 3533; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3534; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 3535; GFX900-NEXT: s_waitcnt vmcnt(0) 3536; GFX900-NEXT: s_setpc_b64 s[30:31] 3537; 3538; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__3_2_2_2: 3539; GFX90A: ; %bb.0: 3540; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3541; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3542; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3543; GFX90A-NEXT: ;;#ASMSTART 3544; GFX90A-NEXT: ; def v[0:1] 3545; GFX90A-NEXT: ;;#ASMEND 3546; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 3547; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 3548; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 3549; GFX90A-NEXT: s_waitcnt vmcnt(0) 3550; GFX90A-NEXT: s_setpc_b64 s[30:31] 3551; 3552; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__3_2_2_2: 3553; GFX940: ; %bb.0: 3554; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3555; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3556; GFX940-NEXT: v_mov_b32_e32 v4, 0 3557; GFX940-NEXT: ;;#ASMSTART 3558; GFX940-NEXT: ; def v[0:1] 3559; GFX940-NEXT: ;;#ASMEND 3560; GFX940-NEXT: s_nop 0 3561; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 3562; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 3563; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 3564; GFX940-NEXT: s_waitcnt vmcnt(0) 3565; GFX940-NEXT: s_setpc_b64 s[30:31] 3566 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3567 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3568 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2> 3569 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3570 ret void 3571} 3572 3573define void @v_shuffle_v4bf16_v3bf16__4_2_2_2(ptr addrspace(1) inreg %ptr) { 3574; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_2_2_2: 3575; GFX900: ; %bb.0: 3576; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3577; GFX900-NEXT: ;;#ASMSTART 3578; GFX900-NEXT: ; def v[2:3] 3579; GFX900-NEXT: ;;#ASMEND 3580; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3581; GFX900-NEXT: v_mov_b32_e32 v4, 0 3582; GFX900-NEXT: ;;#ASMSTART 3583; GFX900-NEXT: ; def v[0:1] 3584; GFX900-NEXT: ;;#ASMEND 3585; GFX900-NEXT: v_perm_b32 v3, v1, v1, s4 3586; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 3587; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 3588; GFX900-NEXT: s_waitcnt vmcnt(0) 3589; GFX900-NEXT: s_setpc_b64 s[30:31] 3590; 3591; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_2_2_2: 3592; GFX90A: ; %bb.0: 3593; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3594; GFX90A-NEXT: ;;#ASMSTART 3595; GFX90A-NEXT: ; def v[2:3] 3596; GFX90A-NEXT: ;;#ASMEND 3597; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3598; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3599; GFX90A-NEXT: ;;#ASMSTART 3600; GFX90A-NEXT: ; def v[0:1] 3601; GFX90A-NEXT: ;;#ASMEND 3602; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 3603; GFX90A-NEXT: v_alignbit_b32 v2, v1, v2, 16 3604; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 3605; GFX90A-NEXT: s_waitcnt vmcnt(0) 3606; GFX90A-NEXT: s_setpc_b64 s[30:31] 3607; 3608; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_2_2_2: 3609; GFX940: ; %bb.0: 3610; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3611; GFX940-NEXT: ;;#ASMSTART 3612; GFX940-NEXT: ; def v[2:3] 3613; GFX940-NEXT: ;;#ASMEND 3614; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3615; GFX940-NEXT: v_mov_b32_e32 v4, 0 3616; GFX940-NEXT: ;;#ASMSTART 3617; GFX940-NEXT: ; def v[0:1] 3618; GFX940-NEXT: ;;#ASMEND 3619; GFX940-NEXT: s_nop 0 3620; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 3621; GFX940-NEXT: v_alignbit_b32 v2, v1, v2, 16 3622; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 3623; GFX940-NEXT: s_waitcnt vmcnt(0) 3624; GFX940-NEXT: s_setpc_b64 s[30:31] 3625 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3626 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 3627 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3628 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3629 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 2, i32 2, i32 2> 3630 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3631 ret void 3632} 3633 3634define void @v_shuffle_v4bf16_v3bf16__5_2_2_2(ptr addrspace(1) inreg %ptr) { 3635; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_2_2: 3636; GFX900: ; %bb.0: 3637; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3638; GFX900-NEXT: ;;#ASMSTART 3639; GFX900-NEXT: ; def v[0:1] 3640; GFX900-NEXT: ;;#ASMEND 3641; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3642; GFX900-NEXT: v_mov_b32_e32 v4, 0 3643; GFX900-NEXT: ;;#ASMSTART 3644; GFX900-NEXT: ; def v[2:3] 3645; GFX900-NEXT: ;;#ASMEND 3646; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 3647; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 3648; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3649; GFX900-NEXT: s_waitcnt vmcnt(0) 3650; GFX900-NEXT: s_setpc_b64 s[30:31] 3651; 3652; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_2_2: 3653; GFX90A: ; %bb.0: 3654; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3655; GFX90A-NEXT: ;;#ASMSTART 3656; GFX90A-NEXT: ; def v[0:1] 3657; GFX90A-NEXT: ;;#ASMEND 3658; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3659; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3660; GFX90A-NEXT: ;;#ASMSTART 3661; GFX90A-NEXT: ; def v[2:3] 3662; GFX90A-NEXT: ;;#ASMEND 3663; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 3664; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 3665; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3666; GFX90A-NEXT: s_waitcnt vmcnt(0) 3667; GFX90A-NEXT: s_setpc_b64 s[30:31] 3668; 3669; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_2_2: 3670; GFX940: ; %bb.0: 3671; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3672; GFX940-NEXT: ;;#ASMSTART 3673; GFX940-NEXT: ; def v[0:1] 3674; GFX940-NEXT: ;;#ASMEND 3675; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3676; GFX940-NEXT: v_mov_b32_e32 v4, 0 3677; GFX940-NEXT: ;;#ASMSTART 3678; GFX940-NEXT: ; def v[2:3] 3679; GFX940-NEXT: ;;#ASMEND 3680; GFX940-NEXT: s_nop 0 3681; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 3682; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 3683; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 3684; GFX940-NEXT: s_waitcnt vmcnt(0) 3685; GFX940-NEXT: s_setpc_b64 s[30:31] 3686 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3687 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 3688 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3689 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3690 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 2, i32 2> 3691 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3692 ret void 3693} 3694 3695define void @v_shuffle_v4bf16_v3bf16__5_u_2_2(ptr addrspace(1) inreg %ptr) { 3696; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_2_2: 3697; GFX900: ; %bb.0: 3698; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3699; GFX900-NEXT: ;;#ASMSTART 3700; GFX900-NEXT: ; def v[0:1] 3701; GFX900-NEXT: ;;#ASMEND 3702; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3703; GFX900-NEXT: v_mov_b32_e32 v4, 0 3704; GFX900-NEXT: ;;#ASMSTART 3705; GFX900-NEXT: ; def v[2:3] 3706; GFX900-NEXT: ;;#ASMEND 3707; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 3708; GFX900-NEXT: v_mov_b32_e32 v0, v3 3709; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3710; GFX900-NEXT: s_waitcnt vmcnt(0) 3711; GFX900-NEXT: s_setpc_b64 s[30:31] 3712; 3713; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_2_2: 3714; GFX90A: ; %bb.0: 3715; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3716; GFX90A-NEXT: ;;#ASMSTART 3717; GFX90A-NEXT: ; def v[0:1] 3718; GFX90A-NEXT: ;;#ASMEND 3719; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3720; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3721; GFX90A-NEXT: ;;#ASMSTART 3722; GFX90A-NEXT: ; def v[2:3] 3723; GFX90A-NEXT: ;;#ASMEND 3724; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 3725; GFX90A-NEXT: v_mov_b32_e32 v0, v3 3726; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3727; GFX90A-NEXT: s_waitcnt vmcnt(0) 3728; GFX90A-NEXT: s_setpc_b64 s[30:31] 3729; 3730; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_2_2: 3731; GFX940: ; %bb.0: 3732; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3733; GFX940-NEXT: ;;#ASMSTART 3734; GFX940-NEXT: ; def v[0:1] 3735; GFX940-NEXT: ;;#ASMEND 3736; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3737; GFX940-NEXT: v_mov_b32_e32 v4, 0 3738; GFX940-NEXT: ;;#ASMSTART 3739; GFX940-NEXT: ; def v[2:3] 3740; GFX940-NEXT: ;;#ASMEND 3741; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 3742; GFX940-NEXT: v_mov_b32_e32 v0, v3 3743; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 3744; GFX940-NEXT: s_waitcnt vmcnt(0) 3745; GFX940-NEXT: s_setpc_b64 s[30:31] 3746 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3747 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 3748 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3749 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3750 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 2, i32 2> 3751 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3752 ret void 3753} 3754 3755define void @v_shuffle_v4bf16_v3bf16__5_0_2_2(ptr addrspace(1) inreg %ptr) { 3756; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_2_2: 3757; GFX900: ; %bb.0: 3758; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3759; GFX900-NEXT: ;;#ASMSTART 3760; GFX900-NEXT: ; def v[0:1] 3761; GFX900-NEXT: ;;#ASMEND 3762; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3763; GFX900-NEXT: v_mov_b32_e32 v4, 0 3764; GFX900-NEXT: ;;#ASMSTART 3765; GFX900-NEXT: ; def v[2:3] 3766; GFX900-NEXT: ;;#ASMEND 3767; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 3768; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 3769; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3770; GFX900-NEXT: s_waitcnt vmcnt(0) 3771; GFX900-NEXT: s_setpc_b64 s[30:31] 3772; 3773; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_2_2: 3774; GFX90A: ; %bb.0: 3775; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3776; GFX90A-NEXT: ;;#ASMSTART 3777; GFX90A-NEXT: ; def v[0:1] 3778; GFX90A-NEXT: ;;#ASMEND 3779; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3780; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3781; GFX90A-NEXT: ;;#ASMSTART 3782; GFX90A-NEXT: ; def v[2:3] 3783; GFX90A-NEXT: ;;#ASMEND 3784; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 3785; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 3786; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3787; GFX90A-NEXT: s_waitcnt vmcnt(0) 3788; GFX90A-NEXT: s_setpc_b64 s[30:31] 3789; 3790; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_2_2: 3791; GFX940: ; %bb.0: 3792; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3793; GFX940-NEXT: ;;#ASMSTART 3794; GFX940-NEXT: ; def v[0:1] 3795; GFX940-NEXT: ;;#ASMEND 3796; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3797; GFX940-NEXT: v_mov_b32_e32 v4, 0 3798; GFX940-NEXT: ;;#ASMSTART 3799; GFX940-NEXT: ; def v[2:3] 3800; GFX940-NEXT: ;;#ASMEND 3801; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 3802; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 3803; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 3804; GFX940-NEXT: s_waitcnt vmcnt(0) 3805; GFX940-NEXT: s_setpc_b64 s[30:31] 3806 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3807 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 3808 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3809 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3810 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 2, i32 2> 3811 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3812 ret void 3813} 3814 3815define void @v_shuffle_v4bf16_v3bf16__5_1_2_2(ptr addrspace(1) inreg %ptr) { 3816; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_2_2: 3817; GFX900: ; %bb.0: 3818; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3819; GFX900-NEXT: ;;#ASMSTART 3820; GFX900-NEXT: ; def v[0:1] 3821; GFX900-NEXT: ;;#ASMEND 3822; GFX900-NEXT: s_mov_b32 s4, 0xffff 3823; GFX900-NEXT: ;;#ASMSTART 3824; GFX900-NEXT: ; def v[2:3] 3825; GFX900-NEXT: ;;#ASMEND 3826; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 3827; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3828; GFX900-NEXT: v_mov_b32_e32 v4, 0 3829; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 3830; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3831; GFX900-NEXT: s_waitcnt vmcnt(0) 3832; GFX900-NEXT: s_setpc_b64 s[30:31] 3833; 3834; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_2_2: 3835; GFX90A: ; %bb.0: 3836; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3837; GFX90A-NEXT: ;;#ASMSTART 3838; GFX90A-NEXT: ; def v[0:1] 3839; GFX90A-NEXT: ;;#ASMEND 3840; GFX90A-NEXT: s_mov_b32 s4, 0xffff 3841; GFX90A-NEXT: ;;#ASMSTART 3842; GFX90A-NEXT: ; def v[2:3] 3843; GFX90A-NEXT: ;;#ASMEND 3844; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 3845; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3846; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3847; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 3848; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3849; GFX90A-NEXT: s_waitcnt vmcnt(0) 3850; GFX90A-NEXT: s_setpc_b64 s[30:31] 3851; 3852; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_2_2: 3853; GFX940: ; %bb.0: 3854; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3855; GFX940-NEXT: ;;#ASMSTART 3856; GFX940-NEXT: ; def v[0:1] 3857; GFX940-NEXT: ;;#ASMEND 3858; GFX940-NEXT: s_mov_b32 s2, 0xffff 3859; GFX940-NEXT: ;;#ASMSTART 3860; GFX940-NEXT: ; def v[2:3] 3861; GFX940-NEXT: ;;#ASMEND 3862; GFX940-NEXT: v_mov_b32_e32 v4, 0 3863; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 3864; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3865; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 3866; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 3867; GFX940-NEXT: s_waitcnt vmcnt(0) 3868; GFX940-NEXT: s_setpc_b64 s[30:31] 3869 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3870 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 3871 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3872 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3873 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 2, i32 2> 3874 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3875 ret void 3876} 3877 3878define void @v_shuffle_v4bf16_v3bf16__5_3_2_2(ptr addrspace(1) inreg %ptr) { 3879; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_2_2: 3880; GFX900: ; %bb.0: 3881; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3882; GFX900-NEXT: ;;#ASMSTART 3883; GFX900-NEXT: ; def v[0:1] 3884; GFX900-NEXT: ;;#ASMEND 3885; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3886; GFX900-NEXT: v_mov_b32_e32 v4, 0 3887; GFX900-NEXT: ;;#ASMSTART 3888; GFX900-NEXT: ; def v[2:3] 3889; GFX900-NEXT: ;;#ASMEND 3890; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 3891; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 3892; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3893; GFX900-NEXT: s_waitcnt vmcnt(0) 3894; GFX900-NEXT: s_setpc_b64 s[30:31] 3895; 3896; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_2_2: 3897; GFX90A: ; %bb.0: 3898; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3899; GFX90A-NEXT: ;;#ASMSTART 3900; GFX90A-NEXT: ; def v[0:1] 3901; GFX90A-NEXT: ;;#ASMEND 3902; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3903; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3904; GFX90A-NEXT: ;;#ASMSTART 3905; GFX90A-NEXT: ; def v[2:3] 3906; GFX90A-NEXT: ;;#ASMEND 3907; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 3908; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 3909; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3910; GFX90A-NEXT: s_waitcnt vmcnt(0) 3911; GFX90A-NEXT: s_setpc_b64 s[30:31] 3912; 3913; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_2_2: 3914; GFX940: ; %bb.0: 3915; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3916; GFX940-NEXT: ;;#ASMSTART 3917; GFX940-NEXT: ; def v[0:1] 3918; GFX940-NEXT: ;;#ASMEND 3919; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3920; GFX940-NEXT: v_mov_b32_e32 v4, 0 3921; GFX940-NEXT: ;;#ASMSTART 3922; GFX940-NEXT: ; def v[2:3] 3923; GFX940-NEXT: ;;#ASMEND 3924; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 3925; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 3926; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 3927; GFX940-NEXT: s_waitcnt vmcnt(0) 3928; GFX940-NEXT: s_setpc_b64 s[30:31] 3929 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3930 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 3931 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3932 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3933 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 2, i32 2> 3934 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3935 ret void 3936} 3937 3938define void @v_shuffle_v4bf16_v3bf16__5_4_2_2(ptr addrspace(1) inreg %ptr) { 3939; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_2_2: 3940; GFX900: ; %bb.0: 3941; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3942; GFX900-NEXT: ;;#ASMSTART 3943; GFX900-NEXT: ; def v[0:1] 3944; GFX900-NEXT: ;;#ASMEND 3945; GFX900-NEXT: s_mov_b32 s4, 0xffff 3946; GFX900-NEXT: ;;#ASMSTART 3947; GFX900-NEXT: ; def v[2:3] 3948; GFX900-NEXT: ;;#ASMEND 3949; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v2 3950; GFX900-NEXT: s_mov_b32 s4, 0x5040100 3951; GFX900-NEXT: v_mov_b32_e32 v4, 0 3952; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 3953; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3954; GFX900-NEXT: s_waitcnt vmcnt(0) 3955; GFX900-NEXT: s_setpc_b64 s[30:31] 3956; 3957; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_2_2: 3958; GFX90A: ; %bb.0: 3959; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3960; GFX90A-NEXT: ;;#ASMSTART 3961; GFX90A-NEXT: ; def v[0:1] 3962; GFX90A-NEXT: ;;#ASMEND 3963; GFX90A-NEXT: s_mov_b32 s4, 0xffff 3964; GFX90A-NEXT: ;;#ASMSTART 3965; GFX90A-NEXT: ; def v[2:3] 3966; GFX90A-NEXT: ;;#ASMEND 3967; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v2 3968; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 3969; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3970; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 3971; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 3972; GFX90A-NEXT: s_waitcnt vmcnt(0) 3973; GFX90A-NEXT: s_setpc_b64 s[30:31] 3974; 3975; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_2_2: 3976; GFX940: ; %bb.0: 3977; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3978; GFX940-NEXT: ;;#ASMSTART 3979; GFX940-NEXT: ; def v[0:1] 3980; GFX940-NEXT: ;;#ASMEND 3981; GFX940-NEXT: s_mov_b32 s2, 0xffff 3982; GFX940-NEXT: ;;#ASMSTART 3983; GFX940-NEXT: ; def v[2:3] 3984; GFX940-NEXT: ;;#ASMEND 3985; GFX940-NEXT: v_mov_b32_e32 v4, 0 3986; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v2 3987; GFX940-NEXT: s_mov_b32 s2, 0x5040100 3988; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 3989; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 3990; GFX940-NEXT: s_waitcnt vmcnt(0) 3991; GFX940-NEXT: s_setpc_b64 s[30:31] 3992 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 3993 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 3994 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3995 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 3996 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 2, i32 2> 3997 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 3998 ret void 3999} 4000 4001define void @v_shuffle_v4bf16_v3bf16__5_5_2_2(ptr addrspace(1) inreg %ptr) { 4002; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_2: 4003; GFX900: ; %bb.0: 4004; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4005; GFX900-NEXT: ;;#ASMSTART 4006; GFX900-NEXT: ; def v[0:1] 4007; GFX900-NEXT: ;;#ASMEND 4008; GFX900-NEXT: s_mov_b32 s4, 0x5040100 4009; GFX900-NEXT: v_mov_b32_e32 v4, 0 4010; GFX900-NEXT: ;;#ASMSTART 4011; GFX900-NEXT: ; def v[2:3] 4012; GFX900-NEXT: ;;#ASMEND 4013; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 4014; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 4015; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4016; GFX900-NEXT: s_waitcnt vmcnt(0) 4017; GFX900-NEXT: s_setpc_b64 s[30:31] 4018; 4019; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_2: 4020; GFX90A: ; %bb.0: 4021; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4022; GFX90A-NEXT: ;;#ASMSTART 4023; GFX90A-NEXT: ; def v[0:1] 4024; GFX90A-NEXT: ;;#ASMEND 4025; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 4026; GFX90A-NEXT: v_mov_b32_e32 v4, 0 4027; GFX90A-NEXT: ;;#ASMSTART 4028; GFX90A-NEXT: ; def v[2:3] 4029; GFX90A-NEXT: ;;#ASMEND 4030; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 4031; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 4032; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4033; GFX90A-NEXT: s_waitcnt vmcnt(0) 4034; GFX90A-NEXT: s_setpc_b64 s[30:31] 4035; 4036; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_2: 4037; GFX940: ; %bb.0: 4038; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4039; GFX940-NEXT: ;;#ASMSTART 4040; GFX940-NEXT: ; def v[0:1] 4041; GFX940-NEXT: ;;#ASMEND 4042; GFX940-NEXT: s_mov_b32 s2, 0x5040100 4043; GFX940-NEXT: v_mov_b32_e32 v4, 0 4044; GFX940-NEXT: ;;#ASMSTART 4045; GFX940-NEXT: ; def v[2:3] 4046; GFX940-NEXT: ;;#ASMEND 4047; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 4048; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 4049; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 4050; GFX940-NEXT: s_waitcnt vmcnt(0) 4051; GFX940-NEXT: s_setpc_b64 s[30:31] 4052 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4053 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 4054 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4055 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4056 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 2> 4057 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4058 ret void 4059} 4060 4061define void @v_shuffle_v4bf16_v3bf16__5_5_u_2(ptr addrspace(1) inreg %ptr) { 4062; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_2: 4063; GFX900: ; %bb.0: 4064; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4065; GFX900-NEXT: ;;#ASMSTART 4066; GFX900-NEXT: ; def v[0:1] 4067; GFX900-NEXT: ;;#ASMEND 4068; GFX900-NEXT: s_mov_b32 s4, 0x5040100 4069; GFX900-NEXT: v_mov_b32_e32 v4, 0 4070; GFX900-NEXT: ;;#ASMSTART 4071; GFX900-NEXT: ; def v[2:3] 4072; GFX900-NEXT: ;;#ASMEND 4073; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 4074; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4075; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4076; GFX900-NEXT: s_waitcnt vmcnt(0) 4077; GFX900-NEXT: s_setpc_b64 s[30:31] 4078; 4079; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_2: 4080; GFX90A: ; %bb.0: 4081; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4082; GFX90A-NEXT: ;;#ASMSTART 4083; GFX90A-NEXT: ; def v[0:1] 4084; GFX90A-NEXT: ;;#ASMEND 4085; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 4086; GFX90A-NEXT: v_mov_b32_e32 v4, 0 4087; GFX90A-NEXT: ;;#ASMSTART 4088; GFX90A-NEXT: ; def v[2:3] 4089; GFX90A-NEXT: ;;#ASMEND 4090; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 4091; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4092; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4093; GFX90A-NEXT: s_waitcnt vmcnt(0) 4094; GFX90A-NEXT: s_setpc_b64 s[30:31] 4095; 4096; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_2: 4097; GFX940: ; %bb.0: 4098; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4099; GFX940-NEXT: ;;#ASMSTART 4100; GFX940-NEXT: ; def v[0:1] 4101; GFX940-NEXT: ;;#ASMEND 4102; GFX940-NEXT: s_mov_b32 s2, 0x5040100 4103; GFX940-NEXT: v_mov_b32_e32 v4, 0 4104; GFX940-NEXT: ;;#ASMSTART 4105; GFX940-NEXT: ; def v[2:3] 4106; GFX940-NEXT: ;;#ASMEND 4107; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4108; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 4109; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 4110; GFX940-NEXT: s_waitcnt vmcnt(0) 4111; GFX940-NEXT: s_setpc_b64 s[30:31] 4112 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4113 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 4114 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4115 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4116 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 2> 4117 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4118 ret void 4119} 4120 4121define void @v_shuffle_v4bf16_v3bf16__5_5_0_2(ptr addrspace(1) inreg %ptr) { 4122; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_2: 4123; GFX900: ; %bb.0: 4124; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4125; GFX900-NEXT: ;;#ASMSTART 4126; GFX900-NEXT: ; def v[0:1] 4127; GFX900-NEXT: ;;#ASMEND 4128; GFX900-NEXT: s_mov_b32 s4, 0x5040100 4129; GFX900-NEXT: v_mov_b32_e32 v4, 0 4130; GFX900-NEXT: ;;#ASMSTART 4131; GFX900-NEXT: ; def v[2:3] 4132; GFX900-NEXT: ;;#ASMEND 4133; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 4134; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 4135; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4136; GFX900-NEXT: s_waitcnt vmcnt(0) 4137; GFX900-NEXT: s_setpc_b64 s[30:31] 4138; 4139; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_2: 4140; GFX90A: ; %bb.0: 4141; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4142; GFX90A-NEXT: ;;#ASMSTART 4143; GFX90A-NEXT: ; def v[0:1] 4144; GFX90A-NEXT: ;;#ASMEND 4145; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 4146; GFX90A-NEXT: v_mov_b32_e32 v4, 0 4147; GFX90A-NEXT: ;;#ASMSTART 4148; GFX90A-NEXT: ; def v[2:3] 4149; GFX90A-NEXT: ;;#ASMEND 4150; GFX90A-NEXT: v_perm_b32 v1, v1, v0, s4 4151; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 4152; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4153; GFX90A-NEXT: s_waitcnt vmcnt(0) 4154; GFX90A-NEXT: s_setpc_b64 s[30:31] 4155; 4156; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_2: 4157; GFX940: ; %bb.0: 4158; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4159; GFX940-NEXT: ;;#ASMSTART 4160; GFX940-NEXT: ; def v[0:1] 4161; GFX940-NEXT: ;;#ASMEND 4162; GFX940-NEXT: s_mov_b32 s2, 0x5040100 4163; GFX940-NEXT: v_mov_b32_e32 v4, 0 4164; GFX940-NEXT: ;;#ASMSTART 4165; GFX940-NEXT: ; def v[2:3] 4166; GFX940-NEXT: ;;#ASMEND 4167; GFX940-NEXT: v_perm_b32 v1, v1, v0, s2 4168; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 4169; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 4170; GFX940-NEXT: s_waitcnt vmcnt(0) 4171; GFX940-NEXT: s_setpc_b64 s[30:31] 4172 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4173 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 4174 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4175 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4176 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 2> 4177 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4178 ret void 4179} 4180 4181define void @v_shuffle_v4bf16_v3bf16__5_5_1_2(ptr addrspace(1) inreg %ptr) { 4182; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_2: 4183; GFX900: ; %bb.0: 4184; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4185; GFX900-NEXT: ;;#ASMSTART 4186; GFX900-NEXT: ; def v[0:1] 4187; GFX900-NEXT: ;;#ASMEND 4188; GFX900-NEXT: s_mov_b32 s4, 0x5040100 4189; GFX900-NEXT: v_mov_b32_e32 v4, 0 4190; GFX900-NEXT: ;;#ASMSTART 4191; GFX900-NEXT: ; def v[2:3] 4192; GFX900-NEXT: ;;#ASMEND 4193; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 4194; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 4195; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4196; GFX900-NEXT: s_waitcnt vmcnt(0) 4197; GFX900-NEXT: s_setpc_b64 s[30:31] 4198; 4199; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_2: 4200; GFX90A: ; %bb.0: 4201; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4202; GFX90A-NEXT: ;;#ASMSTART 4203; GFX90A-NEXT: ; def v[0:1] 4204; GFX90A-NEXT: ;;#ASMEND 4205; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 4206; GFX90A-NEXT: v_mov_b32_e32 v4, 0 4207; GFX90A-NEXT: ;;#ASMSTART 4208; GFX90A-NEXT: ; def v[2:3] 4209; GFX90A-NEXT: ;;#ASMEND 4210; GFX90A-NEXT: v_alignbit_b32 v1, v1, v0, 16 4211; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 4212; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4213; GFX90A-NEXT: s_waitcnt vmcnt(0) 4214; GFX90A-NEXT: s_setpc_b64 s[30:31] 4215; 4216; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_2: 4217; GFX940: ; %bb.0: 4218; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4219; GFX940-NEXT: ;;#ASMSTART 4220; GFX940-NEXT: ; def v[0:1] 4221; GFX940-NEXT: ;;#ASMEND 4222; GFX940-NEXT: s_mov_b32 s2, 0x5040100 4223; GFX940-NEXT: v_mov_b32_e32 v4, 0 4224; GFX940-NEXT: ;;#ASMSTART 4225; GFX940-NEXT: ; def v[2:3] 4226; GFX940-NEXT: ;;#ASMEND 4227; GFX940-NEXT: v_alignbit_b32 v1, v1, v0, 16 4228; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 4229; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 4230; GFX940-NEXT: s_waitcnt vmcnt(0) 4231; GFX940-NEXT: s_setpc_b64 s[30:31] 4232 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4233 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 4234 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4235 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4236 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 2> 4237 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4238 ret void 4239} 4240 4241define void @v_shuffle_v4bf16_v3bf16__5_5_3_2(ptr addrspace(1) inreg %ptr) { 4242; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_2: 4243; GFX900: ; %bb.0: 4244; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4245; GFX900-NEXT: ;;#ASMSTART 4246; GFX900-NEXT: ; def v[0:1] 4247; GFX900-NEXT: ;;#ASMEND 4248; GFX900-NEXT: s_mov_b32 s4, 0x5040100 4249; GFX900-NEXT: v_mov_b32_e32 v4, 0 4250; GFX900-NEXT: ;;#ASMSTART 4251; GFX900-NEXT: ; def v[2:3] 4252; GFX900-NEXT: ;;#ASMEND 4253; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 4254; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 4255; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4256; GFX900-NEXT: s_waitcnt vmcnt(0) 4257; GFX900-NEXT: s_setpc_b64 s[30:31] 4258; 4259; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_2: 4260; GFX90A: ; %bb.0: 4261; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4262; GFX90A-NEXT: ;;#ASMSTART 4263; GFX90A-NEXT: ; def v[0:1] 4264; GFX90A-NEXT: ;;#ASMEND 4265; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 4266; GFX90A-NEXT: v_mov_b32_e32 v4, 0 4267; GFX90A-NEXT: ;;#ASMSTART 4268; GFX90A-NEXT: ; def v[2:3] 4269; GFX90A-NEXT: ;;#ASMEND 4270; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 4271; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 4272; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4273; GFX90A-NEXT: s_waitcnt vmcnt(0) 4274; GFX90A-NEXT: s_setpc_b64 s[30:31] 4275; 4276; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_2: 4277; GFX940: ; %bb.0: 4278; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4279; GFX940-NEXT: ;;#ASMSTART 4280; GFX940-NEXT: ; def v[0:1] 4281; GFX940-NEXT: ;;#ASMEND 4282; GFX940-NEXT: s_mov_b32 s2, 0x5040100 4283; GFX940-NEXT: v_mov_b32_e32 v4, 0 4284; GFX940-NEXT: ;;#ASMSTART 4285; GFX940-NEXT: ; def v[2:3] 4286; GFX940-NEXT: ;;#ASMEND 4287; GFX940-NEXT: s_nop 0 4288; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 4289; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 4290; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 4291; GFX940-NEXT: s_waitcnt vmcnt(0) 4292; GFX940-NEXT: s_setpc_b64 s[30:31] 4293 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4294 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 4295 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4296 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4297 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 2> 4298 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4299 ret void 4300} 4301 4302define void @v_shuffle_v4bf16_v3bf16__5_5_4_2(ptr addrspace(1) inreg %ptr) { 4303; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_2: 4304; GFX900: ; %bb.0: 4305; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4306; GFX900-NEXT: ;;#ASMSTART 4307; GFX900-NEXT: ; def v[0:1] 4308; GFX900-NEXT: ;;#ASMEND 4309; GFX900-NEXT: s_mov_b32 s4, 0x5040100 4310; GFX900-NEXT: v_mov_b32_e32 v4, 0 4311; GFX900-NEXT: ;;#ASMSTART 4312; GFX900-NEXT: ; def v[2:3] 4313; GFX900-NEXT: ;;#ASMEND 4314; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 4315; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 4316; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4317; GFX900-NEXT: s_waitcnt vmcnt(0) 4318; GFX900-NEXT: s_setpc_b64 s[30:31] 4319; 4320; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_2: 4321; GFX90A: ; %bb.0: 4322; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4323; GFX90A-NEXT: ;;#ASMSTART 4324; GFX90A-NEXT: ; def v[0:1] 4325; GFX90A-NEXT: ;;#ASMEND 4326; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 4327; GFX90A-NEXT: v_mov_b32_e32 v4, 0 4328; GFX90A-NEXT: ;;#ASMSTART 4329; GFX90A-NEXT: ; def v[2:3] 4330; GFX90A-NEXT: ;;#ASMEND 4331; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 4332; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 4333; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4334; GFX90A-NEXT: s_waitcnt vmcnt(0) 4335; GFX90A-NEXT: s_setpc_b64 s[30:31] 4336; 4337; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_2: 4338; GFX940: ; %bb.0: 4339; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4340; GFX940-NEXT: ;;#ASMSTART 4341; GFX940-NEXT: ; def v[0:1] 4342; GFX940-NEXT: ;;#ASMEND 4343; GFX940-NEXT: s_mov_b32 s2, 0x5040100 4344; GFX940-NEXT: v_mov_b32_e32 v4, 0 4345; GFX940-NEXT: ;;#ASMSTART 4346; GFX940-NEXT: ; def v[2:3] 4347; GFX940-NEXT: ;;#ASMEND 4348; GFX940-NEXT: s_nop 0 4349; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 4350; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 4351; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 4352; GFX940-NEXT: s_waitcnt vmcnt(0) 4353; GFX940-NEXT: s_setpc_b64 s[30:31] 4354 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4355 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 4356 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4357 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4358 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 2> 4359 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4360 ret void 4361} 4362 4363define void @v_shuffle_v4bf16_v3bf16__u_3_3_3(ptr addrspace(1) inreg %ptr) { 4364; GFX9-LABEL: v_shuffle_v4bf16_v3bf16__u_3_3_3: 4365; GFX9: ; %bb.0: 4366; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4367; GFX9-NEXT: s_setpc_b64 s[30:31] 4368 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4369 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4370 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 3, i32 3, i32 3> 4371 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4372 ret void 4373} 4374 4375define void @v_shuffle_v4bf16_v3bf16__0_3_3_3(ptr addrspace(1) inreg %ptr) { 4376; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_3_3_3: 4377; GFX900: ; %bb.0: 4378; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4379; GFX900-NEXT: v_mov_b32_e32 v2, 0 4380; GFX900-NEXT: ;;#ASMSTART 4381; GFX900-NEXT: ; def v[0:1] 4382; GFX900-NEXT: ;;#ASMEND 4383; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 4384; GFX900-NEXT: s_waitcnt vmcnt(0) 4385; GFX900-NEXT: s_setpc_b64 s[30:31] 4386; 4387; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_3_3_3: 4388; GFX90A: ; %bb.0: 4389; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4390; GFX90A-NEXT: v_mov_b32_e32 v2, 0 4391; GFX90A-NEXT: ;;#ASMSTART 4392; GFX90A-NEXT: ; def v[0:1] 4393; GFX90A-NEXT: ;;#ASMEND 4394; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 4395; GFX90A-NEXT: s_waitcnt vmcnt(0) 4396; GFX90A-NEXT: s_setpc_b64 s[30:31] 4397; 4398; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_3_3_3: 4399; GFX940: ; %bb.0: 4400; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4401; GFX940-NEXT: v_mov_b32_e32 v2, 0 4402; GFX940-NEXT: ;;#ASMSTART 4403; GFX940-NEXT: ; def v[0:1] 4404; GFX940-NEXT: ;;#ASMEND 4405; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 4406; GFX940-NEXT: s_waitcnt vmcnt(0) 4407; GFX940-NEXT: s_setpc_b64 s[30:31] 4408 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4409 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4410 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 3, i32 3, i32 3> 4411 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4412 ret void 4413} 4414 4415define void @v_shuffle_v4bf16_v3bf16__1_3_3_3(ptr addrspace(1) inreg %ptr) { 4416; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_3_3_3: 4417; GFX900: ; %bb.0: 4418; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4419; GFX900-NEXT: ;;#ASMSTART 4420; GFX900-NEXT: ; def v[0:1] 4421; GFX900-NEXT: ;;#ASMEND 4422; GFX900-NEXT: v_mov_b32_e32 v2, 0 4423; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 4424; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 4425; GFX900-NEXT: s_waitcnt vmcnt(0) 4426; GFX900-NEXT: s_setpc_b64 s[30:31] 4427; 4428; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_3_3_3: 4429; GFX90A: ; %bb.0: 4430; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4431; GFX90A-NEXT: ;;#ASMSTART 4432; GFX90A-NEXT: ; def v[0:1] 4433; GFX90A-NEXT: ;;#ASMEND 4434; GFX90A-NEXT: v_mov_b32_e32 v2, 0 4435; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 4436; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 4437; GFX90A-NEXT: s_waitcnt vmcnt(0) 4438; GFX90A-NEXT: s_setpc_b64 s[30:31] 4439; 4440; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_3_3_3: 4441; GFX940: ; %bb.0: 4442; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4443; GFX940-NEXT: ;;#ASMSTART 4444; GFX940-NEXT: ; def v[0:1] 4445; GFX940-NEXT: ;;#ASMEND 4446; GFX940-NEXT: v_mov_b32_e32 v2, 0 4447; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 4448; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 4449; GFX940-NEXT: s_waitcnt vmcnt(0) 4450; GFX940-NEXT: s_setpc_b64 s[30:31] 4451 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4452 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4453 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3> 4454 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4455 ret void 4456} 4457 4458define void @v_shuffle_v4bf16_v3bf16__2_3_3_3(ptr addrspace(1) inreg %ptr) { 4459; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3: 4460; GFX900: ; %bb.0: 4461; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4462; GFX900-NEXT: ;;#ASMSTART 4463; GFX900-NEXT: ; def v[0:1] 4464; GFX900-NEXT: ;;#ASMEND 4465; GFX900-NEXT: v_mov_b32_e32 v2, 0 4466; GFX900-NEXT: v_mov_b32_e32 v0, v1 4467; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 4468; GFX900-NEXT: s_waitcnt vmcnt(0) 4469; GFX900-NEXT: s_setpc_b64 s[30:31] 4470; 4471; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3: 4472; GFX90A: ; %bb.0: 4473; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4474; GFX90A-NEXT: ;;#ASMSTART 4475; GFX90A-NEXT: ; def v[0:1] 4476; GFX90A-NEXT: ;;#ASMEND 4477; GFX90A-NEXT: v_mov_b32_e32 v2, 0 4478; GFX90A-NEXT: v_mov_b32_e32 v0, v1 4479; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 4480; GFX90A-NEXT: s_waitcnt vmcnt(0) 4481; GFX90A-NEXT: s_setpc_b64 s[30:31] 4482; 4483; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3: 4484; GFX940: ; %bb.0: 4485; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4486; GFX940-NEXT: ;;#ASMSTART 4487; GFX940-NEXT: ; def v[0:1] 4488; GFX940-NEXT: ;;#ASMEND 4489; GFX940-NEXT: v_mov_b32_e32 v2, 0 4490; GFX940-NEXT: v_mov_b32_e32 v0, v1 4491; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 4492; GFX940-NEXT: s_waitcnt vmcnt(0) 4493; GFX940-NEXT: s_setpc_b64 s[30:31] 4494 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4495 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4496 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 4497 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4498 ret void 4499} 4500 4501define void @v_shuffle_v4bf16_v3bf16__3_3_3_3(ptr addrspace(1) inreg %ptr) { 4502; GFX9-LABEL: v_shuffle_v4bf16_v3bf16__3_3_3_3: 4503; GFX9: ; %bb.0: 4504; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4505; GFX9-NEXT: s_setpc_b64 s[30:31] 4506 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4507 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4508 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 4509 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4510 ret void 4511} 4512 4513define void @v_shuffle_v4bf16_v3bf16__4_3_3_3(ptr addrspace(1) inreg %ptr) { 4514; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_3_3_3: 4515; GFX900: ; %bb.0: 4516; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4517; GFX900-NEXT: ;;#ASMSTART 4518; GFX900-NEXT: ; def v[0:1] 4519; GFX900-NEXT: ;;#ASMEND 4520; GFX900-NEXT: s_mov_b32 s4, 0x5040100 4521; GFX900-NEXT: v_mov_b32_e32 v2, 0 4522; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 4523; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 4524; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 4525; GFX900-NEXT: s_waitcnt vmcnt(0) 4526; GFX900-NEXT: s_setpc_b64 s[30:31] 4527; 4528; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_3_3_3: 4529; GFX90A: ; %bb.0: 4530; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4531; GFX90A-NEXT: ;;#ASMSTART 4532; GFX90A-NEXT: ; def v[0:1] 4533; GFX90A-NEXT: ;;#ASMEND 4534; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 4535; GFX90A-NEXT: v_mov_b32_e32 v2, 0 4536; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 4537; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 4538; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 4539; GFX90A-NEXT: s_waitcnt vmcnt(0) 4540; GFX90A-NEXT: s_setpc_b64 s[30:31] 4541; 4542; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_3_3_3: 4543; GFX940: ; %bb.0: 4544; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4545; GFX940-NEXT: ;;#ASMSTART 4546; GFX940-NEXT: ; def v[0:1] 4547; GFX940-NEXT: ;;#ASMEND 4548; GFX940-NEXT: s_mov_b32 s2, 0x5040100 4549; GFX940-NEXT: v_mov_b32_e32 v2, 0 4550; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 4551; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 4552; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 4553; GFX940-NEXT: s_waitcnt vmcnt(0) 4554; GFX940-NEXT: s_setpc_b64 s[30:31] 4555 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4556 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 4557 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4558 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4559 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 3, i32 3, i32 3> 4560 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4561 ret void 4562} 4563 4564define void @v_shuffle_v4bf16_v3bf16__5_3_3_3(ptr addrspace(1) inreg %ptr) { 4565; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_3_3: 4566; GFX900: ; %bb.0: 4567; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4568; GFX900-NEXT: ;;#ASMSTART 4569; GFX900-NEXT: ; def v[0:1] 4570; GFX900-NEXT: ;;#ASMEND 4571; GFX900-NEXT: s_mov_b32 s4, 0x5040100 4572; GFX900-NEXT: v_mov_b32_e32 v3, 0 4573; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 4574; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 4575; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 4576; GFX900-NEXT: s_waitcnt vmcnt(0) 4577; GFX900-NEXT: s_setpc_b64 s[30:31] 4578; 4579; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_3_3: 4580; GFX90A: ; %bb.0: 4581; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4582; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 4583; GFX90A-NEXT: v_mov_b32_e32 v4, 0 4584; GFX90A-NEXT: ;;#ASMSTART 4585; GFX90A-NEXT: ; def v[0:1] 4586; GFX90A-NEXT: ;;#ASMEND 4587; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 4588; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 4589; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 4590; GFX90A-NEXT: s_waitcnt vmcnt(0) 4591; GFX90A-NEXT: s_setpc_b64 s[30:31] 4592; 4593; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_3_3: 4594; GFX940: ; %bb.0: 4595; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4596; GFX940-NEXT: s_mov_b32 s2, 0x5040100 4597; GFX940-NEXT: v_mov_b32_e32 v4, 0 4598; GFX940-NEXT: ;;#ASMSTART 4599; GFX940-NEXT: ; def v[0:1] 4600; GFX940-NEXT: ;;#ASMEND 4601; GFX940-NEXT: s_nop 0 4602; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 4603; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 4604; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 4605; GFX940-NEXT: s_waitcnt vmcnt(0) 4606; GFX940-NEXT: s_setpc_b64 s[30:31] 4607 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4608 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 4609 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4610 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4611 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 3, i32 3> 4612 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4613 ret void 4614} 4615 4616define void @v_shuffle_v4bf16_v3bf16__5_u_3_3(ptr addrspace(1) inreg %ptr) { 4617; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_3_3: 4618; GFX900: ; %bb.0: 4619; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4620; GFX900-NEXT: s_mov_b32 s4, 0x5040100 4621; GFX900-NEXT: v_mov_b32_e32 v3, 0 4622; GFX900-NEXT: ;;#ASMSTART 4623; GFX900-NEXT: ; def v[0:1] 4624; GFX900-NEXT: ;;#ASMEND 4625; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 4626; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 4627; GFX900-NEXT: s_waitcnt vmcnt(0) 4628; GFX900-NEXT: s_setpc_b64 s[30:31] 4629; 4630; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_3_3: 4631; GFX90A: ; %bb.0: 4632; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4633; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 4634; GFX90A-NEXT: v_mov_b32_e32 v4, 0 4635; GFX90A-NEXT: ;;#ASMSTART 4636; GFX90A-NEXT: ; def v[0:1] 4637; GFX90A-NEXT: ;;#ASMEND 4638; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 4639; GFX90A-NEXT: v_mov_b32_e32 v2, v1 4640; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 4641; GFX90A-NEXT: s_waitcnt vmcnt(0) 4642; GFX90A-NEXT: s_setpc_b64 s[30:31] 4643; 4644; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_3_3: 4645; GFX940: ; %bb.0: 4646; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4647; GFX940-NEXT: s_mov_b32 s2, 0x5040100 4648; GFX940-NEXT: v_mov_b32_e32 v4, 0 4649; GFX940-NEXT: ;;#ASMSTART 4650; GFX940-NEXT: ; def v[0:1] 4651; GFX940-NEXT: ;;#ASMEND 4652; GFX940-NEXT: s_nop 0 4653; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 4654; GFX940-NEXT: v_mov_b32_e32 v2, v1 4655; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 4656; GFX940-NEXT: s_waitcnt vmcnt(0) 4657; GFX940-NEXT: s_setpc_b64 s[30:31] 4658 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4659 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 4660 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4661 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4662 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 3, i32 3> 4663 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4664 ret void 4665} 4666 4667define void @v_shuffle_v4bf16_v3bf16__5_0_3_3(ptr addrspace(1) inreg %ptr) { 4668; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_3_3: 4669; GFX900: ; %bb.0: 4670; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4671; GFX900-NEXT: ;;#ASMSTART 4672; GFX900-NEXT: ; def v[0:1] 4673; GFX900-NEXT: ;;#ASMEND 4674; GFX900-NEXT: ;;#ASMSTART 4675; GFX900-NEXT: ; def v[1:2] 4676; GFX900-NEXT: ;;#ASMEND 4677; GFX900-NEXT: s_mov_b32 s4, 0x5040100 4678; GFX900-NEXT: v_mov_b32_e32 v3, 0 4679; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 4680; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 4681; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 4682; GFX900-NEXT: s_waitcnt vmcnt(0) 4683; GFX900-NEXT: s_setpc_b64 s[30:31] 4684; 4685; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_3_3: 4686; GFX90A: ; %bb.0: 4687; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4688; GFX90A-NEXT: ;;#ASMSTART 4689; GFX90A-NEXT: ; def v[0:1] 4690; GFX90A-NEXT: ;;#ASMEND 4691; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 4692; GFX90A-NEXT: v_mov_b32_e32 v4, 0 4693; GFX90A-NEXT: ;;#ASMSTART 4694; GFX90A-NEXT: ; def v[2:3] 4695; GFX90A-NEXT: ;;#ASMEND 4696; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 4697; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 4698; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4699; GFX90A-NEXT: s_waitcnt vmcnt(0) 4700; GFX90A-NEXT: s_setpc_b64 s[30:31] 4701; 4702; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_3_3: 4703; GFX940: ; %bb.0: 4704; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4705; GFX940-NEXT: ;;#ASMSTART 4706; GFX940-NEXT: ; def v[0:1] 4707; GFX940-NEXT: ;;#ASMEND 4708; GFX940-NEXT: s_mov_b32 s2, 0x5040100 4709; GFX940-NEXT: v_mov_b32_e32 v4, 0 4710; GFX940-NEXT: ;;#ASMSTART 4711; GFX940-NEXT: ; def v[2:3] 4712; GFX940-NEXT: ;;#ASMEND 4713; GFX940-NEXT: s_nop 0 4714; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 4715; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 4716; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 4717; GFX940-NEXT: s_waitcnt vmcnt(0) 4718; GFX940-NEXT: s_setpc_b64 s[30:31] 4719 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4720 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 4721 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4722 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4723 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 3, i32 3> 4724 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4725 ret void 4726} 4727 4728define void @v_shuffle_v4bf16_v3bf16__5_1_3_3(ptr addrspace(1) inreg %ptr) { 4729; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_3_3: 4730; GFX900: ; %bb.0: 4731; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4732; GFX900-NEXT: ;;#ASMSTART 4733; GFX900-NEXT: ; def v[0:1] 4734; GFX900-NEXT: ;;#ASMEND 4735; GFX900-NEXT: s_mov_b32 s4, 0xffff 4736; GFX900-NEXT: ;;#ASMSTART 4737; GFX900-NEXT: ; def v[1:2] 4738; GFX900-NEXT: ;;#ASMEND 4739; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 4740; GFX900-NEXT: s_mov_b32 s4, 0x5040100 4741; GFX900-NEXT: v_mov_b32_e32 v3, 0 4742; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 4743; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 4744; GFX900-NEXT: s_waitcnt vmcnt(0) 4745; GFX900-NEXT: s_setpc_b64 s[30:31] 4746; 4747; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_3_3: 4748; GFX90A: ; %bb.0: 4749; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4750; GFX90A-NEXT: ;;#ASMSTART 4751; GFX90A-NEXT: ; def v[0:1] 4752; GFX90A-NEXT: ;;#ASMEND 4753; GFX90A-NEXT: s_mov_b32 s4, 0xffff 4754; GFX90A-NEXT: ;;#ASMSTART 4755; GFX90A-NEXT: ; def v[2:3] 4756; GFX90A-NEXT: ;;#ASMEND 4757; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 4758; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 4759; GFX90A-NEXT: v_mov_b32_e32 v4, 0 4760; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 4761; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4762; GFX90A-NEXT: s_waitcnt vmcnt(0) 4763; GFX90A-NEXT: s_setpc_b64 s[30:31] 4764; 4765; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_3_3: 4766; GFX940: ; %bb.0: 4767; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4768; GFX940-NEXT: ;;#ASMSTART 4769; GFX940-NEXT: ; def v[0:1] 4770; GFX940-NEXT: ;;#ASMEND 4771; GFX940-NEXT: s_mov_b32 s2, 0xffff 4772; GFX940-NEXT: ;;#ASMSTART 4773; GFX940-NEXT: ; def v[2:3] 4774; GFX940-NEXT: ;;#ASMEND 4775; GFX940-NEXT: v_mov_b32_e32 v4, 0 4776; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 4777; GFX940-NEXT: s_mov_b32 s2, 0x5040100 4778; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 4779; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 4780; GFX940-NEXT: s_waitcnt vmcnt(0) 4781; GFX940-NEXT: s_setpc_b64 s[30:31] 4782 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4783 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 4784 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4785 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4786 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 3, i32 3> 4787 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4788 ret void 4789} 4790 4791define void @v_shuffle_v4bf16_v3bf16__5_2_3_3(ptr addrspace(1) inreg %ptr) { 4792; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_3_3: 4793; GFX900: ; %bb.0: 4794; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4795; GFX900-NEXT: ;;#ASMSTART 4796; GFX900-NEXT: ; def v[0:1] 4797; GFX900-NEXT: ;;#ASMEND 4798; GFX900-NEXT: s_mov_b32 s4, 0x5040100 4799; GFX900-NEXT: v_mov_b32_e32 v4, 0 4800; GFX900-NEXT: ;;#ASMSTART 4801; GFX900-NEXT: ; def v[2:3] 4802; GFX900-NEXT: ;;#ASMEND 4803; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 4804; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 4805; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4806; GFX900-NEXT: s_waitcnt vmcnt(0) 4807; GFX900-NEXT: s_setpc_b64 s[30:31] 4808; 4809; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_3_3: 4810; GFX90A: ; %bb.0: 4811; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4812; GFX90A-NEXT: ;;#ASMSTART 4813; GFX90A-NEXT: ; def v[0:1] 4814; GFX90A-NEXT: ;;#ASMEND 4815; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 4816; GFX90A-NEXT: v_mov_b32_e32 v4, 0 4817; GFX90A-NEXT: ;;#ASMSTART 4818; GFX90A-NEXT: ; def v[2:3] 4819; GFX90A-NEXT: ;;#ASMEND 4820; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 4821; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 4822; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 4823; GFX90A-NEXT: s_waitcnt vmcnt(0) 4824; GFX90A-NEXT: s_setpc_b64 s[30:31] 4825; 4826; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_3_3: 4827; GFX940: ; %bb.0: 4828; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4829; GFX940-NEXT: ;;#ASMSTART 4830; GFX940-NEXT: ; def v[0:1] 4831; GFX940-NEXT: ;;#ASMEND 4832; GFX940-NEXT: s_mov_b32 s2, 0x5040100 4833; GFX940-NEXT: v_mov_b32_e32 v4, 0 4834; GFX940-NEXT: ;;#ASMSTART 4835; GFX940-NEXT: ; def v[2:3] 4836; GFX940-NEXT: ;;#ASMEND 4837; GFX940-NEXT: s_nop 0 4838; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 4839; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 4840; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 4841; GFX940-NEXT: s_waitcnt vmcnt(0) 4842; GFX940-NEXT: s_setpc_b64 s[30:31] 4843 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4844 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 4845 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4846 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4847 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 3, i32 3> 4848 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4849 ret void 4850} 4851 4852define void @v_shuffle_v4bf16_v3bf16__5_4_3_3(ptr addrspace(1) inreg %ptr) { 4853; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_3_3: 4854; GFX900: ; %bb.0: 4855; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4856; GFX900-NEXT: ;;#ASMSTART 4857; GFX900-NEXT: ; def v[0:1] 4858; GFX900-NEXT: ;;#ASMEND 4859; GFX900-NEXT: s_mov_b32 s4, 0xffff 4860; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 4861; GFX900-NEXT: s_mov_b32 s4, 0x5040100 4862; GFX900-NEXT: v_mov_b32_e32 v3, 0 4863; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 4864; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 4865; GFX900-NEXT: s_waitcnt vmcnt(0) 4866; GFX900-NEXT: s_setpc_b64 s[30:31] 4867; 4868; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_3_3: 4869; GFX90A: ; %bb.0: 4870; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4871; GFX90A-NEXT: s_mov_b32 s4, 0xffff 4872; GFX90A-NEXT: ;;#ASMSTART 4873; GFX90A-NEXT: ; def v[0:1] 4874; GFX90A-NEXT: ;;#ASMEND 4875; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 4876; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 4877; GFX90A-NEXT: v_mov_b32_e32 v4, 0 4878; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 4879; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 4880; GFX90A-NEXT: s_waitcnt vmcnt(0) 4881; GFX90A-NEXT: s_setpc_b64 s[30:31] 4882; 4883; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_3_3: 4884; GFX940: ; %bb.0: 4885; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4886; GFX940-NEXT: s_mov_b32 s2, 0xffff 4887; GFX940-NEXT: ;;#ASMSTART 4888; GFX940-NEXT: ; def v[0:1] 4889; GFX940-NEXT: ;;#ASMEND 4890; GFX940-NEXT: v_mov_b32_e32 v4, 0 4891; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 4892; GFX940-NEXT: s_mov_b32 s2, 0x5040100 4893; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 4894; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 4895; GFX940-NEXT: s_waitcnt vmcnt(0) 4896; GFX940-NEXT: s_setpc_b64 s[30:31] 4897 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4898 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 4899 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4900 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4901 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 3, i32 3> 4902 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4903 ret void 4904} 4905 4906define void @v_shuffle_v4bf16_v3bf16__5_5_3_3(ptr addrspace(1) inreg %ptr) { 4907; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_3: 4908; GFX900: ; %bb.0: 4909; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4910; GFX900-NEXT: ;;#ASMSTART 4911; GFX900-NEXT: ; def v[0:1] 4912; GFX900-NEXT: ;;#ASMEND 4913; GFX900-NEXT: s_mov_b32 s4, 0x5040100 4914; GFX900-NEXT: v_mov_b32_e32 v3, 0 4915; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 4916; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 4917; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 4918; GFX900-NEXT: s_waitcnt vmcnt(0) 4919; GFX900-NEXT: s_setpc_b64 s[30:31] 4920; 4921; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_3: 4922; GFX90A: ; %bb.0: 4923; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4924; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 4925; GFX90A-NEXT: v_mov_b32_e32 v4, 0 4926; GFX90A-NEXT: ;;#ASMSTART 4927; GFX90A-NEXT: ; def v[0:1] 4928; GFX90A-NEXT: ;;#ASMEND 4929; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 4930; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 4931; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 4932; GFX90A-NEXT: s_waitcnt vmcnt(0) 4933; GFX90A-NEXT: s_setpc_b64 s[30:31] 4934; 4935; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_3: 4936; GFX940: ; %bb.0: 4937; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4938; GFX940-NEXT: s_mov_b32 s2, 0x5040100 4939; GFX940-NEXT: v_mov_b32_e32 v4, 0 4940; GFX940-NEXT: ;;#ASMSTART 4941; GFX940-NEXT: ; def v[0:1] 4942; GFX940-NEXT: ;;#ASMEND 4943; GFX940-NEXT: s_nop 0 4944; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 4945; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 4946; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 4947; GFX940-NEXT: s_waitcnt vmcnt(0) 4948; GFX940-NEXT: s_setpc_b64 s[30:31] 4949 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 4950 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 4951 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4952 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 4953 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 3> 4954 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 4955 ret void 4956} 4957 4958define void @v_shuffle_v4bf16_v3bf16__5_5_u_3(ptr addrspace(1) inreg %ptr) { 4959; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_3: 4960; GFX900: ; %bb.0: 4961; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4962; GFX900-NEXT: ;;#ASMSTART 4963; GFX900-NEXT: ; def v[0:1] 4964; GFX900-NEXT: ;;#ASMEND 4965; GFX900-NEXT: s_mov_b32 s4, 0x5040100 4966; GFX900-NEXT: v_mov_b32_e32 v3, 0 4967; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 4968; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 4969; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 4970; GFX900-NEXT: s_waitcnt vmcnt(0) 4971; GFX900-NEXT: s_setpc_b64 s[30:31] 4972; 4973; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_3: 4974; GFX90A: ; %bb.0: 4975; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4976; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 4977; GFX90A-NEXT: v_mov_b32_e32 v4, 0 4978; GFX90A-NEXT: ;;#ASMSTART 4979; GFX90A-NEXT: ; def v[0:1] 4980; GFX90A-NEXT: ;;#ASMEND 4981; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 4982; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 4983; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 4984; GFX90A-NEXT: s_waitcnt vmcnt(0) 4985; GFX90A-NEXT: s_setpc_b64 s[30:31] 4986; 4987; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_3: 4988; GFX940: ; %bb.0: 4989; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4990; GFX940-NEXT: s_mov_b32 s2, 0x5040100 4991; GFX940-NEXT: v_mov_b32_e32 v4, 0 4992; GFX940-NEXT: ;;#ASMSTART 4993; GFX940-NEXT: ; def v[0:1] 4994; GFX940-NEXT: ;;#ASMEND 4995; GFX940-NEXT: s_nop 0 4996; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 4997; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 4998; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 4999; GFX940-NEXT: s_waitcnt vmcnt(0) 5000; GFX940-NEXT: s_setpc_b64 s[30:31] 5001 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5002 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5003 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5004 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5005 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 3> 5006 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5007 ret void 5008} 5009 5010define void @v_shuffle_v4bf16_v3bf16__5_5_0_3(ptr addrspace(1) inreg %ptr) { 5011; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_3: 5012; GFX900: ; %bb.0: 5013; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5014; GFX900-NEXT: ;;#ASMSTART 5015; GFX900-NEXT: ; def v[0:1] 5016; GFX900-NEXT: ;;#ASMEND 5017; GFX900-NEXT: ;;#ASMSTART 5018; GFX900-NEXT: ; def v[1:2] 5019; GFX900-NEXT: ;;#ASMEND 5020; GFX900-NEXT: s_mov_b32 s4, 0x5040100 5021; GFX900-NEXT: v_mov_b32_e32 v3, 0 5022; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 5023; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 5024; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 5025; GFX900-NEXT: s_waitcnt vmcnt(0) 5026; GFX900-NEXT: s_setpc_b64 s[30:31] 5027; 5028; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_3: 5029; GFX90A: ; %bb.0: 5030; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5031; GFX90A-NEXT: ;;#ASMSTART 5032; GFX90A-NEXT: ; def v[0:1] 5033; GFX90A-NEXT: ;;#ASMEND 5034; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 5035; GFX90A-NEXT: v_mov_b32_e32 v4, 0 5036; GFX90A-NEXT: ;;#ASMSTART 5037; GFX90A-NEXT: ; def v[2:3] 5038; GFX90A-NEXT: ;;#ASMEND 5039; GFX90A-NEXT: v_perm_b32 v1, v2, v0, s4 5040; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 5041; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 5042; GFX90A-NEXT: s_waitcnt vmcnt(0) 5043; GFX90A-NEXT: s_setpc_b64 s[30:31] 5044; 5045; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_3: 5046; GFX940: ; %bb.0: 5047; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5048; GFX940-NEXT: ;;#ASMSTART 5049; GFX940-NEXT: ; def v[0:1] 5050; GFX940-NEXT: ;;#ASMEND 5051; GFX940-NEXT: s_mov_b32 s2, 0x5040100 5052; GFX940-NEXT: v_mov_b32_e32 v4, 0 5053; GFX940-NEXT: ;;#ASMSTART 5054; GFX940-NEXT: ; def v[2:3] 5055; GFX940-NEXT: ;;#ASMEND 5056; GFX940-NEXT: s_nop 0 5057; GFX940-NEXT: v_perm_b32 v1, v2, v0, s2 5058; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 5059; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 5060; GFX940-NEXT: s_waitcnt vmcnt(0) 5061; GFX940-NEXT: s_setpc_b64 s[30:31] 5062 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5063 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5064 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5065 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5066 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 3> 5067 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5068 ret void 5069} 5070 5071define void @v_shuffle_v4bf16_v3bf16__5_5_1_3(ptr addrspace(1) inreg %ptr) { 5072; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_3: 5073; GFX900: ; %bb.0: 5074; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5075; GFX900-NEXT: ;;#ASMSTART 5076; GFX900-NEXT: ; def v[0:1] 5077; GFX900-NEXT: ;;#ASMEND 5078; GFX900-NEXT: ;;#ASMSTART 5079; GFX900-NEXT: ; def v[1:2] 5080; GFX900-NEXT: ;;#ASMEND 5081; GFX900-NEXT: s_mov_b32 s4, 0x5040100 5082; GFX900-NEXT: v_mov_b32_e32 v3, 0 5083; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 5084; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 5085; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 5086; GFX900-NEXT: s_waitcnt vmcnt(0) 5087; GFX900-NEXT: s_setpc_b64 s[30:31] 5088; 5089; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_3: 5090; GFX90A: ; %bb.0: 5091; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5092; GFX90A-NEXT: ;;#ASMSTART 5093; GFX90A-NEXT: ; def v[0:1] 5094; GFX90A-NEXT: ;;#ASMEND 5095; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 5096; GFX90A-NEXT: v_mov_b32_e32 v4, 0 5097; GFX90A-NEXT: ;;#ASMSTART 5098; GFX90A-NEXT: ; def v[2:3] 5099; GFX90A-NEXT: ;;#ASMEND 5100; GFX90A-NEXT: v_alignbit_b32 v1, v2, v0, 16 5101; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 5102; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 5103; GFX90A-NEXT: s_waitcnt vmcnt(0) 5104; GFX90A-NEXT: s_setpc_b64 s[30:31] 5105; 5106; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_3: 5107; GFX940: ; %bb.0: 5108; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5109; GFX940-NEXT: ;;#ASMSTART 5110; GFX940-NEXT: ; def v[0:1] 5111; GFX940-NEXT: ;;#ASMEND 5112; GFX940-NEXT: s_mov_b32 s2, 0x5040100 5113; GFX940-NEXT: v_mov_b32_e32 v4, 0 5114; GFX940-NEXT: ;;#ASMSTART 5115; GFX940-NEXT: ; def v[2:3] 5116; GFX940-NEXT: ;;#ASMEND 5117; GFX940-NEXT: s_nop 0 5118; GFX940-NEXT: v_alignbit_b32 v1, v2, v0, 16 5119; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 5120; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 5121; GFX940-NEXT: s_waitcnt vmcnt(0) 5122; GFX940-NEXT: s_setpc_b64 s[30:31] 5123 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5124 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5125 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5126 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5127 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 3> 5128 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5129 ret void 5130} 5131 5132define void @v_shuffle_v4bf16_v3bf16__5_5_2_3(ptr addrspace(1) inreg %ptr) { 5133; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_3: 5134; GFX900: ; %bb.0: 5135; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5136; GFX900-NEXT: ;;#ASMSTART 5137; GFX900-NEXT: ; def v[0:1] 5138; GFX900-NEXT: ;;#ASMEND 5139; GFX900-NEXT: s_mov_b32 s4, 0x5040100 5140; GFX900-NEXT: v_mov_b32_e32 v4, 0 5141; GFX900-NEXT: ;;#ASMSTART 5142; GFX900-NEXT: ; def v[2:3] 5143; GFX900-NEXT: ;;#ASMEND 5144; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 5145; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 5146; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 5147; GFX900-NEXT: s_waitcnt vmcnt(0) 5148; GFX900-NEXT: s_setpc_b64 s[30:31] 5149; 5150; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_3: 5151; GFX90A: ; %bb.0: 5152; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5153; GFX90A-NEXT: ;;#ASMSTART 5154; GFX90A-NEXT: ; def v[0:1] 5155; GFX90A-NEXT: ;;#ASMEND 5156; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 5157; GFX90A-NEXT: v_mov_b32_e32 v4, 0 5158; GFX90A-NEXT: ;;#ASMSTART 5159; GFX90A-NEXT: ; def v[2:3] 5160; GFX90A-NEXT: ;;#ASMEND 5161; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 5162; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 5163; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 5164; GFX90A-NEXT: s_waitcnt vmcnt(0) 5165; GFX90A-NEXT: s_setpc_b64 s[30:31] 5166; 5167; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_3: 5168; GFX940: ; %bb.0: 5169; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5170; GFX940-NEXT: ;;#ASMSTART 5171; GFX940-NEXT: ; def v[0:1] 5172; GFX940-NEXT: ;;#ASMEND 5173; GFX940-NEXT: s_mov_b32 s2, 0x5040100 5174; GFX940-NEXT: v_mov_b32_e32 v4, 0 5175; GFX940-NEXT: ;;#ASMSTART 5176; GFX940-NEXT: ; def v[2:3] 5177; GFX940-NEXT: ;;#ASMEND 5178; GFX940-NEXT: s_nop 0 5179; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 5180; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 5181; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 5182; GFX940-NEXT: s_waitcnt vmcnt(0) 5183; GFX940-NEXT: s_setpc_b64 s[30:31] 5184 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5185 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5186 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5187 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5188 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 3> 5189 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5190 ret void 5191} 5192 5193define void @v_shuffle_v4bf16_v3bf16__5_5_4_3(ptr addrspace(1) inreg %ptr) { 5194; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_3: 5195; GFX900: ; %bb.0: 5196; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5197; GFX900-NEXT: ;;#ASMSTART 5198; GFX900-NEXT: ; def v[0:1] 5199; GFX900-NEXT: ;;#ASMEND 5200; GFX900-NEXT: s_mov_b32 s4, 0x5040100 5201; GFX900-NEXT: v_mov_b32_e32 v3, 0 5202; GFX900-NEXT: v_alignbit_b32 v2, v0, v0, 16 5203; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 5204; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 5205; GFX900-NEXT: s_waitcnt vmcnt(0) 5206; GFX900-NEXT: s_setpc_b64 s[30:31] 5207; 5208; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_3: 5209; GFX90A: ; %bb.0: 5210; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5211; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 5212; GFX90A-NEXT: v_mov_b32_e32 v4, 0 5213; GFX90A-NEXT: ;;#ASMSTART 5214; GFX90A-NEXT: ; def v[0:1] 5215; GFX90A-NEXT: ;;#ASMEND 5216; GFX90A-NEXT: v_alignbit_b32 v3, v0, v0, 16 5217; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 5218; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 5219; GFX90A-NEXT: s_waitcnt vmcnt(0) 5220; GFX90A-NEXT: s_setpc_b64 s[30:31] 5221; 5222; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_3: 5223; GFX940: ; %bb.0: 5224; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5225; GFX940-NEXT: s_mov_b32 s2, 0x5040100 5226; GFX940-NEXT: v_mov_b32_e32 v4, 0 5227; GFX940-NEXT: ;;#ASMSTART 5228; GFX940-NEXT: ; def v[0:1] 5229; GFX940-NEXT: ;;#ASMEND 5230; GFX940-NEXT: s_nop 0 5231; GFX940-NEXT: v_alignbit_b32 v3, v0, v0, 16 5232; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 5233; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 5234; GFX940-NEXT: s_waitcnt vmcnt(0) 5235; GFX940-NEXT: s_setpc_b64 s[30:31] 5236 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5237 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5238 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5239 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5240 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 3> 5241 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5242 ret void 5243} 5244 5245define void @v_shuffle_v4bf16_v3bf16__u_4_4_4(ptr addrspace(1) inreg %ptr) { 5246; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__u_4_4_4: 5247; GFX900: ; %bb.0: 5248; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5249; GFX900-NEXT: ;;#ASMSTART 5250; GFX900-NEXT: ; def v[0:1] 5251; GFX900-NEXT: ;;#ASMEND 5252; GFX900-NEXT: s_mov_b32 s4, 0x7060302 5253; GFX900-NEXT: v_mov_b32_e32 v2, 0 5254; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 5255; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 5256; GFX900-NEXT: s_waitcnt vmcnt(0) 5257; GFX900-NEXT: s_setpc_b64 s[30:31] 5258; 5259; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__u_4_4_4: 5260; GFX90A: ; %bb.0: 5261; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5262; GFX90A-NEXT: ;;#ASMSTART 5263; GFX90A-NEXT: ; def v[0:1] 5264; GFX90A-NEXT: ;;#ASMEND 5265; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 5266; GFX90A-NEXT: v_mov_b32_e32 v2, 0 5267; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 5268; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 5269; GFX90A-NEXT: s_waitcnt vmcnt(0) 5270; GFX90A-NEXT: s_setpc_b64 s[30:31] 5271; 5272; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__u_4_4_4: 5273; GFX940: ; %bb.0: 5274; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5275; GFX940-NEXT: ;;#ASMSTART 5276; GFX940-NEXT: ; def v[0:1] 5277; GFX940-NEXT: ;;#ASMEND 5278; GFX940-NEXT: s_mov_b32 s2, 0x7060302 5279; GFX940-NEXT: v_mov_b32_e32 v2, 0 5280; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 5281; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 5282; GFX940-NEXT: s_waitcnt vmcnt(0) 5283; GFX940-NEXT: s_setpc_b64 s[30:31] 5284 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5285 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5286 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5287 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5288 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 poison, i32 4, i32 4, i32 4> 5289 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5290 ret void 5291} 5292 5293define void @v_shuffle_v4bf16_v3bf16__0_4_4_4(ptr addrspace(1) inreg %ptr) { 5294; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_4_4_4: 5295; GFX900: ; %bb.0: 5296; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5297; GFX900-NEXT: ;;#ASMSTART 5298; GFX900-NEXT: ; def v[0:1] 5299; GFX900-NEXT: ;;#ASMEND 5300; GFX900-NEXT: s_mov_b32 s4, 0xffff 5301; GFX900-NEXT: ;;#ASMSTART 5302; GFX900-NEXT: ; def v[1:2] 5303; GFX900-NEXT: ;;#ASMEND 5304; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 5305; GFX900-NEXT: s_mov_b32 s4, 0x7060302 5306; GFX900-NEXT: v_mov_b32_e32 v3, 0 5307; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 5308; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 5309; GFX900-NEXT: s_waitcnt vmcnt(0) 5310; GFX900-NEXT: s_setpc_b64 s[30:31] 5311; 5312; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_4_4_4: 5313; GFX90A: ; %bb.0: 5314; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5315; GFX90A-NEXT: ;;#ASMSTART 5316; GFX90A-NEXT: ; def v[0:1] 5317; GFX90A-NEXT: ;;#ASMEND 5318; GFX90A-NEXT: s_mov_b32 s4, 0xffff 5319; GFX90A-NEXT: ;;#ASMSTART 5320; GFX90A-NEXT: ; def v[2:3] 5321; GFX90A-NEXT: ;;#ASMEND 5322; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 5323; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 5324; GFX90A-NEXT: v_mov_b32_e32 v4, 0 5325; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 5326; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 5327; GFX90A-NEXT: s_waitcnt vmcnt(0) 5328; GFX90A-NEXT: s_setpc_b64 s[30:31] 5329; 5330; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_4_4_4: 5331; GFX940: ; %bb.0: 5332; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5333; GFX940-NEXT: ;;#ASMSTART 5334; GFX940-NEXT: ; def v[0:1] 5335; GFX940-NEXT: ;;#ASMEND 5336; GFX940-NEXT: s_mov_b32 s2, 0xffff 5337; GFX940-NEXT: ;;#ASMSTART 5338; GFX940-NEXT: ; def v[2:3] 5339; GFX940-NEXT: ;;#ASMEND 5340; GFX940-NEXT: v_mov_b32_e32 v4, 0 5341; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 5342; GFX940-NEXT: s_mov_b32 s2, 0x7060302 5343; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 5344; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 5345; GFX940-NEXT: s_waitcnt vmcnt(0) 5346; GFX940-NEXT: s_setpc_b64 s[30:31] 5347 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5348 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5349 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5350 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5351 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 0, i32 4, i32 4, i32 4> 5352 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5353 ret void 5354} 5355 5356define void @v_shuffle_v4bf16_v3bf16__1_4_4_4(ptr addrspace(1) inreg %ptr) { 5357; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_4_4_4: 5358; GFX900: ; %bb.0: 5359; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5360; GFX900-NEXT: ;;#ASMSTART 5361; GFX900-NEXT: ; def v[0:1] 5362; GFX900-NEXT: ;;#ASMEND 5363; GFX900-NEXT: ;;#ASMSTART 5364; GFX900-NEXT: ; def v[1:2] 5365; GFX900-NEXT: ;;#ASMEND 5366; GFX900-NEXT: s_mov_b32 s4, 0x7060302 5367; GFX900-NEXT: v_mov_b32_e32 v3, 0 5368; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 5369; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 5370; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 5371; GFX900-NEXT: s_waitcnt vmcnt(0) 5372; GFX900-NEXT: s_setpc_b64 s[30:31] 5373; 5374; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_4_4_4: 5375; GFX90A: ; %bb.0: 5376; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5377; GFX90A-NEXT: ;;#ASMSTART 5378; GFX90A-NEXT: ; def v[0:1] 5379; GFX90A-NEXT: ;;#ASMEND 5380; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 5381; GFX90A-NEXT: v_mov_b32_e32 v4, 0 5382; GFX90A-NEXT: ;;#ASMSTART 5383; GFX90A-NEXT: ; def v[2:3] 5384; GFX90A-NEXT: ;;#ASMEND 5385; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 5386; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 5387; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 5388; GFX90A-NEXT: s_waitcnt vmcnt(0) 5389; GFX90A-NEXT: s_setpc_b64 s[30:31] 5390; 5391; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_4_4_4: 5392; GFX940: ; %bb.0: 5393; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5394; GFX940-NEXT: ;;#ASMSTART 5395; GFX940-NEXT: ; def v[0:1] 5396; GFX940-NEXT: ;;#ASMEND 5397; GFX940-NEXT: s_mov_b32 s2, 0x7060302 5398; GFX940-NEXT: v_mov_b32_e32 v4, 0 5399; GFX940-NEXT: ;;#ASMSTART 5400; GFX940-NEXT: ; def v[2:3] 5401; GFX940-NEXT: ;;#ASMEND 5402; GFX940-NEXT: s_nop 0 5403; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 5404; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 5405; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 5406; GFX940-NEXT: s_waitcnt vmcnt(0) 5407; GFX940-NEXT: s_setpc_b64 s[30:31] 5408 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5409 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5410 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5411 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5412 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 1, i32 4, i32 4, i32 4> 5413 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5414 ret void 5415} 5416 5417define void @v_shuffle_v4bf16_v3bf16__2_4_4_4(ptr addrspace(1) inreg %ptr) { 5418; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_4_4_4: 5419; GFX900: ; %bb.0: 5420; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5421; GFX900-NEXT: ;;#ASMSTART 5422; GFX900-NEXT: ; def v[0:1] 5423; GFX900-NEXT: ;;#ASMEND 5424; GFX900-NEXT: s_mov_b32 s4, 0xffff 5425; GFX900-NEXT: ;;#ASMSTART 5426; GFX900-NEXT: ; def v[2:3] 5427; GFX900-NEXT: ;;#ASMEND 5428; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 5429; GFX900-NEXT: s_mov_b32 s4, 0x7060302 5430; GFX900-NEXT: v_mov_b32_e32 v4, 0 5431; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 5432; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 5433; GFX900-NEXT: s_waitcnt vmcnt(0) 5434; GFX900-NEXT: s_setpc_b64 s[30:31] 5435; 5436; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_4_4_4: 5437; GFX90A: ; %bb.0: 5438; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5439; GFX90A-NEXT: ;;#ASMSTART 5440; GFX90A-NEXT: ; def v[0:1] 5441; GFX90A-NEXT: ;;#ASMEND 5442; GFX90A-NEXT: s_mov_b32 s4, 0xffff 5443; GFX90A-NEXT: ;;#ASMSTART 5444; GFX90A-NEXT: ; def v[2:3] 5445; GFX90A-NEXT: ;;#ASMEND 5446; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 5447; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 5448; GFX90A-NEXT: v_mov_b32_e32 v4, 0 5449; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 5450; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 5451; GFX90A-NEXT: s_waitcnt vmcnt(0) 5452; GFX90A-NEXT: s_setpc_b64 s[30:31] 5453; 5454; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_4_4_4: 5455; GFX940: ; %bb.0: 5456; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5457; GFX940-NEXT: ;;#ASMSTART 5458; GFX940-NEXT: ; def v[0:1] 5459; GFX940-NEXT: ;;#ASMEND 5460; GFX940-NEXT: s_mov_b32 s2, 0xffff 5461; GFX940-NEXT: ;;#ASMSTART 5462; GFX940-NEXT: ; def v[2:3] 5463; GFX940-NEXT: ;;#ASMEND 5464; GFX940-NEXT: v_mov_b32_e32 v4, 0 5465; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 5466; GFX940-NEXT: s_mov_b32 s2, 0x7060302 5467; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 5468; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 5469; GFX940-NEXT: s_waitcnt vmcnt(0) 5470; GFX940-NEXT: s_setpc_b64 s[30:31] 5471 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5472 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5473 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5474 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5475 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 2, i32 4, i32 4, i32 4> 5476 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5477 ret void 5478} 5479 5480define void @v_shuffle_v4bf16_v3bf16__3_4_4_4(ptr addrspace(1) inreg %ptr) { 5481; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__3_4_4_4: 5482; GFX900: ; %bb.0: 5483; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5484; GFX900-NEXT: ;;#ASMSTART 5485; GFX900-NEXT: ; def v[0:1] 5486; GFX900-NEXT: ;;#ASMEND 5487; GFX900-NEXT: s_mov_b32 s4, 0x7060302 5488; GFX900-NEXT: v_mov_b32_e32 v2, 0 5489; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 5490; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 5491; GFX900-NEXT: s_waitcnt vmcnt(0) 5492; GFX900-NEXT: s_setpc_b64 s[30:31] 5493; 5494; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__3_4_4_4: 5495; GFX90A: ; %bb.0: 5496; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5497; GFX90A-NEXT: ;;#ASMSTART 5498; GFX90A-NEXT: ; def v[0:1] 5499; GFX90A-NEXT: ;;#ASMEND 5500; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 5501; GFX90A-NEXT: v_mov_b32_e32 v2, 0 5502; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 5503; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 5504; GFX90A-NEXT: s_waitcnt vmcnt(0) 5505; GFX90A-NEXT: s_setpc_b64 s[30:31] 5506; 5507; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__3_4_4_4: 5508; GFX940: ; %bb.0: 5509; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5510; GFX940-NEXT: ;;#ASMSTART 5511; GFX940-NEXT: ; def v[0:1] 5512; GFX940-NEXT: ;;#ASMEND 5513; GFX940-NEXT: s_mov_b32 s2, 0x7060302 5514; GFX940-NEXT: v_mov_b32_e32 v2, 0 5515; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 5516; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 5517; GFX940-NEXT: s_waitcnt vmcnt(0) 5518; GFX940-NEXT: s_setpc_b64 s[30:31] 5519 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5520 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5521 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5522 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5523 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 3, i32 4, i32 4, i32 4> 5524 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5525 ret void 5526} 5527 5528define void @v_shuffle_v4bf16_v3bf16__4_4_4_4(ptr addrspace(1) inreg %ptr) { 5529; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_4_4_4: 5530; GFX900: ; %bb.0: 5531; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5532; GFX900-NEXT: ;;#ASMSTART 5533; GFX900-NEXT: ; def v[0:1] 5534; GFX900-NEXT: ;;#ASMEND 5535; GFX900-NEXT: s_mov_b32 s4, 0x7060302 5536; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 5537; GFX900-NEXT: v_mov_b32_e32 v2, 0 5538; GFX900-NEXT: v_mov_b32_e32 v1, v0 5539; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 5540; GFX900-NEXT: s_waitcnt vmcnt(0) 5541; GFX900-NEXT: s_setpc_b64 s[30:31] 5542; 5543; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_4_4_4: 5544; GFX90A: ; %bb.0: 5545; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5546; GFX90A-NEXT: ;;#ASMSTART 5547; GFX90A-NEXT: ; def v[0:1] 5548; GFX90A-NEXT: ;;#ASMEND 5549; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 5550; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 5551; GFX90A-NEXT: v_mov_b32_e32 v2, 0 5552; GFX90A-NEXT: v_mov_b32_e32 v1, v0 5553; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 5554; GFX90A-NEXT: s_waitcnt vmcnt(0) 5555; GFX90A-NEXT: s_setpc_b64 s[30:31] 5556; 5557; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_4_4_4: 5558; GFX940: ; %bb.0: 5559; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5560; GFX940-NEXT: ;;#ASMSTART 5561; GFX940-NEXT: ; def v[0:1] 5562; GFX940-NEXT: ;;#ASMEND 5563; GFX940-NEXT: s_mov_b32 s2, 0x7060302 5564; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 5565; GFX940-NEXT: v_mov_b32_e32 v2, 0 5566; GFX940-NEXT: v_mov_b32_e32 v1, v0 5567; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 5568; GFX940-NEXT: s_waitcnt vmcnt(0) 5569; GFX940-NEXT: s_setpc_b64 s[30:31] 5570 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5571 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5572 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5573 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5574 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 4, i32 4, i32 4> 5575 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5576 ret void 5577} 5578 5579define void @v_shuffle_v4bf16_v3bf16__5_4_4_4(ptr addrspace(1) inreg %ptr) { 5580; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_4_4: 5581; GFX900: ; %bb.0: 5582; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5583; GFX900-NEXT: ;;#ASMSTART 5584; GFX900-NEXT: ; def v[0:1] 5585; GFX900-NEXT: ;;#ASMEND 5586; GFX900-NEXT: s_mov_b32 s4, 0xffff 5587; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 5588; GFX900-NEXT: s_mov_b32 s4, 0x7060302 5589; GFX900-NEXT: v_mov_b32_e32 v3, 0 5590; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 5591; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 5592; GFX900-NEXT: s_waitcnt vmcnt(0) 5593; GFX900-NEXT: s_setpc_b64 s[30:31] 5594; 5595; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_4_4: 5596; GFX90A: ; %bb.0: 5597; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5598; GFX90A-NEXT: s_mov_b32 s4, 0xffff 5599; GFX90A-NEXT: ;;#ASMSTART 5600; GFX90A-NEXT: ; def v[0:1] 5601; GFX90A-NEXT: ;;#ASMEND 5602; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 5603; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 5604; GFX90A-NEXT: v_mov_b32_e32 v4, 0 5605; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 5606; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 5607; GFX90A-NEXT: s_waitcnt vmcnt(0) 5608; GFX90A-NEXT: s_setpc_b64 s[30:31] 5609; 5610; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_4_4: 5611; GFX940: ; %bb.0: 5612; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5613; GFX940-NEXT: s_mov_b32 s2, 0xffff 5614; GFX940-NEXT: ;;#ASMSTART 5615; GFX940-NEXT: ; def v[0:1] 5616; GFX940-NEXT: ;;#ASMEND 5617; GFX940-NEXT: v_mov_b32_e32 v4, 0 5618; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 5619; GFX940-NEXT: s_mov_b32 s2, 0x7060302 5620; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 5621; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 5622; GFX940-NEXT: s_waitcnt vmcnt(0) 5623; GFX940-NEXT: s_setpc_b64 s[30:31] 5624 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5625 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5626 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5627 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5628 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 4, i32 4> 5629 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5630 ret void 5631} 5632 5633define void @v_shuffle_v4bf16_v3bf16__5_u_4_4(ptr addrspace(1) inreg %ptr) { 5634; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_4_4: 5635; GFX900: ; %bb.0: 5636; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5637; GFX900-NEXT: s_mov_b32 s4, 0x7060302 5638; GFX900-NEXT: v_mov_b32_e32 v3, 0 5639; GFX900-NEXT: ;;#ASMSTART 5640; GFX900-NEXT: ; def v[0:1] 5641; GFX900-NEXT: ;;#ASMEND 5642; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 5643; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 5644; GFX900-NEXT: s_waitcnt vmcnt(0) 5645; GFX900-NEXT: s_setpc_b64 s[30:31] 5646; 5647; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_4_4: 5648; GFX90A: ; %bb.0: 5649; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5650; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 5651; GFX90A-NEXT: v_mov_b32_e32 v4, 0 5652; GFX90A-NEXT: ;;#ASMSTART 5653; GFX90A-NEXT: ; def v[0:1] 5654; GFX90A-NEXT: ;;#ASMEND 5655; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 5656; GFX90A-NEXT: v_mov_b32_e32 v2, v1 5657; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 5658; GFX90A-NEXT: s_waitcnt vmcnt(0) 5659; GFX90A-NEXT: s_setpc_b64 s[30:31] 5660; 5661; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_4_4: 5662; GFX940: ; %bb.0: 5663; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5664; GFX940-NEXT: s_mov_b32 s2, 0x7060302 5665; GFX940-NEXT: v_mov_b32_e32 v4, 0 5666; GFX940-NEXT: ;;#ASMSTART 5667; GFX940-NEXT: ; def v[0:1] 5668; GFX940-NEXT: ;;#ASMEND 5669; GFX940-NEXT: s_nop 0 5670; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 5671; GFX940-NEXT: v_mov_b32_e32 v2, v1 5672; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 5673; GFX940-NEXT: s_waitcnt vmcnt(0) 5674; GFX940-NEXT: s_setpc_b64 s[30:31] 5675 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5676 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5677 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5678 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5679 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 4, i32 4> 5680 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5681 ret void 5682} 5683 5684define void @v_shuffle_v4bf16_v3bf16__5_0_4_4(ptr addrspace(1) inreg %ptr) { 5685; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_4_4: 5686; GFX900: ; %bb.0: 5687; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5688; GFX900-NEXT: ;;#ASMSTART 5689; GFX900-NEXT: ; def v[0:1] 5690; GFX900-NEXT: ;;#ASMEND 5691; GFX900-NEXT: s_mov_b32 s4, 0x5040100 5692; GFX900-NEXT: ;;#ASMSTART 5693; GFX900-NEXT: ; def v[1:2] 5694; GFX900-NEXT: ;;#ASMEND 5695; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 5696; GFX900-NEXT: s_mov_b32 s4, 0x7060302 5697; GFX900-NEXT: v_mov_b32_e32 v3, 0 5698; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 5699; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 5700; GFX900-NEXT: s_waitcnt vmcnt(0) 5701; GFX900-NEXT: s_setpc_b64 s[30:31] 5702; 5703; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_4_4: 5704; GFX90A: ; %bb.0: 5705; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5706; GFX90A-NEXT: ;;#ASMSTART 5707; GFX90A-NEXT: ; def v[0:1] 5708; GFX90A-NEXT: ;;#ASMEND 5709; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 5710; GFX90A-NEXT: ;;#ASMSTART 5711; GFX90A-NEXT: ; def v[2:3] 5712; GFX90A-NEXT: ;;#ASMEND 5713; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 5714; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 5715; GFX90A-NEXT: v_mov_b32_e32 v4, 0 5716; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 5717; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 5718; GFX90A-NEXT: s_waitcnt vmcnt(0) 5719; GFX90A-NEXT: s_setpc_b64 s[30:31] 5720; 5721; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_4_4: 5722; GFX940: ; %bb.0: 5723; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5724; GFX940-NEXT: ;;#ASMSTART 5725; GFX940-NEXT: ; def v[0:1] 5726; GFX940-NEXT: ;;#ASMEND 5727; GFX940-NEXT: s_mov_b32 s2, 0x5040100 5728; GFX940-NEXT: ;;#ASMSTART 5729; GFX940-NEXT: ; def v[2:3] 5730; GFX940-NEXT: ;;#ASMEND 5731; GFX940-NEXT: v_mov_b32_e32 v4, 0 5732; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 5733; GFX940-NEXT: s_mov_b32 s2, 0x7060302 5734; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 5735; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 5736; GFX940-NEXT: s_waitcnt vmcnt(0) 5737; GFX940-NEXT: s_setpc_b64 s[30:31] 5738 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5739 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5740 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5741 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5742 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 4, i32 4> 5743 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5744 ret void 5745} 5746 5747define void @v_shuffle_v4bf16_v3bf16__5_1_4_4(ptr addrspace(1) inreg %ptr) { 5748; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_4_4: 5749; GFX900: ; %bb.0: 5750; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5751; GFX900-NEXT: ;;#ASMSTART 5752; GFX900-NEXT: ; def v[0:1] 5753; GFX900-NEXT: ;;#ASMEND 5754; GFX900-NEXT: s_mov_b32 s4, 0xffff 5755; GFX900-NEXT: ;;#ASMSTART 5756; GFX900-NEXT: ; def v[1:2] 5757; GFX900-NEXT: ;;#ASMEND 5758; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 5759; GFX900-NEXT: s_mov_b32 s4, 0x7060302 5760; GFX900-NEXT: v_mov_b32_e32 v3, 0 5761; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 5762; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 5763; GFX900-NEXT: s_waitcnt vmcnt(0) 5764; GFX900-NEXT: s_setpc_b64 s[30:31] 5765; 5766; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_4_4: 5767; GFX90A: ; %bb.0: 5768; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5769; GFX90A-NEXT: ;;#ASMSTART 5770; GFX90A-NEXT: ; def v[0:1] 5771; GFX90A-NEXT: ;;#ASMEND 5772; GFX90A-NEXT: s_mov_b32 s4, 0xffff 5773; GFX90A-NEXT: ;;#ASMSTART 5774; GFX90A-NEXT: ; def v[2:3] 5775; GFX90A-NEXT: ;;#ASMEND 5776; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 5777; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 5778; GFX90A-NEXT: v_mov_b32_e32 v4, 0 5779; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 5780; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 5781; GFX90A-NEXT: s_waitcnt vmcnt(0) 5782; GFX90A-NEXT: s_setpc_b64 s[30:31] 5783; 5784; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_4_4: 5785; GFX940: ; %bb.0: 5786; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5787; GFX940-NEXT: ;;#ASMSTART 5788; GFX940-NEXT: ; def v[0:1] 5789; GFX940-NEXT: ;;#ASMEND 5790; GFX940-NEXT: s_mov_b32 s2, 0xffff 5791; GFX940-NEXT: ;;#ASMSTART 5792; GFX940-NEXT: ; def v[2:3] 5793; GFX940-NEXT: ;;#ASMEND 5794; GFX940-NEXT: v_mov_b32_e32 v4, 0 5795; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 5796; GFX940-NEXT: s_mov_b32 s2, 0x7060302 5797; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 5798; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 5799; GFX940-NEXT: s_waitcnt vmcnt(0) 5800; GFX940-NEXT: s_setpc_b64 s[30:31] 5801 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5802 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5803 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5804 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5805 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 4, i32 4> 5806 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5807 ret void 5808} 5809 5810define void @v_shuffle_v4bf16_v3bf16__5_2_4_4(ptr addrspace(1) inreg %ptr) { 5811; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_4_4: 5812; GFX900: ; %bb.0: 5813; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5814; GFX900-NEXT: ;;#ASMSTART 5815; GFX900-NEXT: ; def v[0:1] 5816; GFX900-NEXT: ;;#ASMEND 5817; GFX900-NEXT: s_mov_b32 s4, 0x5040100 5818; GFX900-NEXT: ;;#ASMSTART 5819; GFX900-NEXT: ; def v[2:3] 5820; GFX900-NEXT: ;;#ASMEND 5821; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 5822; GFX900-NEXT: s_mov_b32 s4, 0x7060302 5823; GFX900-NEXT: v_mov_b32_e32 v4, 0 5824; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 5825; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 5826; GFX900-NEXT: s_waitcnt vmcnt(0) 5827; GFX900-NEXT: s_setpc_b64 s[30:31] 5828; 5829; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_4_4: 5830; GFX90A: ; %bb.0: 5831; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5832; GFX90A-NEXT: ;;#ASMSTART 5833; GFX90A-NEXT: ; def v[0:1] 5834; GFX90A-NEXT: ;;#ASMEND 5835; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 5836; GFX90A-NEXT: ;;#ASMSTART 5837; GFX90A-NEXT: ; def v[2:3] 5838; GFX90A-NEXT: ;;#ASMEND 5839; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 5840; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 5841; GFX90A-NEXT: v_mov_b32_e32 v4, 0 5842; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 5843; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 5844; GFX90A-NEXT: s_waitcnt vmcnt(0) 5845; GFX90A-NEXT: s_setpc_b64 s[30:31] 5846; 5847; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_4_4: 5848; GFX940: ; %bb.0: 5849; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5850; GFX940-NEXT: ;;#ASMSTART 5851; GFX940-NEXT: ; def v[0:1] 5852; GFX940-NEXT: ;;#ASMEND 5853; GFX940-NEXT: s_mov_b32 s2, 0x5040100 5854; GFX940-NEXT: ;;#ASMSTART 5855; GFX940-NEXT: ; def v[2:3] 5856; GFX940-NEXT: ;;#ASMEND 5857; GFX940-NEXT: v_mov_b32_e32 v4, 0 5858; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 5859; GFX940-NEXT: s_mov_b32 s2, 0x7060302 5860; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 5861; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 5862; GFX940-NEXT: s_waitcnt vmcnt(0) 5863; GFX940-NEXT: s_setpc_b64 s[30:31] 5864 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5865 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5866 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5867 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5868 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 4, i32 4> 5869 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5870 ret void 5871} 5872 5873define void @v_shuffle_v4bf16_v3bf16__5_3_4_4(ptr addrspace(1) inreg %ptr) { 5874; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_4_4: 5875; GFX900: ; %bb.0: 5876; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5877; GFX900-NEXT: ;;#ASMSTART 5878; GFX900-NEXT: ; def v[0:1] 5879; GFX900-NEXT: ;;#ASMEND 5880; GFX900-NEXT: s_mov_b32 s4, 0x5040100 5881; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 5882; GFX900-NEXT: s_mov_b32 s4, 0x7060302 5883; GFX900-NEXT: v_mov_b32_e32 v3, 0 5884; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 5885; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 5886; GFX900-NEXT: s_waitcnt vmcnt(0) 5887; GFX900-NEXT: s_setpc_b64 s[30:31] 5888; 5889; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_4_4: 5890; GFX90A: ; %bb.0: 5891; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5892; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 5893; GFX90A-NEXT: ;;#ASMSTART 5894; GFX90A-NEXT: ; def v[0:1] 5895; GFX90A-NEXT: ;;#ASMEND 5896; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 5897; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 5898; GFX90A-NEXT: v_mov_b32_e32 v4, 0 5899; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 5900; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 5901; GFX90A-NEXT: s_waitcnt vmcnt(0) 5902; GFX90A-NEXT: s_setpc_b64 s[30:31] 5903; 5904; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_4_4: 5905; GFX940: ; %bb.0: 5906; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5907; GFX940-NEXT: s_mov_b32 s2, 0x5040100 5908; GFX940-NEXT: ;;#ASMSTART 5909; GFX940-NEXT: ; def v[0:1] 5910; GFX940-NEXT: ;;#ASMEND 5911; GFX940-NEXT: v_mov_b32_e32 v4, 0 5912; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 5913; GFX940-NEXT: s_mov_b32 s2, 0x7060302 5914; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 5915; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 5916; GFX940-NEXT: s_waitcnt vmcnt(0) 5917; GFX940-NEXT: s_setpc_b64 s[30:31] 5918 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5919 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5920 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5921 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5922 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 4, i32 4> 5923 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5924 ret void 5925} 5926 5927define void @v_shuffle_v4bf16_v3bf16__5_5_4_4(ptr addrspace(1) inreg %ptr) { 5928; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_4: 5929; GFX900: ; %bb.0: 5930; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5931; GFX900-NEXT: s_mov_b32 s4, 0x7060302 5932; GFX900-NEXT: ;;#ASMSTART 5933; GFX900-NEXT: ; def v[0:1] 5934; GFX900-NEXT: ;;#ASMEND 5935; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 5936; GFX900-NEXT: s_mov_b32 s4, 0x5040100 5937; GFX900-NEXT: v_mov_b32_e32 v3, 0 5938; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 5939; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 5940; GFX900-NEXT: s_waitcnt vmcnt(0) 5941; GFX900-NEXT: s_setpc_b64 s[30:31] 5942; 5943; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_4: 5944; GFX90A: ; %bb.0: 5945; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5946; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 5947; GFX90A-NEXT: ;;#ASMSTART 5948; GFX90A-NEXT: ; def v[0:1] 5949; GFX90A-NEXT: ;;#ASMEND 5950; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 5951; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 5952; GFX90A-NEXT: v_mov_b32_e32 v4, 0 5953; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 5954; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 5955; GFX90A-NEXT: s_waitcnt vmcnt(0) 5956; GFX90A-NEXT: s_setpc_b64 s[30:31] 5957; 5958; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_4: 5959; GFX940: ; %bb.0: 5960; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5961; GFX940-NEXT: s_mov_b32 s2, 0x7060302 5962; GFX940-NEXT: ;;#ASMSTART 5963; GFX940-NEXT: ; def v[0:1] 5964; GFX940-NEXT: ;;#ASMEND 5965; GFX940-NEXT: v_mov_b32_e32 v4, 0 5966; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 5967; GFX940-NEXT: s_mov_b32 s2, 0x5040100 5968; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 5969; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 5970; GFX940-NEXT: s_waitcnt vmcnt(0) 5971; GFX940-NEXT: s_setpc_b64 s[30:31] 5972 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 5973 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 5974 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5975 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 5976 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 4> 5977 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 5978 ret void 5979} 5980 5981define void @v_shuffle_v4bf16_v3bf16__5_5_u_4(ptr addrspace(1) inreg %ptr) { 5982; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_4: 5983; GFX900: ; %bb.0: 5984; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5985; GFX900-NEXT: ;;#ASMSTART 5986; GFX900-NEXT: ; def v[0:1] 5987; GFX900-NEXT: ;;#ASMEND 5988; GFX900-NEXT: s_mov_b32 s4, 0x5040100 5989; GFX900-NEXT: v_mov_b32_e32 v3, 0 5990; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 5991; GFX900-NEXT: v_mov_b32_e32 v2, v0 5992; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 5993; GFX900-NEXT: s_waitcnt vmcnt(0) 5994; GFX900-NEXT: s_setpc_b64 s[30:31] 5995; 5996; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_4: 5997; GFX90A: ; %bb.0: 5998; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5999; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6000; GFX90A-NEXT: v_mov_b32_e32 v4, 0 6001; GFX90A-NEXT: ;;#ASMSTART 6002; GFX90A-NEXT: ; def v[0:1] 6003; GFX90A-NEXT: ;;#ASMEND 6004; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 6005; GFX90A-NEXT: v_mov_b32_e32 v3, v0 6006; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 6007; GFX90A-NEXT: s_waitcnt vmcnt(0) 6008; GFX90A-NEXT: s_setpc_b64 s[30:31] 6009; 6010; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_4: 6011; GFX940: ; %bb.0: 6012; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6013; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6014; GFX940-NEXT: v_mov_b32_e32 v4, 0 6015; GFX940-NEXT: ;;#ASMSTART 6016; GFX940-NEXT: ; def v[0:1] 6017; GFX940-NEXT: ;;#ASMEND 6018; GFX940-NEXT: s_nop 0 6019; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 6020; GFX940-NEXT: v_mov_b32_e32 v3, v0 6021; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 6022; GFX940-NEXT: s_waitcnt vmcnt(0) 6023; GFX940-NEXT: s_setpc_b64 s[30:31] 6024 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6025 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6026 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6027 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6028 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 4> 6029 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6030 ret void 6031} 6032 6033define void @v_shuffle_v4bf16_v3bf16__5_5_0_4(ptr addrspace(1) inreg %ptr) { 6034; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_4: 6035; GFX900: ; %bb.0: 6036; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6037; GFX900-NEXT: ;;#ASMSTART 6038; GFX900-NEXT: ; def v[0:1] 6039; GFX900-NEXT: ;;#ASMEND 6040; GFX900-NEXT: ;;#ASMSTART 6041; GFX900-NEXT: ; def v[1:2] 6042; GFX900-NEXT: ;;#ASMEND 6043; GFX900-NEXT: s_mov_b32 s4, 0xffff 6044; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v1 6045; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6046; GFX900-NEXT: v_mov_b32_e32 v3, 0 6047; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 6048; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 6049; GFX900-NEXT: s_waitcnt vmcnt(0) 6050; GFX900-NEXT: s_setpc_b64 s[30:31] 6051; 6052; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_4: 6053; GFX90A: ; %bb.0: 6054; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6055; GFX90A-NEXT: ;;#ASMSTART 6056; GFX90A-NEXT: ; def v[0:1] 6057; GFX90A-NEXT: ;;#ASMEND 6058; GFX90A-NEXT: s_mov_b32 s4, 0xffff 6059; GFX90A-NEXT: ;;#ASMSTART 6060; GFX90A-NEXT: ; def v[2:3] 6061; GFX90A-NEXT: ;;#ASMEND 6062; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v2 6063; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6064; GFX90A-NEXT: v_mov_b32_e32 v4, 0 6065; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 6066; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 6067; GFX90A-NEXT: s_waitcnt vmcnt(0) 6068; GFX90A-NEXT: s_setpc_b64 s[30:31] 6069; 6070; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_4: 6071; GFX940: ; %bb.0: 6072; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6073; GFX940-NEXT: ;;#ASMSTART 6074; GFX940-NEXT: ; def v[0:1] 6075; GFX940-NEXT: ;;#ASMEND 6076; GFX940-NEXT: s_mov_b32 s2, 0xffff 6077; GFX940-NEXT: ;;#ASMSTART 6078; GFX940-NEXT: ; def v[2:3] 6079; GFX940-NEXT: ;;#ASMEND 6080; GFX940-NEXT: v_mov_b32_e32 v4, 0 6081; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v2 6082; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6083; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 6084; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 6085; GFX940-NEXT: s_waitcnt vmcnt(0) 6086; GFX940-NEXT: s_setpc_b64 s[30:31] 6087 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6088 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6089 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6090 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6091 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 4> 6092 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6093 ret void 6094} 6095 6096define void @v_shuffle_v4bf16_v3bf16__5_5_1_4(ptr addrspace(1) inreg %ptr) { 6097; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_4: 6098; GFX900: ; %bb.0: 6099; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6100; GFX900-NEXT: ;;#ASMSTART 6101; GFX900-NEXT: ; def v[0:1] 6102; GFX900-NEXT: ;;#ASMEND 6103; GFX900-NEXT: ;;#ASMSTART 6104; GFX900-NEXT: ; def v[1:2] 6105; GFX900-NEXT: ;;#ASMEND 6106; GFX900-NEXT: s_mov_b32 s4, 0x7060302 6107; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 6108; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6109; GFX900-NEXT: v_mov_b32_e32 v3, 0 6110; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 6111; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 6112; GFX900-NEXT: s_waitcnt vmcnt(0) 6113; GFX900-NEXT: s_setpc_b64 s[30:31] 6114; 6115; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_4: 6116; GFX90A: ; %bb.0: 6117; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6118; GFX90A-NEXT: ;;#ASMSTART 6119; GFX90A-NEXT: ; def v[0:1] 6120; GFX90A-NEXT: ;;#ASMEND 6121; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 6122; GFX90A-NEXT: ;;#ASMSTART 6123; GFX90A-NEXT: ; def v[2:3] 6124; GFX90A-NEXT: ;;#ASMEND 6125; GFX90A-NEXT: v_perm_b32 v1, v2, v0, s4 6126; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6127; GFX90A-NEXT: v_mov_b32_e32 v4, 0 6128; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 6129; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 6130; GFX90A-NEXT: s_waitcnt vmcnt(0) 6131; GFX90A-NEXT: s_setpc_b64 s[30:31] 6132; 6133; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_4: 6134; GFX940: ; %bb.0: 6135; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6136; GFX940-NEXT: ;;#ASMSTART 6137; GFX940-NEXT: ; def v[0:1] 6138; GFX940-NEXT: ;;#ASMEND 6139; GFX940-NEXT: s_mov_b32 s2, 0x7060302 6140; GFX940-NEXT: ;;#ASMSTART 6141; GFX940-NEXT: ; def v[2:3] 6142; GFX940-NEXT: ;;#ASMEND 6143; GFX940-NEXT: v_mov_b32_e32 v4, 0 6144; GFX940-NEXT: v_perm_b32 v1, v2, v0, s2 6145; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6146; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 6147; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 6148; GFX940-NEXT: s_waitcnt vmcnt(0) 6149; GFX940-NEXT: s_setpc_b64 s[30:31] 6150 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6151 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6152 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6153 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6154 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 4> 6155 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6156 ret void 6157} 6158 6159define void @v_shuffle_v4bf16_v3bf16__5_5_2_4(ptr addrspace(1) inreg %ptr) { 6160; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_4: 6161; GFX900: ; %bb.0: 6162; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6163; GFX900-NEXT: ;;#ASMSTART 6164; GFX900-NEXT: ; def v[0:1] 6165; GFX900-NEXT: ;;#ASMEND 6166; GFX900-NEXT: s_mov_b32 s4, 0xffff 6167; GFX900-NEXT: ;;#ASMSTART 6168; GFX900-NEXT: ; def v[2:3] 6169; GFX900-NEXT: ;;#ASMEND 6170; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v2 6171; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6172; GFX900-NEXT: v_mov_b32_e32 v4, 0 6173; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 6174; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 6175; GFX900-NEXT: s_waitcnt vmcnt(0) 6176; GFX900-NEXT: s_setpc_b64 s[30:31] 6177; 6178; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_4: 6179; GFX90A: ; %bb.0: 6180; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6181; GFX90A-NEXT: ;;#ASMSTART 6182; GFX90A-NEXT: ; def v[0:1] 6183; GFX90A-NEXT: ;;#ASMEND 6184; GFX90A-NEXT: s_mov_b32 s4, 0xffff 6185; GFX90A-NEXT: ;;#ASMSTART 6186; GFX90A-NEXT: ; def v[2:3] 6187; GFX90A-NEXT: ;;#ASMEND 6188; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v2 6189; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6190; GFX90A-NEXT: v_mov_b32_e32 v4, 0 6191; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 6192; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 6193; GFX90A-NEXT: s_waitcnt vmcnt(0) 6194; GFX90A-NEXT: s_setpc_b64 s[30:31] 6195; 6196; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_4: 6197; GFX940: ; %bb.0: 6198; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6199; GFX940-NEXT: ;;#ASMSTART 6200; GFX940-NEXT: ; def v[0:1] 6201; GFX940-NEXT: ;;#ASMEND 6202; GFX940-NEXT: s_mov_b32 s2, 0xffff 6203; GFX940-NEXT: ;;#ASMSTART 6204; GFX940-NEXT: ; def v[2:3] 6205; GFX940-NEXT: ;;#ASMEND 6206; GFX940-NEXT: v_mov_b32_e32 v4, 0 6207; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v2 6208; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6209; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 6210; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 6211; GFX940-NEXT: s_waitcnt vmcnt(0) 6212; GFX940-NEXT: s_setpc_b64 s[30:31] 6213 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6214 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6215 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6216 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6217 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 4> 6218 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6219 ret void 6220} 6221 6222define void @v_shuffle_v4bf16_v3bf16__5_5_3_4(ptr addrspace(1) inreg %ptr) { 6223; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_4: 6224; GFX900: ; %bb.0: 6225; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6226; GFX900-NEXT: ;;#ASMSTART 6227; GFX900-NEXT: ; def v[0:1] 6228; GFX900-NEXT: ;;#ASMEND 6229; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6230; GFX900-NEXT: v_mov_b32_e32 v3, 0 6231; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 6232; GFX900-NEXT: v_mov_b32_e32 v2, v0 6233; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 6234; GFX900-NEXT: s_waitcnt vmcnt(0) 6235; GFX900-NEXT: s_setpc_b64 s[30:31] 6236; 6237; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_4: 6238; GFX90A: ; %bb.0: 6239; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6240; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6241; GFX90A-NEXT: v_mov_b32_e32 v4, 0 6242; GFX90A-NEXT: ;;#ASMSTART 6243; GFX90A-NEXT: ; def v[0:1] 6244; GFX90A-NEXT: ;;#ASMEND 6245; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 6246; GFX90A-NEXT: v_mov_b32_e32 v3, v0 6247; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 6248; GFX90A-NEXT: s_waitcnt vmcnt(0) 6249; GFX90A-NEXT: s_setpc_b64 s[30:31] 6250; 6251; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_4: 6252; GFX940: ; %bb.0: 6253; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6254; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6255; GFX940-NEXT: v_mov_b32_e32 v4, 0 6256; GFX940-NEXT: ;;#ASMSTART 6257; GFX940-NEXT: ; def v[0:1] 6258; GFX940-NEXT: ;;#ASMEND 6259; GFX940-NEXT: s_nop 0 6260; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 6261; GFX940-NEXT: v_mov_b32_e32 v3, v0 6262; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 6263; GFX940-NEXT: s_waitcnt vmcnt(0) 6264; GFX940-NEXT: s_setpc_b64 s[30:31] 6265 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6266 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6267 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6268 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6269 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 4> 6270 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6271 ret void 6272} 6273 6274define void @v_shuffle_v4bf16_v3bf16__u_5_5_5(ptr addrspace(1) inreg %ptr) { 6275; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__u_5_5_5: 6276; GFX900: ; %bb.0: 6277; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6278; GFX900-NEXT: ;;#ASMSTART 6279; GFX900-NEXT: ; def v[0:1] 6280; GFX900-NEXT: ;;#ASMEND 6281; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6282; GFX900-NEXT: v_mov_b32_e32 v3, 0 6283; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 6284; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 6285; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 6286; GFX900-NEXT: s_waitcnt vmcnt(0) 6287; GFX900-NEXT: s_setpc_b64 s[30:31] 6288; 6289; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__u_5_5_5: 6290; GFX90A: ; %bb.0: 6291; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6292; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6293; GFX90A-NEXT: v_mov_b32_e32 v4, 0 6294; GFX90A-NEXT: ;;#ASMSTART 6295; GFX90A-NEXT: ; def v[0:1] 6296; GFX90A-NEXT: ;;#ASMEND 6297; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 6298; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 6299; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 6300; GFX90A-NEXT: s_waitcnt vmcnt(0) 6301; GFX90A-NEXT: s_setpc_b64 s[30:31] 6302; 6303; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__u_5_5_5: 6304; GFX940: ; %bb.0: 6305; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6306; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6307; GFX940-NEXT: v_mov_b32_e32 v4, 0 6308; GFX940-NEXT: ;;#ASMSTART 6309; GFX940-NEXT: ; def v[0:1] 6310; GFX940-NEXT: ;;#ASMEND 6311; GFX940-NEXT: s_nop 0 6312; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 6313; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 6314; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 6315; GFX940-NEXT: s_waitcnt vmcnt(0) 6316; GFX940-NEXT: s_setpc_b64 s[30:31] 6317 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6318 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6319 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6320 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6321 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 poison, i32 5, i32 5, i32 5> 6322 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6323 ret void 6324} 6325 6326define void @v_shuffle_v4bf16_v3bf16__0_5_5_5(ptr addrspace(1) inreg %ptr) { 6327; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_5_5_5: 6328; GFX900: ; %bb.0: 6329; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6330; GFX900-NEXT: ;;#ASMSTART 6331; GFX900-NEXT: ; def v[0:1] 6332; GFX900-NEXT: ;;#ASMEND 6333; GFX900-NEXT: ;;#ASMSTART 6334; GFX900-NEXT: ; def v[1:2] 6335; GFX900-NEXT: ;;#ASMEND 6336; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6337; GFX900-NEXT: v_mov_b32_e32 v3, 0 6338; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 6339; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 6340; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 6341; GFX900-NEXT: s_waitcnt vmcnt(0) 6342; GFX900-NEXT: s_setpc_b64 s[30:31] 6343; 6344; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_5_5_5: 6345; GFX90A: ; %bb.0: 6346; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6347; GFX90A-NEXT: ;;#ASMSTART 6348; GFX90A-NEXT: ; def v[0:1] 6349; GFX90A-NEXT: ;;#ASMEND 6350; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6351; GFX90A-NEXT: v_mov_b32_e32 v4, 0 6352; GFX90A-NEXT: ;;#ASMSTART 6353; GFX90A-NEXT: ; def v[2:3] 6354; GFX90A-NEXT: ;;#ASMEND 6355; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 6356; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 6357; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 6358; GFX90A-NEXT: s_waitcnt vmcnt(0) 6359; GFX90A-NEXT: s_setpc_b64 s[30:31] 6360; 6361; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_5_5_5: 6362; GFX940: ; %bb.0: 6363; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6364; GFX940-NEXT: ;;#ASMSTART 6365; GFX940-NEXT: ; def v[0:1] 6366; GFX940-NEXT: ;;#ASMEND 6367; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6368; GFX940-NEXT: v_mov_b32_e32 v4, 0 6369; GFX940-NEXT: ;;#ASMSTART 6370; GFX940-NEXT: ; def v[2:3] 6371; GFX940-NEXT: ;;#ASMEND 6372; GFX940-NEXT: s_nop 0 6373; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 6374; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 6375; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 6376; GFX940-NEXT: s_waitcnt vmcnt(0) 6377; GFX940-NEXT: s_setpc_b64 s[30:31] 6378 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6379 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6380 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6381 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6382 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 0, i32 5, i32 5, i32 5> 6383 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6384 ret void 6385} 6386 6387define void @v_shuffle_v4bf16_v3bf16__1_5_5_5(ptr addrspace(1) inreg %ptr) { 6388; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_5_5_5: 6389; GFX900: ; %bb.0: 6390; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6391; GFX900-NEXT: ;;#ASMSTART 6392; GFX900-NEXT: ; def v[0:1] 6393; GFX900-NEXT: ;;#ASMEND 6394; GFX900-NEXT: ;;#ASMSTART 6395; GFX900-NEXT: ; def v[1:2] 6396; GFX900-NEXT: ;;#ASMEND 6397; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6398; GFX900-NEXT: v_mov_b32_e32 v3, 0 6399; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 6400; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 6401; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 6402; GFX900-NEXT: s_waitcnt vmcnt(0) 6403; GFX900-NEXT: s_setpc_b64 s[30:31] 6404; 6405; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_5_5_5: 6406; GFX90A: ; %bb.0: 6407; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6408; GFX90A-NEXT: ;;#ASMSTART 6409; GFX90A-NEXT: ; def v[0:1] 6410; GFX90A-NEXT: ;;#ASMEND 6411; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6412; GFX90A-NEXT: v_mov_b32_e32 v4, 0 6413; GFX90A-NEXT: ;;#ASMSTART 6414; GFX90A-NEXT: ; def v[2:3] 6415; GFX90A-NEXT: ;;#ASMEND 6416; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 6417; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 6418; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 6419; GFX90A-NEXT: s_waitcnt vmcnt(0) 6420; GFX90A-NEXT: s_setpc_b64 s[30:31] 6421; 6422; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_5_5_5: 6423; GFX940: ; %bb.0: 6424; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6425; GFX940-NEXT: ;;#ASMSTART 6426; GFX940-NEXT: ; def v[0:1] 6427; GFX940-NEXT: ;;#ASMEND 6428; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6429; GFX940-NEXT: v_mov_b32_e32 v4, 0 6430; GFX940-NEXT: ;;#ASMSTART 6431; GFX940-NEXT: ; def v[2:3] 6432; GFX940-NEXT: ;;#ASMEND 6433; GFX940-NEXT: s_nop 0 6434; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 6435; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 6436; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 6437; GFX940-NEXT: s_waitcnt vmcnt(0) 6438; GFX940-NEXT: s_setpc_b64 s[30:31] 6439 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6440 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6441 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6442 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6443 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 1, i32 5, i32 5, i32 5> 6444 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6445 ret void 6446} 6447 6448define void @v_shuffle_v4bf16_v3bf16__2_5_5_5(ptr addrspace(1) inreg %ptr) { 6449; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_5_5_5: 6450; GFX900: ; %bb.0: 6451; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6452; GFX900-NEXT: ;;#ASMSTART 6453; GFX900-NEXT: ; def v[0:1] 6454; GFX900-NEXT: ;;#ASMEND 6455; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6456; GFX900-NEXT: v_mov_b32_e32 v4, 0 6457; GFX900-NEXT: ;;#ASMSTART 6458; GFX900-NEXT: ; def v[2:3] 6459; GFX900-NEXT: ;;#ASMEND 6460; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 6461; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 6462; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 6463; GFX900-NEXT: s_waitcnt vmcnt(0) 6464; GFX900-NEXT: s_setpc_b64 s[30:31] 6465; 6466; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_5_5_5: 6467; GFX90A: ; %bb.0: 6468; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6469; GFX90A-NEXT: ;;#ASMSTART 6470; GFX90A-NEXT: ; def v[0:1] 6471; GFX90A-NEXT: ;;#ASMEND 6472; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6473; GFX90A-NEXT: v_mov_b32_e32 v4, 0 6474; GFX90A-NEXT: ;;#ASMSTART 6475; GFX90A-NEXT: ; def v[2:3] 6476; GFX90A-NEXT: ;;#ASMEND 6477; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 6478; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 6479; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 6480; GFX90A-NEXT: s_waitcnt vmcnt(0) 6481; GFX90A-NEXT: s_setpc_b64 s[30:31] 6482; 6483; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_5_5_5: 6484; GFX940: ; %bb.0: 6485; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6486; GFX940-NEXT: ;;#ASMSTART 6487; GFX940-NEXT: ; def v[0:1] 6488; GFX940-NEXT: ;;#ASMEND 6489; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6490; GFX940-NEXT: v_mov_b32_e32 v4, 0 6491; GFX940-NEXT: ;;#ASMSTART 6492; GFX940-NEXT: ; def v[2:3] 6493; GFX940-NEXT: ;;#ASMEND 6494; GFX940-NEXT: s_nop 0 6495; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 6496; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 6497; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 6498; GFX940-NEXT: s_waitcnt vmcnt(0) 6499; GFX940-NEXT: s_setpc_b64 s[30:31] 6500 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6501 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6502 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6503 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6504 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 2, i32 5, i32 5, i32 5> 6505 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6506 ret void 6507} 6508 6509define void @v_shuffle_v4bf16_v3bf16__3_5_5_5(ptr addrspace(1) inreg %ptr) { 6510; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__3_5_5_5: 6511; GFX900: ; %bb.0: 6512; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6513; GFX900-NEXT: ;;#ASMSTART 6514; GFX900-NEXT: ; def v[0:1] 6515; GFX900-NEXT: ;;#ASMEND 6516; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6517; GFX900-NEXT: v_mov_b32_e32 v2, 0 6518; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 6519; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 6520; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 6521; GFX900-NEXT: s_waitcnt vmcnt(0) 6522; GFX900-NEXT: s_setpc_b64 s[30:31] 6523; 6524; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__3_5_5_5: 6525; GFX90A: ; %bb.0: 6526; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6527; GFX90A-NEXT: ;;#ASMSTART 6528; GFX90A-NEXT: ; def v[0:1] 6529; GFX90A-NEXT: ;;#ASMEND 6530; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6531; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6532; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 6533; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 6534; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 6535; GFX90A-NEXT: s_waitcnt vmcnt(0) 6536; GFX90A-NEXT: s_setpc_b64 s[30:31] 6537; 6538; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__3_5_5_5: 6539; GFX940: ; %bb.0: 6540; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6541; GFX940-NEXT: ;;#ASMSTART 6542; GFX940-NEXT: ; def v[0:1] 6543; GFX940-NEXT: ;;#ASMEND 6544; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6545; GFX940-NEXT: v_mov_b32_e32 v2, 0 6546; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 6547; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 6548; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 6549; GFX940-NEXT: s_waitcnt vmcnt(0) 6550; GFX940-NEXT: s_setpc_b64 s[30:31] 6551 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6552 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6553 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6554 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6555 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 3, i32 5, i32 5, i32 5> 6556 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6557 ret void 6558} 6559 6560define void @v_shuffle_v4bf16_v3bf16__4_5_5_5(ptr addrspace(1) inreg %ptr) { 6561; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_5_5_5: 6562; GFX900: ; %bb.0: 6563; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6564; GFX900-NEXT: ;;#ASMSTART 6565; GFX900-NEXT: ; def v[0:1] 6566; GFX900-NEXT: ;;#ASMEND 6567; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6568; GFX900-NEXT: v_mov_b32_e32 v3, 0 6569; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 6570; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 6571; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 6572; GFX900-NEXT: s_waitcnt vmcnt(0) 6573; GFX900-NEXT: s_setpc_b64 s[30:31] 6574; 6575; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_5_5_5: 6576; GFX90A: ; %bb.0: 6577; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6578; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6579; GFX90A-NEXT: v_mov_b32_e32 v4, 0 6580; GFX90A-NEXT: ;;#ASMSTART 6581; GFX90A-NEXT: ; def v[0:1] 6582; GFX90A-NEXT: ;;#ASMEND 6583; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 6584; GFX90A-NEXT: v_alignbit_b32 v2, v1, v0, 16 6585; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 6586; GFX90A-NEXT: s_waitcnt vmcnt(0) 6587; GFX90A-NEXT: s_setpc_b64 s[30:31] 6588; 6589; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_5_5_5: 6590; GFX940: ; %bb.0: 6591; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6592; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6593; GFX940-NEXT: v_mov_b32_e32 v4, 0 6594; GFX940-NEXT: ;;#ASMSTART 6595; GFX940-NEXT: ; def v[0:1] 6596; GFX940-NEXT: ;;#ASMEND 6597; GFX940-NEXT: s_nop 0 6598; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 6599; GFX940-NEXT: v_alignbit_b32 v2, v1, v0, 16 6600; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 6601; GFX940-NEXT: s_waitcnt vmcnt(0) 6602; GFX940-NEXT: s_setpc_b64 s[30:31] 6603 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6604 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6605 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6606 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6607 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 5, i32 5, i32 5> 6608 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6609 ret void 6610} 6611 6612define void @v_shuffle_v4bf16_v3bf16__5_u_5_5(ptr addrspace(1) inreg %ptr) { 6613; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_5_5: 6614; GFX900: ; %bb.0: 6615; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6616; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6617; GFX900-NEXT: v_mov_b32_e32 v3, 0 6618; GFX900-NEXT: ;;#ASMSTART 6619; GFX900-NEXT: ; def v[0:1] 6620; GFX900-NEXT: ;;#ASMEND 6621; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 6622; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 6623; GFX900-NEXT: s_waitcnt vmcnt(0) 6624; GFX900-NEXT: s_setpc_b64 s[30:31] 6625; 6626; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_5_5: 6627; GFX90A: ; %bb.0: 6628; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6629; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6630; GFX90A-NEXT: v_mov_b32_e32 v4, 0 6631; GFX90A-NEXT: ;;#ASMSTART 6632; GFX90A-NEXT: ; def v[0:1] 6633; GFX90A-NEXT: ;;#ASMEND 6634; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 6635; GFX90A-NEXT: v_mov_b32_e32 v2, v1 6636; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 6637; GFX90A-NEXT: s_waitcnt vmcnt(0) 6638; GFX90A-NEXT: s_setpc_b64 s[30:31] 6639; 6640; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_5_5: 6641; GFX940: ; %bb.0: 6642; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6643; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6644; GFX940-NEXT: v_mov_b32_e32 v4, 0 6645; GFX940-NEXT: ;;#ASMSTART 6646; GFX940-NEXT: ; def v[0:1] 6647; GFX940-NEXT: ;;#ASMEND 6648; GFX940-NEXT: s_nop 0 6649; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 6650; GFX940-NEXT: v_mov_b32_e32 v2, v1 6651; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 6652; GFX940-NEXT: s_waitcnt vmcnt(0) 6653; GFX940-NEXT: s_setpc_b64 s[30:31] 6654 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6655 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6656 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6657 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6658 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 5, i32 5> 6659 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6660 ret void 6661} 6662 6663define void @v_shuffle_v4bf16_v3bf16__5_0_5_5(ptr addrspace(1) inreg %ptr) { 6664; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_5_5: 6665; GFX900: ; %bb.0: 6666; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6667; GFX900-NEXT: ;;#ASMSTART 6668; GFX900-NEXT: ; def v[0:1] 6669; GFX900-NEXT: ;;#ASMEND 6670; GFX900-NEXT: ;;#ASMSTART 6671; GFX900-NEXT: ; def v[1:2] 6672; GFX900-NEXT: ;;#ASMEND 6673; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6674; GFX900-NEXT: v_mov_b32_e32 v3, 0 6675; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 6676; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 6677; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 6678; GFX900-NEXT: s_waitcnt vmcnt(0) 6679; GFX900-NEXT: s_setpc_b64 s[30:31] 6680; 6681; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_5_5: 6682; GFX90A: ; %bb.0: 6683; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6684; GFX90A-NEXT: ;;#ASMSTART 6685; GFX90A-NEXT: ; def v[0:1] 6686; GFX90A-NEXT: ;;#ASMEND 6687; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6688; GFX90A-NEXT: v_mov_b32_e32 v4, 0 6689; GFX90A-NEXT: ;;#ASMSTART 6690; GFX90A-NEXT: ; def v[2:3] 6691; GFX90A-NEXT: ;;#ASMEND 6692; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 6693; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 6694; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 6695; GFX90A-NEXT: s_waitcnt vmcnt(0) 6696; GFX90A-NEXT: s_setpc_b64 s[30:31] 6697; 6698; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_5_5: 6699; GFX940: ; %bb.0: 6700; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6701; GFX940-NEXT: ;;#ASMSTART 6702; GFX940-NEXT: ; def v[0:1] 6703; GFX940-NEXT: ;;#ASMEND 6704; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6705; GFX940-NEXT: v_mov_b32_e32 v4, 0 6706; GFX940-NEXT: ;;#ASMSTART 6707; GFX940-NEXT: ; def v[2:3] 6708; GFX940-NEXT: ;;#ASMEND 6709; GFX940-NEXT: s_nop 0 6710; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 6711; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 6712; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 6713; GFX940-NEXT: s_waitcnt vmcnt(0) 6714; GFX940-NEXT: s_setpc_b64 s[30:31] 6715 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6716 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6717 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6718 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6719 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 5, i32 5> 6720 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6721 ret void 6722} 6723 6724define void @v_shuffle_v4bf16_v3bf16__5_1_5_5(ptr addrspace(1) inreg %ptr) { 6725; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_5_5: 6726; GFX900: ; %bb.0: 6727; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6728; GFX900-NEXT: ;;#ASMSTART 6729; GFX900-NEXT: ; def v[0:1] 6730; GFX900-NEXT: ;;#ASMEND 6731; GFX900-NEXT: s_mov_b32 s4, 0xffff 6732; GFX900-NEXT: ;;#ASMSTART 6733; GFX900-NEXT: ; def v[1:2] 6734; GFX900-NEXT: ;;#ASMEND 6735; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 6736; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6737; GFX900-NEXT: v_mov_b32_e32 v3, 0 6738; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 6739; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 6740; GFX900-NEXT: s_waitcnt vmcnt(0) 6741; GFX900-NEXT: s_setpc_b64 s[30:31] 6742; 6743; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_5_5: 6744; GFX90A: ; %bb.0: 6745; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6746; GFX90A-NEXT: ;;#ASMSTART 6747; GFX90A-NEXT: ; def v[0:1] 6748; GFX90A-NEXT: ;;#ASMEND 6749; GFX90A-NEXT: s_mov_b32 s4, 0xffff 6750; GFX90A-NEXT: ;;#ASMSTART 6751; GFX90A-NEXT: ; def v[2:3] 6752; GFX90A-NEXT: ;;#ASMEND 6753; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 6754; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6755; GFX90A-NEXT: v_mov_b32_e32 v4, 0 6756; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 6757; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 6758; GFX90A-NEXT: s_waitcnt vmcnt(0) 6759; GFX90A-NEXT: s_setpc_b64 s[30:31] 6760; 6761; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_5_5: 6762; GFX940: ; %bb.0: 6763; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6764; GFX940-NEXT: ;;#ASMSTART 6765; GFX940-NEXT: ; def v[0:1] 6766; GFX940-NEXT: ;;#ASMEND 6767; GFX940-NEXT: s_mov_b32 s2, 0xffff 6768; GFX940-NEXT: ;;#ASMSTART 6769; GFX940-NEXT: ; def v[2:3] 6770; GFX940-NEXT: ;;#ASMEND 6771; GFX940-NEXT: v_mov_b32_e32 v4, 0 6772; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 6773; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6774; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 6775; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 6776; GFX940-NEXT: s_waitcnt vmcnt(0) 6777; GFX940-NEXT: s_setpc_b64 s[30:31] 6778 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6779 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6780 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6781 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6782 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 5, i32 5> 6783 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6784 ret void 6785} 6786 6787define void @v_shuffle_v4bf16_v3bf16__5_2_5_5(ptr addrspace(1) inreg %ptr) { 6788; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_5_5: 6789; GFX900: ; %bb.0: 6790; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6791; GFX900-NEXT: ;;#ASMSTART 6792; GFX900-NEXT: ; def v[0:1] 6793; GFX900-NEXT: ;;#ASMEND 6794; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6795; GFX900-NEXT: v_mov_b32_e32 v4, 0 6796; GFX900-NEXT: ;;#ASMSTART 6797; GFX900-NEXT: ; def v[2:3] 6798; GFX900-NEXT: ;;#ASMEND 6799; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 6800; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 6801; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 6802; GFX900-NEXT: s_waitcnt vmcnt(0) 6803; GFX900-NEXT: s_setpc_b64 s[30:31] 6804; 6805; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_5_5: 6806; GFX90A: ; %bb.0: 6807; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6808; GFX90A-NEXT: ;;#ASMSTART 6809; GFX90A-NEXT: ; def v[0:1] 6810; GFX90A-NEXT: ;;#ASMEND 6811; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6812; GFX90A-NEXT: v_mov_b32_e32 v4, 0 6813; GFX90A-NEXT: ;;#ASMSTART 6814; GFX90A-NEXT: ; def v[2:3] 6815; GFX90A-NEXT: ;;#ASMEND 6816; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 6817; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 6818; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 6819; GFX90A-NEXT: s_waitcnt vmcnt(0) 6820; GFX90A-NEXT: s_setpc_b64 s[30:31] 6821; 6822; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_5_5: 6823; GFX940: ; %bb.0: 6824; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6825; GFX940-NEXT: ;;#ASMSTART 6826; GFX940-NEXT: ; def v[0:1] 6827; GFX940-NEXT: ;;#ASMEND 6828; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6829; GFX940-NEXT: v_mov_b32_e32 v4, 0 6830; GFX940-NEXT: ;;#ASMSTART 6831; GFX940-NEXT: ; def v[2:3] 6832; GFX940-NEXT: ;;#ASMEND 6833; GFX940-NEXT: s_nop 0 6834; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 6835; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 6836; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 6837; GFX940-NEXT: s_waitcnt vmcnt(0) 6838; GFX940-NEXT: s_setpc_b64 s[30:31] 6839 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6840 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6841 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6842 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6843 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 5, i32 5> 6844 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6845 ret void 6846} 6847 6848define void @v_shuffle_v4bf16_v3bf16__5_3_5_5(ptr addrspace(1) inreg %ptr) { 6849; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_5_5: 6850; GFX900: ; %bb.0: 6851; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6852; GFX900-NEXT: ;;#ASMSTART 6853; GFX900-NEXT: ; def v[0:1] 6854; GFX900-NEXT: ;;#ASMEND 6855; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6856; GFX900-NEXT: v_mov_b32_e32 v2, 0 6857; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 6858; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 6859; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 6860; GFX900-NEXT: s_waitcnt vmcnt(0) 6861; GFX900-NEXT: s_setpc_b64 s[30:31] 6862; 6863; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_5_5: 6864; GFX90A: ; %bb.0: 6865; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6866; GFX90A-NEXT: ;;#ASMSTART 6867; GFX90A-NEXT: ; def v[0:1] 6868; GFX90A-NEXT: ;;#ASMEND 6869; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6870; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6871; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 6872; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 6873; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 6874; GFX90A-NEXT: s_waitcnt vmcnt(0) 6875; GFX90A-NEXT: s_setpc_b64 s[30:31] 6876; 6877; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_5_5: 6878; GFX940: ; %bb.0: 6879; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6880; GFX940-NEXT: ;;#ASMSTART 6881; GFX940-NEXT: ; def v[0:1] 6882; GFX940-NEXT: ;;#ASMEND 6883; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6884; GFX940-NEXT: v_mov_b32_e32 v2, 0 6885; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 6886; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 6887; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 6888; GFX940-NEXT: s_waitcnt vmcnt(0) 6889; GFX940-NEXT: s_setpc_b64 s[30:31] 6890 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6891 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6892 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6893 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6894 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 5, i32 5> 6895 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6896 ret void 6897} 6898 6899define void @v_shuffle_v4bf16_v3bf16__5_4_5_5(ptr addrspace(1) inreg %ptr) { 6900; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_5_5: 6901; GFX900: ; %bb.0: 6902; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6903; GFX900-NEXT: ;;#ASMSTART 6904; GFX900-NEXT: ; def v[0:1] 6905; GFX900-NEXT: ;;#ASMEND 6906; GFX900-NEXT: s_mov_b32 s4, 0xffff 6907; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 6908; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6909; GFX900-NEXT: v_mov_b32_e32 v2, 0 6910; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 6911; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 6912; GFX900-NEXT: s_waitcnt vmcnt(0) 6913; GFX900-NEXT: s_setpc_b64 s[30:31] 6914; 6915; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_5_5: 6916; GFX90A: ; %bb.0: 6917; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6918; GFX90A-NEXT: ;;#ASMSTART 6919; GFX90A-NEXT: ; def v[0:1] 6920; GFX90A-NEXT: ;;#ASMEND 6921; GFX90A-NEXT: s_mov_b32 s4, 0xffff 6922; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 6923; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6924; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6925; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 6926; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 6927; GFX90A-NEXT: s_waitcnt vmcnt(0) 6928; GFX90A-NEXT: s_setpc_b64 s[30:31] 6929; 6930; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_5_5: 6931; GFX940: ; %bb.0: 6932; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6933; GFX940-NEXT: ;;#ASMSTART 6934; GFX940-NEXT: ; def v[0:1] 6935; GFX940-NEXT: ;;#ASMEND 6936; GFX940-NEXT: s_mov_b32 s2, 0xffff 6937; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 6938; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6939; GFX940-NEXT: v_mov_b32_e32 v2, 0 6940; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 6941; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 6942; GFX940-NEXT: s_waitcnt vmcnt(0) 6943; GFX940-NEXT: s_setpc_b64 s[30:31] 6944 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6945 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6946 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6947 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6948 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 5, i32 5> 6949 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 6950 ret void 6951} 6952 6953define void @v_shuffle_v4bf16_v3bf16__5_5_u_5(ptr addrspace(1) inreg %ptr) { 6954; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_5: 6955; GFX900: ; %bb.0: 6956; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6957; GFX900-NEXT: ;;#ASMSTART 6958; GFX900-NEXT: ; def v[0:1] 6959; GFX900-NEXT: ;;#ASMEND 6960; GFX900-NEXT: s_mov_b32 s4, 0x5040100 6961; GFX900-NEXT: v_mov_b32_e32 v2, 0 6962; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 6963; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 6964; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 6965; GFX900-NEXT: s_waitcnt vmcnt(0) 6966; GFX900-NEXT: s_setpc_b64 s[30:31] 6967; 6968; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_5: 6969; GFX90A: ; %bb.0: 6970; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6971; GFX90A-NEXT: ;;#ASMSTART 6972; GFX90A-NEXT: ; def v[0:1] 6973; GFX90A-NEXT: ;;#ASMEND 6974; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 6975; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6976; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 6977; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 6978; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] 6979; GFX90A-NEXT: s_waitcnt vmcnt(0) 6980; GFX90A-NEXT: s_setpc_b64 s[30:31] 6981; 6982; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_5: 6983; GFX940: ; %bb.0: 6984; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6985; GFX940-NEXT: ;;#ASMSTART 6986; GFX940-NEXT: ; def v[0:1] 6987; GFX940-NEXT: ;;#ASMEND 6988; GFX940-NEXT: s_mov_b32 s2, 0x5040100 6989; GFX940-NEXT: v_mov_b32_e32 v2, 0 6990; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 6991; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 6992; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 6993; GFX940-NEXT: s_waitcnt vmcnt(0) 6994; GFX940-NEXT: s_setpc_b64 s[30:31] 6995 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 6996 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 6997 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6998 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 6999 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 5> 7000 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 7001 ret void 7002} 7003 7004define void @v_shuffle_v4bf16_v3bf16__5_5_0_5(ptr addrspace(1) inreg %ptr) { 7005; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_5: 7006; GFX900: ; %bb.0: 7007; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7008; GFX900-NEXT: ;;#ASMSTART 7009; GFX900-NEXT: ; def v[0:1] 7010; GFX900-NEXT: ;;#ASMEND 7011; GFX900-NEXT: ;;#ASMSTART 7012; GFX900-NEXT: ; def v[1:2] 7013; GFX900-NEXT: ;;#ASMEND 7014; GFX900-NEXT: s_mov_b32 s4, 0x5040100 7015; GFX900-NEXT: v_mov_b32_e32 v3, 0 7016; GFX900-NEXT: v_perm_b32 v1, v2, v0, s4 7017; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 7018; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 7019; GFX900-NEXT: s_waitcnt vmcnt(0) 7020; GFX900-NEXT: s_setpc_b64 s[30:31] 7021; 7022; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_5: 7023; GFX90A: ; %bb.0: 7024; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7025; GFX90A-NEXT: ;;#ASMSTART 7026; GFX90A-NEXT: ; def v[0:1] 7027; GFX90A-NEXT: ;;#ASMEND 7028; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 7029; GFX90A-NEXT: v_mov_b32_e32 v4, 0 7030; GFX90A-NEXT: ;;#ASMSTART 7031; GFX90A-NEXT: ; def v[2:3] 7032; GFX90A-NEXT: ;;#ASMEND 7033; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 7034; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 7035; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 7036; GFX90A-NEXT: s_waitcnt vmcnt(0) 7037; GFX90A-NEXT: s_setpc_b64 s[30:31] 7038; 7039; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_5: 7040; GFX940: ; %bb.0: 7041; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7042; GFX940-NEXT: ;;#ASMSTART 7043; GFX940-NEXT: ; def v[0:1] 7044; GFX940-NEXT: ;;#ASMEND 7045; GFX940-NEXT: s_mov_b32 s2, 0x5040100 7046; GFX940-NEXT: v_mov_b32_e32 v4, 0 7047; GFX940-NEXT: ;;#ASMSTART 7048; GFX940-NEXT: ; def v[2:3] 7049; GFX940-NEXT: ;;#ASMEND 7050; GFX940-NEXT: s_nop 0 7051; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 7052; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 7053; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 7054; GFX940-NEXT: s_waitcnt vmcnt(0) 7055; GFX940-NEXT: s_setpc_b64 s[30:31] 7056 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 7057 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 7058 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7059 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7060 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 5> 7061 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 7062 ret void 7063} 7064 7065define void @v_shuffle_v4bf16_v3bf16__5_5_1_5(ptr addrspace(1) inreg %ptr) { 7066; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_5: 7067; GFX900: ; %bb.0: 7068; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7069; GFX900-NEXT: ;;#ASMSTART 7070; GFX900-NEXT: ; def v[0:1] 7071; GFX900-NEXT: ;;#ASMEND 7072; GFX900-NEXT: ;;#ASMSTART 7073; GFX900-NEXT: ; def v[1:2] 7074; GFX900-NEXT: ;;#ASMEND 7075; GFX900-NEXT: s_mov_b32 s4, 0x5040100 7076; GFX900-NEXT: v_mov_b32_e32 v3, 0 7077; GFX900-NEXT: v_alignbit_b32 v1, v2, v0, 16 7078; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 7079; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] 7080; GFX900-NEXT: s_waitcnt vmcnt(0) 7081; GFX900-NEXT: s_setpc_b64 s[30:31] 7082; 7083; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_5: 7084; GFX90A: ; %bb.0: 7085; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7086; GFX90A-NEXT: ;;#ASMSTART 7087; GFX90A-NEXT: ; def v[0:1] 7088; GFX90A-NEXT: ;;#ASMEND 7089; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 7090; GFX90A-NEXT: v_mov_b32_e32 v4, 0 7091; GFX90A-NEXT: ;;#ASMSTART 7092; GFX90A-NEXT: ; def v[2:3] 7093; GFX90A-NEXT: ;;#ASMEND 7094; GFX90A-NEXT: v_alignbit_b32 v1, v3, v0, 16 7095; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 7096; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 7097; GFX90A-NEXT: s_waitcnt vmcnt(0) 7098; GFX90A-NEXT: s_setpc_b64 s[30:31] 7099; 7100; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_5: 7101; GFX940: ; %bb.0: 7102; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7103; GFX940-NEXT: ;;#ASMSTART 7104; GFX940-NEXT: ; def v[0:1] 7105; GFX940-NEXT: ;;#ASMEND 7106; GFX940-NEXT: s_mov_b32 s2, 0x5040100 7107; GFX940-NEXT: v_mov_b32_e32 v4, 0 7108; GFX940-NEXT: ;;#ASMSTART 7109; GFX940-NEXT: ; def v[2:3] 7110; GFX940-NEXT: ;;#ASMEND 7111; GFX940-NEXT: s_nop 0 7112; GFX940-NEXT: v_alignbit_b32 v1, v3, v0, 16 7113; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 7114; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 7115; GFX940-NEXT: s_waitcnt vmcnt(0) 7116; GFX940-NEXT: s_setpc_b64 s[30:31] 7117 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 7118 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 7119 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7120 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7121 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 5> 7122 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 7123 ret void 7124} 7125 7126define void @v_shuffle_v4bf16_v3bf16__5_5_2_5(ptr addrspace(1) inreg %ptr) { 7127; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_5: 7128; GFX900: ; %bb.0: 7129; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7130; GFX900-NEXT: ;;#ASMSTART 7131; GFX900-NEXT: ; def v[0:1] 7132; GFX900-NEXT: ;;#ASMEND 7133; GFX900-NEXT: s_mov_b32 s4, 0x5040100 7134; GFX900-NEXT: v_mov_b32_e32 v4, 0 7135; GFX900-NEXT: ;;#ASMSTART 7136; GFX900-NEXT: ; def v[2:3] 7137; GFX900-NEXT: ;;#ASMEND 7138; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 7139; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 7140; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 7141; GFX900-NEXT: s_waitcnt vmcnt(0) 7142; GFX900-NEXT: s_setpc_b64 s[30:31] 7143; 7144; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_5: 7145; GFX90A: ; %bb.0: 7146; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7147; GFX90A-NEXT: ;;#ASMSTART 7148; GFX90A-NEXT: ; def v[0:1] 7149; GFX90A-NEXT: ;;#ASMEND 7150; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 7151; GFX90A-NEXT: v_mov_b32_e32 v4, 0 7152; GFX90A-NEXT: ;;#ASMSTART 7153; GFX90A-NEXT: ; def v[2:3] 7154; GFX90A-NEXT: ;;#ASMEND 7155; GFX90A-NEXT: v_perm_b32 v1, v3, v1, s4 7156; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 7157; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] 7158; GFX90A-NEXT: s_waitcnt vmcnt(0) 7159; GFX90A-NEXT: s_setpc_b64 s[30:31] 7160; 7161; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_5: 7162; GFX940: ; %bb.0: 7163; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7164; GFX940-NEXT: ;;#ASMSTART 7165; GFX940-NEXT: ; def v[0:1] 7166; GFX940-NEXT: ;;#ASMEND 7167; GFX940-NEXT: s_mov_b32 s2, 0x5040100 7168; GFX940-NEXT: v_mov_b32_e32 v4, 0 7169; GFX940-NEXT: ;;#ASMSTART 7170; GFX940-NEXT: ; def v[2:3] 7171; GFX940-NEXT: ;;#ASMEND 7172; GFX940-NEXT: s_nop 0 7173; GFX940-NEXT: v_perm_b32 v1, v3, v1, s2 7174; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 7175; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 7176; GFX940-NEXT: s_waitcnt vmcnt(0) 7177; GFX940-NEXT: s_setpc_b64 s[30:31] 7178 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 7179 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 7180 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7181 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7182 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 5> 7183 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 7184 ret void 7185} 7186 7187define void @v_shuffle_v4bf16_v3bf16__5_5_3_5(ptr addrspace(1) inreg %ptr) { 7188; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_5: 7189; GFX900: ; %bb.0: 7190; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7191; GFX900-NEXT: ;;#ASMSTART 7192; GFX900-NEXT: ; def v[0:1] 7193; GFX900-NEXT: ;;#ASMEND 7194; GFX900-NEXT: s_mov_b32 s4, 0x5040100 7195; GFX900-NEXT: v_mov_b32_e32 v3, 0 7196; GFX900-NEXT: v_perm_b32 v2, v1, v0, s4 7197; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 7198; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 7199; GFX900-NEXT: s_waitcnt vmcnt(0) 7200; GFX900-NEXT: s_setpc_b64 s[30:31] 7201; 7202; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_5: 7203; GFX90A: ; %bb.0: 7204; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7205; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 7206; GFX90A-NEXT: v_mov_b32_e32 v4, 0 7207; GFX90A-NEXT: ;;#ASMSTART 7208; GFX90A-NEXT: ; def v[0:1] 7209; GFX90A-NEXT: ;;#ASMEND 7210; GFX90A-NEXT: v_perm_b32 v3, v1, v0, s4 7211; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 7212; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 7213; GFX90A-NEXT: s_waitcnt vmcnt(0) 7214; GFX90A-NEXT: s_setpc_b64 s[30:31] 7215; 7216; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_5: 7217; GFX940: ; %bb.0: 7218; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7219; GFX940-NEXT: s_mov_b32 s2, 0x5040100 7220; GFX940-NEXT: v_mov_b32_e32 v4, 0 7221; GFX940-NEXT: ;;#ASMSTART 7222; GFX940-NEXT: ; def v[0:1] 7223; GFX940-NEXT: ;;#ASMEND 7224; GFX940-NEXT: s_nop 0 7225; GFX940-NEXT: v_perm_b32 v3, v1, v0, s2 7226; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 7227; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 7228; GFX940-NEXT: s_waitcnt vmcnt(0) 7229; GFX940-NEXT: s_setpc_b64 s[30:31] 7230 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 7231 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 7232 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7233 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7234 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 5> 7235 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 7236 ret void 7237} 7238 7239define void @v_shuffle_v4bf16_v3bf16__5_5_4_5(ptr addrspace(1) inreg %ptr) { 7240; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_5: 7241; GFX900: ; %bb.0: 7242; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7243; GFX900-NEXT: ;;#ASMSTART 7244; GFX900-NEXT: ; def v[0:1] 7245; GFX900-NEXT: ;;#ASMEND 7246; GFX900-NEXT: s_mov_b32 s4, 0x5040100 7247; GFX900-NEXT: v_mov_b32_e32 v3, 0 7248; GFX900-NEXT: v_alignbit_b32 v2, v1, v0, 16 7249; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 7250; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] 7251; GFX900-NEXT: s_waitcnt vmcnt(0) 7252; GFX900-NEXT: s_setpc_b64 s[30:31] 7253; 7254; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_5: 7255; GFX90A: ; %bb.0: 7256; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7257; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 7258; GFX90A-NEXT: v_mov_b32_e32 v4, 0 7259; GFX90A-NEXT: ;;#ASMSTART 7260; GFX90A-NEXT: ; def v[0:1] 7261; GFX90A-NEXT: ;;#ASMEND 7262; GFX90A-NEXT: v_alignbit_b32 v3, v1, v0, 16 7263; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 7264; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] 7265; GFX90A-NEXT: s_waitcnt vmcnt(0) 7266; GFX90A-NEXT: s_setpc_b64 s[30:31] 7267; 7268; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_5: 7269; GFX940: ; %bb.0: 7270; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7271; GFX940-NEXT: s_mov_b32 s2, 0x5040100 7272; GFX940-NEXT: v_mov_b32_e32 v4, 0 7273; GFX940-NEXT: ;;#ASMSTART 7274; GFX940-NEXT: ; def v[0:1] 7275; GFX940-NEXT: ;;#ASMEND 7276; GFX940-NEXT: s_nop 0 7277; GFX940-NEXT: v_alignbit_b32 v3, v1, v0, 16 7278; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 7279; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 7280; GFX940-NEXT: s_waitcnt vmcnt(0) 7281; GFX940-NEXT: s_setpc_b64 s[30:31] 7282 %vec0 = call <4 x bfloat> asm "; def $0", "=v"() 7283 %vec1 = call <4 x bfloat> asm "; def $0", "=v"() 7284 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7285 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7286 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 5> 7287 store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 7288 ret void 7289} 7290 7291define void @s_shuffle_v4bf16_v3bf16__u_u_u_u() { 7292; GFX9-LABEL: s_shuffle_v4bf16_v3bf16__u_u_u_u: 7293; GFX9: ; %bb.0: 7294; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7295; GFX9-NEXT: ;;#ASMSTART 7296; GFX9-NEXT: ; use s[8:9] 7297; GFX9-NEXT: ;;#ASMEND 7298; GFX9-NEXT: s_setpc_b64 s[30:31] 7299 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 7300 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7301 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> poison 7302 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 7303 ret void 7304} 7305 7306define void @s_shuffle_v4bf16_v3bf16__0_u_u_u() { 7307; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_u_u_u: 7308; GFX900: ; %bb.0: 7309; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7310; GFX900-NEXT: ;;#ASMSTART 7311; GFX900-NEXT: ; def s[8:9] 7312; GFX900-NEXT: ;;#ASMEND 7313; GFX900-NEXT: ;;#ASMSTART 7314; GFX900-NEXT: ; use s[8:9] 7315; GFX900-NEXT: ;;#ASMEND 7316; GFX900-NEXT: s_setpc_b64 s[30:31] 7317; 7318; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_u_u_u: 7319; GFX90A: ; %bb.0: 7320; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7321; GFX90A-NEXT: ;;#ASMSTART 7322; GFX90A-NEXT: ; def s[8:9] 7323; GFX90A-NEXT: ;;#ASMEND 7324; GFX90A-NEXT: ;;#ASMSTART 7325; GFX90A-NEXT: ; use s[8:9] 7326; GFX90A-NEXT: ;;#ASMEND 7327; GFX90A-NEXT: s_setpc_b64 s[30:31] 7328; 7329; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_u_u_u: 7330; GFX940: ; %bb.0: 7331; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7332; GFX940-NEXT: ;;#ASMSTART 7333; GFX940-NEXT: ; def s[8:9] 7334; GFX940-NEXT: ;;#ASMEND 7335; GFX940-NEXT: s_nop 0 7336; GFX940-NEXT: ;;#ASMSTART 7337; GFX940-NEXT: ; use s[8:9] 7338; GFX940-NEXT: ;;#ASMEND 7339; GFX940-NEXT: s_setpc_b64 s[30:31] 7340 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 7341 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7342 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison> 7343 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 7344 ret void 7345} 7346 7347define void @s_shuffle_v4bf16_v3bf16__1_u_u_u() { 7348; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_u_u_u: 7349; GFX900: ; %bb.0: 7350; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7351; GFX900-NEXT: ;;#ASMSTART 7352; GFX900-NEXT: ; def s[4:5] 7353; GFX900-NEXT: ;;#ASMEND 7354; GFX900-NEXT: s_lshr_b32 s8, s4, 16 7355; GFX900-NEXT: ;;#ASMSTART 7356; GFX900-NEXT: ; use s[8:9] 7357; GFX900-NEXT: ;;#ASMEND 7358; GFX900-NEXT: s_setpc_b64 s[30:31] 7359; 7360; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_u_u_u: 7361; GFX90A: ; %bb.0: 7362; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7363; GFX90A-NEXT: ;;#ASMSTART 7364; GFX90A-NEXT: ; def s[4:5] 7365; GFX90A-NEXT: ;;#ASMEND 7366; GFX90A-NEXT: s_lshr_b32 s8, s4, 16 7367; GFX90A-NEXT: ;;#ASMSTART 7368; GFX90A-NEXT: ; use s[8:9] 7369; GFX90A-NEXT: ;;#ASMEND 7370; GFX90A-NEXT: s_setpc_b64 s[30:31] 7371; 7372; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_u_u_u: 7373; GFX940: ; %bb.0: 7374; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7375; GFX940-NEXT: ;;#ASMSTART 7376; GFX940-NEXT: ; def s[0:1] 7377; GFX940-NEXT: ;;#ASMEND 7378; GFX940-NEXT: s_lshr_b32 s8, s0, 16 7379; GFX940-NEXT: ;;#ASMSTART 7380; GFX940-NEXT: ; use s[8:9] 7381; GFX940-NEXT: ;;#ASMEND 7382; GFX940-NEXT: s_setpc_b64 s[30:31] 7383 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 7384 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7385 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison> 7386 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 7387 ret void 7388} 7389 7390define void @s_shuffle_v4bf16_v3bf16__2_u_u_u() { 7391; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_u_u_u: 7392; GFX900: ; %bb.0: 7393; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7394; GFX900-NEXT: ;;#ASMSTART 7395; GFX900-NEXT: ; def s[4:5] 7396; GFX900-NEXT: ;;#ASMEND 7397; GFX900-NEXT: s_mov_b32 s8, s5 7398; GFX900-NEXT: ;;#ASMSTART 7399; GFX900-NEXT: ; use s[8:9] 7400; GFX900-NEXT: ;;#ASMEND 7401; GFX900-NEXT: s_setpc_b64 s[30:31] 7402; 7403; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_u_u_u: 7404; GFX90A: ; %bb.0: 7405; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7406; GFX90A-NEXT: ;;#ASMSTART 7407; GFX90A-NEXT: ; def s[4:5] 7408; GFX90A-NEXT: ;;#ASMEND 7409; GFX90A-NEXT: s_mov_b32 s8, s5 7410; GFX90A-NEXT: ;;#ASMSTART 7411; GFX90A-NEXT: ; use s[8:9] 7412; GFX90A-NEXT: ;;#ASMEND 7413; GFX90A-NEXT: s_setpc_b64 s[30:31] 7414; 7415; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_u_u_u: 7416; GFX940: ; %bb.0: 7417; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7418; GFX940-NEXT: ;;#ASMSTART 7419; GFX940-NEXT: ; def s[0:1] 7420; GFX940-NEXT: ;;#ASMEND 7421; GFX940-NEXT: s_mov_b32 s8, s1 7422; GFX940-NEXT: ;;#ASMSTART 7423; GFX940-NEXT: ; use s[8:9] 7424; GFX940-NEXT: ;;#ASMEND 7425; GFX940-NEXT: s_setpc_b64 s[30:31] 7426 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 7427 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7428 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison> 7429 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 7430 ret void 7431} 7432 7433define void @s_shuffle_v4bf16_v3bf16__3_u_u_u() { 7434; GFX9-LABEL: s_shuffle_v4bf16_v3bf16__3_u_u_u: 7435; GFX9: ; %bb.0: 7436; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7437; GFX9-NEXT: ;;#ASMSTART 7438; GFX9-NEXT: ; use s[8:9] 7439; GFX9-NEXT: ;;#ASMEND 7440; GFX9-NEXT: s_setpc_b64 s[30:31] 7441 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 7442 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7443 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison> 7444 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 7445 ret void 7446} 7447 7448define void @s_shuffle_v4bf16_v3bf16__4_u_u_u() { 7449; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_u_u_u: 7450; GFX900: ; %bb.0: 7451; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7452; GFX900-NEXT: ;;#ASMSTART 7453; GFX900-NEXT: ; def s[4:5] 7454; GFX900-NEXT: ;;#ASMEND 7455; GFX900-NEXT: s_lshr_b32 s8, s4, 16 7456; GFX900-NEXT: ;;#ASMSTART 7457; GFX900-NEXT: ; use s[8:9] 7458; GFX900-NEXT: ;;#ASMEND 7459; GFX900-NEXT: s_setpc_b64 s[30:31] 7460; 7461; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_u_u_u: 7462; GFX90A: ; %bb.0: 7463; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7464; GFX90A-NEXT: ;;#ASMSTART 7465; GFX90A-NEXT: ; def s[4:5] 7466; GFX90A-NEXT: ;;#ASMEND 7467; GFX90A-NEXT: s_lshr_b32 s8, s4, 16 7468; GFX90A-NEXT: ;;#ASMSTART 7469; GFX90A-NEXT: ; use s[8:9] 7470; GFX90A-NEXT: ;;#ASMEND 7471; GFX90A-NEXT: s_setpc_b64 s[30:31] 7472; 7473; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_u_u_u: 7474; GFX940: ; %bb.0: 7475; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7476; GFX940-NEXT: ;;#ASMSTART 7477; GFX940-NEXT: ; def s[0:1] 7478; GFX940-NEXT: ;;#ASMEND 7479; GFX940-NEXT: s_lshr_b32 s8, s0, 16 7480; GFX940-NEXT: ;;#ASMSTART 7481; GFX940-NEXT: ; use s[8:9] 7482; GFX940-NEXT: ;;#ASMEND 7483; GFX940-NEXT: s_setpc_b64 s[30:31] 7484 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 7485 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 7486 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7487 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7488 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 poison, i32 poison, i32 poison> 7489 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 7490 ret void 7491} 7492 7493define void @s_shuffle_v4bf16_v3bf16__5_u_u_u() { 7494; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_u_u: 7495; GFX900: ; %bb.0: 7496; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7497; GFX900-NEXT: ;;#ASMSTART 7498; GFX900-NEXT: ; def s[4:5] 7499; GFX900-NEXT: ;;#ASMEND 7500; GFX900-NEXT: s_mov_b32 s8, s5 7501; GFX900-NEXT: ;;#ASMSTART 7502; GFX900-NEXT: ; use s[8:9] 7503; GFX900-NEXT: ;;#ASMEND 7504; GFX900-NEXT: s_setpc_b64 s[30:31] 7505; 7506; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_u_u: 7507; GFX90A: ; %bb.0: 7508; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7509; GFX90A-NEXT: ;;#ASMSTART 7510; GFX90A-NEXT: ; def s[4:5] 7511; GFX90A-NEXT: ;;#ASMEND 7512; GFX90A-NEXT: s_mov_b32 s8, s5 7513; GFX90A-NEXT: ;;#ASMSTART 7514; GFX90A-NEXT: ; use s[8:9] 7515; GFX90A-NEXT: ;;#ASMEND 7516; GFX90A-NEXT: s_setpc_b64 s[30:31] 7517; 7518; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_u_u: 7519; GFX940: ; %bb.0: 7520; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7521; GFX940-NEXT: ;;#ASMSTART 7522; GFX940-NEXT: ; def s[0:1] 7523; GFX940-NEXT: ;;#ASMEND 7524; GFX940-NEXT: s_mov_b32 s8, s1 7525; GFX940-NEXT: ;;#ASMSTART 7526; GFX940-NEXT: ; use s[8:9] 7527; GFX940-NEXT: ;;#ASMEND 7528; GFX940-NEXT: s_setpc_b64 s[30:31] 7529 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 7530 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 7531 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7532 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7533 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 poison, i32 poison> 7534 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 7535 ret void 7536} 7537 7538define void @s_shuffle_v4bf16_v3bf16__5_0_u_u() { 7539; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_u_u: 7540; GFX900: ; %bb.0: 7541; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7542; GFX900-NEXT: ;;#ASMSTART 7543; GFX900-NEXT: ; def s[4:5] 7544; GFX900-NEXT: ;;#ASMEND 7545; GFX900-NEXT: ;;#ASMSTART 7546; GFX900-NEXT: ; def s[6:7] 7547; GFX900-NEXT: ;;#ASMEND 7548; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s4 7549; GFX900-NEXT: ;;#ASMSTART 7550; GFX900-NEXT: ; use s[8:9] 7551; GFX900-NEXT: ;;#ASMEND 7552; GFX900-NEXT: s_setpc_b64 s[30:31] 7553; 7554; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_u_u: 7555; GFX90A: ; %bb.0: 7556; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7557; GFX90A-NEXT: ;;#ASMSTART 7558; GFX90A-NEXT: ; def s[4:5] 7559; GFX90A-NEXT: ;;#ASMEND 7560; GFX90A-NEXT: ;;#ASMSTART 7561; GFX90A-NEXT: ; def s[6:7] 7562; GFX90A-NEXT: ;;#ASMEND 7563; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s4 7564; GFX90A-NEXT: ;;#ASMSTART 7565; GFX90A-NEXT: ; use s[8:9] 7566; GFX90A-NEXT: ;;#ASMEND 7567; GFX90A-NEXT: s_setpc_b64 s[30:31] 7568; 7569; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_u_u: 7570; GFX940: ; %bb.0: 7571; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7572; GFX940-NEXT: ;;#ASMSTART 7573; GFX940-NEXT: ; def s[0:1] 7574; GFX940-NEXT: ;;#ASMEND 7575; GFX940-NEXT: ;;#ASMSTART 7576; GFX940-NEXT: ; def s[2:3] 7577; GFX940-NEXT: ;;#ASMEND 7578; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s0 7579; GFX940-NEXT: ;;#ASMSTART 7580; GFX940-NEXT: ; use s[8:9] 7581; GFX940-NEXT: ;;#ASMEND 7582; GFX940-NEXT: s_setpc_b64 s[30:31] 7583 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 7584 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 7585 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7586 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7587 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 poison, i32 poison> 7588 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 7589 ret void 7590} 7591 7592define void @s_shuffle_v4bf16_v3bf16__5_1_u_u() { 7593; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_u_u: 7594; GFX900: ; %bb.0: 7595; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7596; GFX900-NEXT: ;;#ASMSTART 7597; GFX900-NEXT: ; def s[4:5] 7598; GFX900-NEXT: ;;#ASMEND 7599; GFX900-NEXT: s_lshr_b32 s4, s4, 16 7600; GFX900-NEXT: ;;#ASMSTART 7601; GFX900-NEXT: ; def s[6:7] 7602; GFX900-NEXT: ;;#ASMEND 7603; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s4 7604; GFX900-NEXT: ;;#ASMSTART 7605; GFX900-NEXT: ; use s[8:9] 7606; GFX900-NEXT: ;;#ASMEND 7607; GFX900-NEXT: s_setpc_b64 s[30:31] 7608; 7609; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_u_u: 7610; GFX90A: ; %bb.0: 7611; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7612; GFX90A-NEXT: ;;#ASMSTART 7613; GFX90A-NEXT: ; def s[4:5] 7614; GFX90A-NEXT: ;;#ASMEND 7615; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 7616; GFX90A-NEXT: ;;#ASMSTART 7617; GFX90A-NEXT: ; def s[6:7] 7618; GFX90A-NEXT: ;;#ASMEND 7619; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s4 7620; GFX90A-NEXT: ;;#ASMSTART 7621; GFX90A-NEXT: ; use s[8:9] 7622; GFX90A-NEXT: ;;#ASMEND 7623; GFX90A-NEXT: s_setpc_b64 s[30:31] 7624; 7625; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_u_u: 7626; GFX940: ; %bb.0: 7627; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7628; GFX940-NEXT: ;;#ASMSTART 7629; GFX940-NEXT: ; def s[0:1] 7630; GFX940-NEXT: ;;#ASMEND 7631; GFX940-NEXT: s_lshr_b32 s0, s0, 16 7632; GFX940-NEXT: ;;#ASMSTART 7633; GFX940-NEXT: ; def s[2:3] 7634; GFX940-NEXT: ;;#ASMEND 7635; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s0 7636; GFX940-NEXT: ;;#ASMSTART 7637; GFX940-NEXT: ; use s[8:9] 7638; GFX940-NEXT: ;;#ASMEND 7639; GFX940-NEXT: s_setpc_b64 s[30:31] 7640 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 7641 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 7642 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7643 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7644 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 poison, i32 poison> 7645 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 7646 ret void 7647} 7648 7649define void @s_shuffle_v4bf16_v3bf16__5_2_u_u() { 7650; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_u_u: 7651; GFX900: ; %bb.0: 7652; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7653; GFX900-NEXT: ;;#ASMSTART 7654; GFX900-NEXT: ; def s[4:5] 7655; GFX900-NEXT: ;;#ASMEND 7656; GFX900-NEXT: ;;#ASMSTART 7657; GFX900-NEXT: ; def s[6:7] 7658; GFX900-NEXT: ;;#ASMEND 7659; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s5 7660; GFX900-NEXT: ;;#ASMSTART 7661; GFX900-NEXT: ; use s[8:9] 7662; GFX900-NEXT: ;;#ASMEND 7663; GFX900-NEXT: s_setpc_b64 s[30:31] 7664; 7665; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_u_u: 7666; GFX90A: ; %bb.0: 7667; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7668; GFX90A-NEXT: ;;#ASMSTART 7669; GFX90A-NEXT: ; def s[4:5] 7670; GFX90A-NEXT: ;;#ASMEND 7671; GFX90A-NEXT: ;;#ASMSTART 7672; GFX90A-NEXT: ; def s[6:7] 7673; GFX90A-NEXT: ;;#ASMEND 7674; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s5 7675; GFX90A-NEXT: ;;#ASMSTART 7676; GFX90A-NEXT: ; use s[8:9] 7677; GFX90A-NEXT: ;;#ASMEND 7678; GFX90A-NEXT: s_setpc_b64 s[30:31] 7679; 7680; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_u_u: 7681; GFX940: ; %bb.0: 7682; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7683; GFX940-NEXT: ;;#ASMSTART 7684; GFX940-NEXT: ; def s[0:1] 7685; GFX940-NEXT: ;;#ASMEND 7686; GFX940-NEXT: ;;#ASMSTART 7687; GFX940-NEXT: ; def s[2:3] 7688; GFX940-NEXT: ;;#ASMEND 7689; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s1 7690; GFX940-NEXT: ;;#ASMSTART 7691; GFX940-NEXT: ; use s[8:9] 7692; GFX940-NEXT: ;;#ASMEND 7693; GFX940-NEXT: s_setpc_b64 s[30:31] 7694 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 7695 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 7696 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7697 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7698 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 poison, i32 poison> 7699 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 7700 ret void 7701} 7702 7703define void @s_shuffle_v4bf16_v3bf16__5_3_u_u() { 7704; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_u_u: 7705; GFX900: ; %bb.0: 7706; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7707; GFX900-NEXT: ;;#ASMSTART 7708; GFX900-NEXT: ; def s[4:5] 7709; GFX900-NEXT: ;;#ASMEND 7710; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s4 7711; GFX900-NEXT: ;;#ASMSTART 7712; GFX900-NEXT: ; use s[8:9] 7713; GFX900-NEXT: ;;#ASMEND 7714; GFX900-NEXT: s_setpc_b64 s[30:31] 7715; 7716; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_u_u: 7717; GFX90A: ; %bb.0: 7718; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7719; GFX90A-NEXT: ;;#ASMSTART 7720; GFX90A-NEXT: ; def s[4:5] 7721; GFX90A-NEXT: ;;#ASMEND 7722; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s4 7723; GFX90A-NEXT: ;;#ASMSTART 7724; GFX90A-NEXT: ; use s[8:9] 7725; GFX90A-NEXT: ;;#ASMEND 7726; GFX90A-NEXT: s_setpc_b64 s[30:31] 7727; 7728; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_u_u: 7729; GFX940: ; %bb.0: 7730; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7731; GFX940-NEXT: ;;#ASMSTART 7732; GFX940-NEXT: ; def s[0:1] 7733; GFX940-NEXT: ;;#ASMEND 7734; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s0 7735; GFX940-NEXT: ;;#ASMSTART 7736; GFX940-NEXT: ; use s[8:9] 7737; GFX940-NEXT: ;;#ASMEND 7738; GFX940-NEXT: s_setpc_b64 s[30:31] 7739 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 7740 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 7741 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7742 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7743 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 poison, i32 poison> 7744 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 7745 ret void 7746} 7747 7748define void @s_shuffle_v4bf16_v3bf16__5_4_u_u() { 7749; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_u_u: 7750; GFX900: ; %bb.0: 7751; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7752; GFX900-NEXT: ;;#ASMSTART 7753; GFX900-NEXT: ; def s[4:5] 7754; GFX900-NEXT: ;;#ASMEND 7755; GFX900-NEXT: s_lshr_b32 s4, s4, 16 7756; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s4 7757; GFX900-NEXT: ;;#ASMSTART 7758; GFX900-NEXT: ; use s[8:9] 7759; GFX900-NEXT: ;;#ASMEND 7760; GFX900-NEXT: s_setpc_b64 s[30:31] 7761; 7762; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_u_u: 7763; GFX90A: ; %bb.0: 7764; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7765; GFX90A-NEXT: ;;#ASMSTART 7766; GFX90A-NEXT: ; def s[4:5] 7767; GFX90A-NEXT: ;;#ASMEND 7768; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 7769; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s4 7770; GFX90A-NEXT: ;;#ASMSTART 7771; GFX90A-NEXT: ; use s[8:9] 7772; GFX90A-NEXT: ;;#ASMEND 7773; GFX90A-NEXT: s_setpc_b64 s[30:31] 7774; 7775; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_u_u: 7776; GFX940: ; %bb.0: 7777; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7778; GFX940-NEXT: ;;#ASMSTART 7779; GFX940-NEXT: ; def s[0:1] 7780; GFX940-NEXT: ;;#ASMEND 7781; GFX940-NEXT: s_lshr_b32 s0, s0, 16 7782; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s0 7783; GFX940-NEXT: ;;#ASMSTART 7784; GFX940-NEXT: ; use s[8:9] 7785; GFX940-NEXT: ;;#ASMEND 7786; GFX940-NEXT: s_setpc_b64 s[30:31] 7787 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 7788 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 7789 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7790 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7791 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 poison, i32 poison> 7792 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 7793 ret void 7794} 7795 7796define void @s_shuffle_v4bf16_v3bf16__5_5_u_u() { 7797; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_u: 7798; GFX900: ; %bb.0: 7799; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7800; GFX900-NEXT: ;;#ASMSTART 7801; GFX900-NEXT: ; def s[4:5] 7802; GFX900-NEXT: ;;#ASMEND 7803; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 7804; GFX900-NEXT: ;;#ASMSTART 7805; GFX900-NEXT: ; use s[8:9] 7806; GFX900-NEXT: ;;#ASMEND 7807; GFX900-NEXT: s_setpc_b64 s[30:31] 7808; 7809; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_u: 7810; GFX90A: ; %bb.0: 7811; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7812; GFX90A-NEXT: ;;#ASMSTART 7813; GFX90A-NEXT: ; def s[4:5] 7814; GFX90A-NEXT: ;;#ASMEND 7815; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 7816; GFX90A-NEXT: ;;#ASMSTART 7817; GFX90A-NEXT: ; use s[8:9] 7818; GFX90A-NEXT: ;;#ASMEND 7819; GFX90A-NEXT: s_setpc_b64 s[30:31] 7820; 7821; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_u: 7822; GFX940: ; %bb.0: 7823; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7824; GFX940-NEXT: ;;#ASMSTART 7825; GFX940-NEXT: ; def s[0:1] 7826; GFX940-NEXT: ;;#ASMEND 7827; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 7828; GFX940-NEXT: ;;#ASMSTART 7829; GFX940-NEXT: ; use s[8:9] 7830; GFX940-NEXT: ;;#ASMEND 7831; GFX940-NEXT: s_setpc_b64 s[30:31] 7832 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 7833 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 7834 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7835 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7836 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 poison> 7837 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 7838 ret void 7839} 7840 7841define void @s_shuffle_v4bf16_v3bf16__5_5_0_u() { 7842; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_u: 7843; GFX900: ; %bb.0: 7844; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7845; GFX900-NEXT: ;;#ASMSTART 7846; GFX900-NEXT: ; def s[4:5] 7847; GFX900-NEXT: ;;#ASMEND 7848; GFX900-NEXT: ;;#ASMSTART 7849; GFX900-NEXT: ; def s[6:7] 7850; GFX900-NEXT: ;;#ASMEND 7851; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 7852; GFX900-NEXT: s_mov_b32 s9, s4 7853; GFX900-NEXT: ;;#ASMSTART 7854; GFX900-NEXT: ; use s[8:9] 7855; GFX900-NEXT: ;;#ASMEND 7856; GFX900-NEXT: s_setpc_b64 s[30:31] 7857; 7858; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_u: 7859; GFX90A: ; %bb.0: 7860; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7861; GFX90A-NEXT: ;;#ASMSTART 7862; GFX90A-NEXT: ; def s[4:5] 7863; GFX90A-NEXT: ;;#ASMEND 7864; GFX90A-NEXT: ;;#ASMSTART 7865; GFX90A-NEXT: ; def s[6:7] 7866; GFX90A-NEXT: ;;#ASMEND 7867; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 7868; GFX90A-NEXT: s_mov_b32 s9, s4 7869; GFX90A-NEXT: ;;#ASMSTART 7870; GFX90A-NEXT: ; use s[8:9] 7871; GFX90A-NEXT: ;;#ASMEND 7872; GFX90A-NEXT: s_setpc_b64 s[30:31] 7873; 7874; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_u: 7875; GFX940: ; %bb.0: 7876; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7877; GFX940-NEXT: ;;#ASMSTART 7878; GFX940-NEXT: ; def s[0:1] 7879; GFX940-NEXT: ;;#ASMEND 7880; GFX940-NEXT: ;;#ASMSTART 7881; GFX940-NEXT: ; def s[2:3] 7882; GFX940-NEXT: ;;#ASMEND 7883; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 7884; GFX940-NEXT: s_mov_b32 s9, s0 7885; GFX940-NEXT: ;;#ASMSTART 7886; GFX940-NEXT: ; use s[8:9] 7887; GFX940-NEXT: ;;#ASMEND 7888; GFX940-NEXT: s_setpc_b64 s[30:31] 7889 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 7890 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 7891 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7892 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7893 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 poison> 7894 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 7895 ret void 7896} 7897 7898define void @s_shuffle_v4bf16_v3bf16__5_5_1_u() { 7899; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_u: 7900; GFX900: ; %bb.0: 7901; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7902; GFX900-NEXT: ;;#ASMSTART 7903; GFX900-NEXT: ; def s[4:5] 7904; GFX900-NEXT: ;;#ASMEND 7905; GFX900-NEXT: ;;#ASMSTART 7906; GFX900-NEXT: ; def s[6:7] 7907; GFX900-NEXT: ;;#ASMEND 7908; GFX900-NEXT: s_lshr_b32 s9, s4, 16 7909; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 7910; GFX900-NEXT: ;;#ASMSTART 7911; GFX900-NEXT: ; use s[8:9] 7912; GFX900-NEXT: ;;#ASMEND 7913; GFX900-NEXT: s_setpc_b64 s[30:31] 7914; 7915; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_u: 7916; GFX90A: ; %bb.0: 7917; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7918; GFX90A-NEXT: ;;#ASMSTART 7919; GFX90A-NEXT: ; def s[4:5] 7920; GFX90A-NEXT: ;;#ASMEND 7921; GFX90A-NEXT: ;;#ASMSTART 7922; GFX90A-NEXT: ; def s[6:7] 7923; GFX90A-NEXT: ;;#ASMEND 7924; GFX90A-NEXT: s_lshr_b32 s9, s4, 16 7925; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 7926; GFX90A-NEXT: ;;#ASMSTART 7927; GFX90A-NEXT: ; use s[8:9] 7928; GFX90A-NEXT: ;;#ASMEND 7929; GFX90A-NEXT: s_setpc_b64 s[30:31] 7930; 7931; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_u: 7932; GFX940: ; %bb.0: 7933; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7934; GFX940-NEXT: ;;#ASMSTART 7935; GFX940-NEXT: ; def s[0:1] 7936; GFX940-NEXT: ;;#ASMEND 7937; GFX940-NEXT: ;;#ASMSTART 7938; GFX940-NEXT: ; def s[2:3] 7939; GFX940-NEXT: ;;#ASMEND 7940; GFX940-NEXT: s_lshr_b32 s9, s0, 16 7941; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 7942; GFX940-NEXT: ;;#ASMSTART 7943; GFX940-NEXT: ; use s[8:9] 7944; GFX940-NEXT: ;;#ASMEND 7945; GFX940-NEXT: s_setpc_b64 s[30:31] 7946 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 7947 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 7948 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7949 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 7950 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 poison> 7951 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 7952 ret void 7953} 7954 7955define void @s_shuffle_v4bf16_v3bf16__5_5_2_u() { 7956; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_u: 7957; GFX900: ; %bb.0: 7958; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7959; GFX900-NEXT: ;;#ASMSTART 7960; GFX900-NEXT: ; def s[8:9] 7961; GFX900-NEXT: ;;#ASMEND 7962; GFX900-NEXT: ;;#ASMSTART 7963; GFX900-NEXT: ; def s[4:5] 7964; GFX900-NEXT: ;;#ASMEND 7965; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 7966; GFX900-NEXT: ;;#ASMSTART 7967; GFX900-NEXT: ; use s[8:9] 7968; GFX900-NEXT: ;;#ASMEND 7969; GFX900-NEXT: s_setpc_b64 s[30:31] 7970; 7971; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_u: 7972; GFX90A: ; %bb.0: 7973; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7974; GFX90A-NEXT: ;;#ASMSTART 7975; GFX90A-NEXT: ; def s[8:9] 7976; GFX90A-NEXT: ;;#ASMEND 7977; GFX90A-NEXT: ;;#ASMSTART 7978; GFX90A-NEXT: ; def s[4:5] 7979; GFX90A-NEXT: ;;#ASMEND 7980; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 7981; GFX90A-NEXT: ;;#ASMSTART 7982; GFX90A-NEXT: ; use s[8:9] 7983; GFX90A-NEXT: ;;#ASMEND 7984; GFX90A-NEXT: s_setpc_b64 s[30:31] 7985; 7986; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_u: 7987; GFX940: ; %bb.0: 7988; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7989; GFX940-NEXT: ;;#ASMSTART 7990; GFX940-NEXT: ; def s[8:9] 7991; GFX940-NEXT: ;;#ASMEND 7992; GFX940-NEXT: ;;#ASMSTART 7993; GFX940-NEXT: ; def s[0:1] 7994; GFX940-NEXT: ;;#ASMEND 7995; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 7996; GFX940-NEXT: ;;#ASMSTART 7997; GFX940-NEXT: ; use s[8:9] 7998; GFX940-NEXT: ;;#ASMEND 7999; GFX940-NEXT: s_setpc_b64 s[30:31] 8000 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8001 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 8002 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8003 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8004 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 poison> 8005 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8006 ret void 8007} 8008 8009define void @s_shuffle_v4bf16_v3bf16__5_5_3_u() { 8010; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_u: 8011; GFX900: ; %bb.0: 8012; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8013; GFX900-NEXT: ;;#ASMSTART 8014; GFX900-NEXT: ; def s[4:5] 8015; GFX900-NEXT: ;;#ASMEND 8016; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 8017; GFX900-NEXT: s_mov_b32 s9, s4 8018; GFX900-NEXT: ;;#ASMSTART 8019; GFX900-NEXT: ; use s[8:9] 8020; GFX900-NEXT: ;;#ASMEND 8021; GFX900-NEXT: s_setpc_b64 s[30:31] 8022; 8023; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_u: 8024; GFX90A: ; %bb.0: 8025; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8026; GFX90A-NEXT: ;;#ASMSTART 8027; GFX90A-NEXT: ; def s[4:5] 8028; GFX90A-NEXT: ;;#ASMEND 8029; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 8030; GFX90A-NEXT: s_mov_b32 s9, s4 8031; GFX90A-NEXT: ;;#ASMSTART 8032; GFX90A-NEXT: ; use s[8:9] 8033; GFX90A-NEXT: ;;#ASMEND 8034; GFX90A-NEXT: s_setpc_b64 s[30:31] 8035; 8036; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_u: 8037; GFX940: ; %bb.0: 8038; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8039; GFX940-NEXT: ;;#ASMSTART 8040; GFX940-NEXT: ; def s[0:1] 8041; GFX940-NEXT: ;;#ASMEND 8042; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 8043; GFX940-NEXT: s_mov_b32 s9, s0 8044; GFX940-NEXT: ;;#ASMSTART 8045; GFX940-NEXT: ; use s[8:9] 8046; GFX940-NEXT: ;;#ASMEND 8047; GFX940-NEXT: s_setpc_b64 s[30:31] 8048 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8049 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 8050 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8051 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8052 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 poison> 8053 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8054 ret void 8055} 8056 8057define void @s_shuffle_v4bf16_v3bf16__5_5_4_u() { 8058; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_u: 8059; GFX900: ; %bb.0: 8060; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8061; GFX900-NEXT: ;;#ASMSTART 8062; GFX900-NEXT: ; def s[4:5] 8063; GFX900-NEXT: ;;#ASMEND 8064; GFX900-NEXT: s_lshr_b32 s9, s4, 16 8065; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 8066; GFX900-NEXT: ;;#ASMSTART 8067; GFX900-NEXT: ; use s[8:9] 8068; GFX900-NEXT: ;;#ASMEND 8069; GFX900-NEXT: s_setpc_b64 s[30:31] 8070; 8071; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_u: 8072; GFX90A: ; %bb.0: 8073; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8074; GFX90A-NEXT: ;;#ASMSTART 8075; GFX90A-NEXT: ; def s[4:5] 8076; GFX90A-NEXT: ;;#ASMEND 8077; GFX90A-NEXT: s_lshr_b32 s9, s4, 16 8078; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 8079; GFX90A-NEXT: ;;#ASMSTART 8080; GFX90A-NEXT: ; use s[8:9] 8081; GFX90A-NEXT: ;;#ASMEND 8082; GFX90A-NEXT: s_setpc_b64 s[30:31] 8083; 8084; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_u: 8085; GFX940: ; %bb.0: 8086; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8087; GFX940-NEXT: ;;#ASMSTART 8088; GFX940-NEXT: ; def s[0:1] 8089; GFX940-NEXT: ;;#ASMEND 8090; GFX940-NEXT: s_lshr_b32 s9, s0, 16 8091; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 8092; GFX940-NEXT: ;;#ASMSTART 8093; GFX940-NEXT: ; use s[8:9] 8094; GFX940-NEXT: ;;#ASMEND 8095; GFX940-NEXT: s_setpc_b64 s[30:31] 8096 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8097 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 8098 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8099 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8100 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 poison> 8101 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8102 ret void 8103} 8104 8105define void @s_shuffle_v4bf16_v3bf16__5_5_5_u() { 8106; GFX9-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_u: 8107; GFX9: ; %bb.0: 8108; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8109; GFX9-NEXT: ;;#ASMSTART 8110; GFX9-NEXT: ; def s[8:9] 8111; GFX9-NEXT: ;;#ASMEND 8112; GFX9-NEXT: s_pack_ll_b32_b16 s8, s9, s9 8113; GFX9-NEXT: ;;#ASMSTART 8114; GFX9-NEXT: ; use s[8:9] 8115; GFX9-NEXT: ;;#ASMEND 8116; GFX9-NEXT: s_setpc_b64 s[30:31] 8117 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8118 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 8119 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8120 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8121 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 poison> 8122 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8123 ret void 8124} 8125 8126define void @s_shuffle_v4bf16_v3bf16__5_5_5_0() { 8127; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_0: 8128; GFX900: ; %bb.0: 8129; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8130; GFX900-NEXT: ;;#ASMSTART 8131; GFX900-NEXT: ; def s[4:5] 8132; GFX900-NEXT: ;;#ASMEND 8133; GFX900-NEXT: ;;#ASMSTART 8134; GFX900-NEXT: ; def s[6:7] 8135; GFX900-NEXT: ;;#ASMEND 8136; GFX900-NEXT: s_pack_ll_b32_b16 s9, s7, s4 8137; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 8138; GFX900-NEXT: ;;#ASMSTART 8139; GFX900-NEXT: ; use s[8:9] 8140; GFX900-NEXT: ;;#ASMEND 8141; GFX900-NEXT: s_setpc_b64 s[30:31] 8142; 8143; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_0: 8144; GFX90A: ; %bb.0: 8145; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8146; GFX90A-NEXT: ;;#ASMSTART 8147; GFX90A-NEXT: ; def s[4:5] 8148; GFX90A-NEXT: ;;#ASMEND 8149; GFX90A-NEXT: ;;#ASMSTART 8150; GFX90A-NEXT: ; def s[6:7] 8151; GFX90A-NEXT: ;;#ASMEND 8152; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s7, s4 8153; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 8154; GFX90A-NEXT: ;;#ASMSTART 8155; GFX90A-NEXT: ; use s[8:9] 8156; GFX90A-NEXT: ;;#ASMEND 8157; GFX90A-NEXT: s_setpc_b64 s[30:31] 8158; 8159; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_0: 8160; GFX940: ; %bb.0: 8161; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8162; GFX940-NEXT: ;;#ASMSTART 8163; GFX940-NEXT: ; def s[0:1] 8164; GFX940-NEXT: ;;#ASMEND 8165; GFX940-NEXT: ;;#ASMSTART 8166; GFX940-NEXT: ; def s[2:3] 8167; GFX940-NEXT: ;;#ASMEND 8168; GFX940-NEXT: s_pack_ll_b32_b16 s9, s3, s0 8169; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 8170; GFX940-NEXT: ;;#ASMSTART 8171; GFX940-NEXT: ; use s[8:9] 8172; GFX940-NEXT: ;;#ASMEND 8173; GFX940-NEXT: s_setpc_b64 s[30:31] 8174 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8175 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 8176 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8177 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8178 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 0> 8179 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8180 ret void 8181} 8182 8183define void @s_shuffle_v4bf16_v3bf16__5_5_5_1() { 8184; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_1: 8185; GFX900: ; %bb.0: 8186; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8187; GFX900-NEXT: ;;#ASMSTART 8188; GFX900-NEXT: ; def s[4:5] 8189; GFX900-NEXT: ;;#ASMEND 8190; GFX900-NEXT: s_lshr_b32 s4, s4, 16 8191; GFX900-NEXT: ;;#ASMSTART 8192; GFX900-NEXT: ; def s[6:7] 8193; GFX900-NEXT: ;;#ASMEND 8194; GFX900-NEXT: s_pack_ll_b32_b16 s9, s7, s4 8195; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 8196; GFX900-NEXT: ;;#ASMSTART 8197; GFX900-NEXT: ; use s[8:9] 8198; GFX900-NEXT: ;;#ASMEND 8199; GFX900-NEXT: s_setpc_b64 s[30:31] 8200; 8201; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_1: 8202; GFX90A: ; %bb.0: 8203; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8204; GFX90A-NEXT: ;;#ASMSTART 8205; GFX90A-NEXT: ; def s[4:5] 8206; GFX90A-NEXT: ;;#ASMEND 8207; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 8208; GFX90A-NEXT: ;;#ASMSTART 8209; GFX90A-NEXT: ; def s[6:7] 8210; GFX90A-NEXT: ;;#ASMEND 8211; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s7, s4 8212; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 8213; GFX90A-NEXT: ;;#ASMSTART 8214; GFX90A-NEXT: ; use s[8:9] 8215; GFX90A-NEXT: ;;#ASMEND 8216; GFX90A-NEXT: s_setpc_b64 s[30:31] 8217; 8218; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_1: 8219; GFX940: ; %bb.0: 8220; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8221; GFX940-NEXT: ;;#ASMSTART 8222; GFX940-NEXT: ; def s[0:1] 8223; GFX940-NEXT: ;;#ASMEND 8224; GFX940-NEXT: s_lshr_b32 s0, s0, 16 8225; GFX940-NEXT: ;;#ASMSTART 8226; GFX940-NEXT: ; def s[2:3] 8227; GFX940-NEXT: ;;#ASMEND 8228; GFX940-NEXT: s_pack_ll_b32_b16 s9, s3, s0 8229; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 8230; GFX940-NEXT: ;;#ASMSTART 8231; GFX940-NEXT: ; use s[8:9] 8232; GFX940-NEXT: ;;#ASMEND 8233; GFX940-NEXT: s_setpc_b64 s[30:31] 8234 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8235 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 8236 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8237 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8238 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 1> 8239 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8240 ret void 8241} 8242 8243define void @s_shuffle_v4bf16_v3bf16__5_5_5_2() { 8244; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_2: 8245; GFX900: ; %bb.0: 8246; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8247; GFX900-NEXT: ;;#ASMSTART 8248; GFX900-NEXT: ; def s[4:5] 8249; GFX900-NEXT: ;;#ASMEND 8250; GFX900-NEXT: ;;#ASMSTART 8251; GFX900-NEXT: ; def s[6:7] 8252; GFX900-NEXT: ;;#ASMEND 8253; GFX900-NEXT: s_pack_ll_b32_b16 s9, s7, s5 8254; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 8255; GFX900-NEXT: ;;#ASMSTART 8256; GFX900-NEXT: ; use s[8:9] 8257; GFX900-NEXT: ;;#ASMEND 8258; GFX900-NEXT: s_setpc_b64 s[30:31] 8259; 8260; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_2: 8261; GFX90A: ; %bb.0: 8262; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8263; GFX90A-NEXT: ;;#ASMSTART 8264; GFX90A-NEXT: ; def s[4:5] 8265; GFX90A-NEXT: ;;#ASMEND 8266; GFX90A-NEXT: ;;#ASMSTART 8267; GFX90A-NEXT: ; def s[6:7] 8268; GFX90A-NEXT: ;;#ASMEND 8269; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s7, s5 8270; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 8271; GFX90A-NEXT: ;;#ASMSTART 8272; GFX90A-NEXT: ; use s[8:9] 8273; GFX90A-NEXT: ;;#ASMEND 8274; GFX90A-NEXT: s_setpc_b64 s[30:31] 8275; 8276; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_2: 8277; GFX940: ; %bb.0: 8278; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8279; GFX940-NEXT: ;;#ASMSTART 8280; GFX940-NEXT: ; def s[0:1] 8281; GFX940-NEXT: ;;#ASMEND 8282; GFX940-NEXT: ;;#ASMSTART 8283; GFX940-NEXT: ; def s[2:3] 8284; GFX940-NEXT: ;;#ASMEND 8285; GFX940-NEXT: s_pack_ll_b32_b16 s9, s3, s1 8286; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 8287; GFX940-NEXT: ;;#ASMSTART 8288; GFX940-NEXT: ; use s[8:9] 8289; GFX940-NEXT: ;;#ASMEND 8290; GFX940-NEXT: s_setpc_b64 s[30:31] 8291 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8292 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 8293 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8294 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8295 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 2> 8296 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8297 ret void 8298} 8299 8300define void @s_shuffle_v4bf16_v3bf16__5_5_5_3() { 8301; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_3: 8302; GFX900: ; %bb.0: 8303; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8304; GFX900-NEXT: ;;#ASMSTART 8305; GFX900-NEXT: ; def s[4:5] 8306; GFX900-NEXT: ;;#ASMEND 8307; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s4 8308; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 8309; GFX900-NEXT: ;;#ASMSTART 8310; GFX900-NEXT: ; use s[8:9] 8311; GFX900-NEXT: ;;#ASMEND 8312; GFX900-NEXT: s_setpc_b64 s[30:31] 8313; 8314; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_3: 8315; GFX90A: ; %bb.0: 8316; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8317; GFX90A-NEXT: ;;#ASMSTART 8318; GFX90A-NEXT: ; def s[4:5] 8319; GFX90A-NEXT: ;;#ASMEND 8320; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s4 8321; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 8322; GFX90A-NEXT: ;;#ASMSTART 8323; GFX90A-NEXT: ; use s[8:9] 8324; GFX90A-NEXT: ;;#ASMEND 8325; GFX90A-NEXT: s_setpc_b64 s[30:31] 8326; 8327; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_3: 8328; GFX940: ; %bb.0: 8329; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8330; GFX940-NEXT: ;;#ASMSTART 8331; GFX940-NEXT: ; def s[0:1] 8332; GFX940-NEXT: ;;#ASMEND 8333; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s0 8334; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 8335; GFX940-NEXT: ;;#ASMSTART 8336; GFX940-NEXT: ; use s[8:9] 8337; GFX940-NEXT: ;;#ASMEND 8338; GFX940-NEXT: s_setpc_b64 s[30:31] 8339 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8340 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 8341 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8342 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8343 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 3> 8344 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8345 ret void 8346} 8347 8348define void @s_shuffle_v4bf16_v3bf16__5_5_5_4() { 8349; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_4: 8350; GFX900: ; %bb.0: 8351; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8352; GFX900-NEXT: ;;#ASMSTART 8353; GFX900-NEXT: ; def s[4:5] 8354; GFX900-NEXT: ;;#ASMEND 8355; GFX900-NEXT: s_lshr_b32 s4, s4, 16 8356; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s4 8357; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 8358; GFX900-NEXT: ;;#ASMSTART 8359; GFX900-NEXT: ; use s[8:9] 8360; GFX900-NEXT: ;;#ASMEND 8361; GFX900-NEXT: s_setpc_b64 s[30:31] 8362; 8363; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_4: 8364; GFX90A: ; %bb.0: 8365; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8366; GFX90A-NEXT: ;;#ASMSTART 8367; GFX90A-NEXT: ; def s[4:5] 8368; GFX90A-NEXT: ;;#ASMEND 8369; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 8370; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s4 8371; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 8372; GFX90A-NEXT: ;;#ASMSTART 8373; GFX90A-NEXT: ; use s[8:9] 8374; GFX90A-NEXT: ;;#ASMEND 8375; GFX90A-NEXT: s_setpc_b64 s[30:31] 8376; 8377; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_4: 8378; GFX940: ; %bb.0: 8379; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8380; GFX940-NEXT: ;;#ASMSTART 8381; GFX940-NEXT: ; def s[0:1] 8382; GFX940-NEXT: ;;#ASMEND 8383; GFX940-NEXT: s_lshr_b32 s0, s0, 16 8384; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s0 8385; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 8386; GFX940-NEXT: ;;#ASMSTART 8387; GFX940-NEXT: ; use s[8:9] 8388; GFX940-NEXT: ;;#ASMEND 8389; GFX940-NEXT: s_setpc_b64 s[30:31] 8390 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8391 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 8392 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8393 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8394 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 4> 8395 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8396 ret void 8397} 8398 8399define void @s_shuffle_v4bf16_v3bf16__5_5_5_5() { 8400; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_5: 8401; GFX900: ; %bb.0: 8402; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8403; GFX900-NEXT: ;;#ASMSTART 8404; GFX900-NEXT: ; def s[4:5] 8405; GFX900-NEXT: ;;#ASMEND 8406; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 8407; GFX900-NEXT: s_mov_b32 s9, s8 8408; GFX900-NEXT: ;;#ASMSTART 8409; GFX900-NEXT: ; use s[8:9] 8410; GFX900-NEXT: ;;#ASMEND 8411; GFX900-NEXT: s_setpc_b64 s[30:31] 8412; 8413; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_5: 8414; GFX90A: ; %bb.0: 8415; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8416; GFX90A-NEXT: ;;#ASMSTART 8417; GFX90A-NEXT: ; def s[4:5] 8418; GFX90A-NEXT: ;;#ASMEND 8419; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 8420; GFX90A-NEXT: s_mov_b32 s9, s8 8421; GFX90A-NEXT: ;;#ASMSTART 8422; GFX90A-NEXT: ; use s[8:9] 8423; GFX90A-NEXT: ;;#ASMEND 8424; GFX90A-NEXT: s_setpc_b64 s[30:31] 8425; 8426; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_5: 8427; GFX940: ; %bb.0: 8428; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8429; GFX940-NEXT: ;;#ASMSTART 8430; GFX940-NEXT: ; def s[0:1] 8431; GFX940-NEXT: ;;#ASMEND 8432; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 8433; GFX940-NEXT: s_mov_b32 s9, s8 8434; GFX940-NEXT: ;;#ASMSTART 8435; GFX940-NEXT: ; use s[8:9] 8436; GFX940-NEXT: ;;#ASMEND 8437; GFX940-NEXT: s_setpc_b64 s[30:31] 8438 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8439 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 8440 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8441 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8442 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 5> 8443 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8444 ret void 8445} 8446 8447define void @s_shuffle_v4bf16_v3bf16__u_0_0_0() { 8448; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__u_0_0_0: 8449; GFX900: ; %bb.0: 8450; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8451; GFX900-NEXT: ;;#ASMSTART 8452; GFX900-NEXT: ; def s[4:5] 8453; GFX900-NEXT: ;;#ASMEND 8454; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8455; GFX900-NEXT: s_lshl_b32 s8, s4, 16 8456; GFX900-NEXT: ;;#ASMSTART 8457; GFX900-NEXT: ; use s[8:9] 8458; GFX900-NEXT: ;;#ASMEND 8459; GFX900-NEXT: s_setpc_b64 s[30:31] 8460; 8461; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__u_0_0_0: 8462; GFX90A: ; %bb.0: 8463; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8464; GFX90A-NEXT: ;;#ASMSTART 8465; GFX90A-NEXT: ; def s[4:5] 8466; GFX90A-NEXT: ;;#ASMEND 8467; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8468; GFX90A-NEXT: s_lshl_b32 s8, s4, 16 8469; GFX90A-NEXT: ;;#ASMSTART 8470; GFX90A-NEXT: ; use s[8:9] 8471; GFX90A-NEXT: ;;#ASMEND 8472; GFX90A-NEXT: s_setpc_b64 s[30:31] 8473; 8474; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__u_0_0_0: 8475; GFX940: ; %bb.0: 8476; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8477; GFX940-NEXT: ;;#ASMSTART 8478; GFX940-NEXT: ; def s[0:1] 8479; GFX940-NEXT: ;;#ASMEND 8480; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 8481; GFX940-NEXT: s_lshl_b32 s8, s0, 16 8482; GFX940-NEXT: ;;#ASMSTART 8483; GFX940-NEXT: ; use s[8:9] 8484; GFX940-NEXT: ;;#ASMEND 8485; GFX940-NEXT: s_setpc_b64 s[30:31] 8486 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8487 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8488 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 0, i32 0, i32 0> 8489 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8490 ret void 8491} 8492 8493define void @s_shuffle_v4bf16_v3bf16__0_0_0_0() { 8494; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_0_0_0: 8495; GFX900: ; %bb.0: 8496; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8497; GFX900-NEXT: ;;#ASMSTART 8498; GFX900-NEXT: ; def s[4:5] 8499; GFX900-NEXT: ;;#ASMEND 8500; GFX900-NEXT: s_pack_ll_b32_b16 s8, s4, s4 8501; GFX900-NEXT: s_mov_b32 s9, s8 8502; GFX900-NEXT: ;;#ASMSTART 8503; GFX900-NEXT: ; use s[8:9] 8504; GFX900-NEXT: ;;#ASMEND 8505; GFX900-NEXT: s_setpc_b64 s[30:31] 8506; 8507; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_0_0_0: 8508; GFX90A: ; %bb.0: 8509; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8510; GFX90A-NEXT: ;;#ASMSTART 8511; GFX90A-NEXT: ; def s[4:5] 8512; GFX90A-NEXT: ;;#ASMEND 8513; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s4, s4 8514; GFX90A-NEXT: s_mov_b32 s9, s8 8515; GFX90A-NEXT: ;;#ASMSTART 8516; GFX90A-NEXT: ; use s[8:9] 8517; GFX90A-NEXT: ;;#ASMEND 8518; GFX90A-NEXT: s_setpc_b64 s[30:31] 8519; 8520; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_0_0_0: 8521; GFX940: ; %bb.0: 8522; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8523; GFX940-NEXT: ;;#ASMSTART 8524; GFX940-NEXT: ; def s[0:1] 8525; GFX940-NEXT: ;;#ASMEND 8526; GFX940-NEXT: s_pack_ll_b32_b16 s8, s0, s0 8527; GFX940-NEXT: s_mov_b32 s9, s8 8528; GFX940-NEXT: ;;#ASMSTART 8529; GFX940-NEXT: ; use s[8:9] 8530; GFX940-NEXT: ;;#ASMEND 8531; GFX940-NEXT: s_setpc_b64 s[30:31] 8532 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8533 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8534 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> zeroinitializer 8535 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8536 ret void 8537} 8538 8539define void @s_shuffle_v4bf16_v3bf16__1_0_0_0() { 8540; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_0_0_0: 8541; GFX900: ; %bb.0: 8542; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8543; GFX900-NEXT: ;;#ASMSTART 8544; GFX900-NEXT: ; def s[4:5] 8545; GFX900-NEXT: ;;#ASMEND 8546; GFX900-NEXT: s_lshr_b32 s5, s4, 16 8547; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s4 8548; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8549; GFX900-NEXT: ;;#ASMSTART 8550; GFX900-NEXT: ; use s[8:9] 8551; GFX900-NEXT: ;;#ASMEND 8552; GFX900-NEXT: s_setpc_b64 s[30:31] 8553; 8554; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_0_0_0: 8555; GFX90A: ; %bb.0: 8556; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8557; GFX90A-NEXT: ;;#ASMSTART 8558; GFX90A-NEXT: ; def s[4:5] 8559; GFX90A-NEXT: ;;#ASMEND 8560; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 8561; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s4 8562; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8563; GFX90A-NEXT: ;;#ASMSTART 8564; GFX90A-NEXT: ; use s[8:9] 8565; GFX90A-NEXT: ;;#ASMEND 8566; GFX90A-NEXT: s_setpc_b64 s[30:31] 8567; 8568; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_0_0_0: 8569; GFX940: ; %bb.0: 8570; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8571; GFX940-NEXT: ;;#ASMSTART 8572; GFX940-NEXT: ; def s[0:1] 8573; GFX940-NEXT: ;;#ASMEND 8574; GFX940-NEXT: s_lshr_b32 s1, s0, 16 8575; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s0 8576; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 8577; GFX940-NEXT: ;;#ASMSTART 8578; GFX940-NEXT: ; use s[8:9] 8579; GFX940-NEXT: ;;#ASMEND 8580; GFX940-NEXT: s_setpc_b64 s[30:31] 8581 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8582 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8583 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 8584 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8585 ret void 8586} 8587 8588define void @s_shuffle_v4bf16_v3bf16__2_0_0_0() { 8589; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_0_0_0: 8590; GFX900: ; %bb.0: 8591; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8592; GFX900-NEXT: ;;#ASMSTART 8593; GFX900-NEXT: ; def s[4:5] 8594; GFX900-NEXT: ;;#ASMEND 8595; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s4 8596; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8597; GFX900-NEXT: ;;#ASMSTART 8598; GFX900-NEXT: ; use s[8:9] 8599; GFX900-NEXT: ;;#ASMEND 8600; GFX900-NEXT: s_setpc_b64 s[30:31] 8601; 8602; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_0_0_0: 8603; GFX90A: ; %bb.0: 8604; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8605; GFX90A-NEXT: ;;#ASMSTART 8606; GFX90A-NEXT: ; def s[4:5] 8607; GFX90A-NEXT: ;;#ASMEND 8608; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s4 8609; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8610; GFX90A-NEXT: ;;#ASMSTART 8611; GFX90A-NEXT: ; use s[8:9] 8612; GFX90A-NEXT: ;;#ASMEND 8613; GFX90A-NEXT: s_setpc_b64 s[30:31] 8614; 8615; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_0_0_0: 8616; GFX940: ; %bb.0: 8617; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8618; GFX940-NEXT: ;;#ASMSTART 8619; GFX940-NEXT: ; def s[0:1] 8620; GFX940-NEXT: ;;#ASMEND 8621; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s0 8622; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 8623; GFX940-NEXT: ;;#ASMSTART 8624; GFX940-NEXT: ; use s[8:9] 8625; GFX940-NEXT: ;;#ASMEND 8626; GFX940-NEXT: s_setpc_b64 s[30:31] 8627 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8628 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8629 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 0, i32 0, i32 0> 8630 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8631 ret void 8632} 8633 8634define void @s_shuffle_v4bf16_v3bf16__3_0_0_0() { 8635; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__3_0_0_0: 8636; GFX900: ; %bb.0: 8637; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8638; GFX900-NEXT: ;;#ASMSTART 8639; GFX900-NEXT: ; def s[4:5] 8640; GFX900-NEXT: ;;#ASMEND 8641; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8642; GFX900-NEXT: s_lshl_b32 s8, s4, 16 8643; GFX900-NEXT: ;;#ASMSTART 8644; GFX900-NEXT: ; use s[8:9] 8645; GFX900-NEXT: ;;#ASMEND 8646; GFX900-NEXT: s_setpc_b64 s[30:31] 8647; 8648; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__3_0_0_0: 8649; GFX90A: ; %bb.0: 8650; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8651; GFX90A-NEXT: ;;#ASMSTART 8652; GFX90A-NEXT: ; def s[4:5] 8653; GFX90A-NEXT: ;;#ASMEND 8654; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8655; GFX90A-NEXT: s_lshl_b32 s8, s4, 16 8656; GFX90A-NEXT: ;;#ASMSTART 8657; GFX90A-NEXT: ; use s[8:9] 8658; GFX90A-NEXT: ;;#ASMEND 8659; GFX90A-NEXT: s_setpc_b64 s[30:31] 8660; 8661; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__3_0_0_0: 8662; GFX940: ; %bb.0: 8663; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8664; GFX940-NEXT: ;;#ASMSTART 8665; GFX940-NEXT: ; def s[0:1] 8666; GFX940-NEXT: ;;#ASMEND 8667; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 8668; GFX940-NEXT: s_lshl_b32 s8, s0, 16 8669; GFX940-NEXT: ;;#ASMSTART 8670; GFX940-NEXT: ; use s[8:9] 8671; GFX940-NEXT: ;;#ASMEND 8672; GFX940-NEXT: s_setpc_b64 s[30:31] 8673 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8674 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8675 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 8676 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8677 ret void 8678} 8679 8680define void @s_shuffle_v4bf16_v3bf16__4_0_0_0() { 8681; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_0_0_0: 8682; GFX900: ; %bb.0: 8683; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8684; GFX900-NEXT: ;;#ASMSTART 8685; GFX900-NEXT: ; def s[4:5] 8686; GFX900-NEXT: ;;#ASMEND 8687; GFX900-NEXT: ;;#ASMSTART 8688; GFX900-NEXT: ; def s[6:7] 8689; GFX900-NEXT: ;;#ASMEND 8690; GFX900-NEXT: s_lshr_b32 s5, s6, 16 8691; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s4 8692; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8693; GFX900-NEXT: ;;#ASMSTART 8694; GFX900-NEXT: ; use s[8:9] 8695; GFX900-NEXT: ;;#ASMEND 8696; GFX900-NEXT: s_setpc_b64 s[30:31] 8697; 8698; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_0_0_0: 8699; GFX90A: ; %bb.0: 8700; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8701; GFX90A-NEXT: ;;#ASMSTART 8702; GFX90A-NEXT: ; def s[4:5] 8703; GFX90A-NEXT: ;;#ASMEND 8704; GFX90A-NEXT: ;;#ASMSTART 8705; GFX90A-NEXT: ; def s[6:7] 8706; GFX90A-NEXT: ;;#ASMEND 8707; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 8708; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s4 8709; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8710; GFX90A-NEXT: ;;#ASMSTART 8711; GFX90A-NEXT: ; use s[8:9] 8712; GFX90A-NEXT: ;;#ASMEND 8713; GFX90A-NEXT: s_setpc_b64 s[30:31] 8714; 8715; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_0_0_0: 8716; GFX940: ; %bb.0: 8717; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8718; GFX940-NEXT: ;;#ASMSTART 8719; GFX940-NEXT: ; def s[0:1] 8720; GFX940-NEXT: ;;#ASMEND 8721; GFX940-NEXT: ;;#ASMSTART 8722; GFX940-NEXT: ; def s[2:3] 8723; GFX940-NEXT: ;;#ASMEND 8724; GFX940-NEXT: s_lshr_b32 s1, s2, 16 8725; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s0 8726; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 8727; GFX940-NEXT: ;;#ASMSTART 8728; GFX940-NEXT: ; use s[8:9] 8729; GFX940-NEXT: ;;#ASMEND 8730; GFX940-NEXT: s_setpc_b64 s[30:31] 8731 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8732 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 8733 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8734 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8735 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 0, i32 0, i32 0> 8736 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8737 ret void 8738} 8739 8740define void @s_shuffle_v4bf16_v3bf16__5_0_0_0() { 8741; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_0_0: 8742; GFX900: ; %bb.0: 8743; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8744; GFX900-NEXT: ;;#ASMSTART 8745; GFX900-NEXT: ; def s[4:5] 8746; GFX900-NEXT: ;;#ASMEND 8747; GFX900-NEXT: ;;#ASMSTART 8748; GFX900-NEXT: ; def s[6:7] 8749; GFX900-NEXT: ;;#ASMEND 8750; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s4 8751; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8752; GFX900-NEXT: ;;#ASMSTART 8753; GFX900-NEXT: ; use s[8:9] 8754; GFX900-NEXT: ;;#ASMEND 8755; GFX900-NEXT: s_setpc_b64 s[30:31] 8756; 8757; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_0_0: 8758; GFX90A: ; %bb.0: 8759; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8760; GFX90A-NEXT: ;;#ASMSTART 8761; GFX90A-NEXT: ; def s[4:5] 8762; GFX90A-NEXT: ;;#ASMEND 8763; GFX90A-NEXT: ;;#ASMSTART 8764; GFX90A-NEXT: ; def s[6:7] 8765; GFX90A-NEXT: ;;#ASMEND 8766; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s4 8767; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8768; GFX90A-NEXT: ;;#ASMSTART 8769; GFX90A-NEXT: ; use s[8:9] 8770; GFX90A-NEXT: ;;#ASMEND 8771; GFX90A-NEXT: s_setpc_b64 s[30:31] 8772; 8773; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_0_0: 8774; GFX940: ; %bb.0: 8775; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8776; GFX940-NEXT: ;;#ASMSTART 8777; GFX940-NEXT: ; def s[0:1] 8778; GFX940-NEXT: ;;#ASMEND 8779; GFX940-NEXT: ;;#ASMSTART 8780; GFX940-NEXT: ; def s[2:3] 8781; GFX940-NEXT: ;;#ASMEND 8782; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s0 8783; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 8784; GFX940-NEXT: ;;#ASMSTART 8785; GFX940-NEXT: ; use s[8:9] 8786; GFX940-NEXT: ;;#ASMEND 8787; GFX940-NEXT: s_setpc_b64 s[30:31] 8788 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8789 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 8790 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8791 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8792 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 0, i32 0> 8793 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8794 ret void 8795} 8796 8797define void @s_shuffle_v4bf16_v3bf16__5_u_0_0() { 8798; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_0_0: 8799; GFX900: ; %bb.0: 8800; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8801; GFX900-NEXT: ;;#ASMSTART 8802; GFX900-NEXT: ; def s[4:5] 8803; GFX900-NEXT: ;;#ASMEND 8804; GFX900-NEXT: ;;#ASMSTART 8805; GFX900-NEXT: ; def s[6:7] 8806; GFX900-NEXT: ;;#ASMEND 8807; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8808; GFX900-NEXT: s_mov_b32 s8, s7 8809; GFX900-NEXT: ;;#ASMSTART 8810; GFX900-NEXT: ; use s[8:9] 8811; GFX900-NEXT: ;;#ASMEND 8812; GFX900-NEXT: s_setpc_b64 s[30:31] 8813; 8814; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_0_0: 8815; GFX90A: ; %bb.0: 8816; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8817; GFX90A-NEXT: ;;#ASMSTART 8818; GFX90A-NEXT: ; def s[4:5] 8819; GFX90A-NEXT: ;;#ASMEND 8820; GFX90A-NEXT: ;;#ASMSTART 8821; GFX90A-NEXT: ; def s[6:7] 8822; GFX90A-NEXT: ;;#ASMEND 8823; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8824; GFX90A-NEXT: s_mov_b32 s8, s7 8825; GFX90A-NEXT: ;;#ASMSTART 8826; GFX90A-NEXT: ; use s[8:9] 8827; GFX90A-NEXT: ;;#ASMEND 8828; GFX90A-NEXT: s_setpc_b64 s[30:31] 8829; 8830; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_0_0: 8831; GFX940: ; %bb.0: 8832; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8833; GFX940-NEXT: ;;#ASMSTART 8834; GFX940-NEXT: ; def s[0:1] 8835; GFX940-NEXT: ;;#ASMEND 8836; GFX940-NEXT: ;;#ASMSTART 8837; GFX940-NEXT: ; def s[2:3] 8838; GFX940-NEXT: ;;#ASMEND 8839; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 8840; GFX940-NEXT: s_mov_b32 s8, s3 8841; GFX940-NEXT: ;;#ASMSTART 8842; GFX940-NEXT: ; use s[8:9] 8843; GFX940-NEXT: ;;#ASMEND 8844; GFX940-NEXT: s_setpc_b64 s[30:31] 8845 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8846 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 8847 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8848 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8849 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 0, i32 0> 8850 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8851 ret void 8852} 8853 8854define void @s_shuffle_v4bf16_v3bf16__5_1_0_0() { 8855; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_0_0: 8856; GFX900: ; %bb.0: 8857; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8858; GFX900-NEXT: ;;#ASMSTART 8859; GFX900-NEXT: ; def s[4:5] 8860; GFX900-NEXT: ;;#ASMEND 8861; GFX900-NEXT: s_lshr_b32 s5, s4, 16 8862; GFX900-NEXT: ;;#ASMSTART 8863; GFX900-NEXT: ; def s[6:7] 8864; GFX900-NEXT: ;;#ASMEND 8865; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s5 8866; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8867; GFX900-NEXT: ;;#ASMSTART 8868; GFX900-NEXT: ; use s[8:9] 8869; GFX900-NEXT: ;;#ASMEND 8870; GFX900-NEXT: s_setpc_b64 s[30:31] 8871; 8872; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_0_0: 8873; GFX90A: ; %bb.0: 8874; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8875; GFX90A-NEXT: ;;#ASMSTART 8876; GFX90A-NEXT: ; def s[4:5] 8877; GFX90A-NEXT: ;;#ASMEND 8878; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 8879; GFX90A-NEXT: ;;#ASMSTART 8880; GFX90A-NEXT: ; def s[6:7] 8881; GFX90A-NEXT: ;;#ASMEND 8882; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s5 8883; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8884; GFX90A-NEXT: ;;#ASMSTART 8885; GFX90A-NEXT: ; use s[8:9] 8886; GFX90A-NEXT: ;;#ASMEND 8887; GFX90A-NEXT: s_setpc_b64 s[30:31] 8888; 8889; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_0_0: 8890; GFX940: ; %bb.0: 8891; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8892; GFX940-NEXT: ;;#ASMSTART 8893; GFX940-NEXT: ; def s[0:1] 8894; GFX940-NEXT: ;;#ASMEND 8895; GFX940-NEXT: s_lshr_b32 s1, s0, 16 8896; GFX940-NEXT: ;;#ASMSTART 8897; GFX940-NEXT: ; def s[2:3] 8898; GFX940-NEXT: ;;#ASMEND 8899; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s1 8900; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 8901; GFX940-NEXT: ;;#ASMSTART 8902; GFX940-NEXT: ; use s[8:9] 8903; GFX940-NEXT: ;;#ASMEND 8904; GFX940-NEXT: s_setpc_b64 s[30:31] 8905 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8906 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 8907 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8908 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8909 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 0, i32 0> 8910 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8911 ret void 8912} 8913 8914define void @s_shuffle_v4bf16_v3bf16__5_2_0_0() { 8915; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_0_0: 8916; GFX900: ; %bb.0: 8917; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8918; GFX900-NEXT: ;;#ASMSTART 8919; GFX900-NEXT: ; def s[4:5] 8920; GFX900-NEXT: ;;#ASMEND 8921; GFX900-NEXT: ;;#ASMSTART 8922; GFX900-NEXT: ; def s[6:7] 8923; GFX900-NEXT: ;;#ASMEND 8924; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s5 8925; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8926; GFX900-NEXT: ;;#ASMSTART 8927; GFX900-NEXT: ; use s[8:9] 8928; GFX900-NEXT: ;;#ASMEND 8929; GFX900-NEXT: s_setpc_b64 s[30:31] 8930; 8931; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_0_0: 8932; GFX90A: ; %bb.0: 8933; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8934; GFX90A-NEXT: ;;#ASMSTART 8935; GFX90A-NEXT: ; def s[4:5] 8936; GFX90A-NEXT: ;;#ASMEND 8937; GFX90A-NEXT: ;;#ASMSTART 8938; GFX90A-NEXT: ; def s[6:7] 8939; GFX90A-NEXT: ;;#ASMEND 8940; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s5 8941; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8942; GFX90A-NEXT: ;;#ASMSTART 8943; GFX90A-NEXT: ; use s[8:9] 8944; GFX90A-NEXT: ;;#ASMEND 8945; GFX90A-NEXT: s_setpc_b64 s[30:31] 8946; 8947; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_0_0: 8948; GFX940: ; %bb.0: 8949; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8950; GFX940-NEXT: ;;#ASMSTART 8951; GFX940-NEXT: ; def s[0:1] 8952; GFX940-NEXT: ;;#ASMEND 8953; GFX940-NEXT: ;;#ASMSTART 8954; GFX940-NEXT: ; def s[2:3] 8955; GFX940-NEXT: ;;#ASMEND 8956; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s1 8957; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 8958; GFX940-NEXT: ;;#ASMSTART 8959; GFX940-NEXT: ; use s[8:9] 8960; GFX940-NEXT: ;;#ASMEND 8961; GFX940-NEXT: s_setpc_b64 s[30:31] 8962 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 8963 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 8964 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8965 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 8966 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 0, i32 0> 8967 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 8968 ret void 8969} 8970 8971define void @s_shuffle_v4bf16_v3bf16__5_3_0_0() { 8972; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_0_0: 8973; GFX900: ; %bb.0: 8974; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8975; GFX900-NEXT: ;;#ASMSTART 8976; GFX900-NEXT: ; def s[4:5] 8977; GFX900-NEXT: ;;#ASMEND 8978; GFX900-NEXT: ;;#ASMSTART 8979; GFX900-NEXT: ; def s[6:7] 8980; GFX900-NEXT: ;;#ASMEND 8981; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s6 8982; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8983; GFX900-NEXT: ;;#ASMSTART 8984; GFX900-NEXT: ; use s[8:9] 8985; GFX900-NEXT: ;;#ASMEND 8986; GFX900-NEXT: s_setpc_b64 s[30:31] 8987; 8988; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_0_0: 8989; GFX90A: ; %bb.0: 8990; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8991; GFX90A-NEXT: ;;#ASMSTART 8992; GFX90A-NEXT: ; def s[4:5] 8993; GFX90A-NEXT: ;;#ASMEND 8994; GFX90A-NEXT: ;;#ASMSTART 8995; GFX90A-NEXT: ; def s[6:7] 8996; GFX90A-NEXT: ;;#ASMEND 8997; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s6 8998; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 8999; GFX90A-NEXT: ;;#ASMSTART 9000; GFX90A-NEXT: ; use s[8:9] 9001; GFX90A-NEXT: ;;#ASMEND 9002; GFX90A-NEXT: s_setpc_b64 s[30:31] 9003; 9004; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_0_0: 9005; GFX940: ; %bb.0: 9006; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9007; GFX940-NEXT: ;;#ASMSTART 9008; GFX940-NEXT: ; def s[0:1] 9009; GFX940-NEXT: ;;#ASMEND 9010; GFX940-NEXT: ;;#ASMSTART 9011; GFX940-NEXT: ; def s[2:3] 9012; GFX940-NEXT: ;;#ASMEND 9013; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s2 9014; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 9015; GFX940-NEXT: ;;#ASMSTART 9016; GFX940-NEXT: ; use s[8:9] 9017; GFX940-NEXT: ;;#ASMEND 9018; GFX940-NEXT: s_setpc_b64 s[30:31] 9019 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9020 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 9021 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9022 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9023 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 0, i32 0> 9024 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9025 ret void 9026} 9027 9028define void @s_shuffle_v4bf16_v3bf16__5_4_0_0() { 9029; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_0_0: 9030; GFX900: ; %bb.0: 9031; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9032; GFX900-NEXT: ;;#ASMSTART 9033; GFX900-NEXT: ; def s[4:5] 9034; GFX900-NEXT: ;;#ASMEND 9035; GFX900-NEXT: ;;#ASMSTART 9036; GFX900-NEXT: ; def s[6:7] 9037; GFX900-NEXT: ;;#ASMEND 9038; GFX900-NEXT: s_lshr_b32 s5, s6, 16 9039; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s5 9040; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9041; GFX900-NEXT: ;;#ASMSTART 9042; GFX900-NEXT: ; use s[8:9] 9043; GFX900-NEXT: ;;#ASMEND 9044; GFX900-NEXT: s_setpc_b64 s[30:31] 9045; 9046; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_0_0: 9047; GFX90A: ; %bb.0: 9048; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9049; GFX90A-NEXT: ;;#ASMSTART 9050; GFX90A-NEXT: ; def s[4:5] 9051; GFX90A-NEXT: ;;#ASMEND 9052; GFX90A-NEXT: ;;#ASMSTART 9053; GFX90A-NEXT: ; def s[6:7] 9054; GFX90A-NEXT: ;;#ASMEND 9055; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 9056; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s5 9057; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9058; GFX90A-NEXT: ;;#ASMSTART 9059; GFX90A-NEXT: ; use s[8:9] 9060; GFX90A-NEXT: ;;#ASMEND 9061; GFX90A-NEXT: s_setpc_b64 s[30:31] 9062; 9063; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_0_0: 9064; GFX940: ; %bb.0: 9065; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9066; GFX940-NEXT: ;;#ASMSTART 9067; GFX940-NEXT: ; def s[0:1] 9068; GFX940-NEXT: ;;#ASMEND 9069; GFX940-NEXT: ;;#ASMSTART 9070; GFX940-NEXT: ; def s[2:3] 9071; GFX940-NEXT: ;;#ASMEND 9072; GFX940-NEXT: s_lshr_b32 s1, s2, 16 9073; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s1 9074; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 9075; GFX940-NEXT: ;;#ASMSTART 9076; GFX940-NEXT: ; use s[8:9] 9077; GFX940-NEXT: ;;#ASMEND 9078; GFX940-NEXT: s_setpc_b64 s[30:31] 9079 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9080 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 9081 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9082 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9083 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 0, i32 0> 9084 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9085 ret void 9086} 9087 9088define void @s_shuffle_v4bf16_v3bf16__5_5_0_0() { 9089; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_0: 9090; GFX900: ; %bb.0: 9091; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9092; GFX900-NEXT: ;;#ASMSTART 9093; GFX900-NEXT: ; def s[4:5] 9094; GFX900-NEXT: ;;#ASMEND 9095; GFX900-NEXT: ;;#ASMSTART 9096; GFX900-NEXT: ; def s[6:7] 9097; GFX900-NEXT: ;;#ASMEND 9098; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9099; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 9100; GFX900-NEXT: ;;#ASMSTART 9101; GFX900-NEXT: ; use s[8:9] 9102; GFX900-NEXT: ;;#ASMEND 9103; GFX900-NEXT: s_setpc_b64 s[30:31] 9104; 9105; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_0: 9106; GFX90A: ; %bb.0: 9107; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9108; GFX90A-NEXT: ;;#ASMSTART 9109; GFX90A-NEXT: ; def s[4:5] 9110; GFX90A-NEXT: ;;#ASMEND 9111; GFX90A-NEXT: ;;#ASMSTART 9112; GFX90A-NEXT: ; def s[6:7] 9113; GFX90A-NEXT: ;;#ASMEND 9114; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9115; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 9116; GFX90A-NEXT: ;;#ASMSTART 9117; GFX90A-NEXT: ; use s[8:9] 9118; GFX90A-NEXT: ;;#ASMEND 9119; GFX90A-NEXT: s_setpc_b64 s[30:31] 9120; 9121; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_0: 9122; GFX940: ; %bb.0: 9123; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9124; GFX940-NEXT: ;;#ASMSTART 9125; GFX940-NEXT: ; def s[0:1] 9126; GFX940-NEXT: ;;#ASMEND 9127; GFX940-NEXT: ;;#ASMSTART 9128; GFX940-NEXT: ; def s[2:3] 9129; GFX940-NEXT: ;;#ASMEND 9130; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 9131; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 9132; GFX940-NEXT: ;;#ASMSTART 9133; GFX940-NEXT: ; use s[8:9] 9134; GFX940-NEXT: ;;#ASMEND 9135; GFX940-NEXT: s_setpc_b64 s[30:31] 9136 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9137 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 9138 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9139 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9140 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 0> 9141 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9142 ret void 9143} 9144 9145define void @s_shuffle_v4bf16_v3bf16__5_5_u_0() { 9146; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_0: 9147; GFX900: ; %bb.0: 9148; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9149; GFX900-NEXT: ;;#ASMSTART 9150; GFX900-NEXT: ; def s[4:5] 9151; GFX900-NEXT: ;;#ASMEND 9152; GFX900-NEXT: ;;#ASMSTART 9153; GFX900-NEXT: ; def s[6:7] 9154; GFX900-NEXT: ;;#ASMEND 9155; GFX900-NEXT: s_lshl_b32 s9, s4, 16 9156; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 9157; GFX900-NEXT: ;;#ASMSTART 9158; GFX900-NEXT: ; use s[8:9] 9159; GFX900-NEXT: ;;#ASMEND 9160; GFX900-NEXT: s_setpc_b64 s[30:31] 9161; 9162; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_0: 9163; GFX90A: ; %bb.0: 9164; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9165; GFX90A-NEXT: ;;#ASMSTART 9166; GFX90A-NEXT: ; def s[4:5] 9167; GFX90A-NEXT: ;;#ASMEND 9168; GFX90A-NEXT: ;;#ASMSTART 9169; GFX90A-NEXT: ; def s[6:7] 9170; GFX90A-NEXT: ;;#ASMEND 9171; GFX90A-NEXT: s_lshl_b32 s9, s4, 16 9172; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 9173; GFX90A-NEXT: ;;#ASMSTART 9174; GFX90A-NEXT: ; use s[8:9] 9175; GFX90A-NEXT: ;;#ASMEND 9176; GFX90A-NEXT: s_setpc_b64 s[30:31] 9177; 9178; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_0: 9179; GFX940: ; %bb.0: 9180; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9181; GFX940-NEXT: ;;#ASMSTART 9182; GFX940-NEXT: ; def s[0:1] 9183; GFX940-NEXT: ;;#ASMEND 9184; GFX940-NEXT: ;;#ASMSTART 9185; GFX940-NEXT: ; def s[2:3] 9186; GFX940-NEXT: ;;#ASMEND 9187; GFX940-NEXT: s_lshl_b32 s9, s0, 16 9188; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 9189; GFX940-NEXT: ;;#ASMSTART 9190; GFX940-NEXT: ; use s[8:9] 9191; GFX940-NEXT: ;;#ASMEND 9192; GFX940-NEXT: s_setpc_b64 s[30:31] 9193 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9194 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 9195 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9196 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9197 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 0> 9198 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9199 ret void 9200} 9201 9202define void @s_shuffle_v4bf16_v3bf16__5_5_1_0() { 9203; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_0: 9204; GFX900: ; %bb.0: 9205; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9206; GFX900-NEXT: ;;#ASMSTART 9207; GFX900-NEXT: ; def s[4:5] 9208; GFX900-NEXT: ;;#ASMEND 9209; GFX900-NEXT: s_lshr_b32 s5, s4, 16 9210; GFX900-NEXT: ;;#ASMSTART 9211; GFX900-NEXT: ; def s[6:7] 9212; GFX900-NEXT: ;;#ASMEND 9213; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s4 9214; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 9215; GFX900-NEXT: ;;#ASMSTART 9216; GFX900-NEXT: ; use s[8:9] 9217; GFX900-NEXT: ;;#ASMEND 9218; GFX900-NEXT: s_setpc_b64 s[30:31] 9219; 9220; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_0: 9221; GFX90A: ; %bb.0: 9222; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9223; GFX90A-NEXT: ;;#ASMSTART 9224; GFX90A-NEXT: ; def s[4:5] 9225; GFX90A-NEXT: ;;#ASMEND 9226; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 9227; GFX90A-NEXT: ;;#ASMSTART 9228; GFX90A-NEXT: ; def s[6:7] 9229; GFX90A-NEXT: ;;#ASMEND 9230; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s4 9231; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 9232; GFX90A-NEXT: ;;#ASMSTART 9233; GFX90A-NEXT: ; use s[8:9] 9234; GFX90A-NEXT: ;;#ASMEND 9235; GFX90A-NEXT: s_setpc_b64 s[30:31] 9236; 9237; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_0: 9238; GFX940: ; %bb.0: 9239; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9240; GFX940-NEXT: ;;#ASMSTART 9241; GFX940-NEXT: ; def s[0:1] 9242; GFX940-NEXT: ;;#ASMEND 9243; GFX940-NEXT: s_lshr_b32 s1, s0, 16 9244; GFX940-NEXT: ;;#ASMSTART 9245; GFX940-NEXT: ; def s[2:3] 9246; GFX940-NEXT: ;;#ASMEND 9247; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s0 9248; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 9249; GFX940-NEXT: ;;#ASMSTART 9250; GFX940-NEXT: ; use s[8:9] 9251; GFX940-NEXT: ;;#ASMEND 9252; GFX940-NEXT: s_setpc_b64 s[30:31] 9253 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9254 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 9255 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9256 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9257 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 0> 9258 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9259 ret void 9260} 9261 9262define void @s_shuffle_v4bf16_v3bf16__5_5_2_0() { 9263; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_0: 9264; GFX900: ; %bb.0: 9265; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9266; GFX900-NEXT: ;;#ASMSTART 9267; GFX900-NEXT: ; def s[4:5] 9268; GFX900-NEXT: ;;#ASMEND 9269; GFX900-NEXT: ;;#ASMSTART 9270; GFX900-NEXT: ; def s[6:7] 9271; GFX900-NEXT: ;;#ASMEND 9272; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s4 9273; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 9274; GFX900-NEXT: ;;#ASMSTART 9275; GFX900-NEXT: ; use s[8:9] 9276; GFX900-NEXT: ;;#ASMEND 9277; GFX900-NEXT: s_setpc_b64 s[30:31] 9278; 9279; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_0: 9280; GFX90A: ; %bb.0: 9281; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9282; GFX90A-NEXT: ;;#ASMSTART 9283; GFX90A-NEXT: ; def s[4:5] 9284; GFX90A-NEXT: ;;#ASMEND 9285; GFX90A-NEXT: ;;#ASMSTART 9286; GFX90A-NEXT: ; def s[6:7] 9287; GFX90A-NEXT: ;;#ASMEND 9288; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s4 9289; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 9290; GFX90A-NEXT: ;;#ASMSTART 9291; GFX90A-NEXT: ; use s[8:9] 9292; GFX90A-NEXT: ;;#ASMEND 9293; GFX90A-NEXT: s_setpc_b64 s[30:31] 9294; 9295; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_0: 9296; GFX940: ; %bb.0: 9297; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9298; GFX940-NEXT: ;;#ASMSTART 9299; GFX940-NEXT: ; def s[0:1] 9300; GFX940-NEXT: ;;#ASMEND 9301; GFX940-NEXT: ;;#ASMSTART 9302; GFX940-NEXT: ; def s[2:3] 9303; GFX940-NEXT: ;;#ASMEND 9304; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s0 9305; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 9306; GFX940-NEXT: ;;#ASMSTART 9307; GFX940-NEXT: ; use s[8:9] 9308; GFX940-NEXT: ;;#ASMEND 9309; GFX940-NEXT: s_setpc_b64 s[30:31] 9310 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9311 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 9312 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9313 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9314 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 0> 9315 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9316 ret void 9317} 9318 9319define void @s_shuffle_v4bf16_v3bf16__5_5_3_0() { 9320; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_0: 9321; GFX900: ; %bb.0: 9322; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9323; GFX900-NEXT: ;;#ASMSTART 9324; GFX900-NEXT: ; def s[4:5] 9325; GFX900-NEXT: ;;#ASMEND 9326; GFX900-NEXT: ;;#ASMSTART 9327; GFX900-NEXT: ; def s[6:7] 9328; GFX900-NEXT: ;;#ASMEND 9329; GFX900-NEXT: s_pack_ll_b32_b16 s9, s6, s4 9330; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 9331; GFX900-NEXT: ;;#ASMSTART 9332; GFX900-NEXT: ; use s[8:9] 9333; GFX900-NEXT: ;;#ASMEND 9334; GFX900-NEXT: s_setpc_b64 s[30:31] 9335; 9336; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_0: 9337; GFX90A: ; %bb.0: 9338; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9339; GFX90A-NEXT: ;;#ASMSTART 9340; GFX90A-NEXT: ; def s[4:5] 9341; GFX90A-NEXT: ;;#ASMEND 9342; GFX90A-NEXT: ;;#ASMSTART 9343; GFX90A-NEXT: ; def s[6:7] 9344; GFX90A-NEXT: ;;#ASMEND 9345; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s6, s4 9346; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 9347; GFX90A-NEXT: ;;#ASMSTART 9348; GFX90A-NEXT: ; use s[8:9] 9349; GFX90A-NEXT: ;;#ASMEND 9350; GFX90A-NEXT: s_setpc_b64 s[30:31] 9351; 9352; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_0: 9353; GFX940: ; %bb.0: 9354; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9355; GFX940-NEXT: ;;#ASMSTART 9356; GFX940-NEXT: ; def s[0:1] 9357; GFX940-NEXT: ;;#ASMEND 9358; GFX940-NEXT: ;;#ASMSTART 9359; GFX940-NEXT: ; def s[2:3] 9360; GFX940-NEXT: ;;#ASMEND 9361; GFX940-NEXT: s_pack_ll_b32_b16 s9, s2, s0 9362; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 9363; GFX940-NEXT: ;;#ASMSTART 9364; GFX940-NEXT: ; use s[8:9] 9365; GFX940-NEXT: ;;#ASMEND 9366; GFX940-NEXT: s_setpc_b64 s[30:31] 9367 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9368 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 9369 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9370 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9371 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 0> 9372 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9373 ret void 9374} 9375 9376define void @s_shuffle_v4bf16_v3bf16__5_5_4_0() { 9377; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_0: 9378; GFX900: ; %bb.0: 9379; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9380; GFX900-NEXT: ;;#ASMSTART 9381; GFX900-NEXT: ; def s[4:5] 9382; GFX900-NEXT: ;;#ASMEND 9383; GFX900-NEXT: ;;#ASMSTART 9384; GFX900-NEXT: ; def s[6:7] 9385; GFX900-NEXT: ;;#ASMEND 9386; GFX900-NEXT: s_lshr_b32 s5, s6, 16 9387; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s4 9388; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 9389; GFX900-NEXT: ;;#ASMSTART 9390; GFX900-NEXT: ; use s[8:9] 9391; GFX900-NEXT: ;;#ASMEND 9392; GFX900-NEXT: s_setpc_b64 s[30:31] 9393; 9394; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_0: 9395; GFX90A: ; %bb.0: 9396; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9397; GFX90A-NEXT: ;;#ASMSTART 9398; GFX90A-NEXT: ; def s[4:5] 9399; GFX90A-NEXT: ;;#ASMEND 9400; GFX90A-NEXT: ;;#ASMSTART 9401; GFX90A-NEXT: ; def s[6:7] 9402; GFX90A-NEXT: ;;#ASMEND 9403; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 9404; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s4 9405; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 9406; GFX90A-NEXT: ;;#ASMSTART 9407; GFX90A-NEXT: ; use s[8:9] 9408; GFX90A-NEXT: ;;#ASMEND 9409; GFX90A-NEXT: s_setpc_b64 s[30:31] 9410; 9411; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_0: 9412; GFX940: ; %bb.0: 9413; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9414; GFX940-NEXT: ;;#ASMSTART 9415; GFX940-NEXT: ; def s[0:1] 9416; GFX940-NEXT: ;;#ASMEND 9417; GFX940-NEXT: ;;#ASMSTART 9418; GFX940-NEXT: ; def s[2:3] 9419; GFX940-NEXT: ;;#ASMEND 9420; GFX940-NEXT: s_lshr_b32 s1, s2, 16 9421; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s0 9422; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 9423; GFX940-NEXT: ;;#ASMSTART 9424; GFX940-NEXT: ; use s[8:9] 9425; GFX940-NEXT: ;;#ASMEND 9426; GFX940-NEXT: s_setpc_b64 s[30:31] 9427 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9428 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 9429 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9430 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9431 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 0> 9432 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9433 ret void 9434} 9435 9436define void @s_shuffle_v4bf16_v3bf16__u_1_1_1() { 9437; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__u_1_1_1: 9438; GFX900: ; %bb.0: 9439; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9440; GFX900-NEXT: ;;#ASMSTART 9441; GFX900-NEXT: ; def s[8:9] 9442; GFX900-NEXT: ;;#ASMEND 9443; GFX900-NEXT: s_lshr_b32 s4, s8, 16 9444; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9445; GFX900-NEXT: ;;#ASMSTART 9446; GFX900-NEXT: ; use s[8:9] 9447; GFX900-NEXT: ;;#ASMEND 9448; GFX900-NEXT: s_setpc_b64 s[30:31] 9449; 9450; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__u_1_1_1: 9451; GFX90A: ; %bb.0: 9452; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9453; GFX90A-NEXT: ;;#ASMSTART 9454; GFX90A-NEXT: ; def s[8:9] 9455; GFX90A-NEXT: ;;#ASMEND 9456; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 9457; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9458; GFX90A-NEXT: ;;#ASMSTART 9459; GFX90A-NEXT: ; use s[8:9] 9460; GFX90A-NEXT: ;;#ASMEND 9461; GFX90A-NEXT: s_setpc_b64 s[30:31] 9462; 9463; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__u_1_1_1: 9464; GFX940: ; %bb.0: 9465; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9466; GFX940-NEXT: ;;#ASMSTART 9467; GFX940-NEXT: ; def s[8:9] 9468; GFX940-NEXT: ;;#ASMEND 9469; GFX940-NEXT: s_lshr_b32 s0, s8, 16 9470; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 9471; GFX940-NEXT: ;;#ASMSTART 9472; GFX940-NEXT: ; use s[8:9] 9473; GFX940-NEXT: ;;#ASMEND 9474; GFX940-NEXT: s_setpc_b64 s[30:31] 9475 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9476 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9477 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1> 9478 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9479 ret void 9480} 9481 9482define void @s_shuffle_v4bf16_v3bf16__0_1_1_1() { 9483; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_1_1_1: 9484; GFX900: ; %bb.0: 9485; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9486; GFX900-NEXT: ;;#ASMSTART 9487; GFX900-NEXT: ; def s[8:9] 9488; GFX900-NEXT: ;;#ASMEND 9489; GFX900-NEXT: s_lshr_b32 s4, s8, 16 9490; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9491; GFX900-NEXT: ;;#ASMSTART 9492; GFX900-NEXT: ; use s[8:9] 9493; GFX900-NEXT: ;;#ASMEND 9494; GFX900-NEXT: s_setpc_b64 s[30:31] 9495; 9496; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_1_1_1: 9497; GFX90A: ; %bb.0: 9498; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9499; GFX90A-NEXT: ;;#ASMSTART 9500; GFX90A-NEXT: ; def s[8:9] 9501; GFX90A-NEXT: ;;#ASMEND 9502; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 9503; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9504; GFX90A-NEXT: ;;#ASMSTART 9505; GFX90A-NEXT: ; use s[8:9] 9506; GFX90A-NEXT: ;;#ASMEND 9507; GFX90A-NEXT: s_setpc_b64 s[30:31] 9508; 9509; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_1_1_1: 9510; GFX940: ; %bb.0: 9511; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9512; GFX940-NEXT: ;;#ASMSTART 9513; GFX940-NEXT: ; def s[8:9] 9514; GFX940-NEXT: ;;#ASMEND 9515; GFX940-NEXT: s_lshr_b32 s0, s8, 16 9516; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 9517; GFX940-NEXT: ;;#ASMSTART 9518; GFX940-NEXT: ; use s[8:9] 9519; GFX940-NEXT: ;;#ASMEND 9520; GFX940-NEXT: s_setpc_b64 s[30:31] 9521 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9522 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9523 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1> 9524 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9525 ret void 9526} 9527 9528define void @s_shuffle_v4bf16_v3bf16__1_1_1_1() { 9529; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_1_1_1: 9530; GFX900: ; %bb.0: 9531; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9532; GFX900-NEXT: ;;#ASMSTART 9533; GFX900-NEXT: ; def s[4:5] 9534; GFX900-NEXT: ;;#ASMEND 9535; GFX900-NEXT: s_lshr_b32 s4, s4, 16 9536; GFX900-NEXT: s_pack_ll_b32_b16 s8, s4, s4 9537; GFX900-NEXT: s_mov_b32 s9, s8 9538; GFX900-NEXT: ;;#ASMSTART 9539; GFX900-NEXT: ; use s[8:9] 9540; GFX900-NEXT: ;;#ASMEND 9541; GFX900-NEXT: s_setpc_b64 s[30:31] 9542; 9543; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_1_1_1: 9544; GFX90A: ; %bb.0: 9545; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9546; GFX90A-NEXT: ;;#ASMSTART 9547; GFX90A-NEXT: ; def s[4:5] 9548; GFX90A-NEXT: ;;#ASMEND 9549; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 9550; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s4, s4 9551; GFX90A-NEXT: s_mov_b32 s9, s8 9552; GFX90A-NEXT: ;;#ASMSTART 9553; GFX90A-NEXT: ; use s[8:9] 9554; GFX90A-NEXT: ;;#ASMEND 9555; GFX90A-NEXT: s_setpc_b64 s[30:31] 9556; 9557; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_1_1_1: 9558; GFX940: ; %bb.0: 9559; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9560; GFX940-NEXT: ;;#ASMSTART 9561; GFX940-NEXT: ; def s[0:1] 9562; GFX940-NEXT: ;;#ASMEND 9563; GFX940-NEXT: s_lshr_b32 s0, s0, 16 9564; GFX940-NEXT: s_pack_ll_b32_b16 s8, s0, s0 9565; GFX940-NEXT: s_mov_b32 s9, s8 9566; GFX940-NEXT: ;;#ASMSTART 9567; GFX940-NEXT: ; use s[8:9] 9568; GFX940-NEXT: ;;#ASMEND 9569; GFX940-NEXT: s_setpc_b64 s[30:31] 9570 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9571 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9572 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 9573 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9574 ret void 9575} 9576 9577define void @s_shuffle_v4bf16_v3bf16__2_1_1_1() { 9578; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_1_1_1: 9579; GFX900: ; %bb.0: 9580; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9581; GFX900-NEXT: ;;#ASMSTART 9582; GFX900-NEXT: ; def s[4:5] 9583; GFX900-NEXT: ;;#ASMEND 9584; GFX900-NEXT: s_lshr_b32 s4, s4, 16 9585; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s4 9586; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9587; GFX900-NEXT: ;;#ASMSTART 9588; GFX900-NEXT: ; use s[8:9] 9589; GFX900-NEXT: ;;#ASMEND 9590; GFX900-NEXT: s_setpc_b64 s[30:31] 9591; 9592; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_1_1_1: 9593; GFX90A: ; %bb.0: 9594; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9595; GFX90A-NEXT: ;;#ASMSTART 9596; GFX90A-NEXT: ; def s[4:5] 9597; GFX90A-NEXT: ;;#ASMEND 9598; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 9599; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s4 9600; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9601; GFX90A-NEXT: ;;#ASMSTART 9602; GFX90A-NEXT: ; use s[8:9] 9603; GFX90A-NEXT: ;;#ASMEND 9604; GFX90A-NEXT: s_setpc_b64 s[30:31] 9605; 9606; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_1_1_1: 9607; GFX940: ; %bb.0: 9608; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9609; GFX940-NEXT: ;;#ASMSTART 9610; GFX940-NEXT: ; def s[0:1] 9611; GFX940-NEXT: ;;#ASMEND 9612; GFX940-NEXT: s_lshr_b32 s0, s0, 16 9613; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s0 9614; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 9615; GFX940-NEXT: ;;#ASMSTART 9616; GFX940-NEXT: ; use s[8:9] 9617; GFX940-NEXT: ;;#ASMEND 9618; GFX940-NEXT: s_setpc_b64 s[30:31] 9619 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9620 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9621 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1> 9622 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9623 ret void 9624} 9625 9626define void @s_shuffle_v4bf16_v3bf16__3_1_1_1() { 9627; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__3_1_1_1: 9628; GFX900: ; %bb.0: 9629; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9630; GFX900-NEXT: ;;#ASMSTART 9631; GFX900-NEXT: ; def s[8:9] 9632; GFX900-NEXT: ;;#ASMEND 9633; GFX900-NEXT: s_lshr_b32 s4, s8, 16 9634; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9635; GFX900-NEXT: ;;#ASMSTART 9636; GFX900-NEXT: ; use s[8:9] 9637; GFX900-NEXT: ;;#ASMEND 9638; GFX900-NEXT: s_setpc_b64 s[30:31] 9639; 9640; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__3_1_1_1: 9641; GFX90A: ; %bb.0: 9642; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9643; GFX90A-NEXT: ;;#ASMSTART 9644; GFX90A-NEXT: ; def s[8:9] 9645; GFX90A-NEXT: ;;#ASMEND 9646; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 9647; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9648; GFX90A-NEXT: ;;#ASMSTART 9649; GFX90A-NEXT: ; use s[8:9] 9650; GFX90A-NEXT: ;;#ASMEND 9651; GFX90A-NEXT: s_setpc_b64 s[30:31] 9652; 9653; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__3_1_1_1: 9654; GFX940: ; %bb.0: 9655; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9656; GFX940-NEXT: ;;#ASMSTART 9657; GFX940-NEXT: ; def s[8:9] 9658; GFX940-NEXT: ;;#ASMEND 9659; GFX940-NEXT: s_lshr_b32 s0, s8, 16 9660; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 9661; GFX940-NEXT: ;;#ASMSTART 9662; GFX940-NEXT: ; use s[8:9] 9663; GFX940-NEXT: ;;#ASMEND 9664; GFX940-NEXT: s_setpc_b64 s[30:31] 9665 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9666 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9667 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1> 9668 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9669 ret void 9670} 9671 9672define void @s_shuffle_v4bf16_v3bf16__4_1_1_1() { 9673; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_1_1_1: 9674; GFX900: ; %bb.0: 9675; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9676; GFX900-NEXT: ;;#ASMSTART 9677; GFX900-NEXT: ; def s[4:5] 9678; GFX900-NEXT: ;;#ASMEND 9679; GFX900-NEXT: ;;#ASMSTART 9680; GFX900-NEXT: ; def s[6:7] 9681; GFX900-NEXT: ;;#ASMEND 9682; GFX900-NEXT: s_lshr_b32 s4, s4, 16 9683; GFX900-NEXT: s_lshr_b32 s5, s6, 16 9684; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s4 9685; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9686; GFX900-NEXT: ;;#ASMSTART 9687; GFX900-NEXT: ; use s[8:9] 9688; GFX900-NEXT: ;;#ASMEND 9689; GFX900-NEXT: s_setpc_b64 s[30:31] 9690; 9691; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_1_1_1: 9692; GFX90A: ; %bb.0: 9693; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9694; GFX90A-NEXT: ;;#ASMSTART 9695; GFX90A-NEXT: ; def s[4:5] 9696; GFX90A-NEXT: ;;#ASMEND 9697; GFX90A-NEXT: ;;#ASMSTART 9698; GFX90A-NEXT: ; def s[6:7] 9699; GFX90A-NEXT: ;;#ASMEND 9700; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 9701; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 9702; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s4 9703; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9704; GFX90A-NEXT: ;;#ASMSTART 9705; GFX90A-NEXT: ; use s[8:9] 9706; GFX90A-NEXT: ;;#ASMEND 9707; GFX90A-NEXT: s_setpc_b64 s[30:31] 9708; 9709; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_1_1_1: 9710; GFX940: ; %bb.0: 9711; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9712; GFX940-NEXT: ;;#ASMSTART 9713; GFX940-NEXT: ; def s[0:1] 9714; GFX940-NEXT: ;;#ASMEND 9715; GFX940-NEXT: ;;#ASMSTART 9716; GFX940-NEXT: ; def s[2:3] 9717; GFX940-NEXT: ;;#ASMEND 9718; GFX940-NEXT: s_lshr_b32 s0, s0, 16 9719; GFX940-NEXT: s_lshr_b32 s1, s2, 16 9720; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s0 9721; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 9722; GFX940-NEXT: ;;#ASMSTART 9723; GFX940-NEXT: ; use s[8:9] 9724; GFX940-NEXT: ;;#ASMEND 9725; GFX940-NEXT: s_setpc_b64 s[30:31] 9726 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9727 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 9728 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9729 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9730 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 1, i32 1, i32 1> 9731 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9732 ret void 9733} 9734 9735define void @s_shuffle_v4bf16_v3bf16__5_1_1_1() { 9736; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_1_1: 9737; GFX900: ; %bb.0: 9738; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9739; GFX900-NEXT: ;;#ASMSTART 9740; GFX900-NEXT: ; def s[4:5] 9741; GFX900-NEXT: ;;#ASMEND 9742; GFX900-NEXT: s_lshr_b32 s4, s4, 16 9743; GFX900-NEXT: ;;#ASMSTART 9744; GFX900-NEXT: ; def s[6:7] 9745; GFX900-NEXT: ;;#ASMEND 9746; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s4 9747; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9748; GFX900-NEXT: ;;#ASMSTART 9749; GFX900-NEXT: ; use s[8:9] 9750; GFX900-NEXT: ;;#ASMEND 9751; GFX900-NEXT: s_setpc_b64 s[30:31] 9752; 9753; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_1_1: 9754; GFX90A: ; %bb.0: 9755; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9756; GFX90A-NEXT: ;;#ASMSTART 9757; GFX90A-NEXT: ; def s[4:5] 9758; GFX90A-NEXT: ;;#ASMEND 9759; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 9760; GFX90A-NEXT: ;;#ASMSTART 9761; GFX90A-NEXT: ; def s[6:7] 9762; GFX90A-NEXT: ;;#ASMEND 9763; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s4 9764; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9765; GFX90A-NEXT: ;;#ASMSTART 9766; GFX90A-NEXT: ; use s[8:9] 9767; GFX90A-NEXT: ;;#ASMEND 9768; GFX90A-NEXT: s_setpc_b64 s[30:31] 9769; 9770; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_1_1: 9771; GFX940: ; %bb.0: 9772; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9773; GFX940-NEXT: ;;#ASMSTART 9774; GFX940-NEXT: ; def s[0:1] 9775; GFX940-NEXT: ;;#ASMEND 9776; GFX940-NEXT: s_lshr_b32 s0, s0, 16 9777; GFX940-NEXT: ;;#ASMSTART 9778; GFX940-NEXT: ; def s[2:3] 9779; GFX940-NEXT: ;;#ASMEND 9780; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s0 9781; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 9782; GFX940-NEXT: ;;#ASMSTART 9783; GFX940-NEXT: ; use s[8:9] 9784; GFX940-NEXT: ;;#ASMEND 9785; GFX940-NEXT: s_setpc_b64 s[30:31] 9786 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9787 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 9788 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9789 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9790 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 1, i32 1> 9791 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9792 ret void 9793} 9794 9795define void @s_shuffle_v4bf16_v3bf16__5_u_1_1() { 9796; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_1_1: 9797; GFX900: ; %bb.0: 9798; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9799; GFX900-NEXT: ;;#ASMSTART 9800; GFX900-NEXT: ; def s[4:5] 9801; GFX900-NEXT: ;;#ASMEND 9802; GFX900-NEXT: s_lshr_b32 s4, s4, 16 9803; GFX900-NEXT: ;;#ASMSTART 9804; GFX900-NEXT: ; def s[6:7] 9805; GFX900-NEXT: ;;#ASMEND 9806; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9807; GFX900-NEXT: s_mov_b32 s8, s7 9808; GFX900-NEXT: ;;#ASMSTART 9809; GFX900-NEXT: ; use s[8:9] 9810; GFX900-NEXT: ;;#ASMEND 9811; GFX900-NEXT: s_setpc_b64 s[30:31] 9812; 9813; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_1_1: 9814; GFX90A: ; %bb.0: 9815; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9816; GFX90A-NEXT: ;;#ASMSTART 9817; GFX90A-NEXT: ; def s[4:5] 9818; GFX90A-NEXT: ;;#ASMEND 9819; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 9820; GFX90A-NEXT: ;;#ASMSTART 9821; GFX90A-NEXT: ; def s[6:7] 9822; GFX90A-NEXT: ;;#ASMEND 9823; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9824; GFX90A-NEXT: s_mov_b32 s8, s7 9825; GFX90A-NEXT: ;;#ASMSTART 9826; GFX90A-NEXT: ; use s[8:9] 9827; GFX90A-NEXT: ;;#ASMEND 9828; GFX90A-NEXT: s_setpc_b64 s[30:31] 9829; 9830; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_1_1: 9831; GFX940: ; %bb.0: 9832; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9833; GFX940-NEXT: ;;#ASMSTART 9834; GFX940-NEXT: ; def s[0:1] 9835; GFX940-NEXT: ;;#ASMEND 9836; GFX940-NEXT: s_lshr_b32 s0, s0, 16 9837; GFX940-NEXT: ;;#ASMSTART 9838; GFX940-NEXT: ; def s[2:3] 9839; GFX940-NEXT: ;;#ASMEND 9840; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 9841; GFX940-NEXT: s_mov_b32 s8, s3 9842; GFX940-NEXT: ;;#ASMSTART 9843; GFX940-NEXT: ; use s[8:9] 9844; GFX940-NEXT: ;;#ASMEND 9845; GFX940-NEXT: s_setpc_b64 s[30:31] 9846 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9847 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 9848 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9849 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9850 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 1, i32 1> 9851 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9852 ret void 9853} 9854 9855define void @s_shuffle_v4bf16_v3bf16__5_0_1_1() { 9856; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_1_1: 9857; GFX900: ; %bb.0: 9858; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9859; GFX900-NEXT: ;;#ASMSTART 9860; GFX900-NEXT: ; def s[4:5] 9861; GFX900-NEXT: ;;#ASMEND 9862; GFX900-NEXT: ;;#ASMSTART 9863; GFX900-NEXT: ; def s[6:7] 9864; GFX900-NEXT: ;;#ASMEND 9865; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s4 9866; GFX900-NEXT: s_lshr_b32 s4, s4, 16 9867; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9868; GFX900-NEXT: ;;#ASMSTART 9869; GFX900-NEXT: ; use s[8:9] 9870; GFX900-NEXT: ;;#ASMEND 9871; GFX900-NEXT: s_setpc_b64 s[30:31] 9872; 9873; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_1_1: 9874; GFX90A: ; %bb.0: 9875; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9876; GFX90A-NEXT: ;;#ASMSTART 9877; GFX90A-NEXT: ; def s[4:5] 9878; GFX90A-NEXT: ;;#ASMEND 9879; GFX90A-NEXT: ;;#ASMSTART 9880; GFX90A-NEXT: ; def s[6:7] 9881; GFX90A-NEXT: ;;#ASMEND 9882; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s4 9883; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 9884; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9885; GFX90A-NEXT: ;;#ASMSTART 9886; GFX90A-NEXT: ; use s[8:9] 9887; GFX90A-NEXT: ;;#ASMEND 9888; GFX90A-NEXT: s_setpc_b64 s[30:31] 9889; 9890; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_1_1: 9891; GFX940: ; %bb.0: 9892; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9893; GFX940-NEXT: ;;#ASMSTART 9894; GFX940-NEXT: ; def s[0:1] 9895; GFX940-NEXT: ;;#ASMEND 9896; GFX940-NEXT: ;;#ASMSTART 9897; GFX940-NEXT: ; def s[2:3] 9898; GFX940-NEXT: ;;#ASMEND 9899; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s0 9900; GFX940-NEXT: s_lshr_b32 s0, s0, 16 9901; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 9902; GFX940-NEXT: ;;#ASMSTART 9903; GFX940-NEXT: ; use s[8:9] 9904; GFX940-NEXT: ;;#ASMEND 9905; GFX940-NEXT: s_setpc_b64 s[30:31] 9906 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9907 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 9908 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9909 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9910 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 1, i32 1> 9911 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9912 ret void 9913} 9914 9915define void @s_shuffle_v4bf16_v3bf16__5_2_1_1() { 9916; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_1_1: 9917; GFX900: ; %bb.0: 9918; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9919; GFX900-NEXT: ;;#ASMSTART 9920; GFX900-NEXT: ; def s[4:5] 9921; GFX900-NEXT: ;;#ASMEND 9922; GFX900-NEXT: s_lshr_b32 s4, s4, 16 9923; GFX900-NEXT: ;;#ASMSTART 9924; GFX900-NEXT: ; def s[6:7] 9925; GFX900-NEXT: ;;#ASMEND 9926; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s5 9927; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9928; GFX900-NEXT: ;;#ASMSTART 9929; GFX900-NEXT: ; use s[8:9] 9930; GFX900-NEXT: ;;#ASMEND 9931; GFX900-NEXT: s_setpc_b64 s[30:31] 9932; 9933; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_1_1: 9934; GFX90A: ; %bb.0: 9935; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9936; GFX90A-NEXT: ;;#ASMSTART 9937; GFX90A-NEXT: ; def s[4:5] 9938; GFX90A-NEXT: ;;#ASMEND 9939; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 9940; GFX90A-NEXT: ;;#ASMSTART 9941; GFX90A-NEXT: ; def s[6:7] 9942; GFX90A-NEXT: ;;#ASMEND 9943; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s5 9944; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9945; GFX90A-NEXT: ;;#ASMSTART 9946; GFX90A-NEXT: ; use s[8:9] 9947; GFX90A-NEXT: ;;#ASMEND 9948; GFX90A-NEXT: s_setpc_b64 s[30:31] 9949; 9950; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_1_1: 9951; GFX940: ; %bb.0: 9952; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9953; GFX940-NEXT: ;;#ASMSTART 9954; GFX940-NEXT: ; def s[0:1] 9955; GFX940-NEXT: ;;#ASMEND 9956; GFX940-NEXT: s_lshr_b32 s0, s0, 16 9957; GFX940-NEXT: ;;#ASMSTART 9958; GFX940-NEXT: ; def s[2:3] 9959; GFX940-NEXT: ;;#ASMEND 9960; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s1 9961; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 9962; GFX940-NEXT: ;;#ASMSTART 9963; GFX940-NEXT: ; use s[8:9] 9964; GFX940-NEXT: ;;#ASMEND 9965; GFX940-NEXT: s_setpc_b64 s[30:31] 9966 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 9967 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 9968 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9969 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 9970 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 1, i32 1> 9971 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 9972 ret void 9973} 9974 9975define void @s_shuffle_v4bf16_v3bf16__5_3_1_1() { 9976; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_1_1: 9977; GFX900: ; %bb.0: 9978; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9979; GFX900-NEXT: ;;#ASMSTART 9980; GFX900-NEXT: ; def s[4:5] 9981; GFX900-NEXT: ;;#ASMEND 9982; GFX900-NEXT: s_lshr_b32 s4, s4, 16 9983; GFX900-NEXT: ;;#ASMSTART 9984; GFX900-NEXT: ; def s[6:7] 9985; GFX900-NEXT: ;;#ASMEND 9986; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s6 9987; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 9988; GFX900-NEXT: ;;#ASMSTART 9989; GFX900-NEXT: ; use s[8:9] 9990; GFX900-NEXT: ;;#ASMEND 9991; GFX900-NEXT: s_setpc_b64 s[30:31] 9992; 9993; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_1_1: 9994; GFX90A: ; %bb.0: 9995; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9996; GFX90A-NEXT: ;;#ASMSTART 9997; GFX90A-NEXT: ; def s[4:5] 9998; GFX90A-NEXT: ;;#ASMEND 9999; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 10000; GFX90A-NEXT: ;;#ASMSTART 10001; GFX90A-NEXT: ; def s[6:7] 10002; GFX90A-NEXT: ;;#ASMEND 10003; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s6 10004; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 10005; GFX90A-NEXT: ;;#ASMSTART 10006; GFX90A-NEXT: ; use s[8:9] 10007; GFX90A-NEXT: ;;#ASMEND 10008; GFX90A-NEXT: s_setpc_b64 s[30:31] 10009; 10010; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_1_1: 10011; GFX940: ; %bb.0: 10012; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10013; GFX940-NEXT: ;;#ASMSTART 10014; GFX940-NEXT: ; def s[0:1] 10015; GFX940-NEXT: ;;#ASMEND 10016; GFX940-NEXT: s_lshr_b32 s0, s0, 16 10017; GFX940-NEXT: ;;#ASMSTART 10018; GFX940-NEXT: ; def s[2:3] 10019; GFX940-NEXT: ;;#ASMEND 10020; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s2 10021; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 10022; GFX940-NEXT: ;;#ASMSTART 10023; GFX940-NEXT: ; use s[8:9] 10024; GFX940-NEXT: ;;#ASMEND 10025; GFX940-NEXT: s_setpc_b64 s[30:31] 10026 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10027 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 10028 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10029 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10030 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 1, i32 1> 10031 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10032 ret void 10033} 10034 10035define void @s_shuffle_v4bf16_v3bf16__5_4_1_1() { 10036; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_1_1: 10037; GFX900: ; %bb.0: 10038; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10039; GFX900-NEXT: ;;#ASMSTART 10040; GFX900-NEXT: ; def s[4:5] 10041; GFX900-NEXT: ;;#ASMEND 10042; GFX900-NEXT: ;;#ASMSTART 10043; GFX900-NEXT: ; def s[6:7] 10044; GFX900-NEXT: ;;#ASMEND 10045; GFX900-NEXT: s_lshr_b32 s5, s6, 16 10046; GFX900-NEXT: s_lshr_b32 s4, s4, 16 10047; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s5 10048; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 10049; GFX900-NEXT: ;;#ASMSTART 10050; GFX900-NEXT: ; use s[8:9] 10051; GFX900-NEXT: ;;#ASMEND 10052; GFX900-NEXT: s_setpc_b64 s[30:31] 10053; 10054; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_1_1: 10055; GFX90A: ; %bb.0: 10056; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10057; GFX90A-NEXT: ;;#ASMSTART 10058; GFX90A-NEXT: ; def s[4:5] 10059; GFX90A-NEXT: ;;#ASMEND 10060; GFX90A-NEXT: ;;#ASMSTART 10061; GFX90A-NEXT: ; def s[6:7] 10062; GFX90A-NEXT: ;;#ASMEND 10063; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 10064; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 10065; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s5 10066; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 10067; GFX90A-NEXT: ;;#ASMSTART 10068; GFX90A-NEXT: ; use s[8:9] 10069; GFX90A-NEXT: ;;#ASMEND 10070; GFX90A-NEXT: s_setpc_b64 s[30:31] 10071; 10072; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_1_1: 10073; GFX940: ; %bb.0: 10074; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10075; GFX940-NEXT: ;;#ASMSTART 10076; GFX940-NEXT: ; def s[0:1] 10077; GFX940-NEXT: ;;#ASMEND 10078; GFX940-NEXT: ;;#ASMSTART 10079; GFX940-NEXT: ; def s[2:3] 10080; GFX940-NEXT: ;;#ASMEND 10081; GFX940-NEXT: s_lshr_b32 s1, s2, 16 10082; GFX940-NEXT: s_lshr_b32 s0, s0, 16 10083; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s1 10084; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 10085; GFX940-NEXT: ;;#ASMSTART 10086; GFX940-NEXT: ; use s[8:9] 10087; GFX940-NEXT: ;;#ASMEND 10088; GFX940-NEXT: s_setpc_b64 s[30:31] 10089 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10090 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 10091 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10092 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10093 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 1, i32 1> 10094 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10095 ret void 10096} 10097 10098define void @s_shuffle_v4bf16_v3bf16__5_5_1_1() { 10099; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_1: 10100; GFX900: ; %bb.0: 10101; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10102; GFX900-NEXT: ;;#ASMSTART 10103; GFX900-NEXT: ; def s[4:5] 10104; GFX900-NEXT: ;;#ASMEND 10105; GFX900-NEXT: s_lshr_b32 s4, s4, 16 10106; GFX900-NEXT: ;;#ASMSTART 10107; GFX900-NEXT: ; def s[6:7] 10108; GFX900-NEXT: ;;#ASMEND 10109; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 10110; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 10111; GFX900-NEXT: ;;#ASMSTART 10112; GFX900-NEXT: ; use s[8:9] 10113; GFX900-NEXT: ;;#ASMEND 10114; GFX900-NEXT: s_setpc_b64 s[30:31] 10115; 10116; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_1: 10117; GFX90A: ; %bb.0: 10118; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10119; GFX90A-NEXT: ;;#ASMSTART 10120; GFX90A-NEXT: ; def s[4:5] 10121; GFX90A-NEXT: ;;#ASMEND 10122; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 10123; GFX90A-NEXT: ;;#ASMSTART 10124; GFX90A-NEXT: ; def s[6:7] 10125; GFX90A-NEXT: ;;#ASMEND 10126; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 10127; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 10128; GFX90A-NEXT: ;;#ASMSTART 10129; GFX90A-NEXT: ; use s[8:9] 10130; GFX90A-NEXT: ;;#ASMEND 10131; GFX90A-NEXT: s_setpc_b64 s[30:31] 10132; 10133; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_1: 10134; GFX940: ; %bb.0: 10135; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10136; GFX940-NEXT: ;;#ASMSTART 10137; GFX940-NEXT: ; def s[0:1] 10138; GFX940-NEXT: ;;#ASMEND 10139; GFX940-NEXT: s_lshr_b32 s0, s0, 16 10140; GFX940-NEXT: ;;#ASMSTART 10141; GFX940-NEXT: ; def s[2:3] 10142; GFX940-NEXT: ;;#ASMEND 10143; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 10144; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 10145; GFX940-NEXT: ;;#ASMSTART 10146; GFX940-NEXT: ; use s[8:9] 10147; GFX940-NEXT: ;;#ASMEND 10148; GFX940-NEXT: s_setpc_b64 s[30:31] 10149 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10150 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 10151 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10152 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10153 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 1> 10154 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10155 ret void 10156} 10157 10158define void @s_shuffle_v4bf16_v3bf16__5_5_u_1() { 10159; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_1: 10160; GFX900: ; %bb.0: 10161; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10162; GFX900-NEXT: ;;#ASMSTART 10163; GFX900-NEXT: ; def s[4:5] 10164; GFX900-NEXT: ;;#ASMEND 10165; GFX900-NEXT: ;;#ASMSTART 10166; GFX900-NEXT: ; def s[6:7] 10167; GFX900-NEXT: ;;#ASMEND 10168; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 10169; GFX900-NEXT: s_mov_b32 s9, s4 10170; GFX900-NEXT: ;;#ASMSTART 10171; GFX900-NEXT: ; use s[8:9] 10172; GFX900-NEXT: ;;#ASMEND 10173; GFX900-NEXT: s_setpc_b64 s[30:31] 10174; 10175; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_1: 10176; GFX90A: ; %bb.0: 10177; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10178; GFX90A-NEXT: ;;#ASMSTART 10179; GFX90A-NEXT: ; def s[4:5] 10180; GFX90A-NEXT: ;;#ASMEND 10181; GFX90A-NEXT: ;;#ASMSTART 10182; GFX90A-NEXT: ; def s[6:7] 10183; GFX90A-NEXT: ;;#ASMEND 10184; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 10185; GFX90A-NEXT: s_mov_b32 s9, s4 10186; GFX90A-NEXT: ;;#ASMSTART 10187; GFX90A-NEXT: ; use s[8:9] 10188; GFX90A-NEXT: ;;#ASMEND 10189; GFX90A-NEXT: s_setpc_b64 s[30:31] 10190; 10191; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_1: 10192; GFX940: ; %bb.0: 10193; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10194; GFX940-NEXT: ;;#ASMSTART 10195; GFX940-NEXT: ; def s[0:1] 10196; GFX940-NEXT: ;;#ASMEND 10197; GFX940-NEXT: ;;#ASMSTART 10198; GFX940-NEXT: ; def s[2:3] 10199; GFX940-NEXT: ;;#ASMEND 10200; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 10201; GFX940-NEXT: s_mov_b32 s9, s0 10202; GFX940-NEXT: ;;#ASMSTART 10203; GFX940-NEXT: ; use s[8:9] 10204; GFX940-NEXT: ;;#ASMEND 10205; GFX940-NEXT: s_setpc_b64 s[30:31] 10206 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10207 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 10208 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10209 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10210 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 1> 10211 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10212 ret void 10213} 10214 10215define void @s_shuffle_v4bf16_v3bf16__5_5_0_1() { 10216; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_1: 10217; GFX900: ; %bb.0: 10218; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10219; GFX900-NEXT: ;;#ASMSTART 10220; GFX900-NEXT: ; def s[4:5] 10221; GFX900-NEXT: ;;#ASMEND 10222; GFX900-NEXT: ;;#ASMSTART 10223; GFX900-NEXT: ; def s[6:7] 10224; GFX900-NEXT: ;;#ASMEND 10225; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 10226; GFX900-NEXT: s_mov_b32 s9, s4 10227; GFX900-NEXT: ;;#ASMSTART 10228; GFX900-NEXT: ; use s[8:9] 10229; GFX900-NEXT: ;;#ASMEND 10230; GFX900-NEXT: s_setpc_b64 s[30:31] 10231; 10232; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_1: 10233; GFX90A: ; %bb.0: 10234; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10235; GFX90A-NEXT: ;;#ASMSTART 10236; GFX90A-NEXT: ; def s[4:5] 10237; GFX90A-NEXT: ;;#ASMEND 10238; GFX90A-NEXT: ;;#ASMSTART 10239; GFX90A-NEXT: ; def s[6:7] 10240; GFX90A-NEXT: ;;#ASMEND 10241; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 10242; GFX90A-NEXT: s_mov_b32 s9, s4 10243; GFX90A-NEXT: ;;#ASMSTART 10244; GFX90A-NEXT: ; use s[8:9] 10245; GFX90A-NEXT: ;;#ASMEND 10246; GFX90A-NEXT: s_setpc_b64 s[30:31] 10247; 10248; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_1: 10249; GFX940: ; %bb.0: 10250; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10251; GFX940-NEXT: ;;#ASMSTART 10252; GFX940-NEXT: ; def s[0:1] 10253; GFX940-NEXT: ;;#ASMEND 10254; GFX940-NEXT: ;;#ASMSTART 10255; GFX940-NEXT: ; def s[2:3] 10256; GFX940-NEXT: ;;#ASMEND 10257; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 10258; GFX940-NEXT: s_mov_b32 s9, s0 10259; GFX940-NEXT: ;;#ASMSTART 10260; GFX940-NEXT: ; use s[8:9] 10261; GFX940-NEXT: ;;#ASMEND 10262; GFX940-NEXT: s_setpc_b64 s[30:31] 10263 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10264 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 10265 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10266 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10267 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 1> 10268 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10269 ret void 10270} 10271 10272define void @s_shuffle_v4bf16_v3bf16__5_5_2_1() { 10273; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_1: 10274; GFX900: ; %bb.0: 10275; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10276; GFX900-NEXT: ;;#ASMSTART 10277; GFX900-NEXT: ; def s[4:5] 10278; GFX900-NEXT: ;;#ASMEND 10279; GFX900-NEXT: s_lshr_b32 s4, s4, 16 10280; GFX900-NEXT: ;;#ASMSTART 10281; GFX900-NEXT: ; def s[6:7] 10282; GFX900-NEXT: ;;#ASMEND 10283; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s4 10284; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 10285; GFX900-NEXT: ;;#ASMSTART 10286; GFX900-NEXT: ; use s[8:9] 10287; GFX900-NEXT: ;;#ASMEND 10288; GFX900-NEXT: s_setpc_b64 s[30:31] 10289; 10290; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_1: 10291; GFX90A: ; %bb.0: 10292; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10293; GFX90A-NEXT: ;;#ASMSTART 10294; GFX90A-NEXT: ; def s[4:5] 10295; GFX90A-NEXT: ;;#ASMEND 10296; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 10297; GFX90A-NEXT: ;;#ASMSTART 10298; GFX90A-NEXT: ; def s[6:7] 10299; GFX90A-NEXT: ;;#ASMEND 10300; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s4 10301; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 10302; GFX90A-NEXT: ;;#ASMSTART 10303; GFX90A-NEXT: ; use s[8:9] 10304; GFX90A-NEXT: ;;#ASMEND 10305; GFX90A-NEXT: s_setpc_b64 s[30:31] 10306; 10307; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_1: 10308; GFX940: ; %bb.0: 10309; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10310; GFX940-NEXT: ;;#ASMSTART 10311; GFX940-NEXT: ; def s[0:1] 10312; GFX940-NEXT: ;;#ASMEND 10313; GFX940-NEXT: s_lshr_b32 s0, s0, 16 10314; GFX940-NEXT: ;;#ASMSTART 10315; GFX940-NEXT: ; def s[2:3] 10316; GFX940-NEXT: ;;#ASMEND 10317; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s0 10318; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 10319; GFX940-NEXT: ;;#ASMSTART 10320; GFX940-NEXT: ; use s[8:9] 10321; GFX940-NEXT: ;;#ASMEND 10322; GFX940-NEXT: s_setpc_b64 s[30:31] 10323 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10324 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 10325 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10326 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10327 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 1> 10328 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10329 ret void 10330} 10331 10332define void @s_shuffle_v4bf16_v3bf16__5_5_3_1() { 10333; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_1: 10334; GFX900: ; %bb.0: 10335; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10336; GFX900-NEXT: ;;#ASMSTART 10337; GFX900-NEXT: ; def s[4:5] 10338; GFX900-NEXT: ;;#ASMEND 10339; GFX900-NEXT: s_lshr_b32 s4, s4, 16 10340; GFX900-NEXT: ;;#ASMSTART 10341; GFX900-NEXT: ; def s[6:7] 10342; GFX900-NEXT: ;;#ASMEND 10343; GFX900-NEXT: s_pack_ll_b32_b16 s9, s6, s4 10344; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 10345; GFX900-NEXT: ;;#ASMSTART 10346; GFX900-NEXT: ; use s[8:9] 10347; GFX900-NEXT: ;;#ASMEND 10348; GFX900-NEXT: s_setpc_b64 s[30:31] 10349; 10350; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_1: 10351; GFX90A: ; %bb.0: 10352; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10353; GFX90A-NEXT: ;;#ASMSTART 10354; GFX90A-NEXT: ; def s[4:5] 10355; GFX90A-NEXT: ;;#ASMEND 10356; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 10357; GFX90A-NEXT: ;;#ASMSTART 10358; GFX90A-NEXT: ; def s[6:7] 10359; GFX90A-NEXT: ;;#ASMEND 10360; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s6, s4 10361; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 10362; GFX90A-NEXT: ;;#ASMSTART 10363; GFX90A-NEXT: ; use s[8:9] 10364; GFX90A-NEXT: ;;#ASMEND 10365; GFX90A-NEXT: s_setpc_b64 s[30:31] 10366; 10367; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_1: 10368; GFX940: ; %bb.0: 10369; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10370; GFX940-NEXT: ;;#ASMSTART 10371; GFX940-NEXT: ; def s[0:1] 10372; GFX940-NEXT: ;;#ASMEND 10373; GFX940-NEXT: s_lshr_b32 s0, s0, 16 10374; GFX940-NEXT: ;;#ASMSTART 10375; GFX940-NEXT: ; def s[2:3] 10376; GFX940-NEXT: ;;#ASMEND 10377; GFX940-NEXT: s_pack_ll_b32_b16 s9, s2, s0 10378; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 10379; GFX940-NEXT: ;;#ASMSTART 10380; GFX940-NEXT: ; use s[8:9] 10381; GFX940-NEXT: ;;#ASMEND 10382; GFX940-NEXT: s_setpc_b64 s[30:31] 10383 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10384 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 10385 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10386 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10387 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 1> 10388 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10389 ret void 10390} 10391 10392define void @s_shuffle_v4bf16_v3bf16__5_5_4_1() { 10393; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_1: 10394; GFX900: ; %bb.0: 10395; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10396; GFX900-NEXT: ;;#ASMSTART 10397; GFX900-NEXT: ; def s[4:5] 10398; GFX900-NEXT: ;;#ASMEND 10399; GFX900-NEXT: ;;#ASMSTART 10400; GFX900-NEXT: ; def s[6:7] 10401; GFX900-NEXT: ;;#ASMEND 10402; GFX900-NEXT: s_lshr_b32 s4, s4, 16 10403; GFX900-NEXT: s_lshr_b32 s5, s6, 16 10404; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s4 10405; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 10406; GFX900-NEXT: ;;#ASMSTART 10407; GFX900-NEXT: ; use s[8:9] 10408; GFX900-NEXT: ;;#ASMEND 10409; GFX900-NEXT: s_setpc_b64 s[30:31] 10410; 10411; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_1: 10412; GFX90A: ; %bb.0: 10413; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10414; GFX90A-NEXT: ;;#ASMSTART 10415; GFX90A-NEXT: ; def s[4:5] 10416; GFX90A-NEXT: ;;#ASMEND 10417; GFX90A-NEXT: ;;#ASMSTART 10418; GFX90A-NEXT: ; def s[6:7] 10419; GFX90A-NEXT: ;;#ASMEND 10420; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 10421; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 10422; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s4 10423; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 10424; GFX90A-NEXT: ;;#ASMSTART 10425; GFX90A-NEXT: ; use s[8:9] 10426; GFX90A-NEXT: ;;#ASMEND 10427; GFX90A-NEXT: s_setpc_b64 s[30:31] 10428; 10429; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_1: 10430; GFX940: ; %bb.0: 10431; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10432; GFX940-NEXT: ;;#ASMSTART 10433; GFX940-NEXT: ; def s[0:1] 10434; GFX940-NEXT: ;;#ASMEND 10435; GFX940-NEXT: ;;#ASMSTART 10436; GFX940-NEXT: ; def s[2:3] 10437; GFX940-NEXT: ;;#ASMEND 10438; GFX940-NEXT: s_lshr_b32 s0, s0, 16 10439; GFX940-NEXT: s_lshr_b32 s1, s2, 16 10440; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s0 10441; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 10442; GFX940-NEXT: ;;#ASMSTART 10443; GFX940-NEXT: ; use s[8:9] 10444; GFX940-NEXT: ;;#ASMEND 10445; GFX940-NEXT: s_setpc_b64 s[30:31] 10446 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10447 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 10448 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10449 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10450 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 1> 10451 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10452 ret void 10453} 10454 10455define void @s_shuffle_v4bf16_v3bf16__u_2_2_2() { 10456; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__u_2_2_2: 10457; GFX900: ; %bb.0: 10458; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10459; GFX900-NEXT: ;;#ASMSTART 10460; GFX900-NEXT: ; def s[4:5] 10461; GFX900-NEXT: ;;#ASMEND 10462; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10463; GFX900-NEXT: s_lshl_b32 s8, s5, 16 10464; GFX900-NEXT: ;;#ASMSTART 10465; GFX900-NEXT: ; use s[8:9] 10466; GFX900-NEXT: ;;#ASMEND 10467; GFX900-NEXT: s_setpc_b64 s[30:31] 10468; 10469; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__u_2_2_2: 10470; GFX90A: ; %bb.0: 10471; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10472; GFX90A-NEXT: ;;#ASMSTART 10473; GFX90A-NEXT: ; def s[4:5] 10474; GFX90A-NEXT: ;;#ASMEND 10475; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10476; GFX90A-NEXT: s_lshl_b32 s8, s5, 16 10477; GFX90A-NEXT: ;;#ASMSTART 10478; GFX90A-NEXT: ; use s[8:9] 10479; GFX90A-NEXT: ;;#ASMEND 10480; GFX90A-NEXT: s_setpc_b64 s[30:31] 10481; 10482; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__u_2_2_2: 10483; GFX940: ; %bb.0: 10484; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10485; GFX940-NEXT: ;;#ASMSTART 10486; GFX940-NEXT: ; def s[0:1] 10487; GFX940-NEXT: ;;#ASMEND 10488; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 10489; GFX940-NEXT: s_lshl_b32 s8, s1, 16 10490; GFX940-NEXT: ;;#ASMSTART 10491; GFX940-NEXT: ; use s[8:9] 10492; GFX940-NEXT: ;;#ASMEND 10493; GFX940-NEXT: s_setpc_b64 s[30:31] 10494 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10495 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10496 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2> 10497 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10498 ret void 10499} 10500 10501define void @s_shuffle_v4bf16_v3bf16__0_2_2_2() { 10502; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_2_2_2: 10503; GFX900: ; %bb.0: 10504; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10505; GFX900-NEXT: ;;#ASMSTART 10506; GFX900-NEXT: ; def s[4:5] 10507; GFX900-NEXT: ;;#ASMEND 10508; GFX900-NEXT: s_pack_ll_b32_b16 s8, s4, s5 10509; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10510; GFX900-NEXT: ;;#ASMSTART 10511; GFX900-NEXT: ; use s[8:9] 10512; GFX900-NEXT: ;;#ASMEND 10513; GFX900-NEXT: s_setpc_b64 s[30:31] 10514; 10515; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_2_2_2: 10516; GFX90A: ; %bb.0: 10517; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10518; GFX90A-NEXT: ;;#ASMSTART 10519; GFX90A-NEXT: ; def s[4:5] 10520; GFX90A-NEXT: ;;#ASMEND 10521; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s4, s5 10522; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10523; GFX90A-NEXT: ;;#ASMSTART 10524; GFX90A-NEXT: ; use s[8:9] 10525; GFX90A-NEXT: ;;#ASMEND 10526; GFX90A-NEXT: s_setpc_b64 s[30:31] 10527; 10528; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_2_2_2: 10529; GFX940: ; %bb.0: 10530; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10531; GFX940-NEXT: ;;#ASMSTART 10532; GFX940-NEXT: ; def s[0:1] 10533; GFX940-NEXT: ;;#ASMEND 10534; GFX940-NEXT: s_pack_ll_b32_b16 s8, s0, s1 10535; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 10536; GFX940-NEXT: ;;#ASMSTART 10537; GFX940-NEXT: ; use s[8:9] 10538; GFX940-NEXT: ;;#ASMEND 10539; GFX940-NEXT: s_setpc_b64 s[30:31] 10540 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10541 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10542 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2> 10543 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10544 ret void 10545} 10546 10547define void @s_shuffle_v4bf16_v3bf16__1_2_2_2() { 10548; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_2_2_2: 10549; GFX900: ; %bb.0: 10550; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10551; GFX900-NEXT: ;;#ASMSTART 10552; GFX900-NEXT: ; def s[4:5] 10553; GFX900-NEXT: ;;#ASMEND 10554; GFX900-NEXT: s_lshr_b32 s4, s4, 16 10555; GFX900-NEXT: s_pack_ll_b32_b16 s8, s4, s5 10556; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10557; GFX900-NEXT: ;;#ASMSTART 10558; GFX900-NEXT: ; use s[8:9] 10559; GFX900-NEXT: ;;#ASMEND 10560; GFX900-NEXT: s_setpc_b64 s[30:31] 10561; 10562; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_2_2_2: 10563; GFX90A: ; %bb.0: 10564; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10565; GFX90A-NEXT: ;;#ASMSTART 10566; GFX90A-NEXT: ; def s[4:5] 10567; GFX90A-NEXT: ;;#ASMEND 10568; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 10569; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s4, s5 10570; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10571; GFX90A-NEXT: ;;#ASMSTART 10572; GFX90A-NEXT: ; use s[8:9] 10573; GFX90A-NEXT: ;;#ASMEND 10574; GFX90A-NEXT: s_setpc_b64 s[30:31] 10575; 10576; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_2_2_2: 10577; GFX940: ; %bb.0: 10578; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10579; GFX940-NEXT: ;;#ASMSTART 10580; GFX940-NEXT: ; def s[0:1] 10581; GFX940-NEXT: ;;#ASMEND 10582; GFX940-NEXT: s_lshr_b32 s0, s0, 16 10583; GFX940-NEXT: s_pack_ll_b32_b16 s8, s0, s1 10584; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 10585; GFX940-NEXT: ;;#ASMSTART 10586; GFX940-NEXT: ; use s[8:9] 10587; GFX940-NEXT: ;;#ASMEND 10588; GFX940-NEXT: s_setpc_b64 s[30:31] 10589 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10590 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10591 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2> 10592 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10593 ret void 10594} 10595 10596define void @s_shuffle_v4bf16_v3bf16__2_2_2_2() { 10597; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_2_2_2: 10598; GFX900: ; %bb.0: 10599; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10600; GFX900-NEXT: ;;#ASMSTART 10601; GFX900-NEXT: ; def s[4:5] 10602; GFX900-NEXT: ;;#ASMEND 10603; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 10604; GFX900-NEXT: s_mov_b32 s9, s8 10605; GFX900-NEXT: ;;#ASMSTART 10606; GFX900-NEXT: ; use s[8:9] 10607; GFX900-NEXT: ;;#ASMEND 10608; GFX900-NEXT: s_setpc_b64 s[30:31] 10609; 10610; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_2_2_2: 10611; GFX90A: ; %bb.0: 10612; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10613; GFX90A-NEXT: ;;#ASMSTART 10614; GFX90A-NEXT: ; def s[4:5] 10615; GFX90A-NEXT: ;;#ASMEND 10616; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 10617; GFX90A-NEXT: s_mov_b32 s9, s8 10618; GFX90A-NEXT: ;;#ASMSTART 10619; GFX90A-NEXT: ; use s[8:9] 10620; GFX90A-NEXT: ;;#ASMEND 10621; GFX90A-NEXT: s_setpc_b64 s[30:31] 10622; 10623; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_2_2_2: 10624; GFX940: ; %bb.0: 10625; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10626; GFX940-NEXT: ;;#ASMSTART 10627; GFX940-NEXT: ; def s[0:1] 10628; GFX940-NEXT: ;;#ASMEND 10629; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 10630; GFX940-NEXT: s_mov_b32 s9, s8 10631; GFX940-NEXT: ;;#ASMSTART 10632; GFX940-NEXT: ; use s[8:9] 10633; GFX940-NEXT: ;;#ASMEND 10634; GFX940-NEXT: s_setpc_b64 s[30:31] 10635 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10636 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10637 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2> 10638 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10639 ret void 10640} 10641 10642define void @s_shuffle_v4bf16_v3bf16__3_2_2_2() { 10643; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__3_2_2_2: 10644; GFX900: ; %bb.0: 10645; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10646; GFX900-NEXT: ;;#ASMSTART 10647; GFX900-NEXT: ; def s[4:5] 10648; GFX900-NEXT: ;;#ASMEND 10649; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10650; GFX900-NEXT: s_lshl_b32 s8, s5, 16 10651; GFX900-NEXT: ;;#ASMSTART 10652; GFX900-NEXT: ; use s[8:9] 10653; GFX900-NEXT: ;;#ASMEND 10654; GFX900-NEXT: s_setpc_b64 s[30:31] 10655; 10656; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__3_2_2_2: 10657; GFX90A: ; %bb.0: 10658; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10659; GFX90A-NEXT: ;;#ASMSTART 10660; GFX90A-NEXT: ; def s[4:5] 10661; GFX90A-NEXT: ;;#ASMEND 10662; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10663; GFX90A-NEXT: s_lshl_b32 s8, s5, 16 10664; GFX90A-NEXT: ;;#ASMSTART 10665; GFX90A-NEXT: ; use s[8:9] 10666; GFX90A-NEXT: ;;#ASMEND 10667; GFX90A-NEXT: s_setpc_b64 s[30:31] 10668; 10669; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__3_2_2_2: 10670; GFX940: ; %bb.0: 10671; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10672; GFX940-NEXT: ;;#ASMSTART 10673; GFX940-NEXT: ; def s[0:1] 10674; GFX940-NEXT: ;;#ASMEND 10675; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 10676; GFX940-NEXT: s_lshl_b32 s8, s1, 16 10677; GFX940-NEXT: ;;#ASMSTART 10678; GFX940-NEXT: ; use s[8:9] 10679; GFX940-NEXT: ;;#ASMEND 10680; GFX940-NEXT: s_setpc_b64 s[30:31] 10681 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10682 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10683 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2> 10684 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10685 ret void 10686} 10687 10688define void @s_shuffle_v4bf16_v3bf16__4_2_2_2() { 10689; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_2_2_2: 10690; GFX900: ; %bb.0: 10691; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10692; GFX900-NEXT: ;;#ASMSTART 10693; GFX900-NEXT: ; def s[4:5] 10694; GFX900-NEXT: ;;#ASMEND 10695; GFX900-NEXT: ;;#ASMSTART 10696; GFX900-NEXT: ; def s[6:7] 10697; GFX900-NEXT: ;;#ASMEND 10698; GFX900-NEXT: s_lshr_b32 s4, s6, 16 10699; GFX900-NEXT: s_pack_ll_b32_b16 s8, s4, s5 10700; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10701; GFX900-NEXT: ;;#ASMSTART 10702; GFX900-NEXT: ; use s[8:9] 10703; GFX900-NEXT: ;;#ASMEND 10704; GFX900-NEXT: s_setpc_b64 s[30:31] 10705; 10706; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_2_2_2: 10707; GFX90A: ; %bb.0: 10708; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10709; GFX90A-NEXT: ;;#ASMSTART 10710; GFX90A-NEXT: ; def s[4:5] 10711; GFX90A-NEXT: ;;#ASMEND 10712; GFX90A-NEXT: ;;#ASMSTART 10713; GFX90A-NEXT: ; def s[6:7] 10714; GFX90A-NEXT: ;;#ASMEND 10715; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 10716; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s4, s5 10717; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10718; GFX90A-NEXT: ;;#ASMSTART 10719; GFX90A-NEXT: ; use s[8:9] 10720; GFX90A-NEXT: ;;#ASMEND 10721; GFX90A-NEXT: s_setpc_b64 s[30:31] 10722; 10723; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_2_2_2: 10724; GFX940: ; %bb.0: 10725; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10726; GFX940-NEXT: ;;#ASMSTART 10727; GFX940-NEXT: ; def s[0:1] 10728; GFX940-NEXT: ;;#ASMEND 10729; GFX940-NEXT: ;;#ASMSTART 10730; GFX940-NEXT: ; def s[2:3] 10731; GFX940-NEXT: ;;#ASMEND 10732; GFX940-NEXT: s_lshr_b32 s0, s2, 16 10733; GFX940-NEXT: s_pack_ll_b32_b16 s8, s0, s1 10734; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 10735; GFX940-NEXT: ;;#ASMSTART 10736; GFX940-NEXT: ; use s[8:9] 10737; GFX940-NEXT: ;;#ASMEND 10738; GFX940-NEXT: s_setpc_b64 s[30:31] 10739 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10740 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 10741 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10742 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10743 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 2, i32 2, i32 2> 10744 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10745 ret void 10746} 10747 10748define void @s_shuffle_v4bf16_v3bf16__5_2_2_2() { 10749; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_2_2: 10750; GFX900: ; %bb.0: 10751; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10752; GFX900-NEXT: ;;#ASMSTART 10753; GFX900-NEXT: ; def s[4:5] 10754; GFX900-NEXT: ;;#ASMEND 10755; GFX900-NEXT: ;;#ASMSTART 10756; GFX900-NEXT: ; def s[6:7] 10757; GFX900-NEXT: ;;#ASMEND 10758; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s5 10759; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10760; GFX900-NEXT: ;;#ASMSTART 10761; GFX900-NEXT: ; use s[8:9] 10762; GFX900-NEXT: ;;#ASMEND 10763; GFX900-NEXT: s_setpc_b64 s[30:31] 10764; 10765; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_2_2: 10766; GFX90A: ; %bb.0: 10767; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10768; GFX90A-NEXT: ;;#ASMSTART 10769; GFX90A-NEXT: ; def s[4:5] 10770; GFX90A-NEXT: ;;#ASMEND 10771; GFX90A-NEXT: ;;#ASMSTART 10772; GFX90A-NEXT: ; def s[6:7] 10773; GFX90A-NEXT: ;;#ASMEND 10774; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s5 10775; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10776; GFX90A-NEXT: ;;#ASMSTART 10777; GFX90A-NEXT: ; use s[8:9] 10778; GFX90A-NEXT: ;;#ASMEND 10779; GFX90A-NEXT: s_setpc_b64 s[30:31] 10780; 10781; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_2_2: 10782; GFX940: ; %bb.0: 10783; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10784; GFX940-NEXT: ;;#ASMSTART 10785; GFX940-NEXT: ; def s[0:1] 10786; GFX940-NEXT: ;;#ASMEND 10787; GFX940-NEXT: ;;#ASMSTART 10788; GFX940-NEXT: ; def s[2:3] 10789; GFX940-NEXT: ;;#ASMEND 10790; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s1 10791; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 10792; GFX940-NEXT: ;;#ASMSTART 10793; GFX940-NEXT: ; use s[8:9] 10794; GFX940-NEXT: ;;#ASMEND 10795; GFX940-NEXT: s_setpc_b64 s[30:31] 10796 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10797 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 10798 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10799 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10800 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 2, i32 2> 10801 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10802 ret void 10803} 10804 10805define void @s_shuffle_v4bf16_v3bf16__5_u_2_2() { 10806; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_2_2: 10807; GFX900: ; %bb.0: 10808; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10809; GFX900-NEXT: ;;#ASMSTART 10810; GFX900-NEXT: ; def s[4:5] 10811; GFX900-NEXT: ;;#ASMEND 10812; GFX900-NEXT: ;;#ASMSTART 10813; GFX900-NEXT: ; def s[6:7] 10814; GFX900-NEXT: ;;#ASMEND 10815; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10816; GFX900-NEXT: s_mov_b32 s8, s7 10817; GFX900-NEXT: ;;#ASMSTART 10818; GFX900-NEXT: ; use s[8:9] 10819; GFX900-NEXT: ;;#ASMEND 10820; GFX900-NEXT: s_setpc_b64 s[30:31] 10821; 10822; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_2_2: 10823; GFX90A: ; %bb.0: 10824; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10825; GFX90A-NEXT: ;;#ASMSTART 10826; GFX90A-NEXT: ; def s[4:5] 10827; GFX90A-NEXT: ;;#ASMEND 10828; GFX90A-NEXT: ;;#ASMSTART 10829; GFX90A-NEXT: ; def s[6:7] 10830; GFX90A-NEXT: ;;#ASMEND 10831; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10832; GFX90A-NEXT: s_mov_b32 s8, s7 10833; GFX90A-NEXT: ;;#ASMSTART 10834; GFX90A-NEXT: ; use s[8:9] 10835; GFX90A-NEXT: ;;#ASMEND 10836; GFX90A-NEXT: s_setpc_b64 s[30:31] 10837; 10838; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_2_2: 10839; GFX940: ; %bb.0: 10840; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10841; GFX940-NEXT: ;;#ASMSTART 10842; GFX940-NEXT: ; def s[0:1] 10843; GFX940-NEXT: ;;#ASMEND 10844; GFX940-NEXT: ;;#ASMSTART 10845; GFX940-NEXT: ; def s[2:3] 10846; GFX940-NEXT: ;;#ASMEND 10847; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 10848; GFX940-NEXT: s_mov_b32 s8, s3 10849; GFX940-NEXT: ;;#ASMSTART 10850; GFX940-NEXT: ; use s[8:9] 10851; GFX940-NEXT: ;;#ASMEND 10852; GFX940-NEXT: s_setpc_b64 s[30:31] 10853 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10854 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 10855 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10856 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10857 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 2, i32 2> 10858 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10859 ret void 10860} 10861 10862define void @s_shuffle_v4bf16_v3bf16__5_0_2_2() { 10863; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_2_2: 10864; GFX900: ; %bb.0: 10865; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10866; GFX900-NEXT: ;;#ASMSTART 10867; GFX900-NEXT: ; def s[4:5] 10868; GFX900-NEXT: ;;#ASMEND 10869; GFX900-NEXT: ;;#ASMSTART 10870; GFX900-NEXT: ; def s[6:7] 10871; GFX900-NEXT: ;;#ASMEND 10872; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s4 10873; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10874; GFX900-NEXT: ;;#ASMSTART 10875; GFX900-NEXT: ; use s[8:9] 10876; GFX900-NEXT: ;;#ASMEND 10877; GFX900-NEXT: s_setpc_b64 s[30:31] 10878; 10879; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_2_2: 10880; GFX90A: ; %bb.0: 10881; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10882; GFX90A-NEXT: ;;#ASMSTART 10883; GFX90A-NEXT: ; def s[4:5] 10884; GFX90A-NEXT: ;;#ASMEND 10885; GFX90A-NEXT: ;;#ASMSTART 10886; GFX90A-NEXT: ; def s[6:7] 10887; GFX90A-NEXT: ;;#ASMEND 10888; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s4 10889; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10890; GFX90A-NEXT: ;;#ASMSTART 10891; GFX90A-NEXT: ; use s[8:9] 10892; GFX90A-NEXT: ;;#ASMEND 10893; GFX90A-NEXT: s_setpc_b64 s[30:31] 10894; 10895; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_2_2: 10896; GFX940: ; %bb.0: 10897; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10898; GFX940-NEXT: ;;#ASMSTART 10899; GFX940-NEXT: ; def s[0:1] 10900; GFX940-NEXT: ;;#ASMEND 10901; GFX940-NEXT: ;;#ASMSTART 10902; GFX940-NEXT: ; def s[2:3] 10903; GFX940-NEXT: ;;#ASMEND 10904; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s0 10905; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 10906; GFX940-NEXT: ;;#ASMSTART 10907; GFX940-NEXT: ; use s[8:9] 10908; GFX940-NEXT: ;;#ASMEND 10909; GFX940-NEXT: s_setpc_b64 s[30:31] 10910 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10911 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 10912 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10913 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10914 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 2, i32 2> 10915 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10916 ret void 10917} 10918 10919define void @s_shuffle_v4bf16_v3bf16__5_1_2_2() { 10920; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_2_2: 10921; GFX900: ; %bb.0: 10922; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10923; GFX900-NEXT: ;;#ASMSTART 10924; GFX900-NEXT: ; def s[4:5] 10925; GFX900-NEXT: ;;#ASMEND 10926; GFX900-NEXT: s_lshr_b32 s4, s4, 16 10927; GFX900-NEXT: ;;#ASMSTART 10928; GFX900-NEXT: ; def s[6:7] 10929; GFX900-NEXT: ;;#ASMEND 10930; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s4 10931; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10932; GFX900-NEXT: ;;#ASMSTART 10933; GFX900-NEXT: ; use s[8:9] 10934; GFX900-NEXT: ;;#ASMEND 10935; GFX900-NEXT: s_setpc_b64 s[30:31] 10936; 10937; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_2_2: 10938; GFX90A: ; %bb.0: 10939; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10940; GFX90A-NEXT: ;;#ASMSTART 10941; GFX90A-NEXT: ; def s[4:5] 10942; GFX90A-NEXT: ;;#ASMEND 10943; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 10944; GFX90A-NEXT: ;;#ASMSTART 10945; GFX90A-NEXT: ; def s[6:7] 10946; GFX90A-NEXT: ;;#ASMEND 10947; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s4 10948; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10949; GFX90A-NEXT: ;;#ASMSTART 10950; GFX90A-NEXT: ; use s[8:9] 10951; GFX90A-NEXT: ;;#ASMEND 10952; GFX90A-NEXT: s_setpc_b64 s[30:31] 10953; 10954; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_2_2: 10955; GFX940: ; %bb.0: 10956; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10957; GFX940-NEXT: ;;#ASMSTART 10958; GFX940-NEXT: ; def s[0:1] 10959; GFX940-NEXT: ;;#ASMEND 10960; GFX940-NEXT: s_lshr_b32 s0, s0, 16 10961; GFX940-NEXT: ;;#ASMSTART 10962; GFX940-NEXT: ; def s[2:3] 10963; GFX940-NEXT: ;;#ASMEND 10964; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s0 10965; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 10966; GFX940-NEXT: ;;#ASMSTART 10967; GFX940-NEXT: ; use s[8:9] 10968; GFX940-NEXT: ;;#ASMEND 10969; GFX940-NEXT: s_setpc_b64 s[30:31] 10970 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 10971 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 10972 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10973 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 10974 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 2, i32 2> 10975 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 10976 ret void 10977} 10978 10979define void @s_shuffle_v4bf16_v3bf16__5_3_2_2() { 10980; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_2_2: 10981; GFX900: ; %bb.0: 10982; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10983; GFX900-NEXT: ;;#ASMSTART 10984; GFX900-NEXT: ; def s[4:5] 10985; GFX900-NEXT: ;;#ASMEND 10986; GFX900-NEXT: ;;#ASMSTART 10987; GFX900-NEXT: ; def s[6:7] 10988; GFX900-NEXT: ;;#ASMEND 10989; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s6 10990; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 10991; GFX900-NEXT: ;;#ASMSTART 10992; GFX900-NEXT: ; use s[8:9] 10993; GFX900-NEXT: ;;#ASMEND 10994; GFX900-NEXT: s_setpc_b64 s[30:31] 10995; 10996; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_2_2: 10997; GFX90A: ; %bb.0: 10998; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10999; GFX90A-NEXT: ;;#ASMSTART 11000; GFX90A-NEXT: ; def s[4:5] 11001; GFX90A-NEXT: ;;#ASMEND 11002; GFX90A-NEXT: ;;#ASMSTART 11003; GFX90A-NEXT: ; def s[6:7] 11004; GFX90A-NEXT: ;;#ASMEND 11005; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s6 11006; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 11007; GFX90A-NEXT: ;;#ASMSTART 11008; GFX90A-NEXT: ; use s[8:9] 11009; GFX90A-NEXT: ;;#ASMEND 11010; GFX90A-NEXT: s_setpc_b64 s[30:31] 11011; 11012; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_2_2: 11013; GFX940: ; %bb.0: 11014; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11015; GFX940-NEXT: ;;#ASMSTART 11016; GFX940-NEXT: ; def s[0:1] 11017; GFX940-NEXT: ;;#ASMEND 11018; GFX940-NEXT: ;;#ASMSTART 11019; GFX940-NEXT: ; def s[2:3] 11020; GFX940-NEXT: ;;#ASMEND 11021; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s2 11022; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 11023; GFX940-NEXT: ;;#ASMSTART 11024; GFX940-NEXT: ; use s[8:9] 11025; GFX940-NEXT: ;;#ASMEND 11026; GFX940-NEXT: s_setpc_b64 s[30:31] 11027 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11028 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 11029 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11030 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11031 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 2, i32 2> 11032 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11033 ret void 11034} 11035 11036define void @s_shuffle_v4bf16_v3bf16__5_4_2_2() { 11037; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_2_2: 11038; GFX900: ; %bb.0: 11039; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11040; GFX900-NEXT: ;;#ASMSTART 11041; GFX900-NEXT: ; def s[4:5] 11042; GFX900-NEXT: ;;#ASMEND 11043; GFX900-NEXT: ;;#ASMSTART 11044; GFX900-NEXT: ; def s[6:7] 11045; GFX900-NEXT: ;;#ASMEND 11046; GFX900-NEXT: s_lshr_b32 s4, s6, 16 11047; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s4 11048; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 11049; GFX900-NEXT: ;;#ASMSTART 11050; GFX900-NEXT: ; use s[8:9] 11051; GFX900-NEXT: ;;#ASMEND 11052; GFX900-NEXT: s_setpc_b64 s[30:31] 11053; 11054; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_2_2: 11055; GFX90A: ; %bb.0: 11056; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11057; GFX90A-NEXT: ;;#ASMSTART 11058; GFX90A-NEXT: ; def s[4:5] 11059; GFX90A-NEXT: ;;#ASMEND 11060; GFX90A-NEXT: ;;#ASMSTART 11061; GFX90A-NEXT: ; def s[6:7] 11062; GFX90A-NEXT: ;;#ASMEND 11063; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 11064; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s4 11065; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 11066; GFX90A-NEXT: ;;#ASMSTART 11067; GFX90A-NEXT: ; use s[8:9] 11068; GFX90A-NEXT: ;;#ASMEND 11069; GFX90A-NEXT: s_setpc_b64 s[30:31] 11070; 11071; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_2_2: 11072; GFX940: ; %bb.0: 11073; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11074; GFX940-NEXT: ;;#ASMSTART 11075; GFX940-NEXT: ; def s[0:1] 11076; GFX940-NEXT: ;;#ASMEND 11077; GFX940-NEXT: ;;#ASMSTART 11078; GFX940-NEXT: ; def s[2:3] 11079; GFX940-NEXT: ;;#ASMEND 11080; GFX940-NEXT: s_lshr_b32 s0, s2, 16 11081; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s0 11082; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 11083; GFX940-NEXT: ;;#ASMSTART 11084; GFX940-NEXT: ; use s[8:9] 11085; GFX940-NEXT: ;;#ASMEND 11086; GFX940-NEXT: s_setpc_b64 s[30:31] 11087 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11088 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 11089 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11090 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11091 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 2, i32 2> 11092 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11093 ret void 11094} 11095 11096define void @s_shuffle_v4bf16_v3bf16__5_5_2_2() { 11097; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_2: 11098; GFX900: ; %bb.0: 11099; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11100; GFX900-NEXT: ;;#ASMSTART 11101; GFX900-NEXT: ; def s[4:5] 11102; GFX900-NEXT: ;;#ASMEND 11103; GFX900-NEXT: ;;#ASMSTART 11104; GFX900-NEXT: ; def s[6:7] 11105; GFX900-NEXT: ;;#ASMEND 11106; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 11107; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 11108; GFX900-NEXT: ;;#ASMSTART 11109; GFX900-NEXT: ; use s[8:9] 11110; GFX900-NEXT: ;;#ASMEND 11111; GFX900-NEXT: s_setpc_b64 s[30:31] 11112; 11113; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_2: 11114; GFX90A: ; %bb.0: 11115; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11116; GFX90A-NEXT: ;;#ASMSTART 11117; GFX90A-NEXT: ; def s[4:5] 11118; GFX90A-NEXT: ;;#ASMEND 11119; GFX90A-NEXT: ;;#ASMSTART 11120; GFX90A-NEXT: ; def s[6:7] 11121; GFX90A-NEXT: ;;#ASMEND 11122; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 11123; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 11124; GFX90A-NEXT: ;;#ASMSTART 11125; GFX90A-NEXT: ; use s[8:9] 11126; GFX90A-NEXT: ;;#ASMEND 11127; GFX90A-NEXT: s_setpc_b64 s[30:31] 11128; 11129; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_2: 11130; GFX940: ; %bb.0: 11131; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11132; GFX940-NEXT: ;;#ASMSTART 11133; GFX940-NEXT: ; def s[0:1] 11134; GFX940-NEXT: ;;#ASMEND 11135; GFX940-NEXT: ;;#ASMSTART 11136; GFX940-NEXT: ; def s[2:3] 11137; GFX940-NEXT: ;;#ASMEND 11138; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 11139; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 11140; GFX940-NEXT: ;;#ASMSTART 11141; GFX940-NEXT: ; use s[8:9] 11142; GFX940-NEXT: ;;#ASMEND 11143; GFX940-NEXT: s_setpc_b64 s[30:31] 11144 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11145 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 11146 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11147 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11148 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 2> 11149 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11150 ret void 11151} 11152 11153define void @s_shuffle_v4bf16_v3bf16__5_5_u_2() { 11154; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_2: 11155; GFX900: ; %bb.0: 11156; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11157; GFX900-NEXT: ;;#ASMSTART 11158; GFX900-NEXT: ; def s[4:5] 11159; GFX900-NEXT: ;;#ASMEND 11160; GFX900-NEXT: ;;#ASMSTART 11161; GFX900-NEXT: ; def s[6:7] 11162; GFX900-NEXT: ;;#ASMEND 11163; GFX900-NEXT: s_lshl_b32 s9, s5, 16 11164; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 11165; GFX900-NEXT: ;;#ASMSTART 11166; GFX900-NEXT: ; use s[8:9] 11167; GFX900-NEXT: ;;#ASMEND 11168; GFX900-NEXT: s_setpc_b64 s[30:31] 11169; 11170; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_2: 11171; GFX90A: ; %bb.0: 11172; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11173; GFX90A-NEXT: ;;#ASMSTART 11174; GFX90A-NEXT: ; def s[4:5] 11175; GFX90A-NEXT: ;;#ASMEND 11176; GFX90A-NEXT: ;;#ASMSTART 11177; GFX90A-NEXT: ; def s[6:7] 11178; GFX90A-NEXT: ;;#ASMEND 11179; GFX90A-NEXT: s_lshl_b32 s9, s5, 16 11180; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 11181; GFX90A-NEXT: ;;#ASMSTART 11182; GFX90A-NEXT: ; use s[8:9] 11183; GFX90A-NEXT: ;;#ASMEND 11184; GFX90A-NEXT: s_setpc_b64 s[30:31] 11185; 11186; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_2: 11187; GFX940: ; %bb.0: 11188; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11189; GFX940-NEXT: ;;#ASMSTART 11190; GFX940-NEXT: ; def s[0:1] 11191; GFX940-NEXT: ;;#ASMEND 11192; GFX940-NEXT: ;;#ASMSTART 11193; GFX940-NEXT: ; def s[2:3] 11194; GFX940-NEXT: ;;#ASMEND 11195; GFX940-NEXT: s_lshl_b32 s9, s1, 16 11196; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 11197; GFX940-NEXT: ;;#ASMSTART 11198; GFX940-NEXT: ; use s[8:9] 11199; GFX940-NEXT: ;;#ASMEND 11200; GFX940-NEXT: s_setpc_b64 s[30:31] 11201 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11202 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 11203 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11204 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11205 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 2> 11206 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11207 ret void 11208} 11209 11210define void @s_shuffle_v4bf16_v3bf16__5_5_0_2() { 11211; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_2: 11212; GFX900: ; %bb.0: 11213; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11214; GFX900-NEXT: ;;#ASMSTART 11215; GFX900-NEXT: ; def s[4:5] 11216; GFX900-NEXT: ;;#ASMEND 11217; GFX900-NEXT: ;;#ASMSTART 11218; GFX900-NEXT: ; def s[6:7] 11219; GFX900-NEXT: ;;#ASMEND 11220; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s5 11221; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 11222; GFX900-NEXT: ;;#ASMSTART 11223; GFX900-NEXT: ; use s[8:9] 11224; GFX900-NEXT: ;;#ASMEND 11225; GFX900-NEXT: s_setpc_b64 s[30:31] 11226; 11227; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_2: 11228; GFX90A: ; %bb.0: 11229; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11230; GFX90A-NEXT: ;;#ASMSTART 11231; GFX90A-NEXT: ; def s[4:5] 11232; GFX90A-NEXT: ;;#ASMEND 11233; GFX90A-NEXT: ;;#ASMSTART 11234; GFX90A-NEXT: ; def s[6:7] 11235; GFX90A-NEXT: ;;#ASMEND 11236; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s5 11237; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 11238; GFX90A-NEXT: ;;#ASMSTART 11239; GFX90A-NEXT: ; use s[8:9] 11240; GFX90A-NEXT: ;;#ASMEND 11241; GFX90A-NEXT: s_setpc_b64 s[30:31] 11242; 11243; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_2: 11244; GFX940: ; %bb.0: 11245; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11246; GFX940-NEXT: ;;#ASMSTART 11247; GFX940-NEXT: ; def s[0:1] 11248; GFX940-NEXT: ;;#ASMEND 11249; GFX940-NEXT: ;;#ASMSTART 11250; GFX940-NEXT: ; def s[2:3] 11251; GFX940-NEXT: ;;#ASMEND 11252; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s1 11253; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 11254; GFX940-NEXT: ;;#ASMSTART 11255; GFX940-NEXT: ; use s[8:9] 11256; GFX940-NEXT: ;;#ASMEND 11257; GFX940-NEXT: s_setpc_b64 s[30:31] 11258 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11259 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 11260 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11261 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11262 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 2> 11263 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11264 ret void 11265} 11266 11267define void @s_shuffle_v4bf16_v3bf16__5_5_1_2() { 11268; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_2: 11269; GFX900: ; %bb.0: 11270; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11271; GFX900-NEXT: ;;#ASMSTART 11272; GFX900-NEXT: ; def s[4:5] 11273; GFX900-NEXT: ;;#ASMEND 11274; GFX900-NEXT: s_lshr_b32 s4, s4, 16 11275; GFX900-NEXT: ;;#ASMSTART 11276; GFX900-NEXT: ; def s[6:7] 11277; GFX900-NEXT: ;;#ASMEND 11278; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s5 11279; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 11280; GFX900-NEXT: ;;#ASMSTART 11281; GFX900-NEXT: ; use s[8:9] 11282; GFX900-NEXT: ;;#ASMEND 11283; GFX900-NEXT: s_setpc_b64 s[30:31] 11284; 11285; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_2: 11286; GFX90A: ; %bb.0: 11287; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11288; GFX90A-NEXT: ;;#ASMSTART 11289; GFX90A-NEXT: ; def s[4:5] 11290; GFX90A-NEXT: ;;#ASMEND 11291; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 11292; GFX90A-NEXT: ;;#ASMSTART 11293; GFX90A-NEXT: ; def s[6:7] 11294; GFX90A-NEXT: ;;#ASMEND 11295; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s5 11296; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 11297; GFX90A-NEXT: ;;#ASMSTART 11298; GFX90A-NEXT: ; use s[8:9] 11299; GFX90A-NEXT: ;;#ASMEND 11300; GFX90A-NEXT: s_setpc_b64 s[30:31] 11301; 11302; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_2: 11303; GFX940: ; %bb.0: 11304; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11305; GFX940-NEXT: ;;#ASMSTART 11306; GFX940-NEXT: ; def s[0:1] 11307; GFX940-NEXT: ;;#ASMEND 11308; GFX940-NEXT: s_lshr_b32 s0, s0, 16 11309; GFX940-NEXT: ;;#ASMSTART 11310; GFX940-NEXT: ; def s[2:3] 11311; GFX940-NEXT: ;;#ASMEND 11312; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s1 11313; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 11314; GFX940-NEXT: ;;#ASMSTART 11315; GFX940-NEXT: ; use s[8:9] 11316; GFX940-NEXT: ;;#ASMEND 11317; GFX940-NEXT: s_setpc_b64 s[30:31] 11318 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11319 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 11320 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11321 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11322 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 2> 11323 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11324 ret void 11325} 11326 11327define void @s_shuffle_v4bf16_v3bf16__5_5_3_2() { 11328; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_2: 11329; GFX900: ; %bb.0: 11330; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11331; GFX900-NEXT: ;;#ASMSTART 11332; GFX900-NEXT: ; def s[4:5] 11333; GFX900-NEXT: ;;#ASMEND 11334; GFX900-NEXT: ;;#ASMSTART 11335; GFX900-NEXT: ; def s[6:7] 11336; GFX900-NEXT: ;;#ASMEND 11337; GFX900-NEXT: s_pack_ll_b32_b16 s9, s6, s5 11338; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 11339; GFX900-NEXT: ;;#ASMSTART 11340; GFX900-NEXT: ; use s[8:9] 11341; GFX900-NEXT: ;;#ASMEND 11342; GFX900-NEXT: s_setpc_b64 s[30:31] 11343; 11344; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_2: 11345; GFX90A: ; %bb.0: 11346; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11347; GFX90A-NEXT: ;;#ASMSTART 11348; GFX90A-NEXT: ; def s[4:5] 11349; GFX90A-NEXT: ;;#ASMEND 11350; GFX90A-NEXT: ;;#ASMSTART 11351; GFX90A-NEXT: ; def s[6:7] 11352; GFX90A-NEXT: ;;#ASMEND 11353; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s6, s5 11354; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 11355; GFX90A-NEXT: ;;#ASMSTART 11356; GFX90A-NEXT: ; use s[8:9] 11357; GFX90A-NEXT: ;;#ASMEND 11358; GFX90A-NEXT: s_setpc_b64 s[30:31] 11359; 11360; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_2: 11361; GFX940: ; %bb.0: 11362; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11363; GFX940-NEXT: ;;#ASMSTART 11364; GFX940-NEXT: ; def s[0:1] 11365; GFX940-NEXT: ;;#ASMEND 11366; GFX940-NEXT: ;;#ASMSTART 11367; GFX940-NEXT: ; def s[2:3] 11368; GFX940-NEXT: ;;#ASMEND 11369; GFX940-NEXT: s_pack_ll_b32_b16 s9, s2, s1 11370; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 11371; GFX940-NEXT: ;;#ASMSTART 11372; GFX940-NEXT: ; use s[8:9] 11373; GFX940-NEXT: ;;#ASMEND 11374; GFX940-NEXT: s_setpc_b64 s[30:31] 11375 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11376 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 11377 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11378 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11379 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 2> 11380 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11381 ret void 11382} 11383 11384define void @s_shuffle_v4bf16_v3bf16__5_5_4_2() { 11385; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_2: 11386; GFX900: ; %bb.0: 11387; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11388; GFX900-NEXT: ;;#ASMSTART 11389; GFX900-NEXT: ; def s[4:5] 11390; GFX900-NEXT: ;;#ASMEND 11391; GFX900-NEXT: ;;#ASMSTART 11392; GFX900-NEXT: ; def s[6:7] 11393; GFX900-NEXT: ;;#ASMEND 11394; GFX900-NEXT: s_lshr_b32 s4, s6, 16 11395; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s5 11396; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 11397; GFX900-NEXT: ;;#ASMSTART 11398; GFX900-NEXT: ; use s[8:9] 11399; GFX900-NEXT: ;;#ASMEND 11400; GFX900-NEXT: s_setpc_b64 s[30:31] 11401; 11402; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_2: 11403; GFX90A: ; %bb.0: 11404; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11405; GFX90A-NEXT: ;;#ASMSTART 11406; GFX90A-NEXT: ; def s[4:5] 11407; GFX90A-NEXT: ;;#ASMEND 11408; GFX90A-NEXT: ;;#ASMSTART 11409; GFX90A-NEXT: ; def s[6:7] 11410; GFX90A-NEXT: ;;#ASMEND 11411; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 11412; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s5 11413; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 11414; GFX90A-NEXT: ;;#ASMSTART 11415; GFX90A-NEXT: ; use s[8:9] 11416; GFX90A-NEXT: ;;#ASMEND 11417; GFX90A-NEXT: s_setpc_b64 s[30:31] 11418; 11419; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_2: 11420; GFX940: ; %bb.0: 11421; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11422; GFX940-NEXT: ;;#ASMSTART 11423; GFX940-NEXT: ; def s[0:1] 11424; GFX940-NEXT: ;;#ASMEND 11425; GFX940-NEXT: ;;#ASMSTART 11426; GFX940-NEXT: ; def s[2:3] 11427; GFX940-NEXT: ;;#ASMEND 11428; GFX940-NEXT: s_lshr_b32 s0, s2, 16 11429; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s1 11430; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 11431; GFX940-NEXT: ;;#ASMSTART 11432; GFX940-NEXT: ; use s[8:9] 11433; GFX940-NEXT: ;;#ASMEND 11434; GFX940-NEXT: s_setpc_b64 s[30:31] 11435 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11436 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 11437 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11438 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11439 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 2> 11440 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11441 ret void 11442} 11443 11444define void @s_shuffle_v4bf16_v3bf16__u_3_3_3() { 11445; GFX9-LABEL: s_shuffle_v4bf16_v3bf16__u_3_3_3: 11446; GFX9: ; %bb.0: 11447; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11448; GFX9-NEXT: ;;#ASMSTART 11449; GFX9-NEXT: ; use s[8:9] 11450; GFX9-NEXT: ;;#ASMEND 11451; GFX9-NEXT: s_setpc_b64 s[30:31] 11452 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11453 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11454 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 3, i32 3, i32 3> 11455 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11456 ret void 11457} 11458 11459define void @s_shuffle_v4bf16_v3bf16__0_3_3_3() { 11460; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_3_3_3: 11461; GFX900: ; %bb.0: 11462; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11463; GFX900-NEXT: ;;#ASMSTART 11464; GFX900-NEXT: ; def s[8:9] 11465; GFX900-NEXT: ;;#ASMEND 11466; GFX900-NEXT: ;;#ASMSTART 11467; GFX900-NEXT: ; use s[8:9] 11468; GFX900-NEXT: ;;#ASMEND 11469; GFX900-NEXT: s_setpc_b64 s[30:31] 11470; 11471; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_3_3_3: 11472; GFX90A: ; %bb.0: 11473; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11474; GFX90A-NEXT: ;;#ASMSTART 11475; GFX90A-NEXT: ; def s[8:9] 11476; GFX90A-NEXT: ;;#ASMEND 11477; GFX90A-NEXT: ;;#ASMSTART 11478; GFX90A-NEXT: ; use s[8:9] 11479; GFX90A-NEXT: ;;#ASMEND 11480; GFX90A-NEXT: s_setpc_b64 s[30:31] 11481; 11482; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_3_3_3: 11483; GFX940: ; %bb.0: 11484; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11485; GFX940-NEXT: ;;#ASMSTART 11486; GFX940-NEXT: ; def s[8:9] 11487; GFX940-NEXT: ;;#ASMEND 11488; GFX940-NEXT: s_nop 0 11489; GFX940-NEXT: ;;#ASMSTART 11490; GFX940-NEXT: ; use s[8:9] 11491; GFX940-NEXT: ;;#ASMEND 11492; GFX940-NEXT: s_setpc_b64 s[30:31] 11493 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11494 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11495 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 3, i32 3, i32 3> 11496 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11497 ret void 11498} 11499 11500define void @s_shuffle_v4bf16_v3bf16__1_3_3_3() { 11501; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_3_3_3: 11502; GFX900: ; %bb.0: 11503; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11504; GFX900-NEXT: ;;#ASMSTART 11505; GFX900-NEXT: ; def s[4:5] 11506; GFX900-NEXT: ;;#ASMEND 11507; GFX900-NEXT: s_lshr_b32 s8, s4, 16 11508; GFX900-NEXT: ;;#ASMSTART 11509; GFX900-NEXT: ; use s[8:9] 11510; GFX900-NEXT: ;;#ASMEND 11511; GFX900-NEXT: s_setpc_b64 s[30:31] 11512; 11513; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_3_3_3: 11514; GFX90A: ; %bb.0: 11515; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11516; GFX90A-NEXT: ;;#ASMSTART 11517; GFX90A-NEXT: ; def s[4:5] 11518; GFX90A-NEXT: ;;#ASMEND 11519; GFX90A-NEXT: s_lshr_b32 s8, s4, 16 11520; GFX90A-NEXT: ;;#ASMSTART 11521; GFX90A-NEXT: ; use s[8:9] 11522; GFX90A-NEXT: ;;#ASMEND 11523; GFX90A-NEXT: s_setpc_b64 s[30:31] 11524; 11525; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_3_3_3: 11526; GFX940: ; %bb.0: 11527; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11528; GFX940-NEXT: ;;#ASMSTART 11529; GFX940-NEXT: ; def s[0:1] 11530; GFX940-NEXT: ;;#ASMEND 11531; GFX940-NEXT: s_lshr_b32 s8, s0, 16 11532; GFX940-NEXT: ;;#ASMSTART 11533; GFX940-NEXT: ; use s[8:9] 11534; GFX940-NEXT: ;;#ASMEND 11535; GFX940-NEXT: s_setpc_b64 s[30:31] 11536 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11537 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11538 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3> 11539 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11540 ret void 11541} 11542 11543define void @s_shuffle_v4bf16_v3bf16__2_3_3_3() { 11544; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_3_3_3: 11545; GFX900: ; %bb.0: 11546; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11547; GFX900-NEXT: ;;#ASMSTART 11548; GFX900-NEXT: ; def s[4:5] 11549; GFX900-NEXT: ;;#ASMEND 11550; GFX900-NEXT: s_mov_b32 s8, s5 11551; GFX900-NEXT: ;;#ASMSTART 11552; GFX900-NEXT: ; use s[8:9] 11553; GFX900-NEXT: ;;#ASMEND 11554; GFX900-NEXT: s_setpc_b64 s[30:31] 11555; 11556; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_3_3_3: 11557; GFX90A: ; %bb.0: 11558; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11559; GFX90A-NEXT: ;;#ASMSTART 11560; GFX90A-NEXT: ; def s[4:5] 11561; GFX90A-NEXT: ;;#ASMEND 11562; GFX90A-NEXT: s_mov_b32 s8, s5 11563; GFX90A-NEXT: ;;#ASMSTART 11564; GFX90A-NEXT: ; use s[8:9] 11565; GFX90A-NEXT: ;;#ASMEND 11566; GFX90A-NEXT: s_setpc_b64 s[30:31] 11567; 11568; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_3_3_3: 11569; GFX940: ; %bb.0: 11570; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11571; GFX940-NEXT: ;;#ASMSTART 11572; GFX940-NEXT: ; def s[0:1] 11573; GFX940-NEXT: ;;#ASMEND 11574; GFX940-NEXT: s_mov_b32 s8, s1 11575; GFX940-NEXT: ;;#ASMSTART 11576; GFX940-NEXT: ; use s[8:9] 11577; GFX940-NEXT: ;;#ASMEND 11578; GFX940-NEXT: s_setpc_b64 s[30:31] 11579 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11580 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11581 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 11582 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11583 ret void 11584} 11585 11586define void @s_shuffle_v4bf16_v3bf16__3_3_3_3() { 11587; GFX9-LABEL: s_shuffle_v4bf16_v3bf16__3_3_3_3: 11588; GFX9: ; %bb.0: 11589; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11590; GFX9-NEXT: ;;#ASMSTART 11591; GFX9-NEXT: ; use s[8:9] 11592; GFX9-NEXT: ;;#ASMEND 11593; GFX9-NEXT: s_setpc_b64 s[30:31] 11594 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11595 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11596 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 11597 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11598 ret void 11599} 11600 11601define void @s_shuffle_v4bf16_v3bf16__4_3_3_3() { 11602; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_3_3_3: 11603; GFX900: ; %bb.0: 11604; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11605; GFX900-NEXT: ;;#ASMSTART 11606; GFX900-NEXT: ; def s[4:5] 11607; GFX900-NEXT: ;;#ASMEND 11608; GFX900-NEXT: s_lshr_b32 s5, s4, 16 11609; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s4 11610; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 11611; GFX900-NEXT: ;;#ASMSTART 11612; GFX900-NEXT: ; use s[8:9] 11613; GFX900-NEXT: ;;#ASMEND 11614; GFX900-NEXT: s_setpc_b64 s[30:31] 11615; 11616; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_3_3_3: 11617; GFX90A: ; %bb.0: 11618; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11619; GFX90A-NEXT: ;;#ASMSTART 11620; GFX90A-NEXT: ; def s[4:5] 11621; GFX90A-NEXT: ;;#ASMEND 11622; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 11623; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s4 11624; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 11625; GFX90A-NEXT: ;;#ASMSTART 11626; GFX90A-NEXT: ; use s[8:9] 11627; GFX90A-NEXT: ;;#ASMEND 11628; GFX90A-NEXT: s_setpc_b64 s[30:31] 11629; 11630; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_3_3_3: 11631; GFX940: ; %bb.0: 11632; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11633; GFX940-NEXT: ;;#ASMSTART 11634; GFX940-NEXT: ; def s[0:1] 11635; GFX940-NEXT: ;;#ASMEND 11636; GFX940-NEXT: s_lshr_b32 s1, s0, 16 11637; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s0 11638; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 11639; GFX940-NEXT: ;;#ASMSTART 11640; GFX940-NEXT: ; use s[8:9] 11641; GFX940-NEXT: ;;#ASMEND 11642; GFX940-NEXT: s_setpc_b64 s[30:31] 11643 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11644 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 11645 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11646 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11647 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 3, i32 3, i32 3> 11648 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11649 ret void 11650} 11651 11652define void @s_shuffle_v4bf16_v3bf16__5_3_3_3() { 11653; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_3_3: 11654; GFX900: ; %bb.0: 11655; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11656; GFX900-NEXT: ;;#ASMSTART 11657; GFX900-NEXT: ; def s[4:5] 11658; GFX900-NEXT: ;;#ASMEND 11659; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s4 11660; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 11661; GFX900-NEXT: ;;#ASMSTART 11662; GFX900-NEXT: ; use s[8:9] 11663; GFX900-NEXT: ;;#ASMEND 11664; GFX900-NEXT: s_setpc_b64 s[30:31] 11665; 11666; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_3_3: 11667; GFX90A: ; %bb.0: 11668; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11669; GFX90A-NEXT: ;;#ASMSTART 11670; GFX90A-NEXT: ; def s[4:5] 11671; GFX90A-NEXT: ;;#ASMEND 11672; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s4 11673; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 11674; GFX90A-NEXT: ;;#ASMSTART 11675; GFX90A-NEXT: ; use s[8:9] 11676; GFX90A-NEXT: ;;#ASMEND 11677; GFX90A-NEXT: s_setpc_b64 s[30:31] 11678; 11679; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_3_3: 11680; GFX940: ; %bb.0: 11681; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11682; GFX940-NEXT: ;;#ASMSTART 11683; GFX940-NEXT: ; def s[0:1] 11684; GFX940-NEXT: ;;#ASMEND 11685; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s0 11686; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 11687; GFX940-NEXT: ;;#ASMSTART 11688; GFX940-NEXT: ; use s[8:9] 11689; GFX940-NEXT: ;;#ASMEND 11690; GFX940-NEXT: s_setpc_b64 s[30:31] 11691 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11692 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 11693 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11694 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11695 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 3, i32 3> 11696 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11697 ret void 11698} 11699 11700define void @s_shuffle_v4bf16_v3bf16__5_u_3_3() { 11701; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_3_3: 11702; GFX900: ; %bb.0: 11703; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11704; GFX900-NEXT: ;;#ASMSTART 11705; GFX900-NEXT: ; def s[4:5] 11706; GFX900-NEXT: ;;#ASMEND 11707; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 11708; GFX900-NEXT: s_mov_b32 s8, s5 11709; GFX900-NEXT: ;;#ASMSTART 11710; GFX900-NEXT: ; use s[8:9] 11711; GFX900-NEXT: ;;#ASMEND 11712; GFX900-NEXT: s_setpc_b64 s[30:31] 11713; 11714; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_3_3: 11715; GFX90A: ; %bb.0: 11716; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11717; GFX90A-NEXT: ;;#ASMSTART 11718; GFX90A-NEXT: ; def s[4:5] 11719; GFX90A-NEXT: ;;#ASMEND 11720; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 11721; GFX90A-NEXT: s_mov_b32 s8, s5 11722; GFX90A-NEXT: ;;#ASMSTART 11723; GFX90A-NEXT: ; use s[8:9] 11724; GFX90A-NEXT: ;;#ASMEND 11725; GFX90A-NEXT: s_setpc_b64 s[30:31] 11726; 11727; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_3_3: 11728; GFX940: ; %bb.0: 11729; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11730; GFX940-NEXT: ;;#ASMSTART 11731; GFX940-NEXT: ; def s[0:1] 11732; GFX940-NEXT: ;;#ASMEND 11733; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 11734; GFX940-NEXT: s_mov_b32 s8, s1 11735; GFX940-NEXT: ;;#ASMSTART 11736; GFX940-NEXT: ; use s[8:9] 11737; GFX940-NEXT: ;;#ASMEND 11738; GFX940-NEXT: s_setpc_b64 s[30:31] 11739 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11740 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 11741 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11742 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11743 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 3, i32 3> 11744 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11745 ret void 11746} 11747 11748define void @s_shuffle_v4bf16_v3bf16__5_0_3_3() { 11749; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_3_3: 11750; GFX900: ; %bb.0: 11751; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11752; GFX900-NEXT: ;;#ASMSTART 11753; GFX900-NEXT: ; def s[4:5] 11754; GFX900-NEXT: ;;#ASMEND 11755; GFX900-NEXT: ;;#ASMSTART 11756; GFX900-NEXT: ; def s[6:7] 11757; GFX900-NEXT: ;;#ASMEND 11758; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s4 11759; GFX900-NEXT: s_pack_ll_b32_b16 s9, s6, s6 11760; GFX900-NEXT: ;;#ASMSTART 11761; GFX900-NEXT: ; use s[8:9] 11762; GFX900-NEXT: ;;#ASMEND 11763; GFX900-NEXT: s_setpc_b64 s[30:31] 11764; 11765; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_3_3: 11766; GFX90A: ; %bb.0: 11767; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11768; GFX90A-NEXT: ;;#ASMSTART 11769; GFX90A-NEXT: ; def s[4:5] 11770; GFX90A-NEXT: ;;#ASMEND 11771; GFX90A-NEXT: ;;#ASMSTART 11772; GFX90A-NEXT: ; def s[6:7] 11773; GFX90A-NEXT: ;;#ASMEND 11774; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s4 11775; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s6, s6 11776; GFX90A-NEXT: ;;#ASMSTART 11777; GFX90A-NEXT: ; use s[8:9] 11778; GFX90A-NEXT: ;;#ASMEND 11779; GFX90A-NEXT: s_setpc_b64 s[30:31] 11780; 11781; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_3_3: 11782; GFX940: ; %bb.0: 11783; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11784; GFX940-NEXT: ;;#ASMSTART 11785; GFX940-NEXT: ; def s[0:1] 11786; GFX940-NEXT: ;;#ASMEND 11787; GFX940-NEXT: ;;#ASMSTART 11788; GFX940-NEXT: ; def s[2:3] 11789; GFX940-NEXT: ;;#ASMEND 11790; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s0 11791; GFX940-NEXT: s_pack_ll_b32_b16 s9, s2, s2 11792; GFX940-NEXT: ;;#ASMSTART 11793; GFX940-NEXT: ; use s[8:9] 11794; GFX940-NEXT: ;;#ASMEND 11795; GFX940-NEXT: s_setpc_b64 s[30:31] 11796 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11797 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 11798 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11799 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11800 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 3, i32 3> 11801 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11802 ret void 11803} 11804 11805define void @s_shuffle_v4bf16_v3bf16__5_1_3_3() { 11806; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_3_3: 11807; GFX900: ; %bb.0: 11808; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11809; GFX900-NEXT: ;;#ASMSTART 11810; GFX900-NEXT: ; def s[4:5] 11811; GFX900-NEXT: ;;#ASMEND 11812; GFX900-NEXT: s_lshr_b32 s4, s4, 16 11813; GFX900-NEXT: ;;#ASMSTART 11814; GFX900-NEXT: ; def s[6:7] 11815; GFX900-NEXT: ;;#ASMEND 11816; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s4 11817; GFX900-NEXT: s_pack_ll_b32_b16 s9, s6, s6 11818; GFX900-NEXT: ;;#ASMSTART 11819; GFX900-NEXT: ; use s[8:9] 11820; GFX900-NEXT: ;;#ASMEND 11821; GFX900-NEXT: s_setpc_b64 s[30:31] 11822; 11823; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_3_3: 11824; GFX90A: ; %bb.0: 11825; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11826; GFX90A-NEXT: ;;#ASMSTART 11827; GFX90A-NEXT: ; def s[4:5] 11828; GFX90A-NEXT: ;;#ASMEND 11829; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 11830; GFX90A-NEXT: ;;#ASMSTART 11831; GFX90A-NEXT: ; def s[6:7] 11832; GFX90A-NEXT: ;;#ASMEND 11833; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s4 11834; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s6, s6 11835; GFX90A-NEXT: ;;#ASMSTART 11836; GFX90A-NEXT: ; use s[8:9] 11837; GFX90A-NEXT: ;;#ASMEND 11838; GFX90A-NEXT: s_setpc_b64 s[30:31] 11839; 11840; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_3_3: 11841; GFX940: ; %bb.0: 11842; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11843; GFX940-NEXT: ;;#ASMSTART 11844; GFX940-NEXT: ; def s[0:1] 11845; GFX940-NEXT: ;;#ASMEND 11846; GFX940-NEXT: s_lshr_b32 s0, s0, 16 11847; GFX940-NEXT: ;;#ASMSTART 11848; GFX940-NEXT: ; def s[2:3] 11849; GFX940-NEXT: ;;#ASMEND 11850; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s0 11851; GFX940-NEXT: s_pack_ll_b32_b16 s9, s2, s2 11852; GFX940-NEXT: ;;#ASMSTART 11853; GFX940-NEXT: ; use s[8:9] 11854; GFX940-NEXT: ;;#ASMEND 11855; GFX940-NEXT: s_setpc_b64 s[30:31] 11856 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11857 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 11858 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11859 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11860 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 3, i32 3> 11861 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11862 ret void 11863} 11864 11865define void @s_shuffle_v4bf16_v3bf16__5_2_3_3() { 11866; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_3_3: 11867; GFX900: ; %bb.0: 11868; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11869; GFX900-NEXT: ;;#ASMSTART 11870; GFX900-NEXT: ; def s[4:5] 11871; GFX900-NEXT: ;;#ASMEND 11872; GFX900-NEXT: ;;#ASMSTART 11873; GFX900-NEXT: ; def s[6:7] 11874; GFX900-NEXT: ;;#ASMEND 11875; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s5 11876; GFX900-NEXT: s_pack_ll_b32_b16 s9, s6, s6 11877; GFX900-NEXT: ;;#ASMSTART 11878; GFX900-NEXT: ; use s[8:9] 11879; GFX900-NEXT: ;;#ASMEND 11880; GFX900-NEXT: s_setpc_b64 s[30:31] 11881; 11882; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_3_3: 11883; GFX90A: ; %bb.0: 11884; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11885; GFX90A-NEXT: ;;#ASMSTART 11886; GFX90A-NEXT: ; def s[4:5] 11887; GFX90A-NEXT: ;;#ASMEND 11888; GFX90A-NEXT: ;;#ASMSTART 11889; GFX90A-NEXT: ; def s[6:7] 11890; GFX90A-NEXT: ;;#ASMEND 11891; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s5 11892; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s6, s6 11893; GFX90A-NEXT: ;;#ASMSTART 11894; GFX90A-NEXT: ; use s[8:9] 11895; GFX90A-NEXT: ;;#ASMEND 11896; GFX90A-NEXT: s_setpc_b64 s[30:31] 11897; 11898; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_3_3: 11899; GFX940: ; %bb.0: 11900; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11901; GFX940-NEXT: ;;#ASMSTART 11902; GFX940-NEXT: ; def s[0:1] 11903; GFX940-NEXT: ;;#ASMEND 11904; GFX940-NEXT: ;;#ASMSTART 11905; GFX940-NEXT: ; def s[2:3] 11906; GFX940-NEXT: ;;#ASMEND 11907; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s1 11908; GFX940-NEXT: s_pack_ll_b32_b16 s9, s2, s2 11909; GFX940-NEXT: ;;#ASMSTART 11910; GFX940-NEXT: ; use s[8:9] 11911; GFX940-NEXT: ;;#ASMEND 11912; GFX940-NEXT: s_setpc_b64 s[30:31] 11913 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11914 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 11915 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11916 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11917 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 3, i32 3> 11918 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11919 ret void 11920} 11921 11922define void @s_shuffle_v4bf16_v3bf16__5_4_3_3() { 11923; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_3_3: 11924; GFX900: ; %bb.0: 11925; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11926; GFX900-NEXT: ;;#ASMSTART 11927; GFX900-NEXT: ; def s[4:5] 11928; GFX900-NEXT: ;;#ASMEND 11929; GFX900-NEXT: s_lshr_b32 s6, s4, 16 11930; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s6 11931; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 11932; GFX900-NEXT: ;;#ASMSTART 11933; GFX900-NEXT: ; use s[8:9] 11934; GFX900-NEXT: ;;#ASMEND 11935; GFX900-NEXT: s_setpc_b64 s[30:31] 11936; 11937; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_3_3: 11938; GFX90A: ; %bb.0: 11939; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11940; GFX90A-NEXT: ;;#ASMSTART 11941; GFX90A-NEXT: ; def s[4:5] 11942; GFX90A-NEXT: ;;#ASMEND 11943; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 11944; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s6 11945; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 11946; GFX90A-NEXT: ;;#ASMSTART 11947; GFX90A-NEXT: ; use s[8:9] 11948; GFX90A-NEXT: ;;#ASMEND 11949; GFX90A-NEXT: s_setpc_b64 s[30:31] 11950; 11951; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_3_3: 11952; GFX940: ; %bb.0: 11953; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11954; GFX940-NEXT: ;;#ASMSTART 11955; GFX940-NEXT: ; def s[0:1] 11956; GFX940-NEXT: ;;#ASMEND 11957; GFX940-NEXT: s_lshr_b32 s2, s0, 16 11958; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s2 11959; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 11960; GFX940-NEXT: ;;#ASMSTART 11961; GFX940-NEXT: ; use s[8:9] 11962; GFX940-NEXT: ;;#ASMEND 11963; GFX940-NEXT: s_setpc_b64 s[30:31] 11964 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 11965 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 11966 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11967 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 11968 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 3, i32 3> 11969 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 11970 ret void 11971} 11972 11973define void @s_shuffle_v4bf16_v3bf16__5_5_3_3() { 11974; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_3: 11975; GFX900: ; %bb.0: 11976; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11977; GFX900-NEXT: ;;#ASMSTART 11978; GFX900-NEXT: ; def s[4:5] 11979; GFX900-NEXT: ;;#ASMEND 11980; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 11981; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 11982; GFX900-NEXT: ;;#ASMSTART 11983; GFX900-NEXT: ; use s[8:9] 11984; GFX900-NEXT: ;;#ASMEND 11985; GFX900-NEXT: s_setpc_b64 s[30:31] 11986; 11987; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_3: 11988; GFX90A: ; %bb.0: 11989; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11990; GFX90A-NEXT: ;;#ASMSTART 11991; GFX90A-NEXT: ; def s[4:5] 11992; GFX90A-NEXT: ;;#ASMEND 11993; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 11994; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 11995; GFX90A-NEXT: ;;#ASMSTART 11996; GFX90A-NEXT: ; use s[8:9] 11997; GFX90A-NEXT: ;;#ASMEND 11998; GFX90A-NEXT: s_setpc_b64 s[30:31] 11999; 12000; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_3: 12001; GFX940: ; %bb.0: 12002; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12003; GFX940-NEXT: ;;#ASMSTART 12004; GFX940-NEXT: ; def s[0:1] 12005; GFX940-NEXT: ;;#ASMEND 12006; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 12007; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 12008; GFX940-NEXT: ;;#ASMSTART 12009; GFX940-NEXT: ; use s[8:9] 12010; GFX940-NEXT: ;;#ASMEND 12011; GFX940-NEXT: s_setpc_b64 s[30:31] 12012 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12013 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12014 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12015 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12016 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 3> 12017 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12018 ret void 12019} 12020 12021define void @s_shuffle_v4bf16_v3bf16__5_5_u_3() { 12022; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_3: 12023; GFX900: ; %bb.0: 12024; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12025; GFX900-NEXT: ;;#ASMSTART 12026; GFX900-NEXT: ; def s[4:5] 12027; GFX900-NEXT: ;;#ASMEND 12028; GFX900-NEXT: s_lshl_b32 s9, s4, 16 12029; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 12030; GFX900-NEXT: ;;#ASMSTART 12031; GFX900-NEXT: ; use s[8:9] 12032; GFX900-NEXT: ;;#ASMEND 12033; GFX900-NEXT: s_setpc_b64 s[30:31] 12034; 12035; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_3: 12036; GFX90A: ; %bb.0: 12037; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12038; GFX90A-NEXT: ;;#ASMSTART 12039; GFX90A-NEXT: ; def s[4:5] 12040; GFX90A-NEXT: ;;#ASMEND 12041; GFX90A-NEXT: s_lshl_b32 s9, s4, 16 12042; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 12043; GFX90A-NEXT: ;;#ASMSTART 12044; GFX90A-NEXT: ; use s[8:9] 12045; GFX90A-NEXT: ;;#ASMEND 12046; GFX90A-NEXT: s_setpc_b64 s[30:31] 12047; 12048; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_3: 12049; GFX940: ; %bb.0: 12050; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12051; GFX940-NEXT: ;;#ASMSTART 12052; GFX940-NEXT: ; def s[0:1] 12053; GFX940-NEXT: ;;#ASMEND 12054; GFX940-NEXT: s_lshl_b32 s9, s0, 16 12055; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 12056; GFX940-NEXT: ;;#ASMSTART 12057; GFX940-NEXT: ; use s[8:9] 12058; GFX940-NEXT: ;;#ASMEND 12059; GFX940-NEXT: s_setpc_b64 s[30:31] 12060 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12061 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12062 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12063 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12064 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 3> 12065 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12066 ret void 12067} 12068 12069define void @s_shuffle_v4bf16_v3bf16__5_5_0_3() { 12070; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_3: 12071; GFX900: ; %bb.0: 12072; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12073; GFX900-NEXT: ;;#ASMSTART 12074; GFX900-NEXT: ; def s[4:5] 12075; GFX900-NEXT: ;;#ASMEND 12076; GFX900-NEXT: ;;#ASMSTART 12077; GFX900-NEXT: ; def s[6:7] 12078; GFX900-NEXT: ;;#ASMEND 12079; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s6 12080; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 12081; GFX900-NEXT: ;;#ASMSTART 12082; GFX900-NEXT: ; use s[8:9] 12083; GFX900-NEXT: ;;#ASMEND 12084; GFX900-NEXT: s_setpc_b64 s[30:31] 12085; 12086; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_3: 12087; GFX90A: ; %bb.0: 12088; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12089; GFX90A-NEXT: ;;#ASMSTART 12090; GFX90A-NEXT: ; def s[4:5] 12091; GFX90A-NEXT: ;;#ASMEND 12092; GFX90A-NEXT: ;;#ASMSTART 12093; GFX90A-NEXT: ; def s[6:7] 12094; GFX90A-NEXT: ;;#ASMEND 12095; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s6 12096; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 12097; GFX90A-NEXT: ;;#ASMSTART 12098; GFX90A-NEXT: ; use s[8:9] 12099; GFX90A-NEXT: ;;#ASMEND 12100; GFX90A-NEXT: s_setpc_b64 s[30:31] 12101; 12102; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_3: 12103; GFX940: ; %bb.0: 12104; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12105; GFX940-NEXT: ;;#ASMSTART 12106; GFX940-NEXT: ; def s[0:1] 12107; GFX940-NEXT: ;;#ASMEND 12108; GFX940-NEXT: ;;#ASMSTART 12109; GFX940-NEXT: ; def s[2:3] 12110; GFX940-NEXT: ;;#ASMEND 12111; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s2 12112; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 12113; GFX940-NEXT: ;;#ASMSTART 12114; GFX940-NEXT: ; use s[8:9] 12115; GFX940-NEXT: ;;#ASMEND 12116; GFX940-NEXT: s_setpc_b64 s[30:31] 12117 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12118 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12119 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12120 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12121 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 3> 12122 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12123 ret void 12124} 12125 12126define void @s_shuffle_v4bf16_v3bf16__5_5_1_3() { 12127; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_3: 12128; GFX900: ; %bb.0: 12129; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12130; GFX900-NEXT: ;;#ASMSTART 12131; GFX900-NEXT: ; def s[4:5] 12132; GFX900-NEXT: ;;#ASMEND 12133; GFX900-NEXT: s_lshr_b32 s4, s4, 16 12134; GFX900-NEXT: ;;#ASMSTART 12135; GFX900-NEXT: ; def s[6:7] 12136; GFX900-NEXT: ;;#ASMEND 12137; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s6 12138; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 12139; GFX900-NEXT: ;;#ASMSTART 12140; GFX900-NEXT: ; use s[8:9] 12141; GFX900-NEXT: ;;#ASMEND 12142; GFX900-NEXT: s_setpc_b64 s[30:31] 12143; 12144; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_3: 12145; GFX90A: ; %bb.0: 12146; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12147; GFX90A-NEXT: ;;#ASMSTART 12148; GFX90A-NEXT: ; def s[4:5] 12149; GFX90A-NEXT: ;;#ASMEND 12150; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 12151; GFX90A-NEXT: ;;#ASMSTART 12152; GFX90A-NEXT: ; def s[6:7] 12153; GFX90A-NEXT: ;;#ASMEND 12154; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s6 12155; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 12156; GFX90A-NEXT: ;;#ASMSTART 12157; GFX90A-NEXT: ; use s[8:9] 12158; GFX90A-NEXT: ;;#ASMEND 12159; GFX90A-NEXT: s_setpc_b64 s[30:31] 12160; 12161; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_3: 12162; GFX940: ; %bb.0: 12163; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12164; GFX940-NEXT: ;;#ASMSTART 12165; GFX940-NEXT: ; def s[0:1] 12166; GFX940-NEXT: ;;#ASMEND 12167; GFX940-NEXT: s_lshr_b32 s0, s0, 16 12168; GFX940-NEXT: ;;#ASMSTART 12169; GFX940-NEXT: ; def s[2:3] 12170; GFX940-NEXT: ;;#ASMEND 12171; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s2 12172; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 12173; GFX940-NEXT: ;;#ASMSTART 12174; GFX940-NEXT: ; use s[8:9] 12175; GFX940-NEXT: ;;#ASMEND 12176; GFX940-NEXT: s_setpc_b64 s[30:31] 12177 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12178 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12179 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12180 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12181 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 3> 12182 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12183 ret void 12184} 12185 12186define void @s_shuffle_v4bf16_v3bf16__5_5_2_3() { 12187; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_3: 12188; GFX900: ; %bb.0: 12189; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12190; GFX900-NEXT: ;;#ASMSTART 12191; GFX900-NEXT: ; def s[4:5] 12192; GFX900-NEXT: ;;#ASMEND 12193; GFX900-NEXT: ;;#ASMSTART 12194; GFX900-NEXT: ; def s[6:7] 12195; GFX900-NEXT: ;;#ASMEND 12196; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s6 12197; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 12198; GFX900-NEXT: ;;#ASMSTART 12199; GFX900-NEXT: ; use s[8:9] 12200; GFX900-NEXT: ;;#ASMEND 12201; GFX900-NEXT: s_setpc_b64 s[30:31] 12202; 12203; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_3: 12204; GFX90A: ; %bb.0: 12205; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12206; GFX90A-NEXT: ;;#ASMSTART 12207; GFX90A-NEXT: ; def s[4:5] 12208; GFX90A-NEXT: ;;#ASMEND 12209; GFX90A-NEXT: ;;#ASMSTART 12210; GFX90A-NEXT: ; def s[6:7] 12211; GFX90A-NEXT: ;;#ASMEND 12212; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s6 12213; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 12214; GFX90A-NEXT: ;;#ASMSTART 12215; GFX90A-NEXT: ; use s[8:9] 12216; GFX90A-NEXT: ;;#ASMEND 12217; GFX90A-NEXT: s_setpc_b64 s[30:31] 12218; 12219; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_3: 12220; GFX940: ; %bb.0: 12221; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12222; GFX940-NEXT: ;;#ASMSTART 12223; GFX940-NEXT: ; def s[0:1] 12224; GFX940-NEXT: ;;#ASMEND 12225; GFX940-NEXT: ;;#ASMSTART 12226; GFX940-NEXT: ; def s[2:3] 12227; GFX940-NEXT: ;;#ASMEND 12228; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s2 12229; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 12230; GFX940-NEXT: ;;#ASMSTART 12231; GFX940-NEXT: ; use s[8:9] 12232; GFX940-NEXT: ;;#ASMEND 12233; GFX940-NEXT: s_setpc_b64 s[30:31] 12234 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12235 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12236 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12237 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12238 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 3> 12239 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12240 ret void 12241} 12242 12243define void @s_shuffle_v4bf16_v3bf16__5_5_4_3() { 12244; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_3: 12245; GFX900: ; %bb.0: 12246; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12247; GFX900-NEXT: ;;#ASMSTART 12248; GFX900-NEXT: ; def s[4:5] 12249; GFX900-NEXT: ;;#ASMEND 12250; GFX900-NEXT: s_lshr_b32 s6, s4, 16 12251; GFX900-NEXT: s_pack_ll_b32_b16 s9, s6, s4 12252; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 12253; GFX900-NEXT: ;;#ASMSTART 12254; GFX900-NEXT: ; use s[8:9] 12255; GFX900-NEXT: ;;#ASMEND 12256; GFX900-NEXT: s_setpc_b64 s[30:31] 12257; 12258; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_3: 12259; GFX90A: ; %bb.0: 12260; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12261; GFX90A-NEXT: ;;#ASMSTART 12262; GFX90A-NEXT: ; def s[4:5] 12263; GFX90A-NEXT: ;;#ASMEND 12264; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 12265; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s6, s4 12266; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 12267; GFX90A-NEXT: ;;#ASMSTART 12268; GFX90A-NEXT: ; use s[8:9] 12269; GFX90A-NEXT: ;;#ASMEND 12270; GFX90A-NEXT: s_setpc_b64 s[30:31] 12271; 12272; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_3: 12273; GFX940: ; %bb.0: 12274; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12275; GFX940-NEXT: ;;#ASMSTART 12276; GFX940-NEXT: ; def s[0:1] 12277; GFX940-NEXT: ;;#ASMEND 12278; GFX940-NEXT: s_lshr_b32 s2, s0, 16 12279; GFX940-NEXT: s_pack_ll_b32_b16 s9, s2, s0 12280; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 12281; GFX940-NEXT: ;;#ASMSTART 12282; GFX940-NEXT: ; use s[8:9] 12283; GFX940-NEXT: ;;#ASMEND 12284; GFX940-NEXT: s_setpc_b64 s[30:31] 12285 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12286 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12287 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12288 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12289 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 3> 12290 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12291 ret void 12292} 12293 12294define void @s_shuffle_v4bf16_v3bf16__u_4_4_4() { 12295; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__u_4_4_4: 12296; GFX900: ; %bb.0: 12297; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12298; GFX900-NEXT: ;;#ASMSTART 12299; GFX900-NEXT: ; def s[8:9] 12300; GFX900-NEXT: ;;#ASMEND 12301; GFX900-NEXT: s_lshr_b32 s4, s8, 16 12302; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12303; GFX900-NEXT: ;;#ASMSTART 12304; GFX900-NEXT: ; use s[8:9] 12305; GFX900-NEXT: ;;#ASMEND 12306; GFX900-NEXT: s_setpc_b64 s[30:31] 12307; 12308; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__u_4_4_4: 12309; GFX90A: ; %bb.0: 12310; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12311; GFX90A-NEXT: ;;#ASMSTART 12312; GFX90A-NEXT: ; def s[8:9] 12313; GFX90A-NEXT: ;;#ASMEND 12314; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 12315; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12316; GFX90A-NEXT: ;;#ASMSTART 12317; GFX90A-NEXT: ; use s[8:9] 12318; GFX90A-NEXT: ;;#ASMEND 12319; GFX90A-NEXT: s_setpc_b64 s[30:31] 12320; 12321; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__u_4_4_4: 12322; GFX940: ; %bb.0: 12323; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12324; GFX940-NEXT: ;;#ASMSTART 12325; GFX940-NEXT: ; def s[8:9] 12326; GFX940-NEXT: ;;#ASMEND 12327; GFX940-NEXT: s_lshr_b32 s0, s8, 16 12328; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 12329; GFX940-NEXT: ;;#ASMSTART 12330; GFX940-NEXT: ; use s[8:9] 12331; GFX940-NEXT: ;;#ASMEND 12332; GFX940-NEXT: s_setpc_b64 s[30:31] 12333 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12334 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12335 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12336 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12337 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 poison, i32 4, i32 4, i32 4> 12338 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12339 ret void 12340} 12341 12342define void @s_shuffle_v4bf16_v3bf16__0_4_4_4() { 12343; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_4_4_4: 12344; GFX900: ; %bb.0: 12345; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12346; GFX900-NEXT: ;;#ASMSTART 12347; GFX900-NEXT: ; def s[4:5] 12348; GFX900-NEXT: ;;#ASMEND 12349; GFX900-NEXT: ;;#ASMSTART 12350; GFX900-NEXT: ; def s[6:7] 12351; GFX900-NEXT: ;;#ASMEND 12352; GFX900-NEXT: s_lshr_b32 s5, s6, 16 12353; GFX900-NEXT: s_pack_ll_b32_b16 s8, s4, s5 12354; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 12355; GFX900-NEXT: ;;#ASMSTART 12356; GFX900-NEXT: ; use s[8:9] 12357; GFX900-NEXT: ;;#ASMEND 12358; GFX900-NEXT: s_setpc_b64 s[30:31] 12359; 12360; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_4_4_4: 12361; GFX90A: ; %bb.0: 12362; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12363; GFX90A-NEXT: ;;#ASMSTART 12364; GFX90A-NEXT: ; def s[4:5] 12365; GFX90A-NEXT: ;;#ASMEND 12366; GFX90A-NEXT: ;;#ASMSTART 12367; GFX90A-NEXT: ; def s[6:7] 12368; GFX90A-NEXT: ;;#ASMEND 12369; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 12370; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s4, s5 12371; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 12372; GFX90A-NEXT: ;;#ASMSTART 12373; GFX90A-NEXT: ; use s[8:9] 12374; GFX90A-NEXT: ;;#ASMEND 12375; GFX90A-NEXT: s_setpc_b64 s[30:31] 12376; 12377; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_4_4_4: 12378; GFX940: ; %bb.0: 12379; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12380; GFX940-NEXT: ;;#ASMSTART 12381; GFX940-NEXT: ; def s[0:1] 12382; GFX940-NEXT: ;;#ASMEND 12383; GFX940-NEXT: ;;#ASMSTART 12384; GFX940-NEXT: ; def s[2:3] 12385; GFX940-NEXT: ;;#ASMEND 12386; GFX940-NEXT: s_lshr_b32 s1, s2, 16 12387; GFX940-NEXT: s_pack_ll_b32_b16 s8, s0, s1 12388; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 12389; GFX940-NEXT: ;;#ASMSTART 12390; GFX940-NEXT: ; use s[8:9] 12391; GFX940-NEXT: ;;#ASMEND 12392; GFX940-NEXT: s_setpc_b64 s[30:31] 12393 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12394 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12395 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12396 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12397 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 0, i32 4, i32 4, i32 4> 12398 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12399 ret void 12400} 12401 12402define void @s_shuffle_v4bf16_v3bf16__1_4_4_4() { 12403; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_4_4_4: 12404; GFX900: ; %bb.0: 12405; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12406; GFX900-NEXT: ;;#ASMSTART 12407; GFX900-NEXT: ; def s[4:5] 12408; GFX900-NEXT: ;;#ASMEND 12409; GFX900-NEXT: ;;#ASMSTART 12410; GFX900-NEXT: ; def s[6:7] 12411; GFX900-NEXT: ;;#ASMEND 12412; GFX900-NEXT: s_lshr_b32 s5, s6, 16 12413; GFX900-NEXT: s_lshr_b32 s4, s4, 16 12414; GFX900-NEXT: s_pack_ll_b32_b16 s8, s4, s5 12415; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 12416; GFX900-NEXT: ;;#ASMSTART 12417; GFX900-NEXT: ; use s[8:9] 12418; GFX900-NEXT: ;;#ASMEND 12419; GFX900-NEXT: s_setpc_b64 s[30:31] 12420; 12421; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_4_4_4: 12422; GFX90A: ; %bb.0: 12423; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12424; GFX90A-NEXT: ;;#ASMSTART 12425; GFX90A-NEXT: ; def s[4:5] 12426; GFX90A-NEXT: ;;#ASMEND 12427; GFX90A-NEXT: ;;#ASMSTART 12428; GFX90A-NEXT: ; def s[6:7] 12429; GFX90A-NEXT: ;;#ASMEND 12430; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 12431; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 12432; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s4, s5 12433; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 12434; GFX90A-NEXT: ;;#ASMSTART 12435; GFX90A-NEXT: ; use s[8:9] 12436; GFX90A-NEXT: ;;#ASMEND 12437; GFX90A-NEXT: s_setpc_b64 s[30:31] 12438; 12439; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_4_4_4: 12440; GFX940: ; %bb.0: 12441; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12442; GFX940-NEXT: ;;#ASMSTART 12443; GFX940-NEXT: ; def s[0:1] 12444; GFX940-NEXT: ;;#ASMEND 12445; GFX940-NEXT: ;;#ASMSTART 12446; GFX940-NEXT: ; def s[2:3] 12447; GFX940-NEXT: ;;#ASMEND 12448; GFX940-NEXT: s_lshr_b32 s1, s2, 16 12449; GFX940-NEXT: s_lshr_b32 s0, s0, 16 12450; GFX940-NEXT: s_pack_ll_b32_b16 s8, s0, s1 12451; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 12452; GFX940-NEXT: ;;#ASMSTART 12453; GFX940-NEXT: ; use s[8:9] 12454; GFX940-NEXT: ;;#ASMEND 12455; GFX940-NEXT: s_setpc_b64 s[30:31] 12456 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12457 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12458 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12459 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12460 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 1, i32 4, i32 4, i32 4> 12461 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12462 ret void 12463} 12464 12465define void @s_shuffle_v4bf16_v3bf16__2_4_4_4() { 12466; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_4_4_4: 12467; GFX900: ; %bb.0: 12468; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12469; GFX900-NEXT: ;;#ASMSTART 12470; GFX900-NEXT: ; def s[4:5] 12471; GFX900-NEXT: ;;#ASMEND 12472; GFX900-NEXT: ;;#ASMSTART 12473; GFX900-NEXT: ; def s[6:7] 12474; GFX900-NEXT: ;;#ASMEND 12475; GFX900-NEXT: s_lshr_b32 s4, s6, 16 12476; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s4 12477; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12478; GFX900-NEXT: ;;#ASMSTART 12479; GFX900-NEXT: ; use s[8:9] 12480; GFX900-NEXT: ;;#ASMEND 12481; GFX900-NEXT: s_setpc_b64 s[30:31] 12482; 12483; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_4_4_4: 12484; GFX90A: ; %bb.0: 12485; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12486; GFX90A-NEXT: ;;#ASMSTART 12487; GFX90A-NEXT: ; def s[4:5] 12488; GFX90A-NEXT: ;;#ASMEND 12489; GFX90A-NEXT: ;;#ASMSTART 12490; GFX90A-NEXT: ; def s[6:7] 12491; GFX90A-NEXT: ;;#ASMEND 12492; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 12493; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s4 12494; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12495; GFX90A-NEXT: ;;#ASMSTART 12496; GFX90A-NEXT: ; use s[8:9] 12497; GFX90A-NEXT: ;;#ASMEND 12498; GFX90A-NEXT: s_setpc_b64 s[30:31] 12499; 12500; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_4_4_4: 12501; GFX940: ; %bb.0: 12502; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12503; GFX940-NEXT: ;;#ASMSTART 12504; GFX940-NEXT: ; def s[0:1] 12505; GFX940-NEXT: ;;#ASMEND 12506; GFX940-NEXT: ;;#ASMSTART 12507; GFX940-NEXT: ; def s[2:3] 12508; GFX940-NEXT: ;;#ASMEND 12509; GFX940-NEXT: s_lshr_b32 s0, s2, 16 12510; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s0 12511; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 12512; GFX940-NEXT: ;;#ASMSTART 12513; GFX940-NEXT: ; use s[8:9] 12514; GFX940-NEXT: ;;#ASMEND 12515; GFX940-NEXT: s_setpc_b64 s[30:31] 12516 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12517 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12518 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12519 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12520 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 2, i32 4, i32 4, i32 4> 12521 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12522 ret void 12523} 12524 12525define void @s_shuffle_v4bf16_v3bf16__3_4_4_4() { 12526; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__3_4_4_4: 12527; GFX900: ; %bb.0: 12528; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12529; GFX900-NEXT: ;;#ASMSTART 12530; GFX900-NEXT: ; def s[8:9] 12531; GFX900-NEXT: ;;#ASMEND 12532; GFX900-NEXT: s_lshr_b32 s4, s8, 16 12533; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12534; GFX900-NEXT: ;;#ASMSTART 12535; GFX900-NEXT: ; use s[8:9] 12536; GFX900-NEXT: ;;#ASMEND 12537; GFX900-NEXT: s_setpc_b64 s[30:31] 12538; 12539; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__3_4_4_4: 12540; GFX90A: ; %bb.0: 12541; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12542; GFX90A-NEXT: ;;#ASMSTART 12543; GFX90A-NEXT: ; def s[8:9] 12544; GFX90A-NEXT: ;;#ASMEND 12545; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 12546; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12547; GFX90A-NEXT: ;;#ASMSTART 12548; GFX90A-NEXT: ; use s[8:9] 12549; GFX90A-NEXT: ;;#ASMEND 12550; GFX90A-NEXT: s_setpc_b64 s[30:31] 12551; 12552; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__3_4_4_4: 12553; GFX940: ; %bb.0: 12554; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12555; GFX940-NEXT: ;;#ASMSTART 12556; GFX940-NEXT: ; def s[8:9] 12557; GFX940-NEXT: ;;#ASMEND 12558; GFX940-NEXT: s_lshr_b32 s0, s8, 16 12559; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 12560; GFX940-NEXT: ;;#ASMSTART 12561; GFX940-NEXT: ; use s[8:9] 12562; GFX940-NEXT: ;;#ASMEND 12563; GFX940-NEXT: s_setpc_b64 s[30:31] 12564 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12565 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12566 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12567 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12568 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 3, i32 4, i32 4, i32 4> 12569 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12570 ret void 12571} 12572 12573define void @s_shuffle_v4bf16_v3bf16__4_4_4_4() { 12574; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_4_4_4: 12575; GFX900: ; %bb.0: 12576; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12577; GFX900-NEXT: ;;#ASMSTART 12578; GFX900-NEXT: ; def s[4:5] 12579; GFX900-NEXT: ;;#ASMEND 12580; GFX900-NEXT: s_lshr_b32 s4, s4, 16 12581; GFX900-NEXT: s_pack_ll_b32_b16 s8, s4, s4 12582; GFX900-NEXT: s_mov_b32 s9, s8 12583; GFX900-NEXT: ;;#ASMSTART 12584; GFX900-NEXT: ; use s[8:9] 12585; GFX900-NEXT: ;;#ASMEND 12586; GFX900-NEXT: s_setpc_b64 s[30:31] 12587; 12588; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_4_4_4: 12589; GFX90A: ; %bb.0: 12590; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12591; GFX90A-NEXT: ;;#ASMSTART 12592; GFX90A-NEXT: ; def s[4:5] 12593; GFX90A-NEXT: ;;#ASMEND 12594; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 12595; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s4, s4 12596; GFX90A-NEXT: s_mov_b32 s9, s8 12597; GFX90A-NEXT: ;;#ASMSTART 12598; GFX90A-NEXT: ; use s[8:9] 12599; GFX90A-NEXT: ;;#ASMEND 12600; GFX90A-NEXT: s_setpc_b64 s[30:31] 12601; 12602; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_4_4_4: 12603; GFX940: ; %bb.0: 12604; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12605; GFX940-NEXT: ;;#ASMSTART 12606; GFX940-NEXT: ; def s[0:1] 12607; GFX940-NEXT: ;;#ASMEND 12608; GFX940-NEXT: s_lshr_b32 s0, s0, 16 12609; GFX940-NEXT: s_pack_ll_b32_b16 s8, s0, s0 12610; GFX940-NEXT: s_mov_b32 s9, s8 12611; GFX940-NEXT: ;;#ASMSTART 12612; GFX940-NEXT: ; use s[8:9] 12613; GFX940-NEXT: ;;#ASMEND 12614; GFX940-NEXT: s_setpc_b64 s[30:31] 12615 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12616 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12617 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12618 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12619 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 4, i32 4, i32 4> 12620 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12621 ret void 12622} 12623 12624define void @s_shuffle_v4bf16_v3bf16__5_4_4_4() { 12625; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_4_4: 12626; GFX900: ; %bb.0: 12627; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12628; GFX900-NEXT: ;;#ASMSTART 12629; GFX900-NEXT: ; def s[4:5] 12630; GFX900-NEXT: ;;#ASMEND 12631; GFX900-NEXT: s_lshr_b32 s4, s4, 16 12632; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s4 12633; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12634; GFX900-NEXT: ;;#ASMSTART 12635; GFX900-NEXT: ; use s[8:9] 12636; GFX900-NEXT: ;;#ASMEND 12637; GFX900-NEXT: s_setpc_b64 s[30:31] 12638; 12639; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_4_4: 12640; GFX90A: ; %bb.0: 12641; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12642; GFX90A-NEXT: ;;#ASMSTART 12643; GFX90A-NEXT: ; def s[4:5] 12644; GFX90A-NEXT: ;;#ASMEND 12645; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 12646; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s4 12647; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12648; GFX90A-NEXT: ;;#ASMSTART 12649; GFX90A-NEXT: ; use s[8:9] 12650; GFX90A-NEXT: ;;#ASMEND 12651; GFX90A-NEXT: s_setpc_b64 s[30:31] 12652; 12653; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_4_4: 12654; GFX940: ; %bb.0: 12655; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12656; GFX940-NEXT: ;;#ASMSTART 12657; GFX940-NEXT: ; def s[0:1] 12658; GFX940-NEXT: ;;#ASMEND 12659; GFX940-NEXT: s_lshr_b32 s0, s0, 16 12660; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s0 12661; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 12662; GFX940-NEXT: ;;#ASMSTART 12663; GFX940-NEXT: ; use s[8:9] 12664; GFX940-NEXT: ;;#ASMEND 12665; GFX940-NEXT: s_setpc_b64 s[30:31] 12666 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12667 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12668 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12669 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12670 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 4, i32 4> 12671 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12672 ret void 12673} 12674 12675define void @s_shuffle_v4bf16_v3bf16__5_u_4_4() { 12676; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_4_4: 12677; GFX900: ; %bb.0: 12678; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12679; GFX900-NEXT: ;;#ASMSTART 12680; GFX900-NEXT: ; def s[4:5] 12681; GFX900-NEXT: ;;#ASMEND 12682; GFX900-NEXT: s_lshr_b32 s4, s4, 16 12683; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12684; GFX900-NEXT: s_mov_b32 s8, s5 12685; GFX900-NEXT: ;;#ASMSTART 12686; GFX900-NEXT: ; use s[8:9] 12687; GFX900-NEXT: ;;#ASMEND 12688; GFX900-NEXT: s_setpc_b64 s[30:31] 12689; 12690; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_4_4: 12691; GFX90A: ; %bb.0: 12692; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12693; GFX90A-NEXT: ;;#ASMSTART 12694; GFX90A-NEXT: ; def s[4:5] 12695; GFX90A-NEXT: ;;#ASMEND 12696; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 12697; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12698; GFX90A-NEXT: s_mov_b32 s8, s5 12699; GFX90A-NEXT: ;;#ASMSTART 12700; GFX90A-NEXT: ; use s[8:9] 12701; GFX90A-NEXT: ;;#ASMEND 12702; GFX90A-NEXT: s_setpc_b64 s[30:31] 12703; 12704; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_4_4: 12705; GFX940: ; %bb.0: 12706; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12707; GFX940-NEXT: ;;#ASMSTART 12708; GFX940-NEXT: ; def s[0:1] 12709; GFX940-NEXT: ;;#ASMEND 12710; GFX940-NEXT: s_lshr_b32 s0, s0, 16 12711; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 12712; GFX940-NEXT: s_mov_b32 s8, s1 12713; GFX940-NEXT: ;;#ASMSTART 12714; GFX940-NEXT: ; use s[8:9] 12715; GFX940-NEXT: ;;#ASMEND 12716; GFX940-NEXT: s_setpc_b64 s[30:31] 12717 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12718 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12719 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12720 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12721 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 4, i32 4> 12722 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12723 ret void 12724} 12725 12726define void @s_shuffle_v4bf16_v3bf16__5_0_4_4() { 12727; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_4_4: 12728; GFX900: ; %bb.0: 12729; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12730; GFX900-NEXT: ;;#ASMSTART 12731; GFX900-NEXT: ; def s[4:5] 12732; GFX900-NEXT: ;;#ASMEND 12733; GFX900-NEXT: ;;#ASMSTART 12734; GFX900-NEXT: ; def s[6:7] 12735; GFX900-NEXT: ;;#ASMEND 12736; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s4 12737; GFX900-NEXT: s_lshr_b32 s4, s6, 16 12738; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12739; GFX900-NEXT: ;;#ASMSTART 12740; GFX900-NEXT: ; use s[8:9] 12741; GFX900-NEXT: ;;#ASMEND 12742; GFX900-NEXT: s_setpc_b64 s[30:31] 12743; 12744; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_4_4: 12745; GFX90A: ; %bb.0: 12746; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12747; GFX90A-NEXT: ;;#ASMSTART 12748; GFX90A-NEXT: ; def s[4:5] 12749; GFX90A-NEXT: ;;#ASMEND 12750; GFX90A-NEXT: ;;#ASMSTART 12751; GFX90A-NEXT: ; def s[6:7] 12752; GFX90A-NEXT: ;;#ASMEND 12753; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s4 12754; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 12755; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12756; GFX90A-NEXT: ;;#ASMSTART 12757; GFX90A-NEXT: ; use s[8:9] 12758; GFX90A-NEXT: ;;#ASMEND 12759; GFX90A-NEXT: s_setpc_b64 s[30:31] 12760; 12761; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_4_4: 12762; GFX940: ; %bb.0: 12763; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12764; GFX940-NEXT: ;;#ASMSTART 12765; GFX940-NEXT: ; def s[0:1] 12766; GFX940-NEXT: ;;#ASMEND 12767; GFX940-NEXT: ;;#ASMSTART 12768; GFX940-NEXT: ; def s[2:3] 12769; GFX940-NEXT: ;;#ASMEND 12770; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s0 12771; GFX940-NEXT: s_lshr_b32 s0, s2, 16 12772; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 12773; GFX940-NEXT: ;;#ASMSTART 12774; GFX940-NEXT: ; use s[8:9] 12775; GFX940-NEXT: ;;#ASMEND 12776; GFX940-NEXT: s_setpc_b64 s[30:31] 12777 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12778 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12779 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12780 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12781 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 4, i32 4> 12782 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12783 ret void 12784} 12785 12786define void @s_shuffle_v4bf16_v3bf16__5_1_4_4() { 12787; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_4_4: 12788; GFX900: ; %bb.0: 12789; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12790; GFX900-NEXT: ;;#ASMSTART 12791; GFX900-NEXT: ; def s[4:5] 12792; GFX900-NEXT: ;;#ASMEND 12793; GFX900-NEXT: s_lshr_b32 s4, s4, 16 12794; GFX900-NEXT: ;;#ASMSTART 12795; GFX900-NEXT: ; def s[6:7] 12796; GFX900-NEXT: ;;#ASMEND 12797; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s4 12798; GFX900-NEXT: s_lshr_b32 s4, s6, 16 12799; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12800; GFX900-NEXT: ;;#ASMSTART 12801; GFX900-NEXT: ; use s[8:9] 12802; GFX900-NEXT: ;;#ASMEND 12803; GFX900-NEXT: s_setpc_b64 s[30:31] 12804; 12805; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_4_4: 12806; GFX90A: ; %bb.0: 12807; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12808; GFX90A-NEXT: ;;#ASMSTART 12809; GFX90A-NEXT: ; def s[4:5] 12810; GFX90A-NEXT: ;;#ASMEND 12811; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 12812; GFX90A-NEXT: ;;#ASMSTART 12813; GFX90A-NEXT: ; def s[6:7] 12814; GFX90A-NEXT: ;;#ASMEND 12815; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s4 12816; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 12817; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12818; GFX90A-NEXT: ;;#ASMSTART 12819; GFX90A-NEXT: ; use s[8:9] 12820; GFX90A-NEXT: ;;#ASMEND 12821; GFX90A-NEXT: s_setpc_b64 s[30:31] 12822; 12823; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_4_4: 12824; GFX940: ; %bb.0: 12825; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12826; GFX940-NEXT: ;;#ASMSTART 12827; GFX940-NEXT: ; def s[0:1] 12828; GFX940-NEXT: ;;#ASMEND 12829; GFX940-NEXT: s_lshr_b32 s0, s0, 16 12830; GFX940-NEXT: ;;#ASMSTART 12831; GFX940-NEXT: ; def s[2:3] 12832; GFX940-NEXT: ;;#ASMEND 12833; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s0 12834; GFX940-NEXT: s_lshr_b32 s0, s2, 16 12835; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 12836; GFX940-NEXT: ;;#ASMSTART 12837; GFX940-NEXT: ; use s[8:9] 12838; GFX940-NEXT: ;;#ASMEND 12839; GFX940-NEXT: s_setpc_b64 s[30:31] 12840 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12841 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12842 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12843 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12844 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 4, i32 4> 12845 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12846 ret void 12847} 12848 12849define void @s_shuffle_v4bf16_v3bf16__5_2_4_4() { 12850; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_4_4: 12851; GFX900: ; %bb.0: 12852; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12853; GFX900-NEXT: ;;#ASMSTART 12854; GFX900-NEXT: ; def s[4:5] 12855; GFX900-NEXT: ;;#ASMEND 12856; GFX900-NEXT: ;;#ASMSTART 12857; GFX900-NEXT: ; def s[6:7] 12858; GFX900-NEXT: ;;#ASMEND 12859; GFX900-NEXT: s_lshr_b32 s4, s6, 16 12860; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s5 12861; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12862; GFX900-NEXT: ;;#ASMSTART 12863; GFX900-NEXT: ; use s[8:9] 12864; GFX900-NEXT: ;;#ASMEND 12865; GFX900-NEXT: s_setpc_b64 s[30:31] 12866; 12867; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_4_4: 12868; GFX90A: ; %bb.0: 12869; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12870; GFX90A-NEXT: ;;#ASMSTART 12871; GFX90A-NEXT: ; def s[4:5] 12872; GFX90A-NEXT: ;;#ASMEND 12873; GFX90A-NEXT: ;;#ASMSTART 12874; GFX90A-NEXT: ; def s[6:7] 12875; GFX90A-NEXT: ;;#ASMEND 12876; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 12877; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s5 12878; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12879; GFX90A-NEXT: ;;#ASMSTART 12880; GFX90A-NEXT: ; use s[8:9] 12881; GFX90A-NEXT: ;;#ASMEND 12882; GFX90A-NEXT: s_setpc_b64 s[30:31] 12883; 12884; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_4_4: 12885; GFX940: ; %bb.0: 12886; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12887; GFX940-NEXT: ;;#ASMSTART 12888; GFX940-NEXT: ; def s[0:1] 12889; GFX940-NEXT: ;;#ASMEND 12890; GFX940-NEXT: ;;#ASMSTART 12891; GFX940-NEXT: ; def s[2:3] 12892; GFX940-NEXT: ;;#ASMEND 12893; GFX940-NEXT: s_lshr_b32 s0, s2, 16 12894; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s1 12895; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 12896; GFX940-NEXT: ;;#ASMSTART 12897; GFX940-NEXT: ; use s[8:9] 12898; GFX940-NEXT: ;;#ASMEND 12899; GFX940-NEXT: s_setpc_b64 s[30:31] 12900 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12901 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12902 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12903 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12904 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 4, i32 4> 12905 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12906 ret void 12907} 12908 12909define void @s_shuffle_v4bf16_v3bf16__5_3_4_4() { 12910; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_4_4: 12911; GFX900: ; %bb.0: 12912; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12913; GFX900-NEXT: ;;#ASMSTART 12914; GFX900-NEXT: ; def s[4:5] 12915; GFX900-NEXT: ;;#ASMEND 12916; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s4 12917; GFX900-NEXT: s_lshr_b32 s4, s4, 16 12918; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12919; GFX900-NEXT: ;;#ASMSTART 12920; GFX900-NEXT: ; use s[8:9] 12921; GFX900-NEXT: ;;#ASMEND 12922; GFX900-NEXT: s_setpc_b64 s[30:31] 12923; 12924; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_4_4: 12925; GFX90A: ; %bb.0: 12926; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12927; GFX90A-NEXT: ;;#ASMSTART 12928; GFX90A-NEXT: ; def s[4:5] 12929; GFX90A-NEXT: ;;#ASMEND 12930; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s4 12931; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 12932; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12933; GFX90A-NEXT: ;;#ASMSTART 12934; GFX90A-NEXT: ; use s[8:9] 12935; GFX90A-NEXT: ;;#ASMEND 12936; GFX90A-NEXT: s_setpc_b64 s[30:31] 12937; 12938; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_4_4: 12939; GFX940: ; %bb.0: 12940; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12941; GFX940-NEXT: ;;#ASMSTART 12942; GFX940-NEXT: ; def s[0:1] 12943; GFX940-NEXT: ;;#ASMEND 12944; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s0 12945; GFX940-NEXT: s_lshr_b32 s0, s0, 16 12946; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 12947; GFX940-NEXT: ;;#ASMSTART 12948; GFX940-NEXT: ; use s[8:9] 12949; GFX940-NEXT: ;;#ASMEND 12950; GFX940-NEXT: s_setpc_b64 s[30:31] 12951 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 12952 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 12953 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12954 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 12955 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 4, i32 4> 12956 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 12957 ret void 12958} 12959 12960define void @s_shuffle_v4bf16_v3bf16__5_5_4_4() { 12961; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_4: 12962; GFX900: ; %bb.0: 12963; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12964; GFX900-NEXT: ;;#ASMSTART 12965; GFX900-NEXT: ; def s[4:5] 12966; GFX900-NEXT: ;;#ASMEND 12967; GFX900-NEXT: s_lshr_b32 s4, s4, 16 12968; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12969; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 12970; GFX900-NEXT: ;;#ASMSTART 12971; GFX900-NEXT: ; use s[8:9] 12972; GFX900-NEXT: ;;#ASMEND 12973; GFX900-NEXT: s_setpc_b64 s[30:31] 12974; 12975; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_4: 12976; GFX90A: ; %bb.0: 12977; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12978; GFX90A-NEXT: ;;#ASMSTART 12979; GFX90A-NEXT: ; def s[4:5] 12980; GFX90A-NEXT: ;;#ASMEND 12981; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 12982; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s4 12983; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 12984; GFX90A-NEXT: ;;#ASMSTART 12985; GFX90A-NEXT: ; use s[8:9] 12986; GFX90A-NEXT: ;;#ASMEND 12987; GFX90A-NEXT: s_setpc_b64 s[30:31] 12988; 12989; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_4: 12990; GFX940: ; %bb.0: 12991; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12992; GFX940-NEXT: ;;#ASMSTART 12993; GFX940-NEXT: ; def s[0:1] 12994; GFX940-NEXT: ;;#ASMEND 12995; GFX940-NEXT: s_lshr_b32 s0, s0, 16 12996; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s0 12997; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 12998; GFX940-NEXT: ;;#ASMSTART 12999; GFX940-NEXT: ; use s[8:9] 13000; GFX940-NEXT: ;;#ASMEND 13001; GFX940-NEXT: s_setpc_b64 s[30:31] 13002 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13003 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13004 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13005 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13006 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 4> 13007 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13008 ret void 13009} 13010 13011define void @s_shuffle_v4bf16_v3bf16__5_5_u_4() { 13012; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_4: 13013; GFX900: ; %bb.0: 13014; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13015; GFX900-NEXT: ;;#ASMSTART 13016; GFX900-NEXT: ; def s[4:5] 13017; GFX900-NEXT: ;;#ASMEND 13018; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 13019; GFX900-NEXT: s_mov_b32 s9, s4 13020; GFX900-NEXT: ;;#ASMSTART 13021; GFX900-NEXT: ; use s[8:9] 13022; GFX900-NEXT: ;;#ASMEND 13023; GFX900-NEXT: s_setpc_b64 s[30:31] 13024; 13025; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_4: 13026; GFX90A: ; %bb.0: 13027; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13028; GFX90A-NEXT: ;;#ASMSTART 13029; GFX90A-NEXT: ; def s[4:5] 13030; GFX90A-NEXT: ;;#ASMEND 13031; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 13032; GFX90A-NEXT: s_mov_b32 s9, s4 13033; GFX90A-NEXT: ;;#ASMSTART 13034; GFX90A-NEXT: ; use s[8:9] 13035; GFX90A-NEXT: ;;#ASMEND 13036; GFX90A-NEXT: s_setpc_b64 s[30:31] 13037; 13038; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_4: 13039; GFX940: ; %bb.0: 13040; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13041; GFX940-NEXT: ;;#ASMSTART 13042; GFX940-NEXT: ; def s[0:1] 13043; GFX940-NEXT: ;;#ASMEND 13044; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 13045; GFX940-NEXT: s_mov_b32 s9, s0 13046; GFX940-NEXT: ;;#ASMSTART 13047; GFX940-NEXT: ; use s[8:9] 13048; GFX940-NEXT: ;;#ASMEND 13049; GFX940-NEXT: s_setpc_b64 s[30:31] 13050 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13051 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13052 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13053 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13054 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 4> 13055 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13056 ret void 13057} 13058 13059define void @s_shuffle_v4bf16_v3bf16__5_5_0_4() { 13060; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_4: 13061; GFX900: ; %bb.0: 13062; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13063; GFX900-NEXT: ;;#ASMSTART 13064; GFX900-NEXT: ; def s[4:5] 13065; GFX900-NEXT: ;;#ASMEND 13066; GFX900-NEXT: ;;#ASMSTART 13067; GFX900-NEXT: ; def s[6:7] 13068; GFX900-NEXT: ;;#ASMEND 13069; GFX900-NEXT: s_lshr_b32 s5, s6, 16 13070; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s5 13071; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 13072; GFX900-NEXT: ;;#ASMSTART 13073; GFX900-NEXT: ; use s[8:9] 13074; GFX900-NEXT: ;;#ASMEND 13075; GFX900-NEXT: s_setpc_b64 s[30:31] 13076; 13077; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_4: 13078; GFX90A: ; %bb.0: 13079; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13080; GFX90A-NEXT: ;;#ASMSTART 13081; GFX90A-NEXT: ; def s[4:5] 13082; GFX90A-NEXT: ;;#ASMEND 13083; GFX90A-NEXT: ;;#ASMSTART 13084; GFX90A-NEXT: ; def s[6:7] 13085; GFX90A-NEXT: ;;#ASMEND 13086; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 13087; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s5 13088; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 13089; GFX90A-NEXT: ;;#ASMSTART 13090; GFX90A-NEXT: ; use s[8:9] 13091; GFX90A-NEXT: ;;#ASMEND 13092; GFX90A-NEXT: s_setpc_b64 s[30:31] 13093; 13094; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_4: 13095; GFX940: ; %bb.0: 13096; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13097; GFX940-NEXT: ;;#ASMSTART 13098; GFX940-NEXT: ; def s[0:1] 13099; GFX940-NEXT: ;;#ASMEND 13100; GFX940-NEXT: ;;#ASMSTART 13101; GFX940-NEXT: ; def s[2:3] 13102; GFX940-NEXT: ;;#ASMEND 13103; GFX940-NEXT: s_lshr_b32 s1, s2, 16 13104; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s1 13105; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 13106; GFX940-NEXT: ;;#ASMSTART 13107; GFX940-NEXT: ; use s[8:9] 13108; GFX940-NEXT: ;;#ASMEND 13109; GFX940-NEXT: s_setpc_b64 s[30:31] 13110 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13111 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13112 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13113 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13114 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 4> 13115 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13116 ret void 13117} 13118 13119define void @s_shuffle_v4bf16_v3bf16__5_5_1_4() { 13120; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_4: 13121; GFX900: ; %bb.0: 13122; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13123; GFX900-NEXT: ;;#ASMSTART 13124; GFX900-NEXT: ; def s[4:5] 13125; GFX900-NEXT: ;;#ASMEND 13126; GFX900-NEXT: ;;#ASMSTART 13127; GFX900-NEXT: ; def s[6:7] 13128; GFX900-NEXT: ;;#ASMEND 13129; GFX900-NEXT: s_lshr_b32 s5, s6, 16 13130; GFX900-NEXT: s_lshr_b32 s4, s4, 16 13131; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s5 13132; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 13133; GFX900-NEXT: ;;#ASMSTART 13134; GFX900-NEXT: ; use s[8:9] 13135; GFX900-NEXT: ;;#ASMEND 13136; GFX900-NEXT: s_setpc_b64 s[30:31] 13137; 13138; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_4: 13139; GFX90A: ; %bb.0: 13140; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13141; GFX90A-NEXT: ;;#ASMSTART 13142; GFX90A-NEXT: ; def s[4:5] 13143; GFX90A-NEXT: ;;#ASMEND 13144; GFX90A-NEXT: ;;#ASMSTART 13145; GFX90A-NEXT: ; def s[6:7] 13146; GFX90A-NEXT: ;;#ASMEND 13147; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 13148; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 13149; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s5 13150; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 13151; GFX90A-NEXT: ;;#ASMSTART 13152; GFX90A-NEXT: ; use s[8:9] 13153; GFX90A-NEXT: ;;#ASMEND 13154; GFX90A-NEXT: s_setpc_b64 s[30:31] 13155; 13156; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_4: 13157; GFX940: ; %bb.0: 13158; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13159; GFX940-NEXT: ;;#ASMSTART 13160; GFX940-NEXT: ; def s[0:1] 13161; GFX940-NEXT: ;;#ASMEND 13162; GFX940-NEXT: ;;#ASMSTART 13163; GFX940-NEXT: ; def s[2:3] 13164; GFX940-NEXT: ;;#ASMEND 13165; GFX940-NEXT: s_lshr_b32 s1, s2, 16 13166; GFX940-NEXT: s_lshr_b32 s0, s0, 16 13167; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s1 13168; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 13169; GFX940-NEXT: ;;#ASMSTART 13170; GFX940-NEXT: ; use s[8:9] 13171; GFX940-NEXT: ;;#ASMEND 13172; GFX940-NEXT: s_setpc_b64 s[30:31] 13173 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13174 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13175 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13176 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13177 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 4> 13178 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13179 ret void 13180} 13181 13182define void @s_shuffle_v4bf16_v3bf16__5_5_2_4() { 13183; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_4: 13184; GFX900: ; %bb.0: 13185; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13186; GFX900-NEXT: ;;#ASMSTART 13187; GFX900-NEXT: ; def s[4:5] 13188; GFX900-NEXT: ;;#ASMEND 13189; GFX900-NEXT: ;;#ASMSTART 13190; GFX900-NEXT: ; def s[6:7] 13191; GFX900-NEXT: ;;#ASMEND 13192; GFX900-NEXT: s_lshr_b32 s4, s6, 16 13193; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s4 13194; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 13195; GFX900-NEXT: ;;#ASMSTART 13196; GFX900-NEXT: ; use s[8:9] 13197; GFX900-NEXT: ;;#ASMEND 13198; GFX900-NEXT: s_setpc_b64 s[30:31] 13199; 13200; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_4: 13201; GFX90A: ; %bb.0: 13202; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13203; GFX90A-NEXT: ;;#ASMSTART 13204; GFX90A-NEXT: ; def s[4:5] 13205; GFX90A-NEXT: ;;#ASMEND 13206; GFX90A-NEXT: ;;#ASMSTART 13207; GFX90A-NEXT: ; def s[6:7] 13208; GFX90A-NEXT: ;;#ASMEND 13209; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 13210; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s4 13211; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 13212; GFX90A-NEXT: ;;#ASMSTART 13213; GFX90A-NEXT: ; use s[8:9] 13214; GFX90A-NEXT: ;;#ASMEND 13215; GFX90A-NEXT: s_setpc_b64 s[30:31] 13216; 13217; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_4: 13218; GFX940: ; %bb.0: 13219; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13220; GFX940-NEXT: ;;#ASMSTART 13221; GFX940-NEXT: ; def s[0:1] 13222; GFX940-NEXT: ;;#ASMEND 13223; GFX940-NEXT: ;;#ASMSTART 13224; GFX940-NEXT: ; def s[2:3] 13225; GFX940-NEXT: ;;#ASMEND 13226; GFX940-NEXT: s_lshr_b32 s0, s2, 16 13227; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s0 13228; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 13229; GFX940-NEXT: ;;#ASMSTART 13230; GFX940-NEXT: ; use s[8:9] 13231; GFX940-NEXT: ;;#ASMEND 13232; GFX940-NEXT: s_setpc_b64 s[30:31] 13233 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13234 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13235 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13236 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13237 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 4> 13238 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13239 ret void 13240} 13241 13242define void @s_shuffle_v4bf16_v3bf16__5_5_3_4() { 13243; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_4: 13244; GFX900: ; %bb.0: 13245; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13246; GFX900-NEXT: ;;#ASMSTART 13247; GFX900-NEXT: ; def s[4:5] 13248; GFX900-NEXT: ;;#ASMEND 13249; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 13250; GFX900-NEXT: s_mov_b32 s9, s4 13251; GFX900-NEXT: ;;#ASMSTART 13252; GFX900-NEXT: ; use s[8:9] 13253; GFX900-NEXT: ;;#ASMEND 13254; GFX900-NEXT: s_setpc_b64 s[30:31] 13255; 13256; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_4: 13257; GFX90A: ; %bb.0: 13258; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13259; GFX90A-NEXT: ;;#ASMSTART 13260; GFX90A-NEXT: ; def s[4:5] 13261; GFX90A-NEXT: ;;#ASMEND 13262; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 13263; GFX90A-NEXT: s_mov_b32 s9, s4 13264; GFX90A-NEXT: ;;#ASMSTART 13265; GFX90A-NEXT: ; use s[8:9] 13266; GFX90A-NEXT: ;;#ASMEND 13267; GFX90A-NEXT: s_setpc_b64 s[30:31] 13268; 13269; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_4: 13270; GFX940: ; %bb.0: 13271; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13272; GFX940-NEXT: ;;#ASMSTART 13273; GFX940-NEXT: ; def s[0:1] 13274; GFX940-NEXT: ;;#ASMEND 13275; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 13276; GFX940-NEXT: s_mov_b32 s9, s0 13277; GFX940-NEXT: ;;#ASMSTART 13278; GFX940-NEXT: ; use s[8:9] 13279; GFX940-NEXT: ;;#ASMEND 13280; GFX940-NEXT: s_setpc_b64 s[30:31] 13281 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13282 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13283 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13284 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13285 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 4> 13286 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13287 ret void 13288} 13289 13290define void @s_shuffle_v4bf16_v3bf16__u_5_5_5() { 13291; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__u_5_5_5: 13292; GFX900: ; %bb.0: 13293; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13294; GFX900-NEXT: ;;#ASMSTART 13295; GFX900-NEXT: ; def s[4:5] 13296; GFX900-NEXT: ;;#ASMEND 13297; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 13298; GFX900-NEXT: s_lshl_b32 s8, s5, 16 13299; GFX900-NEXT: ;;#ASMSTART 13300; GFX900-NEXT: ; use s[8:9] 13301; GFX900-NEXT: ;;#ASMEND 13302; GFX900-NEXT: s_setpc_b64 s[30:31] 13303; 13304; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__u_5_5_5: 13305; GFX90A: ; %bb.0: 13306; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13307; GFX90A-NEXT: ;;#ASMSTART 13308; GFX90A-NEXT: ; def s[4:5] 13309; GFX90A-NEXT: ;;#ASMEND 13310; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 13311; GFX90A-NEXT: s_lshl_b32 s8, s5, 16 13312; GFX90A-NEXT: ;;#ASMSTART 13313; GFX90A-NEXT: ; use s[8:9] 13314; GFX90A-NEXT: ;;#ASMEND 13315; GFX90A-NEXT: s_setpc_b64 s[30:31] 13316; 13317; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__u_5_5_5: 13318; GFX940: ; %bb.0: 13319; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13320; GFX940-NEXT: ;;#ASMSTART 13321; GFX940-NEXT: ; def s[0:1] 13322; GFX940-NEXT: ;;#ASMEND 13323; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 13324; GFX940-NEXT: s_lshl_b32 s8, s1, 16 13325; GFX940-NEXT: ;;#ASMSTART 13326; GFX940-NEXT: ; use s[8:9] 13327; GFX940-NEXT: ;;#ASMEND 13328; GFX940-NEXT: s_setpc_b64 s[30:31] 13329 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13330 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13331 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13332 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13333 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 poison, i32 5, i32 5, i32 5> 13334 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13335 ret void 13336} 13337 13338define void @s_shuffle_v4bf16_v3bf16__0_5_5_5() { 13339; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_5_5_5: 13340; GFX900: ; %bb.0: 13341; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13342; GFX900-NEXT: ;;#ASMSTART 13343; GFX900-NEXT: ; def s[4:5] 13344; GFX900-NEXT: ;;#ASMEND 13345; GFX900-NEXT: ;;#ASMSTART 13346; GFX900-NEXT: ; def s[6:7] 13347; GFX900-NEXT: ;;#ASMEND 13348; GFX900-NEXT: s_pack_ll_b32_b16 s8, s4, s7 13349; GFX900-NEXT: s_pack_ll_b32_b16 s9, s7, s7 13350; GFX900-NEXT: ;;#ASMSTART 13351; GFX900-NEXT: ; use s[8:9] 13352; GFX900-NEXT: ;;#ASMEND 13353; GFX900-NEXT: s_setpc_b64 s[30:31] 13354; 13355; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_5_5_5: 13356; GFX90A: ; %bb.0: 13357; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13358; GFX90A-NEXT: ;;#ASMSTART 13359; GFX90A-NEXT: ; def s[4:5] 13360; GFX90A-NEXT: ;;#ASMEND 13361; GFX90A-NEXT: ;;#ASMSTART 13362; GFX90A-NEXT: ; def s[6:7] 13363; GFX90A-NEXT: ;;#ASMEND 13364; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s4, s7 13365; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s7, s7 13366; GFX90A-NEXT: ;;#ASMSTART 13367; GFX90A-NEXT: ; use s[8:9] 13368; GFX90A-NEXT: ;;#ASMEND 13369; GFX90A-NEXT: s_setpc_b64 s[30:31] 13370; 13371; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_5_5_5: 13372; GFX940: ; %bb.0: 13373; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13374; GFX940-NEXT: ;;#ASMSTART 13375; GFX940-NEXT: ; def s[0:1] 13376; GFX940-NEXT: ;;#ASMEND 13377; GFX940-NEXT: ;;#ASMSTART 13378; GFX940-NEXT: ; def s[2:3] 13379; GFX940-NEXT: ;;#ASMEND 13380; GFX940-NEXT: s_pack_ll_b32_b16 s8, s0, s3 13381; GFX940-NEXT: s_pack_ll_b32_b16 s9, s3, s3 13382; GFX940-NEXT: ;;#ASMSTART 13383; GFX940-NEXT: ; use s[8:9] 13384; GFX940-NEXT: ;;#ASMEND 13385; GFX940-NEXT: s_setpc_b64 s[30:31] 13386 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13387 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13388 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13389 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13390 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 0, i32 5, i32 5, i32 5> 13391 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13392 ret void 13393} 13394 13395define void @s_shuffle_v4bf16_v3bf16__1_5_5_5() { 13396; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_5_5_5: 13397; GFX900: ; %bb.0: 13398; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13399; GFX900-NEXT: ;;#ASMSTART 13400; GFX900-NEXT: ; def s[4:5] 13401; GFX900-NEXT: ;;#ASMEND 13402; GFX900-NEXT: s_lshr_b32 s4, s4, 16 13403; GFX900-NEXT: ;;#ASMSTART 13404; GFX900-NEXT: ; def s[6:7] 13405; GFX900-NEXT: ;;#ASMEND 13406; GFX900-NEXT: s_pack_ll_b32_b16 s8, s4, s7 13407; GFX900-NEXT: s_pack_ll_b32_b16 s9, s7, s7 13408; GFX900-NEXT: ;;#ASMSTART 13409; GFX900-NEXT: ; use s[8:9] 13410; GFX900-NEXT: ;;#ASMEND 13411; GFX900-NEXT: s_setpc_b64 s[30:31] 13412; 13413; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_5_5_5: 13414; GFX90A: ; %bb.0: 13415; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13416; GFX90A-NEXT: ;;#ASMSTART 13417; GFX90A-NEXT: ; def s[4:5] 13418; GFX90A-NEXT: ;;#ASMEND 13419; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 13420; GFX90A-NEXT: ;;#ASMSTART 13421; GFX90A-NEXT: ; def s[6:7] 13422; GFX90A-NEXT: ;;#ASMEND 13423; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s4, s7 13424; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s7, s7 13425; GFX90A-NEXT: ;;#ASMSTART 13426; GFX90A-NEXT: ; use s[8:9] 13427; GFX90A-NEXT: ;;#ASMEND 13428; GFX90A-NEXT: s_setpc_b64 s[30:31] 13429; 13430; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_5_5_5: 13431; GFX940: ; %bb.0: 13432; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13433; GFX940-NEXT: ;;#ASMSTART 13434; GFX940-NEXT: ; def s[0:1] 13435; GFX940-NEXT: ;;#ASMEND 13436; GFX940-NEXT: s_lshr_b32 s0, s0, 16 13437; GFX940-NEXT: ;;#ASMSTART 13438; GFX940-NEXT: ; def s[2:3] 13439; GFX940-NEXT: ;;#ASMEND 13440; GFX940-NEXT: s_pack_ll_b32_b16 s8, s0, s3 13441; GFX940-NEXT: s_pack_ll_b32_b16 s9, s3, s3 13442; GFX940-NEXT: ;;#ASMSTART 13443; GFX940-NEXT: ; use s[8:9] 13444; GFX940-NEXT: ;;#ASMEND 13445; GFX940-NEXT: s_setpc_b64 s[30:31] 13446 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13447 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13448 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13449 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13450 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 1, i32 5, i32 5, i32 5> 13451 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13452 ret void 13453} 13454 13455define void @s_shuffle_v4bf16_v3bf16__2_5_5_5() { 13456; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_5_5_5: 13457; GFX900: ; %bb.0: 13458; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13459; GFX900-NEXT: ;;#ASMSTART 13460; GFX900-NEXT: ; def s[4:5] 13461; GFX900-NEXT: ;;#ASMEND 13462; GFX900-NEXT: ;;#ASMSTART 13463; GFX900-NEXT: ; def s[6:7] 13464; GFX900-NEXT: ;;#ASMEND 13465; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s7 13466; GFX900-NEXT: s_pack_ll_b32_b16 s9, s7, s7 13467; GFX900-NEXT: ;;#ASMSTART 13468; GFX900-NEXT: ; use s[8:9] 13469; GFX900-NEXT: ;;#ASMEND 13470; GFX900-NEXT: s_setpc_b64 s[30:31] 13471; 13472; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_5_5_5: 13473; GFX90A: ; %bb.0: 13474; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13475; GFX90A-NEXT: ;;#ASMSTART 13476; GFX90A-NEXT: ; def s[4:5] 13477; GFX90A-NEXT: ;;#ASMEND 13478; GFX90A-NEXT: ;;#ASMSTART 13479; GFX90A-NEXT: ; def s[6:7] 13480; GFX90A-NEXT: ;;#ASMEND 13481; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s7 13482; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s7, s7 13483; GFX90A-NEXT: ;;#ASMSTART 13484; GFX90A-NEXT: ; use s[8:9] 13485; GFX90A-NEXT: ;;#ASMEND 13486; GFX90A-NEXT: s_setpc_b64 s[30:31] 13487; 13488; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_5_5_5: 13489; GFX940: ; %bb.0: 13490; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13491; GFX940-NEXT: ;;#ASMSTART 13492; GFX940-NEXT: ; def s[0:1] 13493; GFX940-NEXT: ;;#ASMEND 13494; GFX940-NEXT: ;;#ASMSTART 13495; GFX940-NEXT: ; def s[2:3] 13496; GFX940-NEXT: ;;#ASMEND 13497; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s3 13498; GFX940-NEXT: s_pack_ll_b32_b16 s9, s3, s3 13499; GFX940-NEXT: ;;#ASMSTART 13500; GFX940-NEXT: ; use s[8:9] 13501; GFX940-NEXT: ;;#ASMEND 13502; GFX940-NEXT: s_setpc_b64 s[30:31] 13503 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13504 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13505 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13506 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13507 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 2, i32 5, i32 5, i32 5> 13508 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13509 ret void 13510} 13511 13512define void @s_shuffle_v4bf16_v3bf16__3_5_5_5() { 13513; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__3_5_5_5: 13514; GFX900: ; %bb.0: 13515; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13516; GFX900-NEXT: ;;#ASMSTART 13517; GFX900-NEXT: ; def s[4:5] 13518; GFX900-NEXT: ;;#ASMEND 13519; GFX900-NEXT: s_pack_ll_b32_b16 s8, s4, s5 13520; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 13521; GFX900-NEXT: ;;#ASMSTART 13522; GFX900-NEXT: ; use s[8:9] 13523; GFX900-NEXT: ;;#ASMEND 13524; GFX900-NEXT: s_setpc_b64 s[30:31] 13525; 13526; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__3_5_5_5: 13527; GFX90A: ; %bb.0: 13528; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13529; GFX90A-NEXT: ;;#ASMSTART 13530; GFX90A-NEXT: ; def s[4:5] 13531; GFX90A-NEXT: ;;#ASMEND 13532; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s4, s5 13533; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 13534; GFX90A-NEXT: ;;#ASMSTART 13535; GFX90A-NEXT: ; use s[8:9] 13536; GFX90A-NEXT: ;;#ASMEND 13537; GFX90A-NEXT: s_setpc_b64 s[30:31] 13538; 13539; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__3_5_5_5: 13540; GFX940: ; %bb.0: 13541; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13542; GFX940-NEXT: ;;#ASMSTART 13543; GFX940-NEXT: ; def s[0:1] 13544; GFX940-NEXT: ;;#ASMEND 13545; GFX940-NEXT: s_pack_ll_b32_b16 s8, s0, s1 13546; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 13547; GFX940-NEXT: ;;#ASMSTART 13548; GFX940-NEXT: ; use s[8:9] 13549; GFX940-NEXT: ;;#ASMEND 13550; GFX940-NEXT: s_setpc_b64 s[30:31] 13551 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13552 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13553 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13554 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13555 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 3, i32 5, i32 5, i32 5> 13556 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13557 ret void 13558} 13559 13560define void @s_shuffle_v4bf16_v3bf16__4_5_5_5() { 13561; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_5_5_5: 13562; GFX900: ; %bb.0: 13563; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13564; GFX900-NEXT: ;;#ASMSTART 13565; GFX900-NEXT: ; def s[4:5] 13566; GFX900-NEXT: ;;#ASMEND 13567; GFX900-NEXT: s_lshr_b32 s4, s4, 16 13568; GFX900-NEXT: s_pack_ll_b32_b16 s8, s4, s5 13569; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 13570; GFX900-NEXT: ;;#ASMSTART 13571; GFX900-NEXT: ; use s[8:9] 13572; GFX900-NEXT: ;;#ASMEND 13573; GFX900-NEXT: s_setpc_b64 s[30:31] 13574; 13575; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_5_5_5: 13576; GFX90A: ; %bb.0: 13577; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13578; GFX90A-NEXT: ;;#ASMSTART 13579; GFX90A-NEXT: ; def s[4:5] 13580; GFX90A-NEXT: ;;#ASMEND 13581; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 13582; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s4, s5 13583; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 13584; GFX90A-NEXT: ;;#ASMSTART 13585; GFX90A-NEXT: ; use s[8:9] 13586; GFX90A-NEXT: ;;#ASMEND 13587; GFX90A-NEXT: s_setpc_b64 s[30:31] 13588; 13589; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_5_5_5: 13590; GFX940: ; %bb.0: 13591; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13592; GFX940-NEXT: ;;#ASMSTART 13593; GFX940-NEXT: ; def s[0:1] 13594; GFX940-NEXT: ;;#ASMEND 13595; GFX940-NEXT: s_lshr_b32 s0, s0, 16 13596; GFX940-NEXT: s_pack_ll_b32_b16 s8, s0, s1 13597; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 13598; GFX940-NEXT: ;;#ASMSTART 13599; GFX940-NEXT: ; use s[8:9] 13600; GFX940-NEXT: ;;#ASMEND 13601; GFX940-NEXT: s_setpc_b64 s[30:31] 13602 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13603 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13604 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13605 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13606 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 5, i32 5, i32 5> 13607 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13608 ret void 13609} 13610 13611define void @s_shuffle_v4bf16_v3bf16__5_u_5_5() { 13612; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_5_5: 13613; GFX900: ; %bb.0: 13614; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13615; GFX900-NEXT: ;;#ASMSTART 13616; GFX900-NEXT: ; def s[4:5] 13617; GFX900-NEXT: ;;#ASMEND 13618; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 13619; GFX900-NEXT: s_mov_b32 s8, s5 13620; GFX900-NEXT: ;;#ASMSTART 13621; GFX900-NEXT: ; use s[8:9] 13622; GFX900-NEXT: ;;#ASMEND 13623; GFX900-NEXT: s_setpc_b64 s[30:31] 13624; 13625; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_5_5: 13626; GFX90A: ; %bb.0: 13627; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13628; GFX90A-NEXT: ;;#ASMSTART 13629; GFX90A-NEXT: ; def s[4:5] 13630; GFX90A-NEXT: ;;#ASMEND 13631; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 13632; GFX90A-NEXT: s_mov_b32 s8, s5 13633; GFX90A-NEXT: ;;#ASMSTART 13634; GFX90A-NEXT: ; use s[8:9] 13635; GFX90A-NEXT: ;;#ASMEND 13636; GFX90A-NEXT: s_setpc_b64 s[30:31] 13637; 13638; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_5_5: 13639; GFX940: ; %bb.0: 13640; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13641; GFX940-NEXT: ;;#ASMSTART 13642; GFX940-NEXT: ; def s[0:1] 13643; GFX940-NEXT: ;;#ASMEND 13644; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 13645; GFX940-NEXT: s_mov_b32 s8, s1 13646; GFX940-NEXT: ;;#ASMSTART 13647; GFX940-NEXT: ; use s[8:9] 13648; GFX940-NEXT: ;;#ASMEND 13649; GFX940-NEXT: s_setpc_b64 s[30:31] 13650 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13651 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13652 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13653 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13654 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 5, i32 5> 13655 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13656 ret void 13657} 13658 13659define void @s_shuffle_v4bf16_v3bf16__5_0_5_5() { 13660; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_5_5: 13661; GFX900: ; %bb.0: 13662; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13663; GFX900-NEXT: ;;#ASMSTART 13664; GFX900-NEXT: ; def s[4:5] 13665; GFX900-NEXT: ;;#ASMEND 13666; GFX900-NEXT: ;;#ASMSTART 13667; GFX900-NEXT: ; def s[6:7] 13668; GFX900-NEXT: ;;#ASMEND 13669; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s4 13670; GFX900-NEXT: s_pack_ll_b32_b16 s9, s7, s7 13671; GFX900-NEXT: ;;#ASMSTART 13672; GFX900-NEXT: ; use s[8:9] 13673; GFX900-NEXT: ;;#ASMEND 13674; GFX900-NEXT: s_setpc_b64 s[30:31] 13675; 13676; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_5_5: 13677; GFX90A: ; %bb.0: 13678; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13679; GFX90A-NEXT: ;;#ASMSTART 13680; GFX90A-NEXT: ; def s[4:5] 13681; GFX90A-NEXT: ;;#ASMEND 13682; GFX90A-NEXT: ;;#ASMSTART 13683; GFX90A-NEXT: ; def s[6:7] 13684; GFX90A-NEXT: ;;#ASMEND 13685; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s4 13686; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s7, s7 13687; GFX90A-NEXT: ;;#ASMSTART 13688; GFX90A-NEXT: ; use s[8:9] 13689; GFX90A-NEXT: ;;#ASMEND 13690; GFX90A-NEXT: s_setpc_b64 s[30:31] 13691; 13692; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_5_5: 13693; GFX940: ; %bb.0: 13694; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13695; GFX940-NEXT: ;;#ASMSTART 13696; GFX940-NEXT: ; def s[0:1] 13697; GFX940-NEXT: ;;#ASMEND 13698; GFX940-NEXT: ;;#ASMSTART 13699; GFX940-NEXT: ; def s[2:3] 13700; GFX940-NEXT: ;;#ASMEND 13701; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s0 13702; GFX940-NEXT: s_pack_ll_b32_b16 s9, s3, s3 13703; GFX940-NEXT: ;;#ASMSTART 13704; GFX940-NEXT: ; use s[8:9] 13705; GFX940-NEXT: ;;#ASMEND 13706; GFX940-NEXT: s_setpc_b64 s[30:31] 13707 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13708 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13709 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13710 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13711 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 5, i32 5> 13712 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13713 ret void 13714} 13715 13716define void @s_shuffle_v4bf16_v3bf16__5_1_5_5() { 13717; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_5_5: 13718; GFX900: ; %bb.0: 13719; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13720; GFX900-NEXT: ;;#ASMSTART 13721; GFX900-NEXT: ; def s[4:5] 13722; GFX900-NEXT: ;;#ASMEND 13723; GFX900-NEXT: s_lshr_b32 s4, s4, 16 13724; GFX900-NEXT: ;;#ASMSTART 13725; GFX900-NEXT: ; def s[6:7] 13726; GFX900-NEXT: ;;#ASMEND 13727; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s4 13728; GFX900-NEXT: s_pack_ll_b32_b16 s9, s7, s7 13729; GFX900-NEXT: ;;#ASMSTART 13730; GFX900-NEXT: ; use s[8:9] 13731; GFX900-NEXT: ;;#ASMEND 13732; GFX900-NEXT: s_setpc_b64 s[30:31] 13733; 13734; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_5_5: 13735; GFX90A: ; %bb.0: 13736; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13737; GFX90A-NEXT: ;;#ASMSTART 13738; GFX90A-NEXT: ; def s[4:5] 13739; GFX90A-NEXT: ;;#ASMEND 13740; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 13741; GFX90A-NEXT: ;;#ASMSTART 13742; GFX90A-NEXT: ; def s[6:7] 13743; GFX90A-NEXT: ;;#ASMEND 13744; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s4 13745; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s7, s7 13746; GFX90A-NEXT: ;;#ASMSTART 13747; GFX90A-NEXT: ; use s[8:9] 13748; GFX90A-NEXT: ;;#ASMEND 13749; GFX90A-NEXT: s_setpc_b64 s[30:31] 13750; 13751; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_5_5: 13752; GFX940: ; %bb.0: 13753; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13754; GFX940-NEXT: ;;#ASMSTART 13755; GFX940-NEXT: ; def s[0:1] 13756; GFX940-NEXT: ;;#ASMEND 13757; GFX940-NEXT: s_lshr_b32 s0, s0, 16 13758; GFX940-NEXT: ;;#ASMSTART 13759; GFX940-NEXT: ; def s[2:3] 13760; GFX940-NEXT: ;;#ASMEND 13761; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s0 13762; GFX940-NEXT: s_pack_ll_b32_b16 s9, s3, s3 13763; GFX940-NEXT: ;;#ASMSTART 13764; GFX940-NEXT: ; use s[8:9] 13765; GFX940-NEXT: ;;#ASMEND 13766; GFX940-NEXT: s_setpc_b64 s[30:31] 13767 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13768 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13769 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13770 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13771 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 5, i32 5> 13772 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13773 ret void 13774} 13775 13776define void @s_shuffle_v4bf16_v3bf16__5_2_5_5() { 13777; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_5_5: 13778; GFX900: ; %bb.0: 13779; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13780; GFX900-NEXT: ;;#ASMSTART 13781; GFX900-NEXT: ; def s[4:5] 13782; GFX900-NEXT: ;;#ASMEND 13783; GFX900-NEXT: ;;#ASMSTART 13784; GFX900-NEXT: ; def s[6:7] 13785; GFX900-NEXT: ;;#ASMEND 13786; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s5 13787; GFX900-NEXT: s_pack_ll_b32_b16 s9, s7, s7 13788; GFX900-NEXT: ;;#ASMSTART 13789; GFX900-NEXT: ; use s[8:9] 13790; GFX900-NEXT: ;;#ASMEND 13791; GFX900-NEXT: s_setpc_b64 s[30:31] 13792; 13793; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_5_5: 13794; GFX90A: ; %bb.0: 13795; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13796; GFX90A-NEXT: ;;#ASMSTART 13797; GFX90A-NEXT: ; def s[4:5] 13798; GFX90A-NEXT: ;;#ASMEND 13799; GFX90A-NEXT: ;;#ASMSTART 13800; GFX90A-NEXT: ; def s[6:7] 13801; GFX90A-NEXT: ;;#ASMEND 13802; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s5 13803; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s7, s7 13804; GFX90A-NEXT: ;;#ASMSTART 13805; GFX90A-NEXT: ; use s[8:9] 13806; GFX90A-NEXT: ;;#ASMEND 13807; GFX90A-NEXT: s_setpc_b64 s[30:31] 13808; 13809; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_5_5: 13810; GFX940: ; %bb.0: 13811; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13812; GFX940-NEXT: ;;#ASMSTART 13813; GFX940-NEXT: ; def s[0:1] 13814; GFX940-NEXT: ;;#ASMEND 13815; GFX940-NEXT: ;;#ASMSTART 13816; GFX940-NEXT: ; def s[2:3] 13817; GFX940-NEXT: ;;#ASMEND 13818; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s1 13819; GFX940-NEXT: s_pack_ll_b32_b16 s9, s3, s3 13820; GFX940-NEXT: ;;#ASMSTART 13821; GFX940-NEXT: ; use s[8:9] 13822; GFX940-NEXT: ;;#ASMEND 13823; GFX940-NEXT: s_setpc_b64 s[30:31] 13824 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13825 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13826 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13827 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13828 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 5, i32 5> 13829 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13830 ret void 13831} 13832 13833define void @s_shuffle_v4bf16_v3bf16__5_3_5_5() { 13834; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_5_5: 13835; GFX900: ; %bb.0: 13836; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13837; GFX900-NEXT: ;;#ASMSTART 13838; GFX900-NEXT: ; def s[4:5] 13839; GFX900-NEXT: ;;#ASMEND 13840; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s4 13841; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 13842; GFX900-NEXT: ;;#ASMSTART 13843; GFX900-NEXT: ; use s[8:9] 13844; GFX900-NEXT: ;;#ASMEND 13845; GFX900-NEXT: s_setpc_b64 s[30:31] 13846; 13847; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_5_5: 13848; GFX90A: ; %bb.0: 13849; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13850; GFX90A-NEXT: ;;#ASMSTART 13851; GFX90A-NEXT: ; def s[4:5] 13852; GFX90A-NEXT: ;;#ASMEND 13853; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s4 13854; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 13855; GFX90A-NEXT: ;;#ASMSTART 13856; GFX90A-NEXT: ; use s[8:9] 13857; GFX90A-NEXT: ;;#ASMEND 13858; GFX90A-NEXT: s_setpc_b64 s[30:31] 13859; 13860; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_5_5: 13861; GFX940: ; %bb.0: 13862; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13863; GFX940-NEXT: ;;#ASMSTART 13864; GFX940-NEXT: ; def s[0:1] 13865; GFX940-NEXT: ;;#ASMEND 13866; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s0 13867; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 13868; GFX940-NEXT: ;;#ASMSTART 13869; GFX940-NEXT: ; use s[8:9] 13870; GFX940-NEXT: ;;#ASMEND 13871; GFX940-NEXT: s_setpc_b64 s[30:31] 13872 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13873 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13874 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13875 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13876 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 5, i32 5> 13877 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13878 ret void 13879} 13880 13881define void @s_shuffle_v4bf16_v3bf16__5_4_5_5() { 13882; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_5_5: 13883; GFX900: ; %bb.0: 13884; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13885; GFX900-NEXT: ;;#ASMSTART 13886; GFX900-NEXT: ; def s[4:5] 13887; GFX900-NEXT: ;;#ASMEND 13888; GFX900-NEXT: s_lshr_b32 s4, s4, 16 13889; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s4 13890; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s5 13891; GFX900-NEXT: ;;#ASMSTART 13892; GFX900-NEXT: ; use s[8:9] 13893; GFX900-NEXT: ;;#ASMEND 13894; GFX900-NEXT: s_setpc_b64 s[30:31] 13895; 13896; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_5_5: 13897; GFX90A: ; %bb.0: 13898; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13899; GFX90A-NEXT: ;;#ASMSTART 13900; GFX90A-NEXT: ; def s[4:5] 13901; GFX90A-NEXT: ;;#ASMEND 13902; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 13903; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s4 13904; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s5 13905; GFX90A-NEXT: ;;#ASMSTART 13906; GFX90A-NEXT: ; use s[8:9] 13907; GFX90A-NEXT: ;;#ASMEND 13908; GFX90A-NEXT: s_setpc_b64 s[30:31] 13909; 13910; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_5_5: 13911; GFX940: ; %bb.0: 13912; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13913; GFX940-NEXT: ;;#ASMSTART 13914; GFX940-NEXT: ; def s[0:1] 13915; GFX940-NEXT: ;;#ASMEND 13916; GFX940-NEXT: s_lshr_b32 s0, s0, 16 13917; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s0 13918; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s1 13919; GFX940-NEXT: ;;#ASMSTART 13920; GFX940-NEXT: ; use s[8:9] 13921; GFX940-NEXT: ;;#ASMEND 13922; GFX940-NEXT: s_setpc_b64 s[30:31] 13923 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13924 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13925 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13926 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13927 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 5, i32 5> 13928 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13929 ret void 13930} 13931 13932define void @s_shuffle_v4bf16_v3bf16__5_5_u_5() { 13933; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_5: 13934; GFX900: ; %bb.0: 13935; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13936; GFX900-NEXT: ;;#ASMSTART 13937; GFX900-NEXT: ; def s[4:5] 13938; GFX900-NEXT: ;;#ASMEND 13939; GFX900-NEXT: s_lshl_b32 s9, s5, 16 13940; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 13941; GFX900-NEXT: ;;#ASMSTART 13942; GFX900-NEXT: ; use s[8:9] 13943; GFX900-NEXT: ;;#ASMEND 13944; GFX900-NEXT: s_setpc_b64 s[30:31] 13945; 13946; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_5: 13947; GFX90A: ; %bb.0: 13948; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13949; GFX90A-NEXT: ;;#ASMSTART 13950; GFX90A-NEXT: ; def s[4:5] 13951; GFX90A-NEXT: ;;#ASMEND 13952; GFX90A-NEXT: s_lshl_b32 s9, s5, 16 13953; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 13954; GFX90A-NEXT: ;;#ASMSTART 13955; GFX90A-NEXT: ; use s[8:9] 13956; GFX90A-NEXT: ;;#ASMEND 13957; GFX90A-NEXT: s_setpc_b64 s[30:31] 13958; 13959; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_5: 13960; GFX940: ; %bb.0: 13961; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13962; GFX940-NEXT: ;;#ASMSTART 13963; GFX940-NEXT: ; def s[0:1] 13964; GFX940-NEXT: ;;#ASMEND 13965; GFX940-NEXT: s_lshl_b32 s9, s1, 16 13966; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 13967; GFX940-NEXT: ;;#ASMSTART 13968; GFX940-NEXT: ; use s[8:9] 13969; GFX940-NEXT: ;;#ASMEND 13970; GFX940-NEXT: s_setpc_b64 s[30:31] 13971 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 13972 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 13973 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13974 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 13975 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 5> 13976 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 13977 ret void 13978} 13979 13980define void @s_shuffle_v4bf16_v3bf16__5_5_0_5() { 13981; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_5: 13982; GFX900: ; %bb.0: 13983; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13984; GFX900-NEXT: ;;#ASMSTART 13985; GFX900-NEXT: ; def s[4:5] 13986; GFX900-NEXT: ;;#ASMEND 13987; GFX900-NEXT: ;;#ASMSTART 13988; GFX900-NEXT: ; def s[6:7] 13989; GFX900-NEXT: ;;#ASMEND 13990; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s7 13991; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 13992; GFX900-NEXT: ;;#ASMSTART 13993; GFX900-NEXT: ; use s[8:9] 13994; GFX900-NEXT: ;;#ASMEND 13995; GFX900-NEXT: s_setpc_b64 s[30:31] 13996; 13997; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_5: 13998; GFX90A: ; %bb.0: 13999; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14000; GFX90A-NEXT: ;;#ASMSTART 14001; GFX90A-NEXT: ; def s[4:5] 14002; GFX90A-NEXT: ;;#ASMEND 14003; GFX90A-NEXT: ;;#ASMSTART 14004; GFX90A-NEXT: ; def s[6:7] 14005; GFX90A-NEXT: ;;#ASMEND 14006; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s7 14007; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 14008; GFX90A-NEXT: ;;#ASMSTART 14009; GFX90A-NEXT: ; use s[8:9] 14010; GFX90A-NEXT: ;;#ASMEND 14011; GFX90A-NEXT: s_setpc_b64 s[30:31] 14012; 14013; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_5: 14014; GFX940: ; %bb.0: 14015; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14016; GFX940-NEXT: ;;#ASMSTART 14017; GFX940-NEXT: ; def s[0:1] 14018; GFX940-NEXT: ;;#ASMEND 14019; GFX940-NEXT: ;;#ASMSTART 14020; GFX940-NEXT: ; def s[2:3] 14021; GFX940-NEXT: ;;#ASMEND 14022; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s3 14023; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 14024; GFX940-NEXT: ;;#ASMSTART 14025; GFX940-NEXT: ; use s[8:9] 14026; GFX940-NEXT: ;;#ASMEND 14027; GFX940-NEXT: s_setpc_b64 s[30:31] 14028 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 14029 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 14030 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 14031 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 14032 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 5> 14033 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 14034 ret void 14035} 14036 14037define void @s_shuffle_v4bf16_v3bf16__5_5_1_5() { 14038; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_5: 14039; GFX900: ; %bb.0: 14040; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14041; GFX900-NEXT: ;;#ASMSTART 14042; GFX900-NEXT: ; def s[4:5] 14043; GFX900-NEXT: ;;#ASMEND 14044; GFX900-NEXT: s_lshr_b32 s4, s4, 16 14045; GFX900-NEXT: ;;#ASMSTART 14046; GFX900-NEXT: ; def s[6:7] 14047; GFX900-NEXT: ;;#ASMEND 14048; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s7 14049; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 14050; GFX900-NEXT: ;;#ASMSTART 14051; GFX900-NEXT: ; use s[8:9] 14052; GFX900-NEXT: ;;#ASMEND 14053; GFX900-NEXT: s_setpc_b64 s[30:31] 14054; 14055; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_5: 14056; GFX90A: ; %bb.0: 14057; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14058; GFX90A-NEXT: ;;#ASMSTART 14059; GFX90A-NEXT: ; def s[4:5] 14060; GFX90A-NEXT: ;;#ASMEND 14061; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 14062; GFX90A-NEXT: ;;#ASMSTART 14063; GFX90A-NEXT: ; def s[6:7] 14064; GFX90A-NEXT: ;;#ASMEND 14065; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s7 14066; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 14067; GFX90A-NEXT: ;;#ASMSTART 14068; GFX90A-NEXT: ; use s[8:9] 14069; GFX90A-NEXT: ;;#ASMEND 14070; GFX90A-NEXT: s_setpc_b64 s[30:31] 14071; 14072; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_5: 14073; GFX940: ; %bb.0: 14074; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14075; GFX940-NEXT: ;;#ASMSTART 14076; GFX940-NEXT: ; def s[0:1] 14077; GFX940-NEXT: ;;#ASMEND 14078; GFX940-NEXT: s_lshr_b32 s0, s0, 16 14079; GFX940-NEXT: ;;#ASMSTART 14080; GFX940-NEXT: ; def s[2:3] 14081; GFX940-NEXT: ;;#ASMEND 14082; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s3 14083; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 14084; GFX940-NEXT: ;;#ASMSTART 14085; GFX940-NEXT: ; use s[8:9] 14086; GFX940-NEXT: ;;#ASMEND 14087; GFX940-NEXT: s_setpc_b64 s[30:31] 14088 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 14089 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 14090 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 14091 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 14092 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 5> 14093 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 14094 ret void 14095} 14096 14097define void @s_shuffle_v4bf16_v3bf16__5_5_2_5() { 14098; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_5: 14099; GFX900: ; %bb.0: 14100; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14101; GFX900-NEXT: ;;#ASMSTART 14102; GFX900-NEXT: ; def s[4:5] 14103; GFX900-NEXT: ;;#ASMEND 14104; GFX900-NEXT: ;;#ASMSTART 14105; GFX900-NEXT: ; def s[6:7] 14106; GFX900-NEXT: ;;#ASMEND 14107; GFX900-NEXT: s_pack_ll_b32_b16 s9, s5, s7 14108; GFX900-NEXT: s_pack_ll_b32_b16 s8, s7, s7 14109; GFX900-NEXT: ;;#ASMSTART 14110; GFX900-NEXT: ; use s[8:9] 14111; GFX900-NEXT: ;;#ASMEND 14112; GFX900-NEXT: s_setpc_b64 s[30:31] 14113; 14114; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_5: 14115; GFX90A: ; %bb.0: 14116; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14117; GFX90A-NEXT: ;;#ASMSTART 14118; GFX90A-NEXT: ; def s[4:5] 14119; GFX90A-NEXT: ;;#ASMEND 14120; GFX90A-NEXT: ;;#ASMSTART 14121; GFX90A-NEXT: ; def s[6:7] 14122; GFX90A-NEXT: ;;#ASMEND 14123; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s5, s7 14124; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s7, s7 14125; GFX90A-NEXT: ;;#ASMSTART 14126; GFX90A-NEXT: ; use s[8:9] 14127; GFX90A-NEXT: ;;#ASMEND 14128; GFX90A-NEXT: s_setpc_b64 s[30:31] 14129; 14130; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_5: 14131; GFX940: ; %bb.0: 14132; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14133; GFX940-NEXT: ;;#ASMSTART 14134; GFX940-NEXT: ; def s[0:1] 14135; GFX940-NEXT: ;;#ASMEND 14136; GFX940-NEXT: ;;#ASMSTART 14137; GFX940-NEXT: ; def s[2:3] 14138; GFX940-NEXT: ;;#ASMEND 14139; GFX940-NEXT: s_pack_ll_b32_b16 s9, s1, s3 14140; GFX940-NEXT: s_pack_ll_b32_b16 s8, s3, s3 14141; GFX940-NEXT: ;;#ASMSTART 14142; GFX940-NEXT: ; use s[8:9] 14143; GFX940-NEXT: ;;#ASMEND 14144; GFX940-NEXT: s_setpc_b64 s[30:31] 14145 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 14146 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 14147 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 14148 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 14149 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 5> 14150 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 14151 ret void 14152} 14153 14154define void @s_shuffle_v4bf16_v3bf16__5_5_3_5() { 14155; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_5: 14156; GFX900: ; %bb.0: 14157; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14158; GFX900-NEXT: ;;#ASMSTART 14159; GFX900-NEXT: ; def s[4:5] 14160; GFX900-NEXT: ;;#ASMEND 14161; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s5 14162; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 14163; GFX900-NEXT: ;;#ASMSTART 14164; GFX900-NEXT: ; use s[8:9] 14165; GFX900-NEXT: ;;#ASMEND 14166; GFX900-NEXT: s_setpc_b64 s[30:31] 14167; 14168; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_5: 14169; GFX90A: ; %bb.0: 14170; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14171; GFX90A-NEXT: ;;#ASMSTART 14172; GFX90A-NEXT: ; def s[4:5] 14173; GFX90A-NEXT: ;;#ASMEND 14174; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s5 14175; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 14176; GFX90A-NEXT: ;;#ASMSTART 14177; GFX90A-NEXT: ; use s[8:9] 14178; GFX90A-NEXT: ;;#ASMEND 14179; GFX90A-NEXT: s_setpc_b64 s[30:31] 14180; 14181; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_5: 14182; GFX940: ; %bb.0: 14183; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14184; GFX940-NEXT: ;;#ASMSTART 14185; GFX940-NEXT: ; def s[0:1] 14186; GFX940-NEXT: ;;#ASMEND 14187; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s1 14188; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 14189; GFX940-NEXT: ;;#ASMSTART 14190; GFX940-NEXT: ; use s[8:9] 14191; GFX940-NEXT: ;;#ASMEND 14192; GFX940-NEXT: s_setpc_b64 s[30:31] 14193 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 14194 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 14195 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 14196 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 14197 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 5> 14198 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 14199 ret void 14200} 14201 14202define void @s_shuffle_v4bf16_v3bf16__5_5_4_5() { 14203; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_5: 14204; GFX900: ; %bb.0: 14205; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14206; GFX900-NEXT: ;;#ASMSTART 14207; GFX900-NEXT: ; def s[4:5] 14208; GFX900-NEXT: ;;#ASMEND 14209; GFX900-NEXT: s_lshr_b32 s4, s4, 16 14210; GFX900-NEXT: s_pack_ll_b32_b16 s9, s4, s5 14211; GFX900-NEXT: s_pack_ll_b32_b16 s8, s5, s5 14212; GFX900-NEXT: ;;#ASMSTART 14213; GFX900-NEXT: ; use s[8:9] 14214; GFX900-NEXT: ;;#ASMEND 14215; GFX900-NEXT: s_setpc_b64 s[30:31] 14216; 14217; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_5: 14218; GFX90A: ; %bb.0: 14219; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14220; GFX90A-NEXT: ;;#ASMSTART 14221; GFX90A-NEXT: ; def s[4:5] 14222; GFX90A-NEXT: ;;#ASMEND 14223; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 14224; GFX90A-NEXT: s_pack_ll_b32_b16 s9, s4, s5 14225; GFX90A-NEXT: s_pack_ll_b32_b16 s8, s5, s5 14226; GFX90A-NEXT: ;;#ASMSTART 14227; GFX90A-NEXT: ; use s[8:9] 14228; GFX90A-NEXT: ;;#ASMEND 14229; GFX90A-NEXT: s_setpc_b64 s[30:31] 14230; 14231; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_5: 14232; GFX940: ; %bb.0: 14233; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14234; GFX940-NEXT: ;;#ASMSTART 14235; GFX940-NEXT: ; def s[0:1] 14236; GFX940-NEXT: ;;#ASMEND 14237; GFX940-NEXT: s_lshr_b32 s0, s0, 16 14238; GFX940-NEXT: s_pack_ll_b32_b16 s9, s0, s1 14239; GFX940-NEXT: s_pack_ll_b32_b16 s8, s1, s1 14240; GFX940-NEXT: ;;#ASMSTART 14241; GFX940-NEXT: ; use s[8:9] 14242; GFX940-NEXT: ;;#ASMEND 14243; GFX940-NEXT: s_setpc_b64 s[30:31] 14244 %vec0 = call <4 x bfloat> asm "; def $0", "=s"() 14245 %vec1 = call <4 x bfloat> asm "; def $0", "=s"() 14246 %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 14247 %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2> 14248 %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 5> 14249 call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf) 14250 ret void 14251} 14252;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 14253; GFX90APLUS: {{.*}} 14254