1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GX900 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX940 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s 6 7define <4 x half> @shuffle_v4f16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 8; GFX9-LABEL: shuffle_v4f16_23uu: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 12; GFX9-NEXT: s_waitcnt vmcnt(0) 13; GFX9-NEXT: s_setpc_b64 s[30:31] 14; 15; GFX10-LABEL: shuffle_v4f16_23uu: 16; GFX10: ; %bb.0: 17; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 19; GFX10-NEXT: s_waitcnt vmcnt(0) 20; GFX10-NEXT: s_setpc_b64 s[30:31] 21; 22; GFX11-LABEL: shuffle_v4f16_23uu: 23; GFX11: ; %bb.0: 24; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 26; GFX11-NEXT: s_waitcnt vmcnt(0) 27; GFX11-NEXT: s_setpc_b64 s[30:31] 28 %val0 = load <4 x half>, ptr addrspace(1) %arg0 29 %val1 = load <4 x half>, ptr addrspace(1) %arg1 30 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 31 ret <4 x half> %shuffle 32} 33 34define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 35; GX900-LABEL: shuffle_v4f16_234u: 36; GX900: ; %bb.0: 37; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 38; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4 39; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 40; GX900-NEXT: s_waitcnt vmcnt(1) 41; GX900-NEXT: v_mov_b32_e32 v0, v6 42; GX900-NEXT: s_waitcnt vmcnt(0) 43; GX900-NEXT: v_mov_b32_e32 v1, v4 44; GX900-NEXT: s_setpc_b64 s[30:31] 45; 46; GFX940-LABEL: shuffle_v4f16_234u: 47; GFX940: ; %bb.0: 48; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 49; GFX940-NEXT: global_load_dword v4, v[0:1], off offset:4 50; GFX940-NEXT: global_load_dwordx2 v[6:7], v[2:3], off 51; GFX940-NEXT: s_waitcnt vmcnt(1) 52; GFX940-NEXT: v_mov_b32_e32 v0, v4 53; GFX940-NEXT: s_waitcnt vmcnt(0) 54; GFX940-NEXT: v_mov_b32_e32 v1, v6 55; GFX940-NEXT: s_setpc_b64 s[30:31] 56; 57; GFX10-LABEL: shuffle_v4f16_234u: 58; GFX10: ; %bb.0: 59; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 60; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 61; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 62; GFX10-NEXT: s_waitcnt vmcnt(1) 63; GFX10-NEXT: v_mov_b32_e32 v0, v6 64; GFX10-NEXT: s_waitcnt vmcnt(0) 65; GFX10-NEXT: v_mov_b32_e32 v1, v4 66; GFX10-NEXT: s_setpc_b64 s[30:31] 67; 68; GFX11-LABEL: shuffle_v4f16_234u: 69; GFX11: ; %bb.0: 70; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 71; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 72; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off 73; GFX11-NEXT: s_waitcnt vmcnt(0) 74; GFX11-NEXT: s_setpc_b64 s[30:31] 75 %val0 = load <4 x half>, ptr addrspace(1) %arg0 76 %val1 = load <4 x half>, ptr addrspace(1) %arg1 77 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef> 78 ret <4 x half> %shuffle 79} 80 81define <4 x half> @shuffle_v4f16_u1u3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 82; GFX9-LABEL: shuffle_v4f16_u1u3: 83; GFX9: ; %bb.0: 84; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 85; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 86; GFX9-NEXT: s_waitcnt vmcnt(0) 87; GFX9-NEXT: s_setpc_b64 s[30:31] 88; 89; GFX10-LABEL: shuffle_v4f16_u1u3: 90; GFX10: ; %bb.0: 91; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 92; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 93; GFX10-NEXT: s_waitcnt vmcnt(0) 94; GFX10-NEXT: s_setpc_b64 s[30:31] 95; 96; GFX11-LABEL: shuffle_v4f16_u1u3: 97; GFX11: ; %bb.0: 98; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 99; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 100; GFX11-NEXT: s_waitcnt vmcnt(0) 101; GFX11-NEXT: s_setpc_b64 s[30:31] 102 %val0 = load <4 x half>, ptr addrspace(1) %arg0 103 %val1 = load <4 x half>, ptr addrspace(1) %arg1 104 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3> 105 ret <4 x half> %shuffle 106} 107 108define <4 x half> @shuffle_v4f16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 109; GX900-LABEL: shuffle_v4f16_u3u1: 110; GX900: ; %bb.0: 111; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 112; GX900-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 113; GX900-NEXT: s_waitcnt vmcnt(0) 114; GX900-NEXT: v_mov_b32_e32 v0, v2 115; GX900-NEXT: s_setpc_b64 s[30:31] 116; 117; GFX940-LABEL: shuffle_v4f16_u3u1: 118; GFX940: ; %bb.0: 119; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 120; GFX940-NEXT: global_load_dwordx2 v[2:3], v[0:1], off 121; GFX940-NEXT: s_waitcnt vmcnt(0) 122; GFX940-NEXT: v_mov_b32_e32 v0, v3 123; GFX940-NEXT: v_mov_b32_e32 v1, v2 124; GFX940-NEXT: s_setpc_b64 s[30:31] 125; 126; GFX10-LABEL: shuffle_v4f16_u3u1: 127; GFX10: ; %bb.0: 128; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 129; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 130; GFX10-NEXT: s_waitcnt vmcnt(0) 131; GFX10-NEXT: v_mov_b32_e32 v0, v2 132; GFX10-NEXT: s_setpc_b64 s[30:31] 133; 134; GFX11-LABEL: shuffle_v4f16_u3u1: 135; GFX11: ; %bb.0: 136; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 137; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off 138; GFX11-NEXT: s_waitcnt vmcnt(0) 139; GFX11-NEXT: v_mov_b32_e32 v0, v2 140; GFX11-NEXT: s_setpc_b64 s[30:31] 141 %val0 = load <4 x half>, ptr addrspace(1) %arg0 142 %val1 = load <4 x half>, ptr addrspace(1) %arg1 143 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1> 144 ret <4 x half> %shuffle 145} 146 147define <4 x half> @shuffle_v4f16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 148; GFX9-LABEL: shuffle_v4f16_u3uu: 149; GFX9: ; %bb.0: 150; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 151; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 152; GFX9-NEXT: s_waitcnt vmcnt(0) 153; GFX9-NEXT: s_setpc_b64 s[30:31] 154; 155; GFX10-LABEL: shuffle_v4f16_u3uu: 156; GFX10: ; %bb.0: 157; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 158; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 159; GFX10-NEXT: s_waitcnt vmcnt(0) 160; GFX10-NEXT: s_setpc_b64 s[30:31] 161; 162; GFX11-LABEL: shuffle_v4f16_u3uu: 163; GFX11: ; %bb.0: 164; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 165; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 166; GFX11-NEXT: s_waitcnt vmcnt(0) 167; GFX11-NEXT: s_setpc_b64 s[30:31] 168 %val0 = load <4 x half>, ptr addrspace(1) %arg0 169 %val1 = load <4 x half>, ptr addrspace(1) %arg1 170 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef> 171 ret <4 x half> %shuffle 172} 173 174define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 175; GX900-LABEL: shuffle_v4f16_3u6u: 176; GX900: ; %bb.0: 177; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 178; GX900-NEXT: global_load_dword v5, v[0:1], off offset:4 179; GX900-NEXT: global_load_dword v4, v[2:3], off offset:4 180; GX900-NEXT: s_waitcnt vmcnt(1) 181; GX900-NEXT: v_alignbit_b32 v0, s4, v5, 16 182; GX900-NEXT: s_waitcnt vmcnt(0) 183; GX900-NEXT: v_mov_b32_e32 v1, v4 184; GX900-NEXT: s_setpc_b64 s[30:31] 185; 186; GFX940-LABEL: shuffle_v4f16_3u6u: 187; GFX940: ; %bb.0: 188; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 189; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:4 190; GFX940-NEXT: global_load_dword v4, v[2:3], off offset:4 191; GFX940-NEXT: s_waitcnt vmcnt(1) 192; GFX940-NEXT: v_alignbit_b32 v0, s0, v5, 16 193; GFX940-NEXT: s_waitcnt vmcnt(0) 194; GFX940-NEXT: v_mov_b32_e32 v1, v4 195; GFX940-NEXT: s_setpc_b64 s[30:31] 196; 197; GFX10-LABEL: shuffle_v4f16_3u6u: 198; GFX10: ; %bb.0: 199; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 200; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 201; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 202; GFX10-NEXT: s_waitcnt vmcnt(1) 203; GFX10-NEXT: v_alignbit_b32 v0, s4, v5, 16 204; GFX10-NEXT: s_waitcnt vmcnt(0) 205; GFX10-NEXT: v_mov_b32_e32 v1, v4 206; GFX10-NEXT: s_setpc_b64 s[30:31] 207; 208; GFX11-LABEL: shuffle_v4f16_3u6u: 209; GFX11: ; %bb.0: 210; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 211; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 212; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 213; GFX11-NEXT: s_waitcnt vmcnt(1) 214; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, 16 215; GFX11-NEXT: s_waitcnt vmcnt(0) 216; GFX11-NEXT: s_setpc_b64 s[30:31] 217 %val0 = load <4 x half>, ptr addrspace(1) %arg0 218 %val1 = load <4 x half>, ptr addrspace(1) %arg1 219 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef> 220 ret <4 x half> %shuffle 221} 222 223define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 224; GX900-LABEL: shuffle_v4f16_3uu7: 225; GX900: ; %bb.0: 226; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 227; GX900-NEXT: global_load_dword v5, v[0:1], off offset:4 228; GX900-NEXT: global_load_dword v4, v[2:3], off offset:4 229; GX900-NEXT: s_waitcnt vmcnt(1) 230; GX900-NEXT: v_alignbit_b32 v0, s4, v5, 16 231; GX900-NEXT: s_waitcnt vmcnt(0) 232; GX900-NEXT: v_mov_b32_e32 v1, v4 233; GX900-NEXT: s_setpc_b64 s[30:31] 234; 235; GFX940-LABEL: shuffle_v4f16_3uu7: 236; GFX940: ; %bb.0: 237; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 238; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:4 239; GFX940-NEXT: global_load_dword v4, v[2:3], off offset:4 240; GFX940-NEXT: s_waitcnt vmcnt(1) 241; GFX940-NEXT: v_alignbit_b32 v0, s0, v5, 16 242; GFX940-NEXT: s_waitcnt vmcnt(0) 243; GFX940-NEXT: v_mov_b32_e32 v1, v4 244; GFX940-NEXT: s_setpc_b64 s[30:31] 245; 246; GFX10-LABEL: shuffle_v4f16_3uu7: 247; GFX10: ; %bb.0: 248; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 249; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 250; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 251; GFX10-NEXT: s_waitcnt vmcnt(1) 252; GFX10-NEXT: v_alignbit_b32 v0, s4, v5, 16 253; GFX10-NEXT: s_waitcnt vmcnt(0) 254; GFX10-NEXT: v_mov_b32_e32 v1, v4 255; GFX10-NEXT: s_setpc_b64 s[30:31] 256; 257; GFX11-LABEL: shuffle_v4f16_3uu7: 258; GFX11: ; %bb.0: 259; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 260; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 261; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 262; GFX11-NEXT: s_waitcnt vmcnt(1) 263; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, 16 264; GFX11-NEXT: s_waitcnt vmcnt(0) 265; GFX11-NEXT: s_setpc_b64 s[30:31] 266 %val0 = load <4 x half>, ptr addrspace(1) %arg0 267 %val1 = load <4 x half>, ptr addrspace(1) %arg1 268 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7> 269 ret <4 x half> %shuffle 270} 271 272define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 273; GX900-LABEL: shuffle_v4f16_35u5: 274; GX900: ; %bb.0: 275; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 276; GX900-NEXT: global_load_dword v5, v[0:1], off offset:4 277; GX900-NEXT: global_load_dword v4, v[2:3], off 278; GX900-NEXT: s_mov_b32 s4, 0x7060302 279; GX900-NEXT: s_waitcnt vmcnt(0) 280; GX900-NEXT: v_perm_b32 v0, v4, v5, s4 281; GX900-NEXT: v_mov_b32_e32 v1, v4 282; GX900-NEXT: s_setpc_b64 s[30:31] 283; 284; GFX940-LABEL: shuffle_v4f16_35u5: 285; GFX940: ; %bb.0: 286; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 287; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:4 288; GFX940-NEXT: global_load_dword v4, v[2:3], off 289; GFX940-NEXT: s_mov_b32 s0, 0x7060302 290; GFX940-NEXT: s_waitcnt vmcnt(0) 291; GFX940-NEXT: v_perm_b32 v0, v4, v5, s0 292; GFX940-NEXT: v_mov_b32_e32 v1, v4 293; GFX940-NEXT: s_setpc_b64 s[30:31] 294; 295; GFX10-LABEL: shuffle_v4f16_35u5: 296; GFX10: ; %bb.0: 297; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 298; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 299; GFX10-NEXT: global_load_dword v4, v[2:3], off 300; GFX10-NEXT: s_waitcnt vmcnt(0) 301; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x7060302 302; GFX10-NEXT: v_mov_b32_e32 v1, v4 303; GFX10-NEXT: s_setpc_b64 s[30:31] 304; 305; GFX11-LABEL: shuffle_v4f16_35u5: 306; GFX11: ; %bb.0: 307; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 308; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 309; GFX11-NEXT: global_load_b32 v1, v[2:3], off 310; GFX11-NEXT: s_waitcnt vmcnt(0) 311; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 312; GFX11-NEXT: s_setpc_b64 s[30:31] 313 %val0 = load <4 x half>, ptr addrspace(1) %arg0 314 %val1 = load <4 x half>, ptr addrspace(1) %arg1 315 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5> 316 ret <4 x half> %shuffle 317} 318 319define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 320; GX900-LABEL: shuffle_v4f16_357u: 321; GX900: ; %bb.0: 322; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 323; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 324; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4 325; GX900-NEXT: s_mov_b32 s4, 0x7060302 326; GX900-NEXT: s_waitcnt vmcnt(1) 327; GX900-NEXT: v_alignbit_b32 v1, s4, v5, 16 328; GX900-NEXT: s_waitcnt vmcnt(0) 329; GX900-NEXT: v_perm_b32 v0, v4, v6, s4 330; GX900-NEXT: s_setpc_b64 s[30:31] 331; 332; GFX940-LABEL: shuffle_v4f16_357u: 333; GFX940: ; %bb.0: 334; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 335; GFX940-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 336; GFX940-NEXT: global_load_dword v6, v[0:1], off offset:4 337; GFX940-NEXT: s_mov_b32 s0, 0x7060302 338; GFX940-NEXT: s_waitcnt vmcnt(1) 339; GFX940-NEXT: v_alignbit_b32 v1, s0, v5, 16 340; GFX940-NEXT: s_waitcnt vmcnt(0) 341; GFX940-NEXT: v_perm_b32 v0, v4, v6, s0 342; GFX940-NEXT: s_setpc_b64 s[30:31] 343; 344; GFX10-LABEL: shuffle_v4f16_357u: 345; GFX10: ; %bb.0: 346; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 347; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 348; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 349; GFX10-NEXT: s_waitcnt vmcnt(1) 350; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16 351; GFX10-NEXT: s_waitcnt vmcnt(0) 352; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x7060302 353; GFX10-NEXT: s_setpc_b64 s[30:31] 354; 355; GFX11-LABEL: shuffle_v4f16_357u: 356; GFX11: ; %bb.0: 357; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 358; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 359; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 360; GFX11-NEXT: s_waitcnt vmcnt(1) 361; GFX11-NEXT: v_alignbit_b32 v1, s0, v3, 16 362; GFX11-NEXT: s_waitcnt vmcnt(0) 363; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 364; GFX11-NEXT: s_setpc_b64 s[30:31] 365 %val0 = load <4 x half>, ptr addrspace(1) %arg0 366 %val1 = load <4 x half>, ptr addrspace(1) %arg1 367 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef> 368 ret <4 x half> %shuffle 369} 370 371define <4 x half> @shuffle_v4f16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 372; GFX9-LABEL: shuffle_v4f16_0101: 373; GFX9: ; %bb.0: 374; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 375; GFX9-NEXT: global_load_dword v0, v[0:1], off 376; GFX9-NEXT: s_waitcnt vmcnt(0) 377; GFX9-NEXT: v_mov_b32_e32 v1, v0 378; GFX9-NEXT: s_setpc_b64 s[30:31] 379; 380; GFX10-LABEL: shuffle_v4f16_0101: 381; GFX10: ; %bb.0: 382; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 383; GFX10-NEXT: global_load_dword v0, v[0:1], off 384; GFX10-NEXT: s_waitcnt vmcnt(0) 385; GFX10-NEXT: v_mov_b32_e32 v1, v0 386; GFX10-NEXT: s_setpc_b64 s[30:31] 387; 388; GFX11-LABEL: shuffle_v4f16_0101: 389; GFX11: ; %bb.0: 390; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 391; GFX11-NEXT: global_load_b32 v0, v[0:1], off 392; GFX11-NEXT: s_waitcnt vmcnt(0) 393; GFX11-NEXT: v_mov_b32_e32 v1, v0 394; GFX11-NEXT: s_setpc_b64 s[30:31] 395 %val0 = load <4 x half>, ptr addrspace(1) %arg0 396 %val1 = load <4 x half>, ptr addrspace(1) %arg1 397 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 398 ret <4 x half> %shuffle 399} 400 401define <4 x half> @shuffle_v4f16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 402; GFX9-LABEL: shuffle_v4f16_0123: 403; GFX9: ; %bb.0: 404; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 405; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 406; GFX9-NEXT: s_waitcnt vmcnt(0) 407; GFX9-NEXT: s_setpc_b64 s[30:31] 408; 409; GFX10-LABEL: shuffle_v4f16_0123: 410; GFX10: ; %bb.0: 411; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 412; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 413; GFX10-NEXT: s_waitcnt vmcnt(0) 414; GFX10-NEXT: s_setpc_b64 s[30:31] 415; 416; GFX11-LABEL: shuffle_v4f16_0123: 417; GFX11: ; %bb.0: 418; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 419; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 420; GFX11-NEXT: s_waitcnt vmcnt(0) 421; GFX11-NEXT: s_setpc_b64 s[30:31] 422 %val0 = load <4 x half>, ptr addrspace(1) %arg0 423 %val1 = load <4 x half>, ptr addrspace(1) %arg1 424 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 425 ret <4 x half> %shuffle 426} 427 428define <4 x half> @shuffle_v4f16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 429; GFX9-LABEL: shuffle_v4f16_0145: 430; GFX9: ; %bb.0: 431; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 432; GFX9-NEXT: global_load_dword v4, v[0:1], off 433; GFX9-NEXT: global_load_dword v5, v[2:3], off 434; GFX9-NEXT: s_waitcnt vmcnt(1) 435; GFX9-NEXT: v_mov_b32_e32 v0, v4 436; GFX9-NEXT: s_waitcnt vmcnt(0) 437; GFX9-NEXT: v_mov_b32_e32 v1, v5 438; GFX9-NEXT: s_setpc_b64 s[30:31] 439; 440; GFX10-LABEL: shuffle_v4f16_0145: 441; GFX10: ; %bb.0: 442; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 443; GFX10-NEXT: global_load_dword v4, v[0:1], off 444; GFX10-NEXT: global_load_dword v5, v[2:3], off 445; GFX10-NEXT: s_waitcnt vmcnt(1) 446; GFX10-NEXT: v_mov_b32_e32 v0, v4 447; GFX10-NEXT: s_waitcnt vmcnt(0) 448; GFX10-NEXT: v_mov_b32_e32 v1, v5 449; GFX10-NEXT: s_setpc_b64 s[30:31] 450; 451; GFX11-LABEL: shuffle_v4f16_0145: 452; GFX11: ; %bb.0: 453; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 454; GFX11-NEXT: global_load_b32 v0, v[0:1], off 455; GFX11-NEXT: global_load_b32 v1, v[2:3], off 456; GFX11-NEXT: s_waitcnt vmcnt(0) 457; GFX11-NEXT: s_setpc_b64 s[30:31] 458 %val0 = load <4 x half>, ptr addrspace(1) %arg0 459 %val1 = load <4 x half>, ptr addrspace(1) %arg1 460 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 461 ret <4 x half> %shuffle 462} 463 464define <4 x half> @shuffle_v4f16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 465; GFX9-LABEL: shuffle_v4f16_0167: 466; GFX9: ; %bb.0: 467; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 468; GFX9-NEXT: global_load_dword v4, v[0:1], off 469; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 470; GFX9-NEXT: s_waitcnt vmcnt(1) 471; GFX9-NEXT: v_mov_b32_e32 v0, v4 472; GFX9-NEXT: s_waitcnt vmcnt(0) 473; GFX9-NEXT: v_mov_b32_e32 v1, v5 474; GFX9-NEXT: s_setpc_b64 s[30:31] 475; 476; GFX10-LABEL: shuffle_v4f16_0167: 477; GFX10: ; %bb.0: 478; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 479; GFX10-NEXT: global_load_dword v4, v[0:1], off 480; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 481; GFX10-NEXT: s_waitcnt vmcnt(1) 482; GFX10-NEXT: v_mov_b32_e32 v0, v4 483; GFX10-NEXT: s_waitcnt vmcnt(0) 484; GFX10-NEXT: v_mov_b32_e32 v1, v5 485; GFX10-NEXT: s_setpc_b64 s[30:31] 486; 487; GFX11-LABEL: shuffle_v4f16_0167: 488; GFX11: ; %bb.0: 489; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 490; GFX11-NEXT: global_load_b32 v0, v[0:1], off 491; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 492; GFX11-NEXT: s_waitcnt vmcnt(0) 493; GFX11-NEXT: s_setpc_b64 s[30:31] 494 %val0 = load <4 x half>, ptr addrspace(1) %arg0 495 %val1 = load <4 x half>, ptr addrspace(1) %arg1 496 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 497 ret <4 x half> %shuffle 498} 499 500define <4 x half> @shuffle_v4f16_2301(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 501; GX900-LABEL: shuffle_v4f16_2301: 502; GX900: ; %bb.0: 503; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 504; GX900-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 505; GX900-NEXT: s_waitcnt vmcnt(0) 506; GX900-NEXT: v_mov_b32_e32 v0, v2 507; GX900-NEXT: s_setpc_b64 s[30:31] 508; 509; GFX940-LABEL: shuffle_v4f16_2301: 510; GFX940: ; %bb.0: 511; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 512; GFX940-NEXT: global_load_dwordx2 v[2:3], v[0:1], off 513; GFX940-NEXT: s_waitcnt vmcnt(0) 514; GFX940-NEXT: v_mov_b32_e32 v0, v3 515; GFX940-NEXT: v_mov_b32_e32 v1, v2 516; GFX940-NEXT: s_setpc_b64 s[30:31] 517; 518; GFX10-LABEL: shuffle_v4f16_2301: 519; GFX10: ; %bb.0: 520; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 521; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 522; GFX10-NEXT: s_waitcnt vmcnt(0) 523; GFX10-NEXT: v_mov_b32_e32 v0, v2 524; GFX10-NEXT: s_setpc_b64 s[30:31] 525; 526; GFX11-LABEL: shuffle_v4f16_2301: 527; GFX11: ; %bb.0: 528; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 529; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off 530; GFX11-NEXT: s_waitcnt vmcnt(0) 531; GFX11-NEXT: v_mov_b32_e32 v0, v2 532; GFX11-NEXT: s_setpc_b64 s[30:31] 533 %val0 = load <4 x half>, ptr addrspace(1) %arg0 534 %val1 = load <4 x half>, ptr addrspace(1) %arg1 535 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 536 ret <4 x half> %shuffle 537} 538 539define <4 x half> @shuffle_v4f16_2323(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 540; GFX9-LABEL: shuffle_v4f16_2323: 541; GFX9: ; %bb.0: 542; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 543; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 544; GFX9-NEXT: s_waitcnt vmcnt(0) 545; GFX9-NEXT: v_mov_b32_e32 v1, v0 546; GFX9-NEXT: s_setpc_b64 s[30:31] 547; 548; GFX10-LABEL: shuffle_v4f16_2323: 549; GFX10: ; %bb.0: 550; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 551; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 552; GFX10-NEXT: s_waitcnt vmcnt(0) 553; GFX10-NEXT: v_mov_b32_e32 v1, v0 554; GFX10-NEXT: s_setpc_b64 s[30:31] 555; 556; GFX11-LABEL: shuffle_v4f16_2323: 557; GFX11: ; %bb.0: 558; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 559; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 560; GFX11-NEXT: s_waitcnt vmcnt(0) 561; GFX11-NEXT: v_mov_b32_e32 v1, v0 562; GFX11-NEXT: s_setpc_b64 s[30:31] 563 %val0 = load <4 x half>, ptr addrspace(1) %arg0 564 %val1 = load <4 x half>, ptr addrspace(1) %arg1 565 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 566 ret <4 x half> %shuffle 567} 568 569define <4 x half> @shuffle_v4f16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 570; GFX9-LABEL: shuffle_v4f16_2345: 571; GFX9: ; %bb.0: 572; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 573; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 574; GFX9-NEXT: global_load_dword v5, v[2:3], off 575; GFX9-NEXT: s_waitcnt vmcnt(1) 576; GFX9-NEXT: v_mov_b32_e32 v0, v4 577; GFX9-NEXT: s_waitcnt vmcnt(0) 578; GFX9-NEXT: v_mov_b32_e32 v1, v5 579; GFX9-NEXT: s_setpc_b64 s[30:31] 580; 581; GFX10-LABEL: shuffle_v4f16_2345: 582; GFX10: ; %bb.0: 583; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 584; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 585; GFX10-NEXT: global_load_dword v5, v[2:3], off 586; GFX10-NEXT: s_waitcnt vmcnt(1) 587; GFX10-NEXT: v_mov_b32_e32 v0, v4 588; GFX10-NEXT: s_waitcnt vmcnt(0) 589; GFX10-NEXT: v_mov_b32_e32 v1, v5 590; GFX10-NEXT: s_setpc_b64 s[30:31] 591; 592; GFX11-LABEL: shuffle_v4f16_2345: 593; GFX11: ; %bb.0: 594; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 595; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 596; GFX11-NEXT: global_load_b32 v1, v[2:3], off 597; GFX11-NEXT: s_waitcnt vmcnt(0) 598; GFX11-NEXT: s_setpc_b64 s[30:31] 599 %val0 = load <4 x half>, ptr addrspace(1) %arg0 600 %val1 = load <4 x half>, ptr addrspace(1) %arg1 601 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 602 ret <4 x half> %shuffle 603} 604 605define <4 x half> @shuffle_v4f16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 606; GFX9-LABEL: shuffle_v4f16_2367: 607; GFX9: ; %bb.0: 608; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 609; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 610; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 611; GFX9-NEXT: s_waitcnt vmcnt(1) 612; GFX9-NEXT: v_mov_b32_e32 v0, v4 613; GFX9-NEXT: s_waitcnt vmcnt(0) 614; GFX9-NEXT: v_mov_b32_e32 v1, v5 615; GFX9-NEXT: s_setpc_b64 s[30:31] 616; 617; GFX10-LABEL: shuffle_v4f16_2367: 618; GFX10: ; %bb.0: 619; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 620; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 621; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 622; GFX10-NEXT: s_waitcnt vmcnt(1) 623; GFX10-NEXT: v_mov_b32_e32 v0, v4 624; GFX10-NEXT: s_waitcnt vmcnt(0) 625; GFX10-NEXT: v_mov_b32_e32 v1, v5 626; GFX10-NEXT: s_setpc_b64 s[30:31] 627; 628; GFX11-LABEL: shuffle_v4f16_2367: 629; GFX11: ; %bb.0: 630; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 631; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 632; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 633; GFX11-NEXT: s_waitcnt vmcnt(0) 634; GFX11-NEXT: s_setpc_b64 s[30:31] 635 %val0 = load <4 x half>, ptr addrspace(1) %arg0 636 %val1 = load <4 x half>, ptr addrspace(1) %arg1 637 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 638 ret <4 x half> %shuffle 639} 640 641define <4 x half> @shuffle_v4f16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 642; GFX9-LABEL: shuffle_v4f16_4501: 643; GFX9: ; %bb.0: 644; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 645; GFX9-NEXT: global_load_dword v4, v[2:3], off 646; GFX9-NEXT: global_load_dword v5, v[0:1], off 647; GFX9-NEXT: s_waitcnt vmcnt(1) 648; GFX9-NEXT: v_mov_b32_e32 v0, v4 649; GFX9-NEXT: s_waitcnt vmcnt(0) 650; GFX9-NEXT: v_mov_b32_e32 v1, v5 651; GFX9-NEXT: s_setpc_b64 s[30:31] 652; 653; GFX10-LABEL: shuffle_v4f16_4501: 654; GFX10: ; %bb.0: 655; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 656; GFX10-NEXT: global_load_dword v4, v[2:3], off 657; GFX10-NEXT: global_load_dword v5, v[0:1], off 658; GFX10-NEXT: s_waitcnt vmcnt(1) 659; GFX10-NEXT: v_mov_b32_e32 v0, v4 660; GFX10-NEXT: s_waitcnt vmcnt(0) 661; GFX10-NEXT: v_mov_b32_e32 v1, v5 662; GFX10-NEXT: s_setpc_b64 s[30:31] 663; 664; GFX11-LABEL: shuffle_v4f16_4501: 665; GFX11: ; %bb.0: 666; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 667; GFX11-NEXT: global_load_b32 v2, v[2:3], off 668; GFX11-NEXT: global_load_b32 v1, v[0:1], off 669; GFX11-NEXT: s_waitcnt vmcnt(1) 670; GFX11-NEXT: v_mov_b32_e32 v0, v2 671; GFX11-NEXT: s_waitcnt vmcnt(0) 672; GFX11-NEXT: s_setpc_b64 s[30:31] 673 %val0 = load <4 x half>, ptr addrspace(1) %arg0 674 %val1 = load <4 x half>, ptr addrspace(1) %arg1 675 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 676 ret <4 x half> %shuffle 677} 678 679define <4 x half> @shuffle_v4f16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 680; GFX9-LABEL: shuffle_v4f16_4523: 681; GFX9: ; %bb.0: 682; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 683; GFX9-NEXT: global_load_dword v4, v[2:3], off 684; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 685; GFX9-NEXT: s_waitcnt vmcnt(1) 686; GFX9-NEXT: v_mov_b32_e32 v0, v4 687; GFX9-NEXT: s_waitcnt vmcnt(0) 688; GFX9-NEXT: v_mov_b32_e32 v1, v5 689; GFX9-NEXT: s_setpc_b64 s[30:31] 690; 691; GFX10-LABEL: shuffle_v4f16_4523: 692; GFX10: ; %bb.0: 693; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 694; GFX10-NEXT: global_load_dword v4, v[2:3], off 695; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 696; GFX10-NEXT: s_waitcnt vmcnt(1) 697; GFX10-NEXT: v_mov_b32_e32 v0, v4 698; GFX10-NEXT: s_waitcnt vmcnt(0) 699; GFX10-NEXT: v_mov_b32_e32 v1, v5 700; GFX10-NEXT: s_setpc_b64 s[30:31] 701; 702; GFX11-LABEL: shuffle_v4f16_4523: 703; GFX11: ; %bb.0: 704; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 705; GFX11-NEXT: global_load_b32 v2, v[2:3], off 706; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 707; GFX11-NEXT: s_waitcnt vmcnt(1) 708; GFX11-NEXT: v_mov_b32_e32 v0, v2 709; GFX11-NEXT: s_waitcnt vmcnt(0) 710; GFX11-NEXT: s_setpc_b64 s[30:31] 711 %val0 = load <4 x half>, ptr addrspace(1) %arg0 712 %val1 = load <4 x half>, ptr addrspace(1) %arg1 713 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 714 ret <4 x half> %shuffle 715} 716 717define <4 x half> @shuffle_v4f16_4545(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 718; GFX9-LABEL: shuffle_v4f16_4545: 719; GFX9: ; %bb.0: 720; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 721; GFX9-NEXT: global_load_dword v0, v[2:3], off 722; GFX9-NEXT: s_waitcnt vmcnt(0) 723; GFX9-NEXT: v_mov_b32_e32 v1, v0 724; GFX9-NEXT: s_setpc_b64 s[30:31] 725; 726; GFX10-LABEL: shuffle_v4f16_4545: 727; GFX10: ; %bb.0: 728; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 729; GFX10-NEXT: global_load_dword v0, v[2:3], off 730; GFX10-NEXT: s_waitcnt vmcnt(0) 731; GFX10-NEXT: v_mov_b32_e32 v1, v0 732; GFX10-NEXT: s_setpc_b64 s[30:31] 733; 734; GFX11-LABEL: shuffle_v4f16_4545: 735; GFX11: ; %bb.0: 736; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 737; GFX11-NEXT: global_load_b32 v0, v[2:3], off 738; GFX11-NEXT: s_waitcnt vmcnt(0) 739; GFX11-NEXT: v_mov_b32_e32 v1, v0 740; GFX11-NEXT: s_setpc_b64 s[30:31] 741 %val0 = load <4 x half>, ptr addrspace(1) %arg0 742 %val1 = load <4 x half>, ptr addrspace(1) %arg1 743 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5> 744 ret <4 x half> %shuffle 745} 746 747define <4 x half> @shuffle_v4f16_4567(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 748; GFX9-LABEL: shuffle_v4f16_4567: 749; GFX9: ; %bb.0: 750; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 751; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 752; GFX9-NEXT: s_waitcnt vmcnt(0) 753; GFX9-NEXT: s_setpc_b64 s[30:31] 754; 755; GFX10-LABEL: shuffle_v4f16_4567: 756; GFX10: ; %bb.0: 757; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 758; GFX10-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 759; GFX10-NEXT: s_waitcnt vmcnt(0) 760; GFX10-NEXT: s_setpc_b64 s[30:31] 761; 762; GFX11-LABEL: shuffle_v4f16_4567: 763; GFX11: ; %bb.0: 764; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 765; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off 766; GFX11-NEXT: s_waitcnt vmcnt(0) 767; GFX11-NEXT: s_setpc_b64 s[30:31] 768 %val0 = load <4 x half>, ptr addrspace(1) %arg0 769 %val1 = load <4 x half>, ptr addrspace(1) %arg1 770 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 771 ret <4 x half> %shuffle 772} 773 774define <4 x half> @shuffle_v4f16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 775; GFX9-LABEL: shuffle_v4f16_6701: 776; GFX9: ; %bb.0: 777; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 778; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 779; GFX9-NEXT: global_load_dword v5, v[0:1], off 780; GFX9-NEXT: s_waitcnt vmcnt(1) 781; GFX9-NEXT: v_mov_b32_e32 v0, v4 782; GFX9-NEXT: s_waitcnt vmcnt(0) 783; GFX9-NEXT: v_mov_b32_e32 v1, v5 784; GFX9-NEXT: s_setpc_b64 s[30:31] 785; 786; GFX10-LABEL: shuffle_v4f16_6701: 787; GFX10: ; %bb.0: 788; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 789; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 790; GFX10-NEXT: global_load_dword v5, v[0:1], off 791; GFX10-NEXT: s_waitcnt vmcnt(1) 792; GFX10-NEXT: v_mov_b32_e32 v0, v4 793; GFX10-NEXT: s_waitcnt vmcnt(0) 794; GFX10-NEXT: v_mov_b32_e32 v1, v5 795; GFX10-NEXT: s_setpc_b64 s[30:31] 796; 797; GFX11-LABEL: shuffle_v4f16_6701: 798; GFX11: ; %bb.0: 799; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 800; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 801; GFX11-NEXT: global_load_b32 v1, v[0:1], off 802; GFX11-NEXT: s_waitcnt vmcnt(1) 803; GFX11-NEXT: v_mov_b32_e32 v0, v2 804; GFX11-NEXT: s_waitcnt vmcnt(0) 805; GFX11-NEXT: s_setpc_b64 s[30:31] 806 %val0 = load <4 x half>, ptr addrspace(1) %arg0 807 %val1 = load <4 x half>, ptr addrspace(1) %arg1 808 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 809 ret <4 x half> %shuffle 810} 811 812define <4 x half> @shuffle_v4f16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 813; GFX9-LABEL: shuffle_v4f16_6723: 814; GFX9: ; %bb.0: 815; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 816; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 817; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 818; GFX9-NEXT: s_waitcnt vmcnt(1) 819; GFX9-NEXT: v_mov_b32_e32 v0, v4 820; GFX9-NEXT: s_waitcnt vmcnt(0) 821; GFX9-NEXT: v_mov_b32_e32 v1, v5 822; GFX9-NEXT: s_setpc_b64 s[30:31] 823; 824; GFX10-LABEL: shuffle_v4f16_6723: 825; GFX10: ; %bb.0: 826; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 827; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 828; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 829; GFX10-NEXT: s_waitcnt vmcnt(1) 830; GFX10-NEXT: v_mov_b32_e32 v0, v4 831; GFX10-NEXT: s_waitcnt vmcnt(0) 832; GFX10-NEXT: v_mov_b32_e32 v1, v5 833; GFX10-NEXT: s_setpc_b64 s[30:31] 834; 835; GFX11-LABEL: shuffle_v4f16_6723: 836; GFX11: ; %bb.0: 837; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 838; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 839; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 840; GFX11-NEXT: s_waitcnt vmcnt(1) 841; GFX11-NEXT: v_mov_b32_e32 v0, v2 842; GFX11-NEXT: s_waitcnt vmcnt(0) 843; GFX11-NEXT: s_setpc_b64 s[30:31] 844 %val0 = load <4 x half>, ptr addrspace(1) %arg0 845 %val1 = load <4 x half>, ptr addrspace(1) %arg1 846 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 847 ret <4 x half> %shuffle 848} 849 850define <4 x half> @shuffle_v4f16_6745(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 851; GX900-LABEL: shuffle_v4f16_6745: 852; GX900: ; %bb.0: 853; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 854; GX900-NEXT: global_load_dwordx2 v[1:2], v[2:3], off 855; GX900-NEXT: s_waitcnt vmcnt(0) 856; GX900-NEXT: v_mov_b32_e32 v0, v2 857; GX900-NEXT: s_setpc_b64 s[30:31] 858; 859; GFX940-LABEL: shuffle_v4f16_6745: 860; GFX940: ; %bb.0: 861; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 862; GFX940-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 863; GFX940-NEXT: s_waitcnt vmcnt(0) 864; GFX940-NEXT: v_mov_b32_e32 v0, v3 865; GFX940-NEXT: v_mov_b32_e32 v1, v2 866; GFX940-NEXT: s_setpc_b64 s[30:31] 867; 868; GFX10-LABEL: shuffle_v4f16_6745: 869; GFX10: ; %bb.0: 870; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 871; GFX10-NEXT: global_load_dwordx2 v[1:2], v[2:3], off 872; GFX10-NEXT: s_waitcnt vmcnt(0) 873; GFX10-NEXT: v_mov_b32_e32 v0, v2 874; GFX10-NEXT: s_setpc_b64 s[30:31] 875; 876; GFX11-LABEL: shuffle_v4f16_6745: 877; GFX11: ; %bb.0: 878; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 879; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off 880; GFX11-NEXT: s_waitcnt vmcnt(0) 881; GFX11-NEXT: v_mov_b32_e32 v0, v2 882; GFX11-NEXT: s_setpc_b64 s[30:31] 883 %val0 = load <4 x half>, ptr addrspace(1) %arg0 884 %val1 = load <4 x half>, ptr addrspace(1) %arg1 885 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5> 886 ret <4 x half> %shuffle 887} 888 889define <4 x half> @shuffle_v4f16_6767(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 890; GFX9-LABEL: shuffle_v4f16_6767: 891; GFX9: ; %bb.0: 892; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 893; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4 894; GFX9-NEXT: s_waitcnt vmcnt(0) 895; GFX9-NEXT: v_mov_b32_e32 v1, v0 896; GFX9-NEXT: s_setpc_b64 s[30:31] 897; 898; GFX10-LABEL: shuffle_v4f16_6767: 899; GFX10: ; %bb.0: 900; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 901; GFX10-NEXT: global_load_dword v0, v[2:3], off offset:4 902; GFX10-NEXT: s_waitcnt vmcnt(0) 903; GFX10-NEXT: v_mov_b32_e32 v1, v0 904; GFX10-NEXT: s_setpc_b64 s[30:31] 905; 906; GFX11-LABEL: shuffle_v4f16_6767: 907; GFX11: ; %bb.0: 908; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 909; GFX11-NEXT: global_load_b32 v0, v[2:3], off offset:4 910; GFX11-NEXT: s_waitcnt vmcnt(0) 911; GFX11-NEXT: v_mov_b32_e32 v1, v0 912; GFX11-NEXT: s_setpc_b64 s[30:31] 913 %val0 = load <4 x half>, ptr addrspace(1) %arg0 914 %val1 = load <4 x half>, ptr addrspace(1) %arg1 915 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7> 916 ret <4 x half> %shuffle 917} 918 919define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 920; GX900-LABEL: shuffle_v4f16_2356: 921; GX900: ; %bb.0: 922; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 923; GX900-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 924; GX900-NEXT: global_load_dword v4, v[0:1], off offset:4 925; GX900-NEXT: s_waitcnt vmcnt(1) 926; GX900-NEXT: v_alignbit_b32 v1, v6, v5, 16 927; GX900-NEXT: s_waitcnt vmcnt(0) 928; GX900-NEXT: v_mov_b32_e32 v0, v4 929; GX900-NEXT: s_setpc_b64 s[30:31] 930; 931; GFX940-LABEL: shuffle_v4f16_2356: 932; GFX940: ; %bb.0: 933; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 934; GFX940-NEXT: global_load_dwordx2 v[6:7], v[2:3], off 935; GFX940-NEXT: global_load_dword v4, v[0:1], off offset:4 936; GFX940-NEXT: s_waitcnt vmcnt(1) 937; GFX940-NEXT: v_alignbit_b32 v1, v7, v6, 16 938; GFX940-NEXT: s_waitcnt vmcnt(0) 939; GFX940-NEXT: v_mov_b32_e32 v0, v4 940; GFX940-NEXT: s_setpc_b64 s[30:31] 941; 942; GFX10-LABEL: shuffle_v4f16_2356: 943; GFX10: ; %bb.0: 944; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 945; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 946; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 947; GFX10-NEXT: s_waitcnt vmcnt(1) 948; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16 949; GFX10-NEXT: s_waitcnt vmcnt(0) 950; GFX10-NEXT: v_mov_b32_e32 v0, v4 951; GFX10-NEXT: s_setpc_b64 s[30:31] 952; 953; GFX11-LABEL: shuffle_v4f16_2356: 954; GFX11: ; %bb.0: 955; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 956; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 957; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 958; GFX11-NEXT: s_waitcnt vmcnt(1) 959; GFX11-NEXT: v_alignbit_b32 v1, v3, v2, 16 960; GFX11-NEXT: s_waitcnt vmcnt(0) 961; GFX11-NEXT: s_setpc_b64 s[30:31] 962 %val0 = load <4 x half>, ptr addrspace(1) %arg0 963 %val1 = load <4 x half>, ptr addrspace(1) %arg1 964 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6> 965 ret <4 x half> %shuffle 966} 967 968define <4 x half> @shuffle_v4f16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 969; GX900-LABEL: shuffle_v4f16_5623: 970; GX900: ; %bb.0: 971; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 972; GX900-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 973; GX900-NEXT: global_load_dword v4, v[0:1], off offset:4 974; GX900-NEXT: s_waitcnt vmcnt(1) 975; GX900-NEXT: v_alignbit_b32 v0, v6, v5, 16 976; GX900-NEXT: s_waitcnt vmcnt(0) 977; GX900-NEXT: v_mov_b32_e32 v1, v4 978; GX900-NEXT: s_setpc_b64 s[30:31] 979; 980; GFX940-LABEL: shuffle_v4f16_5623: 981; GFX940: ; %bb.0: 982; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 983; GFX940-NEXT: global_load_dwordx2 v[6:7], v[2:3], off 984; GFX940-NEXT: global_load_dword v4, v[0:1], off offset:4 985; GFX940-NEXT: s_waitcnt vmcnt(1) 986; GFX940-NEXT: v_alignbit_b32 v0, v7, v6, 16 987; GFX940-NEXT: s_waitcnt vmcnt(0) 988; GFX940-NEXT: v_mov_b32_e32 v1, v4 989; GFX940-NEXT: s_setpc_b64 s[30:31] 990; 991; GFX10-LABEL: shuffle_v4f16_5623: 992; GFX10: ; %bb.0: 993; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 994; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 995; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 996; GFX10-NEXT: s_waitcnt vmcnt(1) 997; GFX10-NEXT: v_alignbit_b32 v0, v6, v5, 16 998; GFX10-NEXT: s_waitcnt vmcnt(0) 999; GFX10-NEXT: v_mov_b32_e32 v1, v4 1000; GFX10-NEXT: s_setpc_b64 s[30:31] 1001; 1002; GFX11-LABEL: shuffle_v4f16_5623: 1003; GFX11: ; %bb.0: 1004; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1005; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 1006; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 1007; GFX11-NEXT: s_waitcnt vmcnt(1) 1008; GFX11-NEXT: v_alignbit_b32 v0, v3, v2, 16 1009; GFX11-NEXT: s_waitcnt vmcnt(0) 1010; GFX11-NEXT: s_setpc_b64 s[30:31] 1011 %val0 = load <4 x half>, ptr addrspace(1) %arg0 1012 %val1 = load <4 x half>, ptr addrspace(1) %arg1 1013 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3> 1014 ret <4 x half> %shuffle 1015} 1016 1017define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1018; GFX9-LABEL: shuffle_v4f16_3456: 1019; GFX9: ; %bb.0: 1020; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1021; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 1022; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 1023; GFX9-NEXT: s_waitcnt vmcnt(1) 1024; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16 1025; GFX9-NEXT: s_waitcnt vmcnt(0) 1026; GFX9-NEXT: v_alignbit_b32 v0, v4, v6, 16 1027; GFX9-NEXT: s_setpc_b64 s[30:31] 1028; 1029; GFX10-LABEL: shuffle_v4f16_3456: 1030; GFX10: ; %bb.0: 1031; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1032; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 1033; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 1034; GFX10-NEXT: s_waitcnt vmcnt(1) 1035; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16 1036; GFX10-NEXT: s_waitcnt vmcnt(0) 1037; GFX10-NEXT: v_alignbit_b32 v0, v4, v6, 16 1038; GFX10-NEXT: s_setpc_b64 s[30:31] 1039; 1040; GFX11-LABEL: shuffle_v4f16_3456: 1041; GFX11: ; %bb.0: 1042; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1043; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 1044; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 1045; GFX11-NEXT: s_waitcnt vmcnt(1) 1046; GFX11-NEXT: v_alignbit_b32 v1, v3, v2, 16 1047; GFX11-NEXT: s_waitcnt vmcnt(0) 1048; GFX11-NEXT: v_alignbit_b32 v0, v2, v0, 16 1049; GFX11-NEXT: s_setpc_b64 s[30:31] 1050 %val0 = load <4 x half>, ptr addrspace(1) %arg0 1051 %val1 = load <4 x half>, ptr addrspace(1) %arg1 1052 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 1053 ret <4 x half> %shuffle 1054} 1055 1056define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1057; GFX9-LABEL: shuffle_v4f16_5634: 1058; GFX9: ; %bb.0: 1059; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1060; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 1061; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 1062; GFX9-NEXT: s_waitcnt vmcnt(1) 1063; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 1064; GFX9-NEXT: s_waitcnt vmcnt(0) 1065; GFX9-NEXT: v_alignbit_b32 v1, v4, v6, 16 1066; GFX9-NEXT: s_setpc_b64 s[30:31] 1067; 1068; GFX10-LABEL: shuffle_v4f16_5634: 1069; GFX10: ; %bb.0: 1070; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1071; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 1072; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 1073; GFX10-NEXT: s_waitcnt vmcnt(1) 1074; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16 1075; GFX10-NEXT: s_waitcnt vmcnt(0) 1076; GFX10-NEXT: v_alignbit_b32 v1, v4, v6, 16 1077; GFX10-NEXT: s_setpc_b64 s[30:31] 1078; 1079; GFX11-LABEL: shuffle_v4f16_5634: 1080; GFX11: ; %bb.0: 1081; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1082; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 1083; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 1084; GFX11-NEXT: s_waitcnt vmcnt(1) 1085; GFX11-NEXT: v_alignbit_b32 v0, v3, v2, 16 1086; GFX11-NEXT: s_waitcnt vmcnt(0) 1087; GFX11-NEXT: v_alignbit_b32 v1, v2, v1, 16 1088; GFX11-NEXT: s_setpc_b64 s[30:31] 1089 %val0 = load <4 x half>, ptr addrspace(1) %arg0 1090 %val1 = load <4 x half>, ptr addrspace(1) %arg1 1091 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4> 1092 ret <4 x half> %shuffle 1093} 1094 1095define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1096; GX900-LABEL: shuffle_v4f16_5734: 1097; GX900: ; %bb.0: 1098; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1099; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 1100; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4 1101; GX900-NEXT: s_mov_b32 s4, 0x7060302 1102; GX900-NEXT: s_waitcnt vmcnt(1) 1103; GX900-NEXT: v_perm_b32 v0, v5, v4, s4 1104; GX900-NEXT: s_waitcnt vmcnt(0) 1105; GX900-NEXT: v_alignbit_b32 v1, v4, v6, 16 1106; GX900-NEXT: s_setpc_b64 s[30:31] 1107; 1108; GFX940-LABEL: shuffle_v4f16_5734: 1109; GFX940: ; %bb.0: 1110; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1111; GFX940-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 1112; GFX940-NEXT: global_load_dword v6, v[0:1], off offset:4 1113; GFX940-NEXT: s_mov_b32 s0, 0x7060302 1114; GFX940-NEXT: s_waitcnt vmcnt(1) 1115; GFX940-NEXT: v_perm_b32 v0, v5, v4, s0 1116; GFX940-NEXT: s_waitcnt vmcnt(0) 1117; GFX940-NEXT: v_alignbit_b32 v1, v4, v6, 16 1118; GFX940-NEXT: s_setpc_b64 s[30:31] 1119; 1120; GFX10-LABEL: shuffle_v4f16_5734: 1121; GFX10: ; %bb.0: 1122; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1123; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 1124; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 1125; GFX10-NEXT: s_waitcnt vmcnt(1) 1126; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302 1127; GFX10-NEXT: s_waitcnt vmcnt(0) 1128; GFX10-NEXT: v_alignbit_b32 v1, v4, v6, 16 1129; GFX10-NEXT: s_setpc_b64 s[30:31] 1130; 1131; GFX11-LABEL: shuffle_v4f16_5734: 1132; GFX11: ; %bb.0: 1133; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1134; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 1135; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 1136; GFX11-NEXT: s_waitcnt vmcnt(1) 1137; GFX11-NEXT: v_perm_b32 v0, v3, v2, 0x7060302 1138; GFX11-NEXT: s_waitcnt vmcnt(0) 1139; GFX11-NEXT: v_alignbit_b32 v1, v2, v1, 16 1140; GFX11-NEXT: s_setpc_b64 s[30:31] 1141 %val0 = load <4 x half>, ptr addrspace(1) %arg0 1142 %val1 = load <4 x half>, ptr addrspace(1) %arg1 1143 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4> 1144 ret <4 x half> %shuffle 1145} 1146 1147define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1148; GX900-LABEL: shuffle_v4i16_2356: 1149; GX900: ; %bb.0: 1150; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1151; GX900-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 1152; GX900-NEXT: global_load_dword v4, v[0:1], off offset:4 1153; GX900-NEXT: s_waitcnt vmcnt(1) 1154; GX900-NEXT: v_alignbit_b32 v1, v6, v5, 16 1155; GX900-NEXT: s_waitcnt vmcnt(0) 1156; GX900-NEXT: v_mov_b32_e32 v0, v4 1157; GX900-NEXT: s_setpc_b64 s[30:31] 1158; 1159; GFX940-LABEL: shuffle_v4i16_2356: 1160; GFX940: ; %bb.0: 1161; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1162; GFX940-NEXT: global_load_dwordx2 v[6:7], v[2:3], off 1163; GFX940-NEXT: global_load_dword v4, v[0:1], off offset:4 1164; GFX940-NEXT: s_waitcnt vmcnt(1) 1165; GFX940-NEXT: v_alignbit_b32 v1, v7, v6, 16 1166; GFX940-NEXT: s_waitcnt vmcnt(0) 1167; GFX940-NEXT: v_mov_b32_e32 v0, v4 1168; GFX940-NEXT: s_setpc_b64 s[30:31] 1169; 1170; GFX10-LABEL: shuffle_v4i16_2356: 1171; GFX10: ; %bb.0: 1172; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1173; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 1174; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 1175; GFX10-NEXT: s_waitcnt vmcnt(1) 1176; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16 1177; GFX10-NEXT: s_waitcnt vmcnt(0) 1178; GFX10-NEXT: v_mov_b32_e32 v0, v4 1179; GFX10-NEXT: s_setpc_b64 s[30:31] 1180; 1181; GFX11-LABEL: shuffle_v4i16_2356: 1182; GFX11: ; %bb.0: 1183; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1184; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 1185; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 1186; GFX11-NEXT: s_waitcnt vmcnt(1) 1187; GFX11-NEXT: v_alignbit_b32 v1, v3, v2, 16 1188; GFX11-NEXT: s_waitcnt vmcnt(0) 1189; GFX11-NEXT: s_setpc_b64 s[30:31] 1190 %val0 = load <4 x i16>, ptr addrspace(1) %arg0 1191 %val1 = load <4 x i16>, ptr addrspace(1) %arg1 1192 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6> 1193 ret <4 x i16> %shuffle 1194} 1195 1196define <4 x i16> @shuffle_v4i16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1197; GFX9-LABEL: shuffle_v4i16_0167: 1198; GFX9: ; %bb.0: 1199; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1200; GFX9-NEXT: global_load_dword v4, v[0:1], off 1201; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 1202; GFX9-NEXT: s_waitcnt vmcnt(1) 1203; GFX9-NEXT: v_mov_b32_e32 v0, v4 1204; GFX9-NEXT: s_waitcnt vmcnt(0) 1205; GFX9-NEXT: v_mov_b32_e32 v1, v5 1206; GFX9-NEXT: s_setpc_b64 s[30:31] 1207; 1208; GFX10-LABEL: shuffle_v4i16_0167: 1209; GFX10: ; %bb.0: 1210; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1211; GFX10-NEXT: global_load_dword v4, v[0:1], off 1212; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 1213; GFX10-NEXT: s_waitcnt vmcnt(1) 1214; GFX10-NEXT: v_mov_b32_e32 v0, v4 1215; GFX10-NEXT: s_waitcnt vmcnt(0) 1216; GFX10-NEXT: v_mov_b32_e32 v1, v5 1217; GFX10-NEXT: s_setpc_b64 s[30:31] 1218; 1219; GFX11-LABEL: shuffle_v4i16_0167: 1220; GFX11: ; %bb.0: 1221; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1222; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1223; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 1224; GFX11-NEXT: s_waitcnt vmcnt(0) 1225; GFX11-NEXT: s_setpc_b64 s[30:31] 1226 %val0 = load <4 x i16>, ptr addrspace(1) %arg0 1227 %val1 = load <4 x i16>, ptr addrspace(1) %arg1 1228 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 1229 ret <4 x i16> %shuffle 1230} 1231 1232define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1233; GX900-LABEL: shuffle_v4f16_0000: 1234; GX900: ; %bb.0: 1235; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1236; GX900-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1237; GX900-NEXT: s_mov_b32 s4, 0x5040100 1238; GX900-NEXT: s_waitcnt vmcnt(0) 1239; GX900-NEXT: v_perm_b32 v0, v0, v0, s4 1240; GX900-NEXT: v_mov_b32_e32 v1, v0 1241; GX900-NEXT: s_setpc_b64 s[30:31] 1242; 1243; GFX940-LABEL: shuffle_v4f16_0000: 1244; GFX940: ; %bb.0: 1245; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1246; GFX940-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1247; GFX940-NEXT: s_mov_b32 s0, 0x5040100 1248; GFX940-NEXT: s_waitcnt vmcnt(0) 1249; GFX940-NEXT: v_perm_b32 v0, v0, v0, s0 1250; GFX940-NEXT: v_mov_b32_e32 v1, v0 1251; GFX940-NEXT: s_setpc_b64 s[30:31] 1252; 1253; GFX10-LABEL: shuffle_v4f16_0000: 1254; GFX10: ; %bb.0: 1255; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1256; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1257; GFX10-NEXT: s_waitcnt vmcnt(0) 1258; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 1259; GFX10-NEXT: v_mov_b32_e32 v1, v0 1260; GFX10-NEXT: s_setpc_b64 s[30:31] 1261; 1262; GFX11-LABEL: shuffle_v4f16_0000: 1263; GFX11: ; %bb.0: 1264; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1265; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 1266; GFX11-NEXT: s_waitcnt vmcnt(0) 1267; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 1268; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1269; GFX11-NEXT: v_mov_b32_e32 v1, v0 1270; GFX11-NEXT: s_setpc_b64 s[30:31] 1271 %val0 = load <4 x half>, ptr addrspace(1) %arg0 1272 %val1 = load <4 x half>, ptr addrspace(1) %arg1 1273 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer 1274 ret <4 x half> %shuffle 1275} 1276 1277define <4 x half> @shuffle_v4f16_1010(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1278; GFX9-LABEL: shuffle_v4f16_1010: 1279; GFX9: ; %bb.0: 1280; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1281; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1282; GFX9-NEXT: s_waitcnt vmcnt(0) 1283; GFX9-NEXT: v_alignbit_b32 v0, v0, v0, 16 1284; GFX9-NEXT: v_mov_b32_e32 v1, v0 1285; GFX9-NEXT: s_setpc_b64 s[30:31] 1286; 1287; GFX10-LABEL: shuffle_v4f16_1010: 1288; GFX10: ; %bb.0: 1289; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1290; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1291; GFX10-NEXT: s_waitcnt vmcnt(0) 1292; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 1293; GFX10-NEXT: v_mov_b32_e32 v1, v0 1294; GFX10-NEXT: s_setpc_b64 s[30:31] 1295; 1296; GFX11-LABEL: shuffle_v4f16_1010: 1297; GFX11: ; %bb.0: 1298; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1299; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 1300; GFX11-NEXT: s_waitcnt vmcnt(0) 1301; GFX11-NEXT: v_alignbit_b32 v0, v0, v0, 16 1302; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1303; GFX11-NEXT: v_mov_b32_e32 v1, v0 1304; GFX11-NEXT: s_setpc_b64 s[30:31] 1305 %val0 = load <4 x half>, ptr addrspace(1) %arg0 1306 %val1 = load <4 x half>, ptr addrspace(1) %arg1 1307 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0> 1308 ret <4 x half> %shuffle 1309} 1310 1311define <4 x half> @shuffle_v4f16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1312; GX900-LABEL: shuffle_v4f16_1100: 1313; GX900: ; %bb.0: 1314; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1315; GX900-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 1316; GX900-NEXT: s_mov_b32 s4, 0x7060302 1317; GX900-NEXT: s_mov_b32 s5, 0x5040100 1318; GX900-NEXT: s_waitcnt vmcnt(0) 1319; GX900-NEXT: v_perm_b32 v0, v1, v1, s4 1320; GX900-NEXT: v_perm_b32 v1, v1, v1, s5 1321; GX900-NEXT: s_setpc_b64 s[30:31] 1322; 1323; GFX940-LABEL: shuffle_v4f16_1100: 1324; GFX940: ; %bb.0: 1325; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1326; GFX940-NEXT: global_load_dwordx2 v[2:3], v[0:1], off 1327; GFX940-NEXT: s_mov_b32 s0, 0x7060302 1328; GFX940-NEXT: s_mov_b32 s1, 0x5040100 1329; GFX940-NEXT: s_waitcnt vmcnt(0) 1330; GFX940-NEXT: v_perm_b32 v0, v2, v2, s0 1331; GFX940-NEXT: v_perm_b32 v1, v2, v2, s1 1332; GFX940-NEXT: s_setpc_b64 s[30:31] 1333; 1334; GFX10-LABEL: shuffle_v4f16_1100: 1335; GFX10: ; %bb.0: 1336; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1337; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 1338; GFX10-NEXT: s_waitcnt vmcnt(0) 1339; GFX10-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 1340; GFX10-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 1341; GFX10-NEXT: s_setpc_b64 s[30:31] 1342; 1343; GFX11-LABEL: shuffle_v4f16_1100: 1344; GFX11: ; %bb.0: 1345; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1346; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off 1347; GFX11-NEXT: s_waitcnt vmcnt(0) 1348; GFX11-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 1349; GFX11-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 1350; GFX11-NEXT: s_setpc_b64 s[30:31] 1351 %val0 = load <4 x half>, ptr addrspace(1) %arg0 1352 %val1 = load <4 x half>, ptr addrspace(1) %arg1 1353 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0> 1354 ret <4 x half> %shuffle 1355} 1356 1357define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1358; GX900-LABEL: shuffle_v4f16_6161: 1359; GX900: ; %bb.0: 1360; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1361; GX900-NEXT: global_load_dword v4, v[0:1], off 1362; GX900-NEXT: global_load_dword v5, v[2:3], off offset:4 1363; GX900-NEXT: s_mov_b32 s4, 0xffff 1364; GX900-NEXT: s_waitcnt vmcnt(0) 1365; GX900-NEXT: v_bfi_b32 v0, s4, v5, v4 1366; GX900-NEXT: v_mov_b32_e32 v1, v0 1367; GX900-NEXT: s_setpc_b64 s[30:31] 1368; 1369; GFX940-LABEL: shuffle_v4f16_6161: 1370; GFX940: ; %bb.0: 1371; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1372; GFX940-NEXT: global_load_dword v4, v[0:1], off 1373; GFX940-NEXT: global_load_dword v5, v[2:3], off offset:4 1374; GFX940-NEXT: s_mov_b32 s0, 0xffff 1375; GFX940-NEXT: s_waitcnt vmcnt(0) 1376; GFX940-NEXT: v_bfi_b32 v0, s0, v5, v4 1377; GFX940-NEXT: v_mov_b32_e32 v1, v0 1378; GFX940-NEXT: s_setpc_b64 s[30:31] 1379; 1380; GFX10-LABEL: shuffle_v4f16_6161: 1381; GFX10: ; %bb.0: 1382; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1383; GFX10-NEXT: global_load_dword v4, v[0:1], off 1384; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 1385; GFX10-NEXT: s_waitcnt vmcnt(0) 1386; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v5, v4 1387; GFX10-NEXT: v_mov_b32_e32 v1, v0 1388; GFX10-NEXT: s_setpc_b64 s[30:31] 1389; 1390; GFX11-LABEL: shuffle_v4f16_6161: 1391; GFX11: ; %bb.0: 1392; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1393; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1394; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 1395; GFX11-NEXT: s_waitcnt vmcnt(0) 1396; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 1397; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1398; GFX11-NEXT: v_mov_b32_e32 v1, v0 1399; GFX11-NEXT: s_setpc_b64 s[30:31] 1400 %val0 = load <4 x half>, ptr addrspace(1) %arg0 1401 %val1 = load <4 x half>, ptr addrspace(1) %arg1 1402 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1> 1403 ret <4 x half> %shuffle 1404} 1405 1406define <4 x half> @shuffle_v4f16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1407; GX900-LABEL: shuffle_v4f16_2333: 1408; GX900: ; %bb.0: 1409; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1410; GX900-NEXT: global_load_dword v0, v[0:1], off offset:4 1411; GX900-NEXT: s_mov_b32 s4, 0x7060302 1412; GX900-NEXT: s_waitcnt vmcnt(0) 1413; GX900-NEXT: v_perm_b32 v1, v0, v0, s4 1414; GX900-NEXT: s_setpc_b64 s[30:31] 1415; 1416; GFX940-LABEL: shuffle_v4f16_2333: 1417; GFX940: ; %bb.0: 1418; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1419; GFX940-NEXT: global_load_dword v0, v[0:1], off offset:4 1420; GFX940-NEXT: s_mov_b32 s0, 0x7060302 1421; GFX940-NEXT: s_waitcnt vmcnt(0) 1422; GFX940-NEXT: v_perm_b32 v1, v0, v0, s0 1423; GFX940-NEXT: s_setpc_b64 s[30:31] 1424; 1425; GFX10-LABEL: shuffle_v4f16_2333: 1426; GFX10: ; %bb.0: 1427; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1428; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 1429; GFX10-NEXT: s_waitcnt vmcnt(0) 1430; GFX10-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 1431; GFX10-NEXT: s_setpc_b64 s[30:31] 1432; 1433; GFX11-LABEL: shuffle_v4f16_2333: 1434; GFX11: ; %bb.0: 1435; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1436; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 1437; GFX11-NEXT: s_waitcnt vmcnt(0) 1438; GFX11-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 1439; GFX11-NEXT: s_setpc_b64 s[30:31] 1440 %val0 = load <4 x half>, ptr addrspace(1) %arg0 1441 %val1 = load <4 x half>, ptr addrspace(1) %arg1 1442 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 1443 ret <4 x half> %shuffle 1444} 1445 1446define <4 x half> @shuffle_v4f16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1447; GX900-LABEL: shuffle_v4f16_6667: 1448; GX900: ; %bb.0: 1449; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1450; GX900-NEXT: global_load_dword v0, v[0:1], off offset:4 1451; GX900-NEXT: s_mov_b32 s4, 0x7060302 1452; GX900-NEXT: s_waitcnt vmcnt(0) 1453; GX900-NEXT: v_perm_b32 v1, v0, v0, s4 1454; GX900-NEXT: s_setpc_b64 s[30:31] 1455; 1456; GFX940-LABEL: shuffle_v4f16_6667: 1457; GFX940: ; %bb.0: 1458; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1459; GFX940-NEXT: global_load_dword v0, v[0:1], off offset:4 1460; GFX940-NEXT: s_mov_b32 s0, 0x7060302 1461; GFX940-NEXT: s_waitcnt vmcnt(0) 1462; GFX940-NEXT: v_perm_b32 v1, v0, v0, s0 1463; GFX940-NEXT: s_setpc_b64 s[30:31] 1464; 1465; GFX10-LABEL: shuffle_v4f16_6667: 1466; GFX10: ; %bb.0: 1467; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1468; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 1469; GFX10-NEXT: s_waitcnt vmcnt(0) 1470; GFX10-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 1471; GFX10-NEXT: s_setpc_b64 s[30:31] 1472; 1473; GFX11-LABEL: shuffle_v4f16_6667: 1474; GFX11: ; %bb.0: 1475; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1476; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 1477; GFX11-NEXT: s_waitcnt vmcnt(0) 1478; GFX11-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 1479; GFX11-NEXT: s_setpc_b64 s[30:31] 1480 %val0 = load <4 x half>, ptr addrspace(1) %arg0 1481 %val1 = load <4 x half>, ptr addrspace(1) %arg1 1482 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 1483 ret <4 x half> %shuffle 1484} 1485 1486define <4 x half> @shuffle_v8f16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1487; GFX9-LABEL: shuffle_v8f16_0101: 1488; GFX9: ; %bb.0: 1489; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1490; GFX9-NEXT: global_load_dword v0, v[0:1], off 1491; GFX9-NEXT: s_waitcnt vmcnt(0) 1492; GFX9-NEXT: v_mov_b32_e32 v1, v0 1493; GFX9-NEXT: s_setpc_b64 s[30:31] 1494; 1495; GFX10-LABEL: shuffle_v8f16_0101: 1496; GFX10: ; %bb.0: 1497; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1498; GFX10-NEXT: global_load_dword v0, v[0:1], off 1499; GFX10-NEXT: s_waitcnt vmcnt(0) 1500; GFX10-NEXT: v_mov_b32_e32 v1, v0 1501; GFX10-NEXT: s_setpc_b64 s[30:31] 1502; 1503; GFX11-LABEL: shuffle_v8f16_0101: 1504; GFX11: ; %bb.0: 1505; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1506; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1507; GFX11-NEXT: s_waitcnt vmcnt(0) 1508; GFX11-NEXT: v_mov_b32_e32 v1, v0 1509; GFX11-NEXT: s_setpc_b64 s[30:31] 1510 %val0 = load <8 x half>, ptr addrspace(1) %arg0 1511 %val1 = load <8 x half>, ptr addrspace(1) %arg1 1512 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 1513 ret <4 x half> %shuffle 1514} 1515 1516define <4 x half> @shuffle_v8f16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1517; GFX9-LABEL: shuffle_v8f16_0123: 1518; GFX9: ; %bb.0: 1519; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1520; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1521; GFX9-NEXT: s_waitcnt vmcnt(0) 1522; GFX9-NEXT: s_setpc_b64 s[30:31] 1523; 1524; GFX10-LABEL: shuffle_v8f16_0123: 1525; GFX10: ; %bb.0: 1526; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1527; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1528; GFX10-NEXT: s_waitcnt vmcnt(0) 1529; GFX10-NEXT: s_setpc_b64 s[30:31] 1530; 1531; GFX11-LABEL: shuffle_v8f16_0123: 1532; GFX11: ; %bb.0: 1533; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1534; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 1535; GFX11-NEXT: s_waitcnt vmcnt(0) 1536; GFX11-NEXT: s_setpc_b64 s[30:31] 1537 %val0 = load <8 x half>, ptr addrspace(1) %arg0 1538 %val1 = load <8 x half>, ptr addrspace(1) %arg1 1539 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1540 ret <4 x half> %shuffle 1541} 1542 1543define <4 x half> @shuffle_v8f16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1544; GFX9-LABEL: shuffle_v8f16_4589: 1545; GFX9: ; %bb.0: 1546; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1547; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:8 1548; GFX9-NEXT: global_load_dword v5, v[2:3], off 1549; GFX9-NEXT: s_waitcnt vmcnt(1) 1550; GFX9-NEXT: v_mov_b32_e32 v0, v4 1551; GFX9-NEXT: s_waitcnt vmcnt(0) 1552; GFX9-NEXT: v_mov_b32_e32 v1, v5 1553; GFX9-NEXT: s_setpc_b64 s[30:31] 1554; 1555; GFX10-LABEL: shuffle_v8f16_4589: 1556; GFX10: ; %bb.0: 1557; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1558; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:8 1559; GFX10-NEXT: global_load_dword v5, v[2:3], off 1560; GFX10-NEXT: s_waitcnt vmcnt(1) 1561; GFX10-NEXT: v_mov_b32_e32 v0, v4 1562; GFX10-NEXT: s_waitcnt vmcnt(0) 1563; GFX10-NEXT: v_mov_b32_e32 v1, v5 1564; GFX10-NEXT: s_setpc_b64 s[30:31] 1565; 1566; GFX11-LABEL: shuffle_v8f16_4589: 1567; GFX11: ; %bb.0: 1568; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1569; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:8 1570; GFX11-NEXT: global_load_b32 v1, v[2:3], off 1571; GFX11-NEXT: s_waitcnt vmcnt(0) 1572; GFX11-NEXT: s_setpc_b64 s[30:31] 1573 %val0 = load <8 x half>, ptr addrspace(1) %arg0 1574 %val1 = load <8 x half>, ptr addrspace(1) %arg1 1575 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9> 1576 ret <4 x half> %shuffle 1577} 1578 1579define <4 x half> @shuffle_v8f16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1580; GFX9-LABEL: shuffle_v8f16_10_11_2_3: 1581; GFX9: ; %bb.0: 1582; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1583; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 1584; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 1585; GFX9-NEXT: s_waitcnt vmcnt(1) 1586; GFX9-NEXT: v_mov_b32_e32 v0, v4 1587; GFX9-NEXT: s_waitcnt vmcnt(0) 1588; GFX9-NEXT: v_mov_b32_e32 v1, v5 1589; GFX9-NEXT: s_setpc_b64 s[30:31] 1590; 1591; GFX10-LABEL: shuffle_v8f16_10_11_2_3: 1592; GFX10: ; %bb.0: 1593; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1594; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 1595; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 1596; GFX10-NEXT: s_waitcnt vmcnt(1) 1597; GFX10-NEXT: v_mov_b32_e32 v0, v4 1598; GFX10-NEXT: s_waitcnt vmcnt(0) 1599; GFX10-NEXT: v_mov_b32_e32 v1, v5 1600; GFX10-NEXT: s_setpc_b64 s[30:31] 1601; 1602; GFX11-LABEL: shuffle_v8f16_10_11_2_3: 1603; GFX11: ; %bb.0: 1604; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1605; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 1606; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 1607; GFX11-NEXT: s_waitcnt vmcnt(1) 1608; GFX11-NEXT: v_mov_b32_e32 v0, v2 1609; GFX11-NEXT: s_waitcnt vmcnt(0) 1610; GFX11-NEXT: s_setpc_b64 s[30:31] 1611 %val0 = load <8 x half>, ptr addrspace(1) %arg0 1612 %val1 = load <8 x half>, ptr addrspace(1) %arg1 1613 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3> 1614 ret <4 x half> %shuffle 1615} 1616 1617define <4 x half> @shuffle_v8f16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1618; GX900-LABEL: shuffle_v8f16_13_14_2_3: 1619; GX900: ; %bb.0: 1620; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1621; GX900-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 1622; GX900-NEXT: global_load_dword v4, v[0:1], off offset:4 1623; GX900-NEXT: s_waitcnt vmcnt(1) 1624; GX900-NEXT: v_alignbit_b32 v0, v6, v5, 16 1625; GX900-NEXT: s_waitcnt vmcnt(0) 1626; GX900-NEXT: v_mov_b32_e32 v1, v4 1627; GX900-NEXT: s_setpc_b64 s[30:31] 1628; 1629; GFX940-LABEL: shuffle_v8f16_13_14_2_3: 1630; GFX940: ; %bb.0: 1631; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1632; GFX940-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:8 1633; GFX940-NEXT: global_load_dword v4, v[0:1], off offset:4 1634; GFX940-NEXT: s_waitcnt vmcnt(1) 1635; GFX940-NEXT: v_alignbit_b32 v0, v7, v6, 16 1636; GFX940-NEXT: s_waitcnt vmcnt(0) 1637; GFX940-NEXT: v_mov_b32_e32 v1, v4 1638; GFX940-NEXT: s_setpc_b64 s[30:31] 1639; 1640; GFX10-LABEL: shuffle_v8f16_13_14_2_3: 1641; GFX10: ; %bb.0: 1642; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1643; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 1644; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 1645; GFX10-NEXT: s_waitcnt vmcnt(1) 1646; GFX10-NEXT: v_alignbit_b32 v0, v6, v5, 16 1647; GFX10-NEXT: s_waitcnt vmcnt(0) 1648; GFX10-NEXT: v_mov_b32_e32 v1, v4 1649; GFX10-NEXT: s_setpc_b64 s[30:31] 1650; 1651; GFX11-LABEL: shuffle_v8f16_13_14_2_3: 1652; GFX11: ; %bb.0: 1653; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1654; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8 1655; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 1656; GFX11-NEXT: s_waitcnt vmcnt(1) 1657; GFX11-NEXT: v_alignbit_b32 v0, v3, v2, 16 1658; GFX11-NEXT: s_waitcnt vmcnt(0) 1659; GFX11-NEXT: s_setpc_b64 s[30:31] 1660 %val0 = load <8 x half>, ptr addrspace(1) %arg0 1661 %val1 = load <8 x half>, ptr addrspace(1) %arg1 1662 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3> 1663 ret <4 x half> %shuffle 1664} 1665 1666define <4 x half> @shuffle_v3f16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1667; GX900-LABEL: shuffle_v3f16_0122: 1668; GX900: ; %bb.0: 1669; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1670; GX900-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1671; GX900-NEXT: s_mov_b32 s4, 0x5040100 1672; GX900-NEXT: s_waitcnt vmcnt(0) 1673; GX900-NEXT: v_perm_b32 v1, v1, v1, s4 1674; GX900-NEXT: s_setpc_b64 s[30:31] 1675; 1676; GFX940-LABEL: shuffle_v3f16_0122: 1677; GFX940: ; %bb.0: 1678; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1679; GFX940-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1680; GFX940-NEXT: s_mov_b32 s0, 0x5040100 1681; GFX940-NEXT: s_waitcnt vmcnt(0) 1682; GFX940-NEXT: v_perm_b32 v1, v1, v1, s0 1683; GFX940-NEXT: s_setpc_b64 s[30:31] 1684; 1685; GFX10-LABEL: shuffle_v3f16_0122: 1686; GFX10: ; %bb.0: 1687; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1688; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1689; GFX10-NEXT: s_waitcnt vmcnt(0) 1690; GFX10-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 1691; GFX10-NEXT: s_setpc_b64 s[30:31] 1692; 1693; GFX11-LABEL: shuffle_v3f16_0122: 1694; GFX11: ; %bb.0: 1695; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1696; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 1697; GFX11-NEXT: s_waitcnt vmcnt(0) 1698; GFX11-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 1699; GFX11-NEXT: s_setpc_b64 s[30:31] 1700 %val0 = load <3 x half>, ptr addrspace(1) %arg0 1701 %val1 = load <3 x half>, ptr addrspace(1) %arg1 1702 %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 1703 ret <4 x half> %shuffle 1704} 1705 1706define <4 x half> @shuffle_v2f16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1707; GFX9-LABEL: shuffle_v2f16_0122: 1708; GFX9: ; %bb.0: 1709; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1710; GFX9-NEXT: global_load_dword v0, v[0:1], off 1711; GFX9-NEXT: s_waitcnt vmcnt(0) 1712; GFX9-NEXT: v_alignbit_b32 v1, v0, v0, 16 1713; GFX9-NEXT: s_setpc_b64 s[30:31] 1714; 1715; GFX10-LABEL: shuffle_v2f16_0122: 1716; GFX10: ; %bb.0: 1717; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1718; GFX10-NEXT: global_load_dword v0, v[0:1], off 1719; GFX10-NEXT: s_waitcnt vmcnt(0) 1720; GFX10-NEXT: v_alignbit_b32 v1, v0, v0, 16 1721; GFX10-NEXT: s_setpc_b64 s[30:31] 1722; 1723; GFX11-LABEL: shuffle_v2f16_0122: 1724; GFX11: ; %bb.0: 1725; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1726; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1727; GFX11-NEXT: s_waitcnt vmcnt(0) 1728; GFX11-NEXT: v_alignbit_b32 v1, v0, v0, 16 1729; GFX11-NEXT: s_setpc_b64 s[30:31] 1730 %val0 = load <2 x half>, ptr addrspace(1) %arg0 1731 %val1 = load <2 x half>, ptr addrspace(1) %arg1 1732 %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0> 1733 ret <4 x half> %shuffle 1734} 1735 1736define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1737; GX900-LABEL: shuffle_v6f16_452367: 1738; GX900: ; %bb.0: 1739; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1740; GX900-NEXT: v_mov_b32_e32 v6, v1 1741; GX900-NEXT: v_mov_b32_e32 v5, v0 1742; GX900-NEXT: v_mov_b32_e32 v4, v3 1743; GX900-NEXT: v_mov_b32_e32 v3, v2 1744; GX900-NEXT: global_load_dwordx3 v[0:2], v[5:6], off 1745; GX900-NEXT: global_load_dword v7, v[3:4], off 1746; GX900-NEXT: s_waitcnt vmcnt(1) 1747; GX900-NEXT: v_mov_b32_e32 v0, v2 1748; GX900-NEXT: s_waitcnt vmcnt(0) 1749; GX900-NEXT: v_mov_b32_e32 v2, v7 1750; GX900-NEXT: s_setpc_b64 s[30:31] 1751; 1752; GFX940-LABEL: shuffle_v6f16_452367: 1753; GFX940: ; %bb.0: 1754; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1755; GFX940-NEXT: v_mov_b32_e32 v7, v1 1756; GFX940-NEXT: v_mov_b32_e32 v6, v0 1757; GFX940-NEXT: v_mov_b32_e32 v5, v3 1758; GFX940-NEXT: v_mov_b32_e32 v4, v2 1759; GFX940-NEXT: global_load_dwordx3 v[0:2], v[6:7], off 1760; GFX940-NEXT: global_load_dword v3, v[4:5], off 1761; GFX940-NEXT: s_waitcnt vmcnt(1) 1762; GFX940-NEXT: v_mov_b32_e32 v0, v2 1763; GFX940-NEXT: s_waitcnt vmcnt(0) 1764; GFX940-NEXT: v_mov_b32_e32 v2, v3 1765; GFX940-NEXT: s_setpc_b64 s[30:31] 1766; 1767; GFX10-LABEL: shuffle_v6f16_452367: 1768; GFX10: ; %bb.0: 1769; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1770; GFX10-NEXT: v_mov_b32_e32 v6, v1 1771; GFX10-NEXT: v_mov_b32_e32 v5, v0 1772; GFX10-NEXT: v_mov_b32_e32 v4, v3 1773; GFX10-NEXT: v_mov_b32_e32 v3, v2 1774; GFX10-NEXT: global_load_dwordx3 v[0:2], v[5:6], off 1775; GFX10-NEXT: global_load_dword v7, v[3:4], off 1776; GFX10-NEXT: s_waitcnt vmcnt(1) 1777; GFX10-NEXT: v_mov_b32_e32 v0, v2 1778; GFX10-NEXT: s_waitcnt vmcnt(0) 1779; GFX10-NEXT: v_mov_b32_e32 v2, v7 1780; GFX10-NEXT: s_setpc_b64 s[30:31] 1781; 1782; GFX11-LABEL: shuffle_v6f16_452367: 1783; GFX11: ; %bb.0: 1784; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1785; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 1786; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off 1787; GFX11-NEXT: global_load_b32 v3, v[3:4], off 1788; GFX11-NEXT: s_waitcnt vmcnt(1) 1789; GFX11-NEXT: v_mov_b32_e32 v0, v2 1790; GFX11-NEXT: s_waitcnt vmcnt(0) 1791; GFX11-NEXT: v_mov_b32_e32 v2, v3 1792; GFX11-NEXT: s_setpc_b64 s[30:31] 1793 %val0 = load <6 x half>, ptr addrspace(1) %arg0 1794 %val1 = load <6 x half>, ptr addrspace(1) %arg1 1795 %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7> 1796 ret <6 x half> %shuffle 1797} 1798 1799define amdgpu_kernel void @fma_shuffle_v2f16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) { 1800; GX900-LABEL: fma_shuffle_v2f16: 1801; GX900: ; %bb.0: ; %entry 1802; GX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1803; GX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1804; GX900-NEXT: v_lshlrev_b32_e32 v6, 3, v0 1805; GX900-NEXT: s_waitcnt lgkmcnt(0) 1806; GX900-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] 1807; GX900-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] 1808; GX900-NEXT: global_load_dwordx2 v[4:5], v6, s[4:5] 1809; GX900-NEXT: s_waitcnt vmcnt(0) 1810; GX900-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] 1811; GX900-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] 1812; GX900-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] 1813; GX900-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] 1814; GX900-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] 1815; GX900-NEXT: s_endpgm 1816; 1817; GFX940-LABEL: fma_shuffle_v2f16: 1818; GFX940: ; %bb.0: ; %entry 1819; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1820; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1821; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1822; GFX940-NEXT: v_lshlrev_b32_e32 v6, 3, v0 1823; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1824; GFX940-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] 1825; GFX940-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] 1826; GFX940-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] 1827; GFX940-NEXT: s_waitcnt vmcnt(0) 1828; GFX940-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] 1829; GFX940-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] 1830; GFX940-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] 1831; GFX940-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] 1832; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] sc0 sc1 1833; GFX940-NEXT: s_endpgm 1834; 1835; GFX10-LABEL: fma_shuffle_v2f16: 1836; GFX10: ; %bb.0: ; %entry 1837; GFX10-NEXT: s_clause 0x1 1838; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1839; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1840; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 1841; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1842; GFX10-NEXT: s_clause 0x2 1843; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] 1844; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] 1845; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[4:5] 1846; GFX10-NEXT: s_waitcnt vmcnt(0) 1847; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] 1848; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] 1849; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] 1850; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] 1851; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] 1852; GFX10-NEXT: s_endpgm 1853; 1854; GFX11-LABEL: fma_shuffle_v2f16: 1855; GFX11: ; %bb.0: ; %entry 1856; GFX11-NEXT: s_clause 0x1 1857; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1858; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 1859; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1860; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1861; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 1862; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1863; GFX11-NEXT: s_clause 0x2 1864; GFX11-NEXT: global_load_b64 v[0:1], v6, s[0:1] 1865; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] 1866; GFX11-NEXT: global_load_b64 v[4:5], v6, s[4:5] 1867; GFX11-NEXT: s_waitcnt vmcnt(0) 1868; GFX11-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] 1869; GFX11-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] 1870; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1871; GFX11-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] 1872; GFX11-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] 1873; GFX11-NEXT: global_store_b64 v6, v[0:1], s[4:5] 1874; GFX11-NEXT: s_endpgm 1875entry: 1876 %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() 1877 %tmp12 = zext i32 %tmp1 to i64 1878 %arrayidx = getelementptr inbounds <4 x half>, ptr addrspace(1) %A, i64 %tmp12 1879 %tmp14 = load <4 x half>, ptr addrspace(1) %arrayidx, align 8 1880 %arrayidx1 = getelementptr inbounds <4 x half>, ptr addrspace(1) %B, i64 %tmp12 1881 %tmp15 = load <4 x half>, ptr addrspace(1) %arrayidx1, align 8 1882 %arrayidx2 = getelementptr inbounds <4 x half>, ptr addrspace(1) %C, i64 %tmp12 1883 %tmp16 = load <4 x half>, ptr addrspace(1) %arrayidx2, align 8 1884 %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer 1885 %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1> 1886 %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1> 1887 %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19) 1888 %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1> 1889 %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3> 1890 %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20) 1891 %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1892 %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 1893 %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2> 1894 %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3> 1895 %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27) 1896 %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3> 1897 %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28) 1898 %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1899 %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1900 store <4 x half> %tmp32, ptr addrspace(1) %arrayidx2, align 8 1901 ret void 1902} 1903 1904define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 1905; GX900-LABEL: shuffle_v4f16_0456: 1906; GX900: ; %bb.0: 1907; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1908; GX900-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 1909; GX900-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 1910; GX900-NEXT: s_mov_b32 s4, 0x5040100 1911; GX900-NEXT: ; kill: killed $vgpr0 killed $vgpr1 1912; GX900-NEXT: ; kill: killed $vgpr2 killed $vgpr3 1913; GX900-NEXT: s_waitcnt vmcnt(0) 1914; GX900-NEXT: v_perm_b32 v0, v5, v4, s4 1915; GX900-NEXT: v_alignbit_b32 v1, v6, v5, 16 1916; GX900-NEXT: s_setpc_b64 s[30:31] 1917; 1918; GFX940-LABEL: shuffle_v4f16_0456: 1919; GFX940: ; %bb.0: 1920; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1921; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 1922; GFX940-NEXT: global_load_dwordx2 v[6:7], v[2:3], off 1923; GFX940-NEXT: s_mov_b32 s0, 0x5040100 1924; GFX940-NEXT: s_waitcnt vmcnt(0) 1925; GFX940-NEXT: v_perm_b32 v0, v6, v4, s0 1926; GFX940-NEXT: v_alignbit_b32 v1, v7, v6, 16 1927; GFX940-NEXT: s_setpc_b64 s[30:31] 1928; 1929; GFX10-LABEL: shuffle_v4f16_0456: 1930; GFX10: ; %bb.0: 1931; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1932; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 1933; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 1934; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 1935; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 1936; GFX10-NEXT: s_waitcnt vmcnt(0) 1937; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 1938; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16 1939; GFX10-NEXT: s_setpc_b64 s[30:31] 1940; 1941; GFX11-LABEL: shuffle_v4f16_0456: 1942; GFX11: ; %bb.0: 1943; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1944; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 1945; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off 1946; GFX11-NEXT: s_waitcnt vmcnt(0) 1947; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 1948; GFX11-NEXT: v_alignbit_b32 v1, v2, v1, 16 1949; GFX11-NEXT: s_setpc_b64 s[30:31] 1950 %val0 = load <4 x half>, ptr addrspace(1) %arg0 1951 %val1 = load <4 x half>, ptr addrspace(1) %arg1 1952 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 1953 ret <4 x half> %shuffle 1954} 1955 1956define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ptr addrspace(1) %out) { 1957; GX900-LABEL: shuffle_scalar_load_v8i32_0123: 1958; GX900: ; %bb.0: 1959; GX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1960; GX900-NEXT: v_mov_b32_e32 v4, 0 1961; GX900-NEXT: s_waitcnt lgkmcnt(0) 1962; GX900-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 1963; GX900-NEXT: s_waitcnt lgkmcnt(0) 1964; GX900-NEXT: v_mov_b32_e32 v0, s4 1965; GX900-NEXT: v_mov_b32_e32 v1, s5 1966; GX900-NEXT: v_mov_b32_e32 v2, s6 1967; GX900-NEXT: v_mov_b32_e32 v3, s7 1968; GX900-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 1969; GX900-NEXT: s_endpgm 1970; 1971; GFX940-LABEL: shuffle_scalar_load_v8i32_0123: 1972; GFX940: ; %bb.0: 1973; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1974; GFX940-NEXT: v_mov_b32_e32 v4, 0 1975; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1976; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 1977; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1978; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] 1979; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7] 1980; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 1981; GFX940-NEXT: s_endpgm 1982; 1983; GFX10-LABEL: shuffle_scalar_load_v8i32_0123: 1984; GFX10: ; %bb.0: 1985; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1986; GFX10-NEXT: v_mov_b32_e32 v4, 0 1987; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1988; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 1989; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1990; GFX10-NEXT: v_mov_b32_e32 v0, s4 1991; GFX10-NEXT: v_mov_b32_e32 v1, s5 1992; GFX10-NEXT: v_mov_b32_e32 v2, s6 1993; GFX10-NEXT: v_mov_b32_e32 v3, s7 1994; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 1995; GFX10-NEXT: s_endpgm 1996; 1997; GFX11-LABEL: shuffle_scalar_load_v8i32_0123: 1998; GFX11: ; %bb.0: 1999; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2000; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2001; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 2002; GFX11-NEXT: v_mov_b32_e32 v4, 0 2003; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2004; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 2005; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 2006; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3] 2007; GFX11-NEXT: s_endpgm 2008 %ld8 = load <8 x i32>, ptr addrspace(4) %in, align 16 2009 %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2010 store <4 x i32> %id, ptr addrspace(1) %out, align 8 2011 ret void 2012} 2013 2014define <2 x half> @low16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { 2015; GX900-LABEL: low16bits_v2f16: 2016; GX900: ; %bb.0: ; %entry 2017; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2018; GX900-NEXT: global_load_dword v4, v[0:1], off 2019; GX900-NEXT: global_load_dword v5, v[2:3], off 2020; GX900-NEXT: s_mov_b32 s4, 0x5040100 2021; GX900-NEXT: s_waitcnt vmcnt(0) 2022; GX900-NEXT: v_perm_b32 v0, v5, v4, s4 2023; GX900-NEXT: s_setpc_b64 s[30:31] 2024; 2025; GFX940-LABEL: low16bits_v2f16: 2026; GFX940: ; %bb.0: ; %entry 2027; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2028; GFX940-NEXT: global_load_dword v4, v[0:1], off 2029; GFX940-NEXT: global_load_dword v5, v[2:3], off 2030; GFX940-NEXT: s_mov_b32 s0, 0x5040100 2031; GFX940-NEXT: s_waitcnt vmcnt(0) 2032; GFX940-NEXT: v_perm_b32 v0, v5, v4, s0 2033; GFX940-NEXT: s_setpc_b64 s[30:31] 2034; 2035; GFX10-LABEL: low16bits_v2f16: 2036; GFX10: ; %bb.0: ; %entry 2037; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2038; GFX10-NEXT: global_load_dword v4, v[0:1], off 2039; GFX10-NEXT: global_load_dword v5, v[2:3], off 2040; GFX10-NEXT: s_waitcnt vmcnt(0) 2041; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 2042; GFX10-NEXT: s_setpc_b64 s[30:31] 2043; 2044; GFX11-LABEL: low16bits_v2f16: 2045; GFX11: ; %bb.0: ; %entry 2046; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2047; GFX11-NEXT: global_load_b32 v0, v[0:1], off 2048; GFX11-NEXT: global_load_b32 v1, v[2:3], off 2049; GFX11-NEXT: s_waitcnt vmcnt(0) 2050; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 2051; GFX11-NEXT: s_setpc_b64 s[30:31] 2052entry: 2053 %0 = load <2 x half>, ptr addrspace(1) %x0, align 4 2054 %1 = load <2 x half>, ptr addrspace(1) %x1, align 4 2055 %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 0, i32 undef> 2056 %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 2> 2057 ret <2 x half> %vy1.2.vec.insert 2058} 2059 2060define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { 2061; GX900-LABEL: hi16bits_v2f16: 2062; GX900: ; %bb.0: ; %entry 2063; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2064; GX900-NEXT: global_load_dword v4, v[0:1], off 2065; GX900-NEXT: global_load_dword v5, v[2:3], off 2066; GX900-NEXT: s_mov_b32 s4, 0x7060302 2067; GX900-NEXT: s_waitcnt vmcnt(0) 2068; GX900-NEXT: v_perm_b32 v0, v5, v4, s4 2069; GX900-NEXT: s_setpc_b64 s[30:31] 2070; 2071; GFX940-LABEL: hi16bits_v2f16: 2072; GFX940: ; %bb.0: ; %entry 2073; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2074; GFX940-NEXT: global_load_dword v4, v[0:1], off 2075; GFX940-NEXT: global_load_dword v5, v[2:3], off 2076; GFX940-NEXT: s_mov_b32 s0, 0x7060302 2077; GFX940-NEXT: s_waitcnt vmcnt(0) 2078; GFX940-NEXT: v_perm_b32 v0, v5, v4, s0 2079; GFX940-NEXT: s_setpc_b64 s[30:31] 2080; 2081; GFX10-LABEL: hi16bits_v2f16: 2082; GFX10: ; %bb.0: ; %entry 2083; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2084; GFX10-NEXT: global_load_dword v4, v[0:1], off 2085; GFX10-NEXT: global_load_dword v5, v[2:3], off 2086; GFX10-NEXT: s_waitcnt vmcnt(0) 2087; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302 2088; GFX10-NEXT: s_setpc_b64 s[30:31] 2089; 2090; GFX11-LABEL: hi16bits_v2f16: 2091; GFX11: ; %bb.0: ; %entry 2092; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2093; GFX11-NEXT: global_load_b32 v0, v[0:1], off 2094; GFX11-NEXT: global_load_b32 v1, v[2:3], off 2095; GFX11-NEXT: s_waitcnt vmcnt(0) 2096; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 2097; GFX11-NEXT: s_setpc_b64 s[30:31] 2098entry: 2099 %0 = load <2 x half>, ptr addrspace(1) %x0, align 4 2100 %1 = load <2 x half>, ptr addrspace(1) %x1, align 4 2101 %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 1, i32 undef> 2102 %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 3> 2103 ret <2 x half> %vy1.2.vec.insert 2104} 2105 2106define <2 x half> @low16hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { 2107; GX900-LABEL: low16hi16bits_v2f16: 2108; GX900: ; %bb.0: ; %entry 2109; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2110; GX900-NEXT: global_load_dword v4, v[0:1], off 2111; GX900-NEXT: global_load_dword v5, v[2:3], off 2112; GX900-NEXT: s_mov_b32 s4, 0xffff 2113; GX900-NEXT: s_waitcnt vmcnt(0) 2114; GX900-NEXT: v_bfi_b32 v0, s4, v4, v5 2115; GX900-NEXT: s_setpc_b64 s[30:31] 2116; 2117; GFX940-LABEL: low16hi16bits_v2f16: 2118; GFX940: ; %bb.0: ; %entry 2119; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2120; GFX940-NEXT: global_load_dword v4, v[0:1], off 2121; GFX940-NEXT: global_load_dword v5, v[2:3], off 2122; GFX940-NEXT: s_mov_b32 s0, 0xffff 2123; GFX940-NEXT: s_waitcnt vmcnt(0) 2124; GFX940-NEXT: v_bfi_b32 v0, s0, v4, v5 2125; GFX940-NEXT: s_setpc_b64 s[30:31] 2126; 2127; GFX10-LABEL: low16hi16bits_v2f16: 2128; GFX10: ; %bb.0: ; %entry 2129; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2130; GFX10-NEXT: global_load_dword v4, v[0:1], off 2131; GFX10-NEXT: global_load_dword v5, v[2:3], off 2132; GFX10-NEXT: s_waitcnt vmcnt(0) 2133; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v4, v5 2134; GFX10-NEXT: s_setpc_b64 s[30:31] 2135; 2136; GFX11-LABEL: low16hi16bits_v2f16: 2137; GFX11: ; %bb.0: ; %entry 2138; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2139; GFX11-NEXT: global_load_b32 v0, v[0:1], off 2140; GFX11-NEXT: global_load_b32 v1, v[2:3], off 2141; GFX11-NEXT: s_waitcnt vmcnt(0) 2142; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 2143; GFX11-NEXT: s_setpc_b64 s[30:31] 2144entry: 2145 %0 = load <2 x half>, ptr addrspace(1) %x0, align 4 2146 %1 = load <2 x half>, ptr addrspace(1) %x1, align 4 2147 %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 0, i32 undef> 2148 %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 3> 2149 ret <2 x half> %vy1.2.vec.insert 2150} 2151 2152define <2 x half> @hi16low16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { 2153; GFX9-LABEL: hi16low16bits_v2bf16: 2154; GFX9: ; %bb.0: ; %entry 2155; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2156; GFX9-NEXT: global_load_dword v4, v[0:1], off 2157; GFX9-NEXT: global_load_dword v5, v[2:3], off 2158; GFX9-NEXT: s_waitcnt vmcnt(0) 2159; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 2160; GFX9-NEXT: s_setpc_b64 s[30:31] 2161; 2162; GFX10-LABEL: hi16low16bits_v2bf16: 2163; GFX10: ; %bb.0: ; %entry 2164; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2165; GFX10-NEXT: global_load_dword v4, v[0:1], off 2166; GFX10-NEXT: global_load_dword v5, v[2:3], off 2167; GFX10-NEXT: s_waitcnt vmcnt(0) 2168; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16 2169; GFX10-NEXT: s_setpc_b64 s[30:31] 2170; 2171; GFX11-LABEL: hi16low16bits_v2bf16: 2172; GFX11: ; %bb.0: ; %entry 2173; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2174; GFX11-NEXT: global_load_b32 v0, v[0:1], off 2175; GFX11-NEXT: global_load_b32 v1, v[2:3], off 2176; GFX11-NEXT: s_waitcnt vmcnt(0) 2177; GFX11-NEXT: v_alignbit_b32 v0, v1, v0, 16 2178; GFX11-NEXT: s_setpc_b64 s[30:31] 2179entry: 2180 %0 = load <2 x half>, ptr addrspace(1) %x0, align 4 2181 %1 = load <2 x half>, ptr addrspace(1) %x1, align 4 2182 %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 1, i32 undef> 2183 %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 2> 2184 ret <2 x half> %vy1.2.vec.insert 2185} 2186 2187define <2 x i16> @i16_low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { 2188; GX900-LABEL: i16_low16bits: 2189; GX900: ; %bb.0: ; %entry 2190; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2191; GX900-NEXT: global_load_dword v4, v[0:1], off 2192; GX900-NEXT: global_load_dword v5, v[2:3], off 2193; GX900-NEXT: s_mov_b32 s4, 0x5040100 2194; GX900-NEXT: s_waitcnt vmcnt(0) 2195; GX900-NEXT: v_perm_b32 v0, v5, v4, s4 2196; GX900-NEXT: s_setpc_b64 s[30:31] 2197; 2198; GFX940-LABEL: i16_low16bits: 2199; GFX940: ; %bb.0: ; %entry 2200; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2201; GFX940-NEXT: global_load_dword v4, v[0:1], off 2202; GFX940-NEXT: global_load_dword v5, v[2:3], off 2203; GFX940-NEXT: s_mov_b32 s0, 0x5040100 2204; GFX940-NEXT: s_waitcnt vmcnt(0) 2205; GFX940-NEXT: v_perm_b32 v0, v5, v4, s0 2206; GFX940-NEXT: s_setpc_b64 s[30:31] 2207; 2208; GFX10-LABEL: i16_low16bits: 2209; GFX10: ; %bb.0: ; %entry 2210; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2211; GFX10-NEXT: global_load_dword v4, v[0:1], off 2212; GFX10-NEXT: global_load_dword v5, v[2:3], off 2213; GFX10-NEXT: s_waitcnt vmcnt(0) 2214; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 2215; GFX10-NEXT: s_setpc_b64 s[30:31] 2216; 2217; GFX11-LABEL: i16_low16bits: 2218; GFX11: ; %bb.0: ; %entry 2219; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2220; GFX11-NEXT: global_load_b32 v0, v[0:1], off 2221; GFX11-NEXT: global_load_b32 v1, v[2:3], off 2222; GFX11-NEXT: s_waitcnt vmcnt(0) 2223; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 2224; GFX11-NEXT: s_setpc_b64 s[30:31] 2225entry: 2226 %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4 2227 %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4 2228 %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 0, i32 undef> 2229 %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 2> 2230 ret <2 x i16> %vy1.2.vec.insert 2231} 2232 2233define <2 x i16> @i16_low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { 2234; GX900-LABEL: i16_low16hi16bits: 2235; GX900: ; %bb.0: ; %entry 2236; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2237; GX900-NEXT: global_load_dword v4, v[0:1], off 2238; GX900-NEXT: global_load_dword v5, v[2:3], off 2239; GX900-NEXT: s_mov_b32 s4, 0xffff 2240; GX900-NEXT: s_waitcnt vmcnt(0) 2241; GX900-NEXT: v_bfi_b32 v0, s4, v4, v5 2242; GX900-NEXT: s_setpc_b64 s[30:31] 2243; 2244; GFX940-LABEL: i16_low16hi16bits: 2245; GFX940: ; %bb.0: ; %entry 2246; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2247; GFX940-NEXT: global_load_dword v4, v[0:1], off 2248; GFX940-NEXT: global_load_dword v5, v[2:3], off 2249; GFX940-NEXT: s_mov_b32 s0, 0xffff 2250; GFX940-NEXT: s_waitcnt vmcnt(0) 2251; GFX940-NEXT: v_bfi_b32 v0, s0, v4, v5 2252; GFX940-NEXT: s_setpc_b64 s[30:31] 2253; 2254; GFX10-LABEL: i16_low16hi16bits: 2255; GFX10: ; %bb.0: ; %entry 2256; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2257; GFX10-NEXT: global_load_dword v4, v[0:1], off 2258; GFX10-NEXT: global_load_dword v5, v[2:3], off 2259; GFX10-NEXT: s_waitcnt vmcnt(0) 2260; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v4, v5 2261; GFX10-NEXT: s_setpc_b64 s[30:31] 2262; 2263; GFX11-LABEL: i16_low16hi16bits: 2264; GFX11: ; %bb.0: ; %entry 2265; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2266; GFX11-NEXT: global_load_b32 v0, v[0:1], off 2267; GFX11-NEXT: global_load_b32 v1, v[2:3], off 2268; GFX11-NEXT: s_waitcnt vmcnt(0) 2269; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 2270; GFX11-NEXT: s_setpc_b64 s[30:31] 2271entry: 2272 %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4 2273 %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4 2274 %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 0, i32 undef> 2275 %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 3> 2276 ret <2 x i16> %vy1.2.vec.insert 2277} 2278 2279define <2 x i16> @i16_hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { 2280; GFX9-LABEL: i16_hi16low16bits: 2281; GFX9: ; %bb.0: ; %entry 2282; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2283; GFX9-NEXT: global_load_dword v4, v[0:1], off 2284; GFX9-NEXT: global_load_dword v5, v[2:3], off 2285; GFX9-NEXT: s_waitcnt vmcnt(0) 2286; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 2287; GFX9-NEXT: s_setpc_b64 s[30:31] 2288; 2289; GFX10-LABEL: i16_hi16low16bits: 2290; GFX10: ; %bb.0: ; %entry 2291; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2292; GFX10-NEXT: global_load_dword v4, v[0:1], off 2293; GFX10-NEXT: global_load_dword v5, v[2:3], off 2294; GFX10-NEXT: s_waitcnt vmcnt(0) 2295; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16 2296; GFX10-NEXT: s_setpc_b64 s[30:31] 2297; 2298; GFX11-LABEL: i16_hi16low16bits: 2299; GFX11: ; %bb.0: ; %entry 2300; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2301; GFX11-NEXT: global_load_b32 v0, v[0:1], off 2302; GFX11-NEXT: global_load_b32 v1, v[2:3], off 2303; GFX11-NEXT: s_waitcnt vmcnt(0) 2304; GFX11-NEXT: v_alignbit_b32 v0, v1, v0, 16 2305; GFX11-NEXT: s_setpc_b64 s[30:31] 2306entry: 2307 %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4 2308 %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4 2309 %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 1, i32 undef> 2310 %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 2> 2311 ret <2 x i16> %vy1.2.vec.insert 2312} 2313 2314define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { 2315; GX900-LABEL: i16_hi16bits: 2316; GX900: ; %bb.0: ; %entry 2317; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2318; GX900-NEXT: global_load_dword v4, v[0:1], off 2319; GX900-NEXT: global_load_dword v5, v[2:3], off 2320; GX900-NEXT: s_mov_b32 s4, 0x7060302 2321; GX900-NEXT: s_waitcnt vmcnt(0) 2322; GX900-NEXT: v_perm_b32 v0, v5, v4, s4 2323; GX900-NEXT: s_setpc_b64 s[30:31] 2324; 2325; GFX940-LABEL: i16_hi16bits: 2326; GFX940: ; %bb.0: ; %entry 2327; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2328; GFX940-NEXT: global_load_dword v4, v[0:1], off 2329; GFX940-NEXT: global_load_dword v5, v[2:3], off 2330; GFX940-NEXT: s_mov_b32 s0, 0x7060302 2331; GFX940-NEXT: s_waitcnt vmcnt(0) 2332; GFX940-NEXT: v_perm_b32 v0, v5, v4, s0 2333; GFX940-NEXT: s_setpc_b64 s[30:31] 2334; 2335; GFX10-LABEL: i16_hi16bits: 2336; GFX10: ; %bb.0: ; %entry 2337; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2338; GFX10-NEXT: global_load_dword v4, v[0:1], off 2339; GFX10-NEXT: global_load_dword v5, v[2:3], off 2340; GFX10-NEXT: s_waitcnt vmcnt(0) 2341; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302 2342; GFX10-NEXT: s_setpc_b64 s[30:31] 2343; 2344; GFX11-LABEL: i16_hi16bits: 2345; GFX11: ; %bb.0: ; %entry 2346; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2347; GFX11-NEXT: global_load_b32 v0, v[0:1], off 2348; GFX11-NEXT: global_load_b32 v1, v[2:3], off 2349; GFX11-NEXT: s_waitcnt vmcnt(0) 2350; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 2351; GFX11-NEXT: s_setpc_b64 s[30:31] 2352entry: 2353 %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4 2354 %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4 2355 %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 1, i32 undef> 2356 %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 3> 2357 ret <2 x i16> %vy1.2.vec.insert 2358} 2359 2360define <2 x i16> @v2i16_hi16bits(ptr addrspace(1) %x0) { 2361; GFX9-LABEL: v2i16_hi16bits: 2362; GFX9: ; %bb.0: ; %entry 2363; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2364; GFX9-NEXT: global_load_dword v0, v[0:1], off 2365; GFX9-NEXT: s_waitcnt vmcnt(0) 2366; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 2367; GFX9-NEXT: s_setpc_b64 s[30:31] 2368; 2369; GFX10-LABEL: v2i16_hi16bits: 2370; GFX10: ; %bb.0: ; %entry 2371; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2372; GFX10-NEXT: global_load_dword v0, v[0:1], off 2373; GFX10-NEXT: s_waitcnt vmcnt(0) 2374; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 2375; GFX10-NEXT: s_setpc_b64 s[30:31] 2376; 2377; GFX11-LABEL: v2i16_hi16bits: 2378; GFX11: ; %bb.0: ; %entry 2379; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2380; GFX11-NEXT: global_load_b32 v0, v[0:1], off 2381; GFX11-NEXT: s_waitcnt vmcnt(0) 2382; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 2383; GFX11-NEXT: s_setpc_b64 s[30:31] 2384entry: 2385 %load0 = load <2 x i16>, ptr addrspace(1) %x0, align 4 2386 %insert1 = insertelement <2 x i16> undef, i16 0, i32 0 2387 %insert2 = insertelement <2 x i16> %insert1, i16 0, i32 1 2388 %vec.ret = shufflevector <2 x i16> %insert2, <2 x i16> %load0, <2 x i32> <i32 0, i32 3> 2389 ret <2 x i16> %vec.ret 2390} 2391 2392define <2 x half> @v2half_hi16bits(ptr addrspace(1) %x0) { 2393; GFX9-LABEL: v2half_hi16bits: 2394; GFX9: ; %bb.0: ; %entry 2395; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2396; GFX9-NEXT: global_load_dword v0, v[0:1], off 2397; GFX9-NEXT: s_waitcnt vmcnt(0) 2398; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 2399; GFX9-NEXT: s_setpc_b64 s[30:31] 2400; 2401; GFX10-LABEL: v2half_hi16bits: 2402; GFX10: ; %bb.0: ; %entry 2403; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2404; GFX10-NEXT: global_load_dword v0, v[0:1], off 2405; GFX10-NEXT: s_waitcnt vmcnt(0) 2406; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 2407; GFX10-NEXT: s_setpc_b64 s[30:31] 2408; 2409; GFX11-LABEL: v2half_hi16bits: 2410; GFX11: ; %bb.0: ; %entry 2411; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2412; GFX11-NEXT: global_load_b32 v0, v[0:1], off 2413; GFX11-NEXT: s_waitcnt vmcnt(0) 2414; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 2415; GFX11-NEXT: s_setpc_b64 s[30:31] 2416entry: 2417 %load0 = load <2 x half>, ptr addrspace(1) %x0, align 4 2418 %insert1 = insertelement <2 x half> undef, half 0.0, i32 0 2419 %insert2 = insertelement <2 x half> %insert1, half 0.0, i32 1 2420 %vec.ret = shufflevector <2 x half> %insert2, <2 x half> %load0, <2 x i32> <i32 0, i32 3> 2421 ret <2 x half> %vec.ret 2422} 2423 2424define void @shuffle_v8f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 2425; GX900-LABEL: shuffle_v8f16_concat: 2426; GX900: ; %bb.0: 2427; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2428; GX900-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 2429; GX900-NEXT: global_load_dwordx2 v[8:9], v[2:3], off 2430; GX900-NEXT: s_waitcnt vmcnt(0) 2431; GX900-NEXT: global_store_dwordx4 v[4:5], v[6:9], off 2432; GX900-NEXT: s_waitcnt vmcnt(0) 2433; GX900-NEXT: s_setpc_b64 s[30:31] 2434; 2435; GFX940-LABEL: shuffle_v8f16_concat: 2436; GFX940: ; %bb.0: 2437; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2438; GFX940-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 2439; GFX940-NEXT: global_load_dwordx2 v[8:9], v[2:3], off 2440; GFX940-NEXT: s_waitcnt vmcnt(0) 2441; GFX940-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 2442; GFX940-NEXT: s_waitcnt vmcnt(0) 2443; GFX940-NEXT: s_setpc_b64 s[30:31] 2444; 2445; GFX10-LABEL: shuffle_v8f16_concat: 2446; GFX10: ; %bb.0: 2447; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2448; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 2449; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off 2450; GFX10-NEXT: s_waitcnt vmcnt(0) 2451; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off 2452; GFX10-NEXT: s_setpc_b64 s[30:31] 2453; 2454; GFX11-LABEL: shuffle_v8f16_concat: 2455; GFX11: ; %bb.0: 2456; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2457; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 2458; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 2459; GFX11-NEXT: s_waitcnt vmcnt(0) 2460; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 2461; GFX11-NEXT: s_setpc_b64 s[30:31] 2462 %val0 = load <4 x half>, ptr addrspace(1) %arg0 2463 %val1 = load <4 x half>, ptr addrspace(1) %arg1 2464 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2465 store <8 x half> %shuffle, ptr addrspace(1) %out 2466 ret void 2467} 2468 2469define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 2470; GX900-LABEL: shuffle_v16f16_concat: 2471; GX900: ; %bb.0: 2472; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2473; GX900-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 2474; GX900-NEXT: global_load_dwordx4 v[10:13], v[0:1], off 2475; GX900-NEXT: s_waitcnt vmcnt(1) 2476; GX900-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 2477; GX900-NEXT: s_waitcnt vmcnt(1) 2478; GX900-NEXT: global_store_dwordx4 v[4:5], v[10:13], off 2479; GX900-NEXT: s_waitcnt vmcnt(0) 2480; GX900-NEXT: s_setpc_b64 s[30:31] 2481; 2482; GFX940-LABEL: shuffle_v16f16_concat: 2483; GFX940: ; %bb.0: 2484; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2485; GFX940-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 2486; GFX940-NEXT: global_load_dwordx4 v[10:13], v[0:1], off 2487; GFX940-NEXT: s_waitcnt vmcnt(1) 2488; GFX940-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 sc0 sc1 2489; GFX940-NEXT: s_waitcnt vmcnt(1) 2490; GFX940-NEXT: global_store_dwordx4 v[4:5], v[10:13], off sc0 sc1 2491; GFX940-NEXT: s_waitcnt vmcnt(0) 2492; GFX940-NEXT: s_setpc_b64 s[30:31] 2493; 2494; GFX10-LABEL: shuffle_v16f16_concat: 2495; GFX10: ; %bb.0: 2496; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2497; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 2498; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off 2499; GFX10-NEXT: s_waitcnt vmcnt(1) 2500; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 2501; GFX10-NEXT: s_waitcnt vmcnt(0) 2502; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off 2503; GFX10-NEXT: s_setpc_b64 s[30:31] 2504; 2505; GFX11-LABEL: shuffle_v16f16_concat: 2506; GFX11: ; %bb.0: 2507; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2508; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off 2509; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off 2510; GFX11-NEXT: s_waitcnt vmcnt(1) 2511; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:16 2512; GFX11-NEXT: s_waitcnt vmcnt(0) 2513; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 2514; GFX11-NEXT: s_setpc_b64 s[30:31] 2515 %val0 = load <8 x half>, ptr addrspace(1) %arg0 2516 %val1 = load <8 x half>, ptr addrspace(1) %arg1 2517 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2518 store <16 x half> %shuffle, ptr addrspace(1) %out 2519 ret void 2520} 2521 2522define void @shuffle_v32f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 2523; GX900-LABEL: shuffle_v32f16_concat: 2524; GX900: ; %bb.0: 2525; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2526; GX900-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 2527; GX900-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 2528; GX900-NEXT: global_load_dwordx4 v[14:17], v[0:1], off 2529; GX900-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 2530; GX900-NEXT: s_waitcnt vmcnt(3) 2531; GX900-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 2532; GX900-NEXT: s_waitcnt vmcnt(3) 2533; GX900-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48 2534; GX900-NEXT: s_waitcnt vmcnt(3) 2535; GX900-NEXT: global_store_dwordx4 v[4:5], v[14:17], off 2536; GX900-NEXT: s_waitcnt vmcnt(3) 2537; GX900-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16 2538; GX900-NEXT: s_waitcnt vmcnt(0) 2539; GX900-NEXT: s_setpc_b64 s[30:31] 2540; 2541; GFX940-LABEL: shuffle_v32f16_concat: 2542; GFX940: ; %bb.0: 2543; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2544; GFX940-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 2545; GFX940-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 2546; GFX940-NEXT: global_load_dwordx4 v[14:17], v[0:1], off 2547; GFX940-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 2548; GFX940-NEXT: s_waitcnt vmcnt(3) 2549; GFX940-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 sc0 sc1 2550; GFX940-NEXT: s_waitcnt vmcnt(3) 2551; GFX940-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48 sc0 sc1 2552; GFX940-NEXT: s_waitcnt vmcnt(3) 2553; GFX940-NEXT: global_store_dwordx4 v[4:5], v[14:17], off sc0 sc1 2554; GFX940-NEXT: s_waitcnt vmcnt(3) 2555; GFX940-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16 sc0 sc1 2556; GFX940-NEXT: s_waitcnt vmcnt(0) 2557; GFX940-NEXT: s_setpc_b64 s[30:31] 2558; 2559; GFX10-LABEL: shuffle_v32f16_concat: 2560; GFX10: ; %bb.0: 2561; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2562; GFX10-NEXT: s_clause 0x1 2563; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 2564; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 2565; GFX10-NEXT: s_clause 0x1 2566; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off 2567; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 2568; GFX10-NEXT: s_waitcnt vmcnt(3) 2569; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 2570; GFX10-NEXT: s_waitcnt vmcnt(2) 2571; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48 2572; GFX10-NEXT: s_waitcnt vmcnt(1) 2573; GFX10-NEXT: global_store_dwordx4 v[4:5], v[14:17], off 2574; GFX10-NEXT: s_waitcnt vmcnt(0) 2575; GFX10-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16 2576; GFX10-NEXT: s_setpc_b64 s[30:31] 2577; 2578; GFX11-LABEL: shuffle_v32f16_concat: 2579; GFX11: ; %bb.0: 2580; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2581; GFX11-NEXT: s_clause 0x1 2582; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off 2583; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16 2584; GFX11-NEXT: s_clause 0x1 2585; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off 2586; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16 2587; GFX11-NEXT: s_waitcnt vmcnt(3) 2588; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:32 2589; GFX11-NEXT: s_waitcnt vmcnt(2) 2590; GFX11-NEXT: global_store_b128 v[4:5], v[10:13], off offset:48 2591; GFX11-NEXT: s_waitcnt vmcnt(1) 2592; GFX11-NEXT: global_store_b128 v[4:5], v[14:17], off 2593; GFX11-NEXT: s_waitcnt vmcnt(0) 2594; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off offset:16 2595; GFX11-NEXT: s_setpc_b64 s[30:31] 2596 %val0 = load <16 x half>, ptr addrspace(1) %arg0 2597 %val1 = load <16 x half>, ptr addrspace(1) %arg1 2598 %shuffle = shufflevector <16 x half> %val0, <16 x half> %val1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2599 store <32 x half> %shuffle, ptr addrspace(1) %out 2600 ret void 2601} 2602 2603define void @shuffle_v8i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 2604; GX900-LABEL: shuffle_v8i16_concat: 2605; GX900: ; %bb.0: 2606; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2607; GX900-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 2608; GX900-NEXT: global_load_dwordx2 v[8:9], v[2:3], off 2609; GX900-NEXT: s_waitcnt vmcnt(0) 2610; GX900-NEXT: global_store_dwordx4 v[4:5], v[6:9], off 2611; GX900-NEXT: s_waitcnt vmcnt(0) 2612; GX900-NEXT: s_setpc_b64 s[30:31] 2613; 2614; GFX940-LABEL: shuffle_v8i16_concat: 2615; GFX940: ; %bb.0: 2616; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2617; GFX940-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 2618; GFX940-NEXT: global_load_dwordx2 v[8:9], v[2:3], off 2619; GFX940-NEXT: s_waitcnt vmcnt(0) 2620; GFX940-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 2621; GFX940-NEXT: s_waitcnt vmcnt(0) 2622; GFX940-NEXT: s_setpc_b64 s[30:31] 2623; 2624; GFX10-LABEL: shuffle_v8i16_concat: 2625; GFX10: ; %bb.0: 2626; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2627; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 2628; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off 2629; GFX10-NEXT: s_waitcnt vmcnt(0) 2630; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off 2631; GFX10-NEXT: s_setpc_b64 s[30:31] 2632; 2633; GFX11-LABEL: shuffle_v8i16_concat: 2634; GFX11: ; %bb.0: 2635; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2636; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 2637; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 2638; GFX11-NEXT: s_waitcnt vmcnt(0) 2639; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 2640; GFX11-NEXT: s_setpc_b64 s[30:31] 2641 %val0 = load <4 x i16>, ptr addrspace(1) %arg0 2642 %val1 = load <4 x i16>, ptr addrspace(1) %arg1 2643 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2644 store <8 x i16> %shuffle, ptr addrspace(1) %out 2645 ret void 2646} 2647 2648define void @shuffle_v16i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 2649; GX900-LABEL: shuffle_v16i16_concat: 2650; GX900: ; %bb.0: 2651; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2652; GX900-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 2653; GX900-NEXT: global_load_dwordx4 v[10:13], v[0:1], off 2654; GX900-NEXT: s_waitcnt vmcnt(1) 2655; GX900-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 2656; GX900-NEXT: s_waitcnt vmcnt(1) 2657; GX900-NEXT: global_store_dwordx4 v[4:5], v[10:13], off 2658; GX900-NEXT: s_waitcnt vmcnt(0) 2659; GX900-NEXT: s_setpc_b64 s[30:31] 2660; 2661; GFX940-LABEL: shuffle_v16i16_concat: 2662; GFX940: ; %bb.0: 2663; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2664; GFX940-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 2665; GFX940-NEXT: global_load_dwordx4 v[10:13], v[0:1], off 2666; GFX940-NEXT: s_waitcnt vmcnt(1) 2667; GFX940-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 sc0 sc1 2668; GFX940-NEXT: s_waitcnt vmcnt(1) 2669; GFX940-NEXT: global_store_dwordx4 v[4:5], v[10:13], off sc0 sc1 2670; GFX940-NEXT: s_waitcnt vmcnt(0) 2671; GFX940-NEXT: s_setpc_b64 s[30:31] 2672; 2673; GFX10-LABEL: shuffle_v16i16_concat: 2674; GFX10: ; %bb.0: 2675; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2676; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 2677; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off 2678; GFX10-NEXT: s_waitcnt vmcnt(1) 2679; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 2680; GFX10-NEXT: s_waitcnt vmcnt(0) 2681; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off 2682; GFX10-NEXT: s_setpc_b64 s[30:31] 2683; 2684; GFX11-LABEL: shuffle_v16i16_concat: 2685; GFX11: ; %bb.0: 2686; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2687; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off 2688; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off 2689; GFX11-NEXT: s_waitcnt vmcnt(1) 2690; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:16 2691; GFX11-NEXT: s_waitcnt vmcnt(0) 2692; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 2693; GFX11-NEXT: s_setpc_b64 s[30:31] 2694 %val0 = load <8 x i16>, ptr addrspace(1) %arg0 2695 %val1 = load <8 x i16>, ptr addrspace(1) %arg1 2696 %shuffle = shufflevector <8 x i16> %val0, <8 x i16> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2697 store <16 x i16> %shuffle, ptr addrspace(1) %out 2698 ret void 2699} 2700 2701define void @shuffle_v32i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 2702; GX900-LABEL: shuffle_v32i16_concat: 2703; GX900: ; %bb.0: 2704; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2705; GX900-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 2706; GX900-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 2707; GX900-NEXT: global_load_dwordx4 v[14:17], v[0:1], off 2708; GX900-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 2709; GX900-NEXT: s_waitcnt vmcnt(3) 2710; GX900-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 2711; GX900-NEXT: s_waitcnt vmcnt(3) 2712; GX900-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48 2713; GX900-NEXT: s_waitcnt vmcnt(3) 2714; GX900-NEXT: global_store_dwordx4 v[4:5], v[14:17], off 2715; GX900-NEXT: s_waitcnt vmcnt(3) 2716; GX900-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16 2717; GX900-NEXT: s_waitcnt vmcnt(0) 2718; GX900-NEXT: s_setpc_b64 s[30:31] 2719; 2720; GFX940-LABEL: shuffle_v32i16_concat: 2721; GFX940: ; %bb.0: 2722; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2723; GFX940-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 2724; GFX940-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 2725; GFX940-NEXT: global_load_dwordx4 v[14:17], v[0:1], off 2726; GFX940-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 2727; GFX940-NEXT: s_waitcnt vmcnt(3) 2728; GFX940-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 sc0 sc1 2729; GFX940-NEXT: s_waitcnt vmcnt(3) 2730; GFX940-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48 sc0 sc1 2731; GFX940-NEXT: s_waitcnt vmcnt(3) 2732; GFX940-NEXT: global_store_dwordx4 v[4:5], v[14:17], off sc0 sc1 2733; GFX940-NEXT: s_waitcnt vmcnt(3) 2734; GFX940-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16 sc0 sc1 2735; GFX940-NEXT: s_waitcnt vmcnt(0) 2736; GFX940-NEXT: s_setpc_b64 s[30:31] 2737; 2738; GFX10-LABEL: shuffle_v32i16_concat: 2739; GFX10: ; %bb.0: 2740; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2741; GFX10-NEXT: s_clause 0x1 2742; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 2743; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 2744; GFX10-NEXT: s_clause 0x1 2745; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off 2746; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 2747; GFX10-NEXT: s_waitcnt vmcnt(3) 2748; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 2749; GFX10-NEXT: s_waitcnt vmcnt(2) 2750; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48 2751; GFX10-NEXT: s_waitcnt vmcnt(1) 2752; GFX10-NEXT: global_store_dwordx4 v[4:5], v[14:17], off 2753; GFX10-NEXT: s_waitcnt vmcnt(0) 2754; GFX10-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16 2755; GFX10-NEXT: s_setpc_b64 s[30:31] 2756; 2757; GFX11-LABEL: shuffle_v32i16_concat: 2758; GFX11: ; %bb.0: 2759; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2760; GFX11-NEXT: s_clause 0x1 2761; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off 2762; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16 2763; GFX11-NEXT: s_clause 0x1 2764; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off 2765; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16 2766; GFX11-NEXT: s_waitcnt vmcnt(3) 2767; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:32 2768; GFX11-NEXT: s_waitcnt vmcnt(2) 2769; GFX11-NEXT: global_store_b128 v[4:5], v[10:13], off offset:48 2770; GFX11-NEXT: s_waitcnt vmcnt(1) 2771; GFX11-NEXT: global_store_b128 v[4:5], v[14:17], off 2772; GFX11-NEXT: s_waitcnt vmcnt(0) 2773; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off offset:16 2774; GFX11-NEXT: s_setpc_b64 s[30:31] 2775 %val0 = load <16 x i16>, ptr addrspace(1) %arg0 2776 %val1 = load <16 x i16>, ptr addrspace(1) %arg1 2777 %shuffle = shufflevector <16 x i16> %val0, <16 x i16> %val1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2778 store <32 x i16> %shuffle, ptr addrspace(1) %out 2779 ret void 2780} 2781 2782define void @shuffle_v4i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 2783; GX900-LABEL: shuffle_v4i8_concat: 2784; GX900: ; %bb.0: 2785; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2786; GX900-NEXT: global_load_ushort v0, v[0:1], off 2787; GX900-NEXT: s_nop 0 2788; GX900-NEXT: global_load_short_d16_hi v0, v[2:3], off 2789; GX900-NEXT: s_waitcnt vmcnt(0) 2790; GX900-NEXT: global_store_dword v[4:5], v0, off 2791; GX900-NEXT: s_waitcnt vmcnt(0) 2792; GX900-NEXT: s_setpc_b64 s[30:31] 2793; 2794; GFX940-LABEL: shuffle_v4i8_concat: 2795; GFX940: ; %bb.0: 2796; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2797; GFX940-NEXT: global_load_ushort v6, v[0:1], off 2798; GFX940-NEXT: global_load_ushort v7, v[2:3], off 2799; GFX940-NEXT: s_mov_b32 s0, 0x5040100 2800; GFX940-NEXT: s_waitcnt vmcnt(0) 2801; GFX940-NEXT: v_perm_b32 v0, v7, v6, s0 2802; GFX940-NEXT: global_store_dword v[4:5], v0, off sc0 sc1 2803; GFX940-NEXT: s_waitcnt vmcnt(0) 2804; GFX940-NEXT: s_setpc_b64 s[30:31] 2805; 2806; GFX10-LABEL: shuffle_v4i8_concat: 2807; GFX10: ; %bb.0: 2808; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2809; GFX10-NEXT: global_load_ushort v0, v[0:1], off 2810; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off 2811; GFX10-NEXT: s_waitcnt vmcnt(0) 2812; GFX10-NEXT: global_store_dword v[4:5], v0, off 2813; GFX10-NEXT: s_setpc_b64 s[30:31] 2814; 2815; GFX11-LABEL: shuffle_v4i8_concat: 2816; GFX11: ; %bb.0: 2817; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2818; GFX11-NEXT: global_load_u16 v0, v[0:1], off 2819; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off 2820; GFX11-NEXT: s_waitcnt vmcnt(0) 2821; GFX11-NEXT: global_store_b32 v[4:5], v0, off 2822; GFX11-NEXT: s_setpc_b64 s[30:31] 2823 %val0 = load <2 x i8>, ptr addrspace(1) %arg0 2824 %val1 = load <2 x i8>, ptr addrspace(1) %arg1 2825 %shuffle = shufflevector <2 x i8> %val0, <2 x i8> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2826 store <4 x i8> %shuffle, ptr addrspace(1) %out 2827 ret void 2828} 2829 2830define void @shuffle_v8i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 2831; GX900-LABEL: shuffle_v8i8_concat: 2832; GX900: ; %bb.0: 2833; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2834; GX900-NEXT: global_load_dword v6, v[0:1], off 2835; GX900-NEXT: global_load_dword v7, v[2:3], off 2836; GX900-NEXT: s_waitcnt vmcnt(0) 2837; GX900-NEXT: global_store_dwordx2 v[4:5], v[6:7], off 2838; GX900-NEXT: s_waitcnt vmcnt(0) 2839; GX900-NEXT: s_setpc_b64 s[30:31] 2840; 2841; GFX940-LABEL: shuffle_v8i8_concat: 2842; GFX940: ; %bb.0: 2843; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2844; GFX940-NEXT: global_load_dword v6, v[0:1], off 2845; GFX940-NEXT: global_load_dword v7, v[2:3], off 2846; GFX940-NEXT: s_waitcnt vmcnt(0) 2847; GFX940-NEXT: global_store_dwordx2 v[4:5], v[6:7], off sc0 sc1 2848; GFX940-NEXT: s_waitcnt vmcnt(0) 2849; GFX940-NEXT: s_setpc_b64 s[30:31] 2850; 2851; GFX10-LABEL: shuffle_v8i8_concat: 2852; GFX10: ; %bb.0: 2853; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2854; GFX10-NEXT: global_load_dword v6, v[0:1], off 2855; GFX10-NEXT: global_load_dword v7, v[2:3], off 2856; GFX10-NEXT: s_waitcnt vmcnt(0) 2857; GFX10-NEXT: global_store_dwordx2 v[4:5], v[6:7], off 2858; GFX10-NEXT: s_setpc_b64 s[30:31] 2859; 2860; GFX11-LABEL: shuffle_v8i8_concat: 2861; GFX11: ; %bb.0: 2862; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2863; GFX11-NEXT: global_load_b32 v0, v[0:1], off 2864; GFX11-NEXT: global_load_b32 v1, v[2:3], off 2865; GFX11-NEXT: s_waitcnt vmcnt(0) 2866; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off 2867; GFX11-NEXT: s_setpc_b64 s[30:31] 2868 %val0 = load <4 x i8>, ptr addrspace(1) %arg0 2869 %val1 = load <4 x i8>, ptr addrspace(1) %arg1 2870 %shuffle = shufflevector <4 x i8> %val0, <4 x i8> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2871 store <8 x i8> %shuffle, ptr addrspace(1) %out 2872 ret void 2873} 2874 2875define void @shuffle_v16i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 2876; GX900-LABEL: shuffle_v16i8_concat: 2877; GX900: ; %bb.0: 2878; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2879; GX900-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 2880; GX900-NEXT: global_load_dwordx2 v[8:9], v[2:3], off 2881; GX900-NEXT: s_waitcnt vmcnt(0) 2882; GX900-NEXT: global_store_dwordx4 v[4:5], v[6:9], off 2883; GX900-NEXT: s_waitcnt vmcnt(0) 2884; GX900-NEXT: s_setpc_b64 s[30:31] 2885; 2886; GFX940-LABEL: shuffle_v16i8_concat: 2887; GFX940: ; %bb.0: 2888; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2889; GFX940-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 2890; GFX940-NEXT: global_load_dwordx2 v[8:9], v[2:3], off 2891; GFX940-NEXT: s_waitcnt vmcnt(0) 2892; GFX940-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 2893; GFX940-NEXT: s_waitcnt vmcnt(0) 2894; GFX940-NEXT: s_setpc_b64 s[30:31] 2895; 2896; GFX10-LABEL: shuffle_v16i8_concat: 2897; GFX10: ; %bb.0: 2898; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2899; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 2900; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off 2901; GFX10-NEXT: s_waitcnt vmcnt(0) 2902; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off 2903; GFX10-NEXT: s_setpc_b64 s[30:31] 2904; 2905; GFX11-LABEL: shuffle_v16i8_concat: 2906; GFX11: ; %bb.0: 2907; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2908; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 2909; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 2910; GFX11-NEXT: s_waitcnt vmcnt(0) 2911; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 2912; GFX11-NEXT: s_setpc_b64 s[30:31] 2913 %val0 = load <8 x i8>, ptr addrspace(1) %arg0 2914 %val1 = load <8 x i8>, ptr addrspace(1) %arg1 2915 %shuffle = shufflevector <8 x i8> %val0, <8 x i8> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2916 store <16 x i8> %shuffle, ptr addrspace(1) %out 2917 ret void 2918} 2919 2920define void @shuffle_v32i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 2921; GX900-LABEL: shuffle_v32i8_concat: 2922; GX900: ; %bb.0: 2923; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2924; GX900-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 2925; GX900-NEXT: global_load_dwordx4 v[10:13], v[0:1], off 2926; GX900-NEXT: s_waitcnt vmcnt(1) 2927; GX900-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 2928; GX900-NEXT: s_waitcnt vmcnt(1) 2929; GX900-NEXT: global_store_dwordx4 v[4:5], v[10:13], off 2930; GX900-NEXT: s_waitcnt vmcnt(0) 2931; GX900-NEXT: s_setpc_b64 s[30:31] 2932; 2933; GFX940-LABEL: shuffle_v32i8_concat: 2934; GFX940: ; %bb.0: 2935; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2936; GFX940-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 2937; GFX940-NEXT: global_load_dwordx4 v[10:13], v[0:1], off 2938; GFX940-NEXT: s_waitcnt vmcnt(1) 2939; GFX940-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 sc0 sc1 2940; GFX940-NEXT: s_waitcnt vmcnt(1) 2941; GFX940-NEXT: global_store_dwordx4 v[4:5], v[10:13], off sc0 sc1 2942; GFX940-NEXT: s_waitcnt vmcnt(0) 2943; GFX940-NEXT: s_setpc_b64 s[30:31] 2944; 2945; GFX10-LABEL: shuffle_v32i8_concat: 2946; GFX10: ; %bb.0: 2947; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2948; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 2949; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off 2950; GFX10-NEXT: s_waitcnt vmcnt(1) 2951; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 2952; GFX10-NEXT: s_waitcnt vmcnt(0) 2953; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off 2954; GFX10-NEXT: s_setpc_b64 s[30:31] 2955; 2956; GFX11-LABEL: shuffle_v32i8_concat: 2957; GFX11: ; %bb.0: 2958; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2959; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off 2960; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off 2961; GFX11-NEXT: s_waitcnt vmcnt(1) 2962; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:16 2963; GFX11-NEXT: s_waitcnt vmcnt(0) 2964; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 2965; GFX11-NEXT: s_setpc_b64 s[30:31] 2966 %val0 = load <16 x i8>, ptr addrspace(1) %arg0 2967 %val1 = load <16 x i8>, ptr addrspace(1) %arg1 2968 %shuffle = shufflevector <16 x i8> %val0, <16 x i8> %val1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2969 store <32 x i8> %shuffle, ptr addrspace(1) %out 2970 ret void 2971} 2972 2973define void @shuffle_v4i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 2974; GX900-LABEL: shuffle_v4i32_concat: 2975; GX900: ; %bb.0: 2976; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2977; GX900-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 2978; GX900-NEXT: global_load_dwordx2 v[8:9], v[2:3], off 2979; GX900-NEXT: s_waitcnt vmcnt(0) 2980; GX900-NEXT: global_store_dwordx4 v[4:5], v[6:9], off 2981; GX900-NEXT: s_waitcnt vmcnt(0) 2982; GX900-NEXT: s_setpc_b64 s[30:31] 2983; 2984; GFX940-LABEL: shuffle_v4i32_concat: 2985; GFX940: ; %bb.0: 2986; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2987; GFX940-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 2988; GFX940-NEXT: global_load_dwordx2 v[8:9], v[2:3], off 2989; GFX940-NEXT: s_waitcnt vmcnt(0) 2990; GFX940-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 2991; GFX940-NEXT: s_waitcnt vmcnt(0) 2992; GFX940-NEXT: s_setpc_b64 s[30:31] 2993; 2994; GFX10-LABEL: shuffle_v4i32_concat: 2995; GFX10: ; %bb.0: 2996; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2997; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 2998; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off 2999; GFX10-NEXT: s_waitcnt vmcnt(0) 3000; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off 3001; GFX10-NEXT: s_setpc_b64 s[30:31] 3002; 3003; GFX11-LABEL: shuffle_v4i32_concat: 3004; GFX11: ; %bb.0: 3005; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3006; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 3007; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 3008; GFX11-NEXT: s_waitcnt vmcnt(0) 3009; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 3010; GFX11-NEXT: s_setpc_b64 s[30:31] 3011 %val0 = load <2 x i32>, ptr addrspace(1) %arg0 3012 %val1 = load <2 x i32>, ptr addrspace(1) %arg1 3013 %shuffle = shufflevector <2 x i32> %val0, <2 x i32> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3014 store <4 x i32> %shuffle, ptr addrspace(1) %out 3015 ret void 3016} 3017 3018define void @shuffle_v8i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 3019; GX900-LABEL: shuffle_v8i32_concat: 3020; GX900: ; %bb.0: 3021; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3022; GX900-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 3023; GX900-NEXT: global_load_dwordx4 v[10:13], v[0:1], off 3024; GX900-NEXT: s_waitcnt vmcnt(1) 3025; GX900-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 3026; GX900-NEXT: s_waitcnt vmcnt(1) 3027; GX900-NEXT: global_store_dwordx4 v[4:5], v[10:13], off 3028; GX900-NEXT: s_waitcnt vmcnt(0) 3029; GX900-NEXT: s_setpc_b64 s[30:31] 3030; 3031; GFX940-LABEL: shuffle_v8i32_concat: 3032; GFX940: ; %bb.0: 3033; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3034; GFX940-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 3035; GFX940-NEXT: global_load_dwordx4 v[10:13], v[0:1], off 3036; GFX940-NEXT: s_waitcnt vmcnt(1) 3037; GFX940-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 sc0 sc1 3038; GFX940-NEXT: s_waitcnt vmcnt(1) 3039; GFX940-NEXT: global_store_dwordx4 v[4:5], v[10:13], off sc0 sc1 3040; GFX940-NEXT: s_waitcnt vmcnt(0) 3041; GFX940-NEXT: s_setpc_b64 s[30:31] 3042; 3043; GFX10-LABEL: shuffle_v8i32_concat: 3044; GFX10: ; %bb.0: 3045; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3046; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 3047; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off 3048; GFX10-NEXT: s_waitcnt vmcnt(1) 3049; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 3050; GFX10-NEXT: s_waitcnt vmcnt(0) 3051; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off 3052; GFX10-NEXT: s_setpc_b64 s[30:31] 3053; 3054; GFX11-LABEL: shuffle_v8i32_concat: 3055; GFX11: ; %bb.0: 3056; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3057; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off 3058; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off 3059; GFX11-NEXT: s_waitcnt vmcnt(1) 3060; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:16 3061; GFX11-NEXT: s_waitcnt vmcnt(0) 3062; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 3063; GFX11-NEXT: s_setpc_b64 s[30:31] 3064 %val0 = load <4 x i32>, ptr addrspace(1) %arg0 3065 %val1 = load <4 x i32>, ptr addrspace(1) %arg1 3066 %shuffle = shufflevector <4 x i32> %val0, <4 x i32> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3067 store <8 x i32> %shuffle, ptr addrspace(1) %out 3068 ret void 3069} 3070 3071define void @shuffle_v16i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 3072; GX900-LABEL: shuffle_v16i32_concat: 3073; GX900: ; %bb.0: 3074; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3075; GX900-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 3076; GX900-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 3077; GX900-NEXT: global_load_dwordx4 v[14:17], v[0:1], off 3078; GX900-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 3079; GX900-NEXT: s_waitcnt vmcnt(3) 3080; GX900-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 3081; GX900-NEXT: s_waitcnt vmcnt(3) 3082; GX900-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48 3083; GX900-NEXT: s_waitcnt vmcnt(3) 3084; GX900-NEXT: global_store_dwordx4 v[4:5], v[14:17], off 3085; GX900-NEXT: s_waitcnt vmcnt(3) 3086; GX900-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16 3087; GX900-NEXT: s_waitcnt vmcnt(0) 3088; GX900-NEXT: s_setpc_b64 s[30:31] 3089; 3090; GFX940-LABEL: shuffle_v16i32_concat: 3091; GFX940: ; %bb.0: 3092; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3093; GFX940-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 3094; GFX940-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 3095; GFX940-NEXT: global_load_dwordx4 v[14:17], v[0:1], off 3096; GFX940-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 3097; GFX940-NEXT: s_waitcnt vmcnt(3) 3098; GFX940-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 sc0 sc1 3099; GFX940-NEXT: s_waitcnt vmcnt(3) 3100; GFX940-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48 sc0 sc1 3101; GFX940-NEXT: s_waitcnt vmcnt(3) 3102; GFX940-NEXT: global_store_dwordx4 v[4:5], v[14:17], off sc0 sc1 3103; GFX940-NEXT: s_waitcnt vmcnt(3) 3104; GFX940-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16 sc0 sc1 3105; GFX940-NEXT: s_waitcnt vmcnt(0) 3106; GFX940-NEXT: s_setpc_b64 s[30:31] 3107; 3108; GFX10-LABEL: shuffle_v16i32_concat: 3109; GFX10: ; %bb.0: 3110; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3111; GFX10-NEXT: s_clause 0x1 3112; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 3113; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 3114; GFX10-NEXT: s_clause 0x1 3115; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off 3116; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 3117; GFX10-NEXT: s_waitcnt vmcnt(3) 3118; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 3119; GFX10-NEXT: s_waitcnt vmcnt(2) 3120; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48 3121; GFX10-NEXT: s_waitcnt vmcnt(1) 3122; GFX10-NEXT: global_store_dwordx4 v[4:5], v[14:17], off 3123; GFX10-NEXT: s_waitcnt vmcnt(0) 3124; GFX10-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16 3125; GFX10-NEXT: s_setpc_b64 s[30:31] 3126; 3127; GFX11-LABEL: shuffle_v16i32_concat: 3128; GFX11: ; %bb.0: 3129; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3130; GFX11-NEXT: s_clause 0x1 3131; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off 3132; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16 3133; GFX11-NEXT: s_clause 0x1 3134; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off 3135; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16 3136; GFX11-NEXT: s_waitcnt vmcnt(3) 3137; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:32 3138; GFX11-NEXT: s_waitcnt vmcnt(2) 3139; GFX11-NEXT: global_store_b128 v[4:5], v[10:13], off offset:48 3140; GFX11-NEXT: s_waitcnt vmcnt(1) 3141; GFX11-NEXT: global_store_b128 v[4:5], v[14:17], off 3142; GFX11-NEXT: s_waitcnt vmcnt(0) 3143; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off offset:16 3144; GFX11-NEXT: s_setpc_b64 s[30:31] 3145 %val0 = load <8 x i32>, ptr addrspace(1) %arg0 3146 %val1 = load <8 x i32>, ptr addrspace(1) %arg1 3147 %shuffle = shufflevector <8 x i32> %val0, <8 x i32> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3148 store <16 x i32> %shuffle, ptr addrspace(1) %out 3149 ret void 3150} 3151 3152define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3153; GFX9-LABEL: shuffle_v4bf16_23uu: 3154; GFX9: ; %bb.0: 3155; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3156; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 3157; GFX9-NEXT: s_waitcnt vmcnt(0) 3158; GFX9-NEXT: s_setpc_b64 s[30:31] 3159; 3160; GFX10-LABEL: shuffle_v4bf16_23uu: 3161; GFX10: ; %bb.0: 3162; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3163; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 3164; GFX10-NEXT: s_waitcnt vmcnt(0) 3165; GFX10-NEXT: s_setpc_b64 s[30:31] 3166; 3167; GFX11-LABEL: shuffle_v4bf16_23uu: 3168; GFX11: ; %bb.0: 3169; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3170; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 3171; GFX11-NEXT: s_waitcnt vmcnt(0) 3172; GFX11-NEXT: s_setpc_b64 s[30:31] 3173 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3174 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3175 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 3176 ret <4 x bfloat> %shuffle 3177} 3178 3179define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3180; GX900-LABEL: shuffle_v4bf16_234u: 3181; GX900: ; %bb.0: 3182; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3183; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4 3184; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 3185; GX900-NEXT: s_waitcnt vmcnt(1) 3186; GX900-NEXT: v_mov_b32_e32 v0, v6 3187; GX900-NEXT: s_waitcnt vmcnt(0) 3188; GX900-NEXT: v_mov_b32_e32 v1, v4 3189; GX900-NEXT: s_setpc_b64 s[30:31] 3190; 3191; GFX940-LABEL: shuffle_v4bf16_234u: 3192; GFX940: ; %bb.0: 3193; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3194; GFX940-NEXT: global_load_dword v4, v[0:1], off offset:4 3195; GFX940-NEXT: global_load_dwordx2 v[6:7], v[2:3], off 3196; GFX940-NEXT: s_waitcnt vmcnt(1) 3197; GFX940-NEXT: v_mov_b32_e32 v0, v4 3198; GFX940-NEXT: s_waitcnt vmcnt(0) 3199; GFX940-NEXT: v_mov_b32_e32 v1, v6 3200; GFX940-NEXT: s_setpc_b64 s[30:31] 3201; 3202; GFX10-LABEL: shuffle_v4bf16_234u: 3203; GFX10: ; %bb.0: 3204; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3205; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 3206; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 3207; GFX10-NEXT: s_waitcnt vmcnt(1) 3208; GFX10-NEXT: v_mov_b32_e32 v0, v6 3209; GFX10-NEXT: s_waitcnt vmcnt(0) 3210; GFX10-NEXT: v_mov_b32_e32 v1, v4 3211; GFX10-NEXT: s_setpc_b64 s[30:31] 3212; 3213; GFX11-LABEL: shuffle_v4bf16_234u: 3214; GFX11: ; %bb.0: 3215; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3216; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 3217; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off 3218; GFX11-NEXT: s_waitcnt vmcnt(0) 3219; GFX11-NEXT: s_setpc_b64 s[30:31] 3220 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3221 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3222 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef> 3223 ret <4 x bfloat> %shuffle 3224} 3225 3226define <4 x bfloat> @shuffle_v4bf16_u1u3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3227; GFX9-LABEL: shuffle_v4bf16_u1u3: 3228; GFX9: ; %bb.0: 3229; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3230; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 3231; GFX9-NEXT: s_waitcnt vmcnt(0) 3232; GFX9-NEXT: s_setpc_b64 s[30:31] 3233; 3234; GFX10-LABEL: shuffle_v4bf16_u1u3: 3235; GFX10: ; %bb.0: 3236; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3237; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 3238; GFX10-NEXT: s_waitcnt vmcnt(0) 3239; GFX10-NEXT: s_setpc_b64 s[30:31] 3240; 3241; GFX11-LABEL: shuffle_v4bf16_u1u3: 3242; GFX11: ; %bb.0: 3243; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3244; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 3245; GFX11-NEXT: s_waitcnt vmcnt(0) 3246; GFX11-NEXT: s_setpc_b64 s[30:31] 3247 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3248 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3249 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3> 3250 ret <4 x bfloat> %shuffle 3251} 3252 3253define <4 x bfloat> @shuffle_v4bf16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3254; GX900-LABEL: shuffle_v4bf16_u3u1: 3255; GX900: ; %bb.0: 3256; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3257; GX900-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 3258; GX900-NEXT: s_waitcnt vmcnt(0) 3259; GX900-NEXT: v_mov_b32_e32 v0, v2 3260; GX900-NEXT: s_setpc_b64 s[30:31] 3261; 3262; GFX940-LABEL: shuffle_v4bf16_u3u1: 3263; GFX940: ; %bb.0: 3264; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3265; GFX940-NEXT: global_load_dwordx2 v[2:3], v[0:1], off 3266; GFX940-NEXT: s_waitcnt vmcnt(0) 3267; GFX940-NEXT: v_mov_b32_e32 v0, v3 3268; GFX940-NEXT: v_mov_b32_e32 v1, v2 3269; GFX940-NEXT: s_setpc_b64 s[30:31] 3270; 3271; GFX10-LABEL: shuffle_v4bf16_u3u1: 3272; GFX10: ; %bb.0: 3273; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3274; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 3275; GFX10-NEXT: s_waitcnt vmcnt(0) 3276; GFX10-NEXT: v_mov_b32_e32 v0, v2 3277; GFX10-NEXT: s_setpc_b64 s[30:31] 3278; 3279; GFX11-LABEL: shuffle_v4bf16_u3u1: 3280; GFX11: ; %bb.0: 3281; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3282; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off 3283; GFX11-NEXT: s_waitcnt vmcnt(0) 3284; GFX11-NEXT: v_mov_b32_e32 v0, v2 3285; GFX11-NEXT: s_setpc_b64 s[30:31] 3286 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3287 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3288 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1> 3289 ret <4 x bfloat> %shuffle 3290} 3291 3292define <4 x bfloat> @shuffle_v4bf16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3293; GFX9-LABEL: shuffle_v4bf16_u3uu: 3294; GFX9: ; %bb.0: 3295; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3296; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 3297; GFX9-NEXT: s_waitcnt vmcnt(0) 3298; GFX9-NEXT: s_setpc_b64 s[30:31] 3299; 3300; GFX10-LABEL: shuffle_v4bf16_u3uu: 3301; GFX10: ; %bb.0: 3302; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3303; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 3304; GFX10-NEXT: s_waitcnt vmcnt(0) 3305; GFX10-NEXT: s_setpc_b64 s[30:31] 3306; 3307; GFX11-LABEL: shuffle_v4bf16_u3uu: 3308; GFX11: ; %bb.0: 3309; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3310; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 3311; GFX11-NEXT: s_waitcnt vmcnt(0) 3312; GFX11-NEXT: s_setpc_b64 s[30:31] 3313 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3314 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3315 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef> 3316 ret <4 x bfloat> %shuffle 3317} 3318 3319define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3320; GX900-LABEL: shuffle_v4bf16_3u6u: 3321; GX900: ; %bb.0: 3322; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3323; GX900-NEXT: global_load_dword v5, v[0:1], off offset:4 3324; GX900-NEXT: global_load_dword v4, v[2:3], off offset:4 3325; GX900-NEXT: s_waitcnt vmcnt(1) 3326; GX900-NEXT: v_alignbit_b32 v0, s4, v5, 16 3327; GX900-NEXT: s_waitcnt vmcnt(0) 3328; GX900-NEXT: v_mov_b32_e32 v1, v4 3329; GX900-NEXT: s_setpc_b64 s[30:31] 3330; 3331; GFX940-LABEL: shuffle_v4bf16_3u6u: 3332; GFX940: ; %bb.0: 3333; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3334; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:4 3335; GFX940-NEXT: global_load_dword v4, v[2:3], off offset:4 3336; GFX940-NEXT: s_waitcnt vmcnt(1) 3337; GFX940-NEXT: v_alignbit_b32 v0, s0, v5, 16 3338; GFX940-NEXT: s_waitcnt vmcnt(0) 3339; GFX940-NEXT: v_mov_b32_e32 v1, v4 3340; GFX940-NEXT: s_setpc_b64 s[30:31] 3341; 3342; GFX10-LABEL: shuffle_v4bf16_3u6u: 3343; GFX10: ; %bb.0: 3344; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3345; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 3346; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 3347; GFX10-NEXT: s_waitcnt vmcnt(1) 3348; GFX10-NEXT: v_alignbit_b32 v0, s4, v5, 16 3349; GFX10-NEXT: s_waitcnt vmcnt(0) 3350; GFX10-NEXT: v_mov_b32_e32 v1, v4 3351; GFX10-NEXT: s_setpc_b64 s[30:31] 3352; 3353; GFX11-LABEL: shuffle_v4bf16_3u6u: 3354; GFX11: ; %bb.0: 3355; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3356; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 3357; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 3358; GFX11-NEXT: s_waitcnt vmcnt(1) 3359; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, 16 3360; GFX11-NEXT: s_waitcnt vmcnt(0) 3361; GFX11-NEXT: s_setpc_b64 s[30:31] 3362 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3363 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3364 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef> 3365 ret <4 x bfloat> %shuffle 3366} 3367 3368define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3369; GX900-LABEL: shuffle_v4bf16_3uu7: 3370; GX900: ; %bb.0: 3371; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3372; GX900-NEXT: global_load_dword v5, v[0:1], off offset:4 3373; GX900-NEXT: global_load_dword v4, v[2:3], off offset:4 3374; GX900-NEXT: s_waitcnt vmcnt(1) 3375; GX900-NEXT: v_alignbit_b32 v0, s4, v5, 16 3376; GX900-NEXT: s_waitcnt vmcnt(0) 3377; GX900-NEXT: v_mov_b32_e32 v1, v4 3378; GX900-NEXT: s_setpc_b64 s[30:31] 3379; 3380; GFX940-LABEL: shuffle_v4bf16_3uu7: 3381; GFX940: ; %bb.0: 3382; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3383; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:4 3384; GFX940-NEXT: global_load_dword v4, v[2:3], off offset:4 3385; GFX940-NEXT: s_waitcnt vmcnt(1) 3386; GFX940-NEXT: v_alignbit_b32 v0, s0, v5, 16 3387; GFX940-NEXT: s_waitcnt vmcnt(0) 3388; GFX940-NEXT: v_mov_b32_e32 v1, v4 3389; GFX940-NEXT: s_setpc_b64 s[30:31] 3390; 3391; GFX10-LABEL: shuffle_v4bf16_3uu7: 3392; GFX10: ; %bb.0: 3393; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3394; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 3395; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 3396; GFX10-NEXT: s_waitcnt vmcnt(1) 3397; GFX10-NEXT: v_alignbit_b32 v0, s4, v5, 16 3398; GFX10-NEXT: s_waitcnt vmcnt(0) 3399; GFX10-NEXT: v_mov_b32_e32 v1, v4 3400; GFX10-NEXT: s_setpc_b64 s[30:31] 3401; 3402; GFX11-LABEL: shuffle_v4bf16_3uu7: 3403; GFX11: ; %bb.0: 3404; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3405; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 3406; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 3407; GFX11-NEXT: s_waitcnt vmcnt(1) 3408; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, 16 3409; GFX11-NEXT: s_waitcnt vmcnt(0) 3410; GFX11-NEXT: s_setpc_b64 s[30:31] 3411 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3412 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3413 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7> 3414 ret <4 x bfloat> %shuffle 3415} 3416 3417define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3418; GX900-LABEL: shuffle_v4bf16_35u5: 3419; GX900: ; %bb.0: 3420; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3421; GX900-NEXT: global_load_dword v5, v[0:1], off offset:4 3422; GX900-NEXT: global_load_dword v4, v[2:3], off 3423; GX900-NEXT: s_mov_b32 s4, 0x7060302 3424; GX900-NEXT: s_waitcnt vmcnt(0) 3425; GX900-NEXT: v_perm_b32 v0, v4, v5, s4 3426; GX900-NEXT: v_mov_b32_e32 v1, v4 3427; GX900-NEXT: s_setpc_b64 s[30:31] 3428; 3429; GFX940-LABEL: shuffle_v4bf16_35u5: 3430; GFX940: ; %bb.0: 3431; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3432; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:4 3433; GFX940-NEXT: global_load_dword v4, v[2:3], off 3434; GFX940-NEXT: s_mov_b32 s0, 0x7060302 3435; GFX940-NEXT: s_waitcnt vmcnt(0) 3436; GFX940-NEXT: v_perm_b32 v0, v4, v5, s0 3437; GFX940-NEXT: v_mov_b32_e32 v1, v4 3438; GFX940-NEXT: s_setpc_b64 s[30:31] 3439; 3440; GFX10-LABEL: shuffle_v4bf16_35u5: 3441; GFX10: ; %bb.0: 3442; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3443; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 3444; GFX10-NEXT: global_load_dword v4, v[2:3], off 3445; GFX10-NEXT: s_waitcnt vmcnt(0) 3446; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x7060302 3447; GFX10-NEXT: v_mov_b32_e32 v1, v4 3448; GFX10-NEXT: s_setpc_b64 s[30:31] 3449; 3450; GFX11-LABEL: shuffle_v4bf16_35u5: 3451; GFX11: ; %bb.0: 3452; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3453; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 3454; GFX11-NEXT: global_load_b32 v1, v[2:3], off 3455; GFX11-NEXT: s_waitcnt vmcnt(0) 3456; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 3457; GFX11-NEXT: s_setpc_b64 s[30:31] 3458 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3459 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3460 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5> 3461 ret <4 x bfloat> %shuffle 3462} 3463 3464define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3465; GX900-LABEL: shuffle_v4bf16_357u: 3466; GX900: ; %bb.0: 3467; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3468; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 3469; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4 3470; GX900-NEXT: s_mov_b32 s4, 0x7060302 3471; GX900-NEXT: s_waitcnt vmcnt(1) 3472; GX900-NEXT: v_alignbit_b32 v1, s4, v5, 16 3473; GX900-NEXT: s_waitcnt vmcnt(0) 3474; GX900-NEXT: v_perm_b32 v0, v4, v6, s4 3475; GX900-NEXT: s_setpc_b64 s[30:31] 3476; 3477; GFX940-LABEL: shuffle_v4bf16_357u: 3478; GFX940: ; %bb.0: 3479; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3480; GFX940-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 3481; GFX940-NEXT: global_load_dword v6, v[0:1], off offset:4 3482; GFX940-NEXT: s_mov_b32 s0, 0x7060302 3483; GFX940-NEXT: s_waitcnt vmcnt(1) 3484; GFX940-NEXT: v_alignbit_b32 v1, s0, v5, 16 3485; GFX940-NEXT: s_waitcnt vmcnt(0) 3486; GFX940-NEXT: v_perm_b32 v0, v4, v6, s0 3487; GFX940-NEXT: s_setpc_b64 s[30:31] 3488; 3489; GFX10-LABEL: shuffle_v4bf16_357u: 3490; GFX10: ; %bb.0: 3491; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3492; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 3493; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 3494; GFX10-NEXT: s_waitcnt vmcnt(1) 3495; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16 3496; GFX10-NEXT: s_waitcnt vmcnt(0) 3497; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x7060302 3498; GFX10-NEXT: s_setpc_b64 s[30:31] 3499; 3500; GFX11-LABEL: shuffle_v4bf16_357u: 3501; GFX11: ; %bb.0: 3502; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3503; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 3504; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 3505; GFX11-NEXT: s_waitcnt vmcnt(1) 3506; GFX11-NEXT: v_alignbit_b32 v1, s0, v3, 16 3507; GFX11-NEXT: s_waitcnt vmcnt(0) 3508; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 3509; GFX11-NEXT: s_setpc_b64 s[30:31] 3510 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3511 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3512 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef> 3513 ret <4 x bfloat> %shuffle 3514} 3515 3516define <4 x bfloat> @shuffle_v4bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3517; GFX9-LABEL: shuffle_v4bf16_0101: 3518; GFX9: ; %bb.0: 3519; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3520; GFX9-NEXT: global_load_dword v0, v[0:1], off 3521; GFX9-NEXT: s_waitcnt vmcnt(0) 3522; GFX9-NEXT: v_mov_b32_e32 v1, v0 3523; GFX9-NEXT: s_setpc_b64 s[30:31] 3524; 3525; GFX10-LABEL: shuffle_v4bf16_0101: 3526; GFX10: ; %bb.0: 3527; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3528; GFX10-NEXT: global_load_dword v0, v[0:1], off 3529; GFX10-NEXT: s_waitcnt vmcnt(0) 3530; GFX10-NEXT: v_mov_b32_e32 v1, v0 3531; GFX10-NEXT: s_setpc_b64 s[30:31] 3532; 3533; GFX11-LABEL: shuffle_v4bf16_0101: 3534; GFX11: ; %bb.0: 3535; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3536; GFX11-NEXT: global_load_b32 v0, v[0:1], off 3537; GFX11-NEXT: s_waitcnt vmcnt(0) 3538; GFX11-NEXT: v_mov_b32_e32 v1, v0 3539; GFX11-NEXT: s_setpc_b64 s[30:31] 3540 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3541 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3542 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 3543 ret <4 x bfloat> %shuffle 3544} 3545 3546define <4 x bfloat> @shuffle_v4bf16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3547; GFX9-LABEL: shuffle_v4bf16_0123: 3548; GFX9: ; %bb.0: 3549; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3550; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 3551; GFX9-NEXT: s_waitcnt vmcnt(0) 3552; GFX9-NEXT: s_setpc_b64 s[30:31] 3553; 3554; GFX10-LABEL: shuffle_v4bf16_0123: 3555; GFX10: ; %bb.0: 3556; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3557; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 3558; GFX10-NEXT: s_waitcnt vmcnt(0) 3559; GFX10-NEXT: s_setpc_b64 s[30:31] 3560; 3561; GFX11-LABEL: shuffle_v4bf16_0123: 3562; GFX11: ; %bb.0: 3563; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3564; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 3565; GFX11-NEXT: s_waitcnt vmcnt(0) 3566; GFX11-NEXT: s_setpc_b64 s[30:31] 3567 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3568 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3569 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3570 ret <4 x bfloat> %shuffle 3571} 3572 3573define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3574; GFX9-LABEL: shuffle_v4bf16_0145: 3575; GFX9: ; %bb.0: 3576; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3577; GFX9-NEXT: global_load_dword v4, v[0:1], off 3578; GFX9-NEXT: global_load_dword v5, v[2:3], off 3579; GFX9-NEXT: s_waitcnt vmcnt(1) 3580; GFX9-NEXT: v_mov_b32_e32 v0, v4 3581; GFX9-NEXT: s_waitcnt vmcnt(0) 3582; GFX9-NEXT: v_mov_b32_e32 v1, v5 3583; GFX9-NEXT: s_setpc_b64 s[30:31] 3584; 3585; GFX10-LABEL: shuffle_v4bf16_0145: 3586; GFX10: ; %bb.0: 3587; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3588; GFX10-NEXT: global_load_dword v4, v[0:1], off 3589; GFX10-NEXT: global_load_dword v5, v[2:3], off 3590; GFX10-NEXT: s_waitcnt vmcnt(1) 3591; GFX10-NEXT: v_mov_b32_e32 v0, v4 3592; GFX10-NEXT: s_waitcnt vmcnt(0) 3593; GFX10-NEXT: v_mov_b32_e32 v1, v5 3594; GFX10-NEXT: s_setpc_b64 s[30:31] 3595; 3596; GFX11-LABEL: shuffle_v4bf16_0145: 3597; GFX11: ; %bb.0: 3598; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3599; GFX11-NEXT: global_load_b32 v0, v[0:1], off 3600; GFX11-NEXT: global_load_b32 v1, v[2:3], off 3601; GFX11-NEXT: s_waitcnt vmcnt(0) 3602; GFX11-NEXT: s_setpc_b64 s[30:31] 3603 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3604 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3605 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 3606 ret <4 x bfloat> %shuffle 3607} 3608 3609define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3610; GFX9-LABEL: shuffle_v4bf16_0167: 3611; GFX9: ; %bb.0: 3612; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3613; GFX9-NEXT: global_load_dword v4, v[0:1], off 3614; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 3615; GFX9-NEXT: s_waitcnt vmcnt(1) 3616; GFX9-NEXT: v_mov_b32_e32 v0, v4 3617; GFX9-NEXT: s_waitcnt vmcnt(0) 3618; GFX9-NEXT: v_mov_b32_e32 v1, v5 3619; GFX9-NEXT: s_setpc_b64 s[30:31] 3620; 3621; GFX10-LABEL: shuffle_v4bf16_0167: 3622; GFX10: ; %bb.0: 3623; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3624; GFX10-NEXT: global_load_dword v4, v[0:1], off 3625; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 3626; GFX10-NEXT: s_waitcnt vmcnt(1) 3627; GFX10-NEXT: v_mov_b32_e32 v0, v4 3628; GFX10-NEXT: s_waitcnt vmcnt(0) 3629; GFX10-NEXT: v_mov_b32_e32 v1, v5 3630; GFX10-NEXT: s_setpc_b64 s[30:31] 3631; 3632; GFX11-LABEL: shuffle_v4bf16_0167: 3633; GFX11: ; %bb.0: 3634; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3635; GFX11-NEXT: global_load_b32 v0, v[0:1], off 3636; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 3637; GFX11-NEXT: s_waitcnt vmcnt(0) 3638; GFX11-NEXT: s_setpc_b64 s[30:31] 3639 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3640 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3641 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 3642 ret <4 x bfloat> %shuffle 3643} 3644 3645define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3646; GX900-LABEL: shuffle_v4bf16_2301: 3647; GX900: ; %bb.0: 3648; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3649; GX900-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 3650; GX900-NEXT: s_waitcnt vmcnt(0) 3651; GX900-NEXT: v_mov_b32_e32 v0, v2 3652; GX900-NEXT: s_setpc_b64 s[30:31] 3653; 3654; GFX940-LABEL: shuffle_v4bf16_2301: 3655; GFX940: ; %bb.0: 3656; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3657; GFX940-NEXT: global_load_dwordx2 v[2:3], v[0:1], off 3658; GFX940-NEXT: s_waitcnt vmcnt(0) 3659; GFX940-NEXT: v_mov_b32_e32 v0, v3 3660; GFX940-NEXT: v_mov_b32_e32 v1, v2 3661; GFX940-NEXT: s_setpc_b64 s[30:31] 3662; 3663; GFX10-LABEL: shuffle_v4bf16_2301: 3664; GFX10: ; %bb.0: 3665; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3666; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 3667; GFX10-NEXT: s_waitcnt vmcnt(0) 3668; GFX10-NEXT: v_mov_b32_e32 v0, v2 3669; GFX10-NEXT: s_setpc_b64 s[30:31] 3670; 3671; GFX11-LABEL: shuffle_v4bf16_2301: 3672; GFX11: ; %bb.0: 3673; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3674; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off 3675; GFX11-NEXT: s_waitcnt vmcnt(0) 3676; GFX11-NEXT: v_mov_b32_e32 v0, v2 3677; GFX11-NEXT: s_setpc_b64 s[30:31] 3678 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3679 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3680 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 3681 ret <4 x bfloat> %shuffle 3682} 3683 3684define <4 x bfloat> @shuffle_v4bf16_2323(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3685; GFX9-LABEL: shuffle_v4bf16_2323: 3686; GFX9: ; %bb.0: 3687; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3688; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 3689; GFX9-NEXT: s_waitcnt vmcnt(0) 3690; GFX9-NEXT: v_mov_b32_e32 v1, v0 3691; GFX9-NEXT: s_setpc_b64 s[30:31] 3692; 3693; GFX10-LABEL: shuffle_v4bf16_2323: 3694; GFX10: ; %bb.0: 3695; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3696; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 3697; GFX10-NEXT: s_waitcnt vmcnt(0) 3698; GFX10-NEXT: v_mov_b32_e32 v1, v0 3699; GFX10-NEXT: s_setpc_b64 s[30:31] 3700; 3701; GFX11-LABEL: shuffle_v4bf16_2323: 3702; GFX11: ; %bb.0: 3703; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3704; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 3705; GFX11-NEXT: s_waitcnt vmcnt(0) 3706; GFX11-NEXT: v_mov_b32_e32 v1, v0 3707; GFX11-NEXT: s_setpc_b64 s[30:31] 3708 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3709 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3710 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 3711 ret <4 x bfloat> %shuffle 3712} 3713 3714define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3715; GFX9-LABEL: shuffle_v4bf16_2345: 3716; GFX9: ; %bb.0: 3717; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3718; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 3719; GFX9-NEXT: global_load_dword v5, v[2:3], off 3720; GFX9-NEXT: s_waitcnt vmcnt(1) 3721; GFX9-NEXT: v_mov_b32_e32 v0, v4 3722; GFX9-NEXT: s_waitcnt vmcnt(0) 3723; GFX9-NEXT: v_mov_b32_e32 v1, v5 3724; GFX9-NEXT: s_setpc_b64 s[30:31] 3725; 3726; GFX10-LABEL: shuffle_v4bf16_2345: 3727; GFX10: ; %bb.0: 3728; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3729; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 3730; GFX10-NEXT: global_load_dword v5, v[2:3], off 3731; GFX10-NEXT: s_waitcnt vmcnt(1) 3732; GFX10-NEXT: v_mov_b32_e32 v0, v4 3733; GFX10-NEXT: s_waitcnt vmcnt(0) 3734; GFX10-NEXT: v_mov_b32_e32 v1, v5 3735; GFX10-NEXT: s_setpc_b64 s[30:31] 3736; 3737; GFX11-LABEL: shuffle_v4bf16_2345: 3738; GFX11: ; %bb.0: 3739; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3740; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 3741; GFX11-NEXT: global_load_b32 v1, v[2:3], off 3742; GFX11-NEXT: s_waitcnt vmcnt(0) 3743; GFX11-NEXT: s_setpc_b64 s[30:31] 3744 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3745 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3746 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 3747 ret <4 x bfloat> %shuffle 3748} 3749 3750define <4 x bfloat> @shuffle_v4bf16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3751; GFX9-LABEL: shuffle_v4bf16_2367: 3752; GFX9: ; %bb.0: 3753; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3754; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 3755; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 3756; GFX9-NEXT: s_waitcnt vmcnt(1) 3757; GFX9-NEXT: v_mov_b32_e32 v0, v4 3758; GFX9-NEXT: s_waitcnt vmcnt(0) 3759; GFX9-NEXT: v_mov_b32_e32 v1, v5 3760; GFX9-NEXT: s_setpc_b64 s[30:31] 3761; 3762; GFX10-LABEL: shuffle_v4bf16_2367: 3763; GFX10: ; %bb.0: 3764; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3765; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 3766; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 3767; GFX10-NEXT: s_waitcnt vmcnt(1) 3768; GFX10-NEXT: v_mov_b32_e32 v0, v4 3769; GFX10-NEXT: s_waitcnt vmcnt(0) 3770; GFX10-NEXT: v_mov_b32_e32 v1, v5 3771; GFX10-NEXT: s_setpc_b64 s[30:31] 3772; 3773; GFX11-LABEL: shuffle_v4bf16_2367: 3774; GFX11: ; %bb.0: 3775; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3776; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 3777; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 3778; GFX11-NEXT: s_waitcnt vmcnt(0) 3779; GFX11-NEXT: s_setpc_b64 s[30:31] 3780 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3781 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3782 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 3783 ret <4 x bfloat> %shuffle 3784} 3785 3786define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3787; GFX9-LABEL: shuffle_v4bf16_4501: 3788; GFX9: ; %bb.0: 3789; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3790; GFX9-NEXT: global_load_dword v4, v[2:3], off 3791; GFX9-NEXT: global_load_dword v5, v[0:1], off 3792; GFX9-NEXT: s_waitcnt vmcnt(1) 3793; GFX9-NEXT: v_mov_b32_e32 v0, v4 3794; GFX9-NEXT: s_waitcnt vmcnt(0) 3795; GFX9-NEXT: v_mov_b32_e32 v1, v5 3796; GFX9-NEXT: s_setpc_b64 s[30:31] 3797; 3798; GFX10-LABEL: shuffle_v4bf16_4501: 3799; GFX10: ; %bb.0: 3800; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3801; GFX10-NEXT: global_load_dword v4, v[2:3], off 3802; GFX10-NEXT: global_load_dword v5, v[0:1], off 3803; GFX10-NEXT: s_waitcnt vmcnt(1) 3804; GFX10-NEXT: v_mov_b32_e32 v0, v4 3805; GFX10-NEXT: s_waitcnt vmcnt(0) 3806; GFX10-NEXT: v_mov_b32_e32 v1, v5 3807; GFX10-NEXT: s_setpc_b64 s[30:31] 3808; 3809; GFX11-LABEL: shuffle_v4bf16_4501: 3810; GFX11: ; %bb.0: 3811; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3812; GFX11-NEXT: global_load_b32 v2, v[2:3], off 3813; GFX11-NEXT: global_load_b32 v1, v[0:1], off 3814; GFX11-NEXT: s_waitcnt vmcnt(1) 3815; GFX11-NEXT: v_mov_b32_e32 v0, v2 3816; GFX11-NEXT: s_waitcnt vmcnt(0) 3817; GFX11-NEXT: s_setpc_b64 s[30:31] 3818 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3819 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3820 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 3821 ret <4 x bfloat> %shuffle 3822} 3823 3824define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3825; GFX9-LABEL: shuffle_v4bf16_4523: 3826; GFX9: ; %bb.0: 3827; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3828; GFX9-NEXT: global_load_dword v4, v[2:3], off 3829; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 3830; GFX9-NEXT: s_waitcnt vmcnt(1) 3831; GFX9-NEXT: v_mov_b32_e32 v0, v4 3832; GFX9-NEXT: s_waitcnt vmcnt(0) 3833; GFX9-NEXT: v_mov_b32_e32 v1, v5 3834; GFX9-NEXT: s_setpc_b64 s[30:31] 3835; 3836; GFX10-LABEL: shuffle_v4bf16_4523: 3837; GFX10: ; %bb.0: 3838; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3839; GFX10-NEXT: global_load_dword v4, v[2:3], off 3840; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 3841; GFX10-NEXT: s_waitcnt vmcnt(1) 3842; GFX10-NEXT: v_mov_b32_e32 v0, v4 3843; GFX10-NEXT: s_waitcnt vmcnt(0) 3844; GFX10-NEXT: v_mov_b32_e32 v1, v5 3845; GFX10-NEXT: s_setpc_b64 s[30:31] 3846; 3847; GFX11-LABEL: shuffle_v4bf16_4523: 3848; GFX11: ; %bb.0: 3849; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3850; GFX11-NEXT: global_load_b32 v2, v[2:3], off 3851; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 3852; GFX11-NEXT: s_waitcnt vmcnt(1) 3853; GFX11-NEXT: v_mov_b32_e32 v0, v2 3854; GFX11-NEXT: s_waitcnt vmcnt(0) 3855; GFX11-NEXT: s_setpc_b64 s[30:31] 3856 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3857 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3858 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 3859 ret <4 x bfloat> %shuffle 3860} 3861 3862define <4 x bfloat> @shuffle_v4bf16_4545(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3863; GFX9-LABEL: shuffle_v4bf16_4545: 3864; GFX9: ; %bb.0: 3865; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3866; GFX9-NEXT: global_load_dword v0, v[2:3], off 3867; GFX9-NEXT: s_waitcnt vmcnt(0) 3868; GFX9-NEXT: v_mov_b32_e32 v1, v0 3869; GFX9-NEXT: s_setpc_b64 s[30:31] 3870; 3871; GFX10-LABEL: shuffle_v4bf16_4545: 3872; GFX10: ; %bb.0: 3873; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3874; GFX10-NEXT: global_load_dword v0, v[2:3], off 3875; GFX10-NEXT: s_waitcnt vmcnt(0) 3876; GFX10-NEXT: v_mov_b32_e32 v1, v0 3877; GFX10-NEXT: s_setpc_b64 s[30:31] 3878; 3879; GFX11-LABEL: shuffle_v4bf16_4545: 3880; GFX11: ; %bb.0: 3881; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3882; GFX11-NEXT: global_load_b32 v0, v[2:3], off 3883; GFX11-NEXT: s_waitcnt vmcnt(0) 3884; GFX11-NEXT: v_mov_b32_e32 v1, v0 3885; GFX11-NEXT: s_setpc_b64 s[30:31] 3886 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3887 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3888 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5> 3889 ret <4 x bfloat> %shuffle 3890} 3891 3892define <4 x bfloat> @shuffle_v4bf16_4567(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3893; GFX9-LABEL: shuffle_v4bf16_4567: 3894; GFX9: ; %bb.0: 3895; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3896; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 3897; GFX9-NEXT: s_waitcnt vmcnt(0) 3898; GFX9-NEXT: s_setpc_b64 s[30:31] 3899; 3900; GFX10-LABEL: shuffle_v4bf16_4567: 3901; GFX10: ; %bb.0: 3902; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3903; GFX10-NEXT: global_load_dwordx2 v[0:1], v[2:3], off 3904; GFX10-NEXT: s_waitcnt vmcnt(0) 3905; GFX10-NEXT: s_setpc_b64 s[30:31] 3906; 3907; GFX11-LABEL: shuffle_v4bf16_4567: 3908; GFX11: ; %bb.0: 3909; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3910; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off 3911; GFX11-NEXT: s_waitcnt vmcnt(0) 3912; GFX11-NEXT: s_setpc_b64 s[30:31] 3913 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3914 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3915 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3916 ret <4 x bfloat> %shuffle 3917} 3918 3919define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3920; GFX9-LABEL: shuffle_v4bf16_6701: 3921; GFX9: ; %bb.0: 3922; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3923; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 3924; GFX9-NEXT: global_load_dword v5, v[0:1], off 3925; GFX9-NEXT: s_waitcnt vmcnt(1) 3926; GFX9-NEXT: v_mov_b32_e32 v0, v4 3927; GFX9-NEXT: s_waitcnt vmcnt(0) 3928; GFX9-NEXT: v_mov_b32_e32 v1, v5 3929; GFX9-NEXT: s_setpc_b64 s[30:31] 3930; 3931; GFX10-LABEL: shuffle_v4bf16_6701: 3932; GFX10: ; %bb.0: 3933; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3934; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 3935; GFX10-NEXT: global_load_dword v5, v[0:1], off 3936; GFX10-NEXT: s_waitcnt vmcnt(1) 3937; GFX10-NEXT: v_mov_b32_e32 v0, v4 3938; GFX10-NEXT: s_waitcnt vmcnt(0) 3939; GFX10-NEXT: v_mov_b32_e32 v1, v5 3940; GFX10-NEXT: s_setpc_b64 s[30:31] 3941; 3942; GFX11-LABEL: shuffle_v4bf16_6701: 3943; GFX11: ; %bb.0: 3944; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3945; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 3946; GFX11-NEXT: global_load_b32 v1, v[0:1], off 3947; GFX11-NEXT: s_waitcnt vmcnt(1) 3948; GFX11-NEXT: v_mov_b32_e32 v0, v2 3949; GFX11-NEXT: s_waitcnt vmcnt(0) 3950; GFX11-NEXT: s_setpc_b64 s[30:31] 3951 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3952 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3953 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 3954 ret <4 x bfloat> %shuffle 3955} 3956 3957define <4 x bfloat> @shuffle_v4bf16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3958; GFX9-LABEL: shuffle_v4bf16_6723: 3959; GFX9: ; %bb.0: 3960; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3961; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 3962; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 3963; GFX9-NEXT: s_waitcnt vmcnt(1) 3964; GFX9-NEXT: v_mov_b32_e32 v0, v4 3965; GFX9-NEXT: s_waitcnt vmcnt(0) 3966; GFX9-NEXT: v_mov_b32_e32 v1, v5 3967; GFX9-NEXT: s_setpc_b64 s[30:31] 3968; 3969; GFX10-LABEL: shuffle_v4bf16_6723: 3970; GFX10: ; %bb.0: 3971; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3972; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 3973; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 3974; GFX10-NEXT: s_waitcnt vmcnt(1) 3975; GFX10-NEXT: v_mov_b32_e32 v0, v4 3976; GFX10-NEXT: s_waitcnt vmcnt(0) 3977; GFX10-NEXT: v_mov_b32_e32 v1, v5 3978; GFX10-NEXT: s_setpc_b64 s[30:31] 3979; 3980; GFX11-LABEL: shuffle_v4bf16_6723: 3981; GFX11: ; %bb.0: 3982; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3983; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 3984; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 3985; GFX11-NEXT: s_waitcnt vmcnt(1) 3986; GFX11-NEXT: v_mov_b32_e32 v0, v2 3987; GFX11-NEXT: s_waitcnt vmcnt(0) 3988; GFX11-NEXT: s_setpc_b64 s[30:31] 3989 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 3990 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 3991 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 3992 ret <4 x bfloat> %shuffle 3993} 3994 3995define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 3996; GX900-LABEL: shuffle_v4bf16_6745: 3997; GX900: ; %bb.0: 3998; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3999; GX900-NEXT: global_load_dwordx2 v[1:2], v[2:3], off 4000; GX900-NEXT: s_waitcnt vmcnt(0) 4001; GX900-NEXT: v_mov_b32_e32 v0, v2 4002; GX900-NEXT: s_setpc_b64 s[30:31] 4003; 4004; GFX940-LABEL: shuffle_v4bf16_6745: 4005; GFX940: ; %bb.0: 4006; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4007; GFX940-NEXT: global_load_dwordx2 v[2:3], v[2:3], off 4008; GFX940-NEXT: s_waitcnt vmcnt(0) 4009; GFX940-NEXT: v_mov_b32_e32 v0, v3 4010; GFX940-NEXT: v_mov_b32_e32 v1, v2 4011; GFX940-NEXT: s_setpc_b64 s[30:31] 4012; 4013; GFX10-LABEL: shuffle_v4bf16_6745: 4014; GFX10: ; %bb.0: 4015; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4016; GFX10-NEXT: global_load_dwordx2 v[1:2], v[2:3], off 4017; GFX10-NEXT: s_waitcnt vmcnt(0) 4018; GFX10-NEXT: v_mov_b32_e32 v0, v2 4019; GFX10-NEXT: s_setpc_b64 s[30:31] 4020; 4021; GFX11-LABEL: shuffle_v4bf16_6745: 4022; GFX11: ; %bb.0: 4023; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4024; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off 4025; GFX11-NEXT: s_waitcnt vmcnt(0) 4026; GFX11-NEXT: v_mov_b32_e32 v0, v2 4027; GFX11-NEXT: s_setpc_b64 s[30:31] 4028 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 4029 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 4030 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5> 4031 ret <4 x bfloat> %shuffle 4032} 4033 4034define <4 x bfloat> @shuffle_v4bf16_6767(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4035; GFX9-LABEL: shuffle_v4bf16_6767: 4036; GFX9: ; %bb.0: 4037; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4038; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4 4039; GFX9-NEXT: s_waitcnt vmcnt(0) 4040; GFX9-NEXT: v_mov_b32_e32 v1, v0 4041; GFX9-NEXT: s_setpc_b64 s[30:31] 4042; 4043; GFX10-LABEL: shuffle_v4bf16_6767: 4044; GFX10: ; %bb.0: 4045; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4046; GFX10-NEXT: global_load_dword v0, v[2:3], off offset:4 4047; GFX10-NEXT: s_waitcnt vmcnt(0) 4048; GFX10-NEXT: v_mov_b32_e32 v1, v0 4049; GFX10-NEXT: s_setpc_b64 s[30:31] 4050; 4051; GFX11-LABEL: shuffle_v4bf16_6767: 4052; GFX11: ; %bb.0: 4053; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4054; GFX11-NEXT: global_load_b32 v0, v[2:3], off offset:4 4055; GFX11-NEXT: s_waitcnt vmcnt(0) 4056; GFX11-NEXT: v_mov_b32_e32 v1, v0 4057; GFX11-NEXT: s_setpc_b64 s[30:31] 4058 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 4059 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 4060 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7> 4061 ret <4 x bfloat> %shuffle 4062} 4063 4064define <4 x bfloat> @shuffle_v4bf16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4065; GX900-LABEL: shuffle_v4bf16_2356: 4066; GX900: ; %bb.0: 4067; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4068; GX900-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 4069; GX900-NEXT: global_load_dword v4, v[0:1], off offset:4 4070; GX900-NEXT: s_waitcnt vmcnt(1) 4071; GX900-NEXT: v_alignbit_b32 v1, v6, v5, 16 4072; GX900-NEXT: s_waitcnt vmcnt(0) 4073; GX900-NEXT: v_mov_b32_e32 v0, v4 4074; GX900-NEXT: s_setpc_b64 s[30:31] 4075; 4076; GFX940-LABEL: shuffle_v4bf16_2356: 4077; GFX940: ; %bb.0: 4078; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4079; GFX940-NEXT: global_load_dwordx2 v[6:7], v[2:3], off 4080; GFX940-NEXT: global_load_dword v4, v[0:1], off offset:4 4081; GFX940-NEXT: s_waitcnt vmcnt(1) 4082; GFX940-NEXT: v_alignbit_b32 v1, v7, v6, 16 4083; GFX940-NEXT: s_waitcnt vmcnt(0) 4084; GFX940-NEXT: v_mov_b32_e32 v0, v4 4085; GFX940-NEXT: s_setpc_b64 s[30:31] 4086; 4087; GFX10-LABEL: shuffle_v4bf16_2356: 4088; GFX10: ; %bb.0: 4089; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4090; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 4091; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 4092; GFX10-NEXT: s_waitcnt vmcnt(1) 4093; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16 4094; GFX10-NEXT: s_waitcnt vmcnt(0) 4095; GFX10-NEXT: v_mov_b32_e32 v0, v4 4096; GFX10-NEXT: s_setpc_b64 s[30:31] 4097; 4098; GFX11-LABEL: shuffle_v4bf16_2356: 4099; GFX11: ; %bb.0: 4100; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4101; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 4102; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 4103; GFX11-NEXT: s_waitcnt vmcnt(1) 4104; GFX11-NEXT: v_alignbit_b32 v1, v3, v2, 16 4105; GFX11-NEXT: s_waitcnt vmcnt(0) 4106; GFX11-NEXT: s_setpc_b64 s[30:31] 4107 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 4108 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 4109 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6> 4110 ret <4 x bfloat> %shuffle 4111} 4112 4113define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4114; GX900-LABEL: shuffle_v4bf16_5623: 4115; GX900: ; %bb.0: 4116; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4117; GX900-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 4118; GX900-NEXT: global_load_dword v4, v[0:1], off offset:4 4119; GX900-NEXT: s_waitcnt vmcnt(1) 4120; GX900-NEXT: v_alignbit_b32 v0, v6, v5, 16 4121; GX900-NEXT: s_waitcnt vmcnt(0) 4122; GX900-NEXT: v_mov_b32_e32 v1, v4 4123; GX900-NEXT: s_setpc_b64 s[30:31] 4124; 4125; GFX940-LABEL: shuffle_v4bf16_5623: 4126; GFX940: ; %bb.0: 4127; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4128; GFX940-NEXT: global_load_dwordx2 v[6:7], v[2:3], off 4129; GFX940-NEXT: global_load_dword v4, v[0:1], off offset:4 4130; GFX940-NEXT: s_waitcnt vmcnt(1) 4131; GFX940-NEXT: v_alignbit_b32 v0, v7, v6, 16 4132; GFX940-NEXT: s_waitcnt vmcnt(0) 4133; GFX940-NEXT: v_mov_b32_e32 v1, v4 4134; GFX940-NEXT: s_setpc_b64 s[30:31] 4135; 4136; GFX10-LABEL: shuffle_v4bf16_5623: 4137; GFX10: ; %bb.0: 4138; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4139; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 4140; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 4141; GFX10-NEXT: s_waitcnt vmcnt(1) 4142; GFX10-NEXT: v_alignbit_b32 v0, v6, v5, 16 4143; GFX10-NEXT: s_waitcnt vmcnt(0) 4144; GFX10-NEXT: v_mov_b32_e32 v1, v4 4145; GFX10-NEXT: s_setpc_b64 s[30:31] 4146; 4147; GFX11-LABEL: shuffle_v4bf16_5623: 4148; GFX11: ; %bb.0: 4149; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4150; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 4151; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 4152; GFX11-NEXT: s_waitcnt vmcnt(1) 4153; GFX11-NEXT: v_alignbit_b32 v0, v3, v2, 16 4154; GFX11-NEXT: s_waitcnt vmcnt(0) 4155; GFX11-NEXT: s_setpc_b64 s[30:31] 4156 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 4157 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 4158 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3> 4159 ret <4 x bfloat> %shuffle 4160} 4161 4162define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4163; GFX9-LABEL: shuffle_v4bf16_3456: 4164; GFX9: ; %bb.0: 4165; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4166; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 4167; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 4168; GFX9-NEXT: s_waitcnt vmcnt(1) 4169; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16 4170; GFX9-NEXT: s_waitcnt vmcnt(0) 4171; GFX9-NEXT: v_alignbit_b32 v0, v4, v6, 16 4172; GFX9-NEXT: s_setpc_b64 s[30:31] 4173; 4174; GFX10-LABEL: shuffle_v4bf16_3456: 4175; GFX10: ; %bb.0: 4176; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4177; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 4178; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 4179; GFX10-NEXT: s_waitcnt vmcnt(1) 4180; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16 4181; GFX10-NEXT: s_waitcnt vmcnt(0) 4182; GFX10-NEXT: v_alignbit_b32 v0, v4, v6, 16 4183; GFX10-NEXT: s_setpc_b64 s[30:31] 4184; 4185; GFX11-LABEL: shuffle_v4bf16_3456: 4186; GFX11: ; %bb.0: 4187; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4188; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 4189; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 4190; GFX11-NEXT: s_waitcnt vmcnt(1) 4191; GFX11-NEXT: v_alignbit_b32 v1, v3, v2, 16 4192; GFX11-NEXT: s_waitcnt vmcnt(0) 4193; GFX11-NEXT: v_alignbit_b32 v0, v2, v0, 16 4194; GFX11-NEXT: s_setpc_b64 s[30:31] 4195 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 4196 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 4197 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 4198 ret <4 x bfloat> %shuffle 4199} 4200 4201define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4202; GFX9-LABEL: shuffle_v4bf16_5634: 4203; GFX9: ; %bb.0: 4204; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4205; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 4206; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 4207; GFX9-NEXT: s_waitcnt vmcnt(1) 4208; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 4209; GFX9-NEXT: s_waitcnt vmcnt(0) 4210; GFX9-NEXT: v_alignbit_b32 v1, v4, v6, 16 4211; GFX9-NEXT: s_setpc_b64 s[30:31] 4212; 4213; GFX10-LABEL: shuffle_v4bf16_5634: 4214; GFX10: ; %bb.0: 4215; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4216; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 4217; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 4218; GFX10-NEXT: s_waitcnt vmcnt(1) 4219; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16 4220; GFX10-NEXT: s_waitcnt vmcnt(0) 4221; GFX10-NEXT: v_alignbit_b32 v1, v4, v6, 16 4222; GFX10-NEXT: s_setpc_b64 s[30:31] 4223; 4224; GFX11-LABEL: shuffle_v4bf16_5634: 4225; GFX11: ; %bb.0: 4226; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4227; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 4228; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 4229; GFX11-NEXT: s_waitcnt vmcnt(1) 4230; GFX11-NEXT: v_alignbit_b32 v0, v3, v2, 16 4231; GFX11-NEXT: s_waitcnt vmcnt(0) 4232; GFX11-NEXT: v_alignbit_b32 v1, v2, v1, 16 4233; GFX11-NEXT: s_setpc_b64 s[30:31] 4234 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 4235 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 4236 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4> 4237 ret <4 x bfloat> %shuffle 4238} 4239 4240define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4241; GX900-LABEL: shuffle_v4bf16_5734: 4242; GX900: ; %bb.0: 4243; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4244; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 4245; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4 4246; GX900-NEXT: s_mov_b32 s4, 0x7060302 4247; GX900-NEXT: s_waitcnt vmcnt(1) 4248; GX900-NEXT: v_perm_b32 v0, v5, v4, s4 4249; GX900-NEXT: s_waitcnt vmcnt(0) 4250; GX900-NEXT: v_alignbit_b32 v1, v4, v6, 16 4251; GX900-NEXT: s_setpc_b64 s[30:31] 4252; 4253; GFX940-LABEL: shuffle_v4bf16_5734: 4254; GFX940: ; %bb.0: 4255; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4256; GFX940-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 4257; GFX940-NEXT: global_load_dword v6, v[0:1], off offset:4 4258; GFX940-NEXT: s_mov_b32 s0, 0x7060302 4259; GFX940-NEXT: s_waitcnt vmcnt(1) 4260; GFX940-NEXT: v_perm_b32 v0, v5, v4, s0 4261; GFX940-NEXT: s_waitcnt vmcnt(0) 4262; GFX940-NEXT: v_alignbit_b32 v1, v4, v6, 16 4263; GFX940-NEXT: s_setpc_b64 s[30:31] 4264; 4265; GFX10-LABEL: shuffle_v4bf16_5734: 4266; GFX10: ; %bb.0: 4267; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4268; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off 4269; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 4270; GFX10-NEXT: s_waitcnt vmcnt(1) 4271; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302 4272; GFX10-NEXT: s_waitcnt vmcnt(0) 4273; GFX10-NEXT: v_alignbit_b32 v1, v4, v6, 16 4274; GFX10-NEXT: s_setpc_b64 s[30:31] 4275; 4276; GFX11-LABEL: shuffle_v4bf16_5734: 4277; GFX11: ; %bb.0: 4278; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4279; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 4280; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 4281; GFX11-NEXT: s_waitcnt vmcnt(1) 4282; GFX11-NEXT: v_perm_b32 v0, v3, v2, 0x7060302 4283; GFX11-NEXT: s_waitcnt vmcnt(0) 4284; GFX11-NEXT: v_alignbit_b32 v1, v2, v1, 16 4285; GFX11-NEXT: s_setpc_b64 s[30:31] 4286 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 4287 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 4288 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4> 4289 ret <4 x bfloat> %shuffle 4290} 4291 4292define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4293; GX900-LABEL: shuffle_v4bf16_0000: 4294; GX900: ; %bb.0: 4295; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4296; GX900-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 4297; GX900-NEXT: s_mov_b32 s4, 0x5040100 4298; GX900-NEXT: s_waitcnt vmcnt(0) 4299; GX900-NEXT: v_perm_b32 v0, v0, v0, s4 4300; GX900-NEXT: v_mov_b32_e32 v1, v0 4301; GX900-NEXT: s_setpc_b64 s[30:31] 4302; 4303; GFX940-LABEL: shuffle_v4bf16_0000: 4304; GFX940: ; %bb.0: 4305; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4306; GFX940-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 4307; GFX940-NEXT: s_mov_b32 s0, 0x5040100 4308; GFX940-NEXT: s_waitcnt vmcnt(0) 4309; GFX940-NEXT: v_perm_b32 v0, v0, v0, s0 4310; GFX940-NEXT: v_mov_b32_e32 v1, v0 4311; GFX940-NEXT: s_setpc_b64 s[30:31] 4312; 4313; GFX10-LABEL: shuffle_v4bf16_0000: 4314; GFX10: ; %bb.0: 4315; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4316; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 4317; GFX10-NEXT: s_waitcnt vmcnt(0) 4318; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 4319; GFX10-NEXT: v_mov_b32_e32 v1, v0 4320; GFX10-NEXT: s_setpc_b64 s[30:31] 4321; 4322; GFX11-LABEL: shuffle_v4bf16_0000: 4323; GFX11: ; %bb.0: 4324; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4325; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 4326; GFX11-NEXT: s_waitcnt vmcnt(0) 4327; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 4328; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4329; GFX11-NEXT: v_mov_b32_e32 v1, v0 4330; GFX11-NEXT: s_setpc_b64 s[30:31] 4331 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 4332 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 4333 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> zeroinitializer 4334 ret <4 x bfloat> %shuffle 4335} 4336 4337define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4338; GFX9-LABEL: shuffle_v4bf16_1010: 4339; GFX9: ; %bb.0: 4340; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4341; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 4342; GFX9-NEXT: s_waitcnt vmcnt(0) 4343; GFX9-NEXT: v_alignbit_b32 v0, v0, v0, 16 4344; GFX9-NEXT: v_mov_b32_e32 v1, v0 4345; GFX9-NEXT: s_setpc_b64 s[30:31] 4346; 4347; GFX10-LABEL: shuffle_v4bf16_1010: 4348; GFX10: ; %bb.0: 4349; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4350; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 4351; GFX10-NEXT: s_waitcnt vmcnt(0) 4352; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 4353; GFX10-NEXT: v_mov_b32_e32 v1, v0 4354; GFX10-NEXT: s_setpc_b64 s[30:31] 4355; 4356; GFX11-LABEL: shuffle_v4bf16_1010: 4357; GFX11: ; %bb.0: 4358; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4359; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 4360; GFX11-NEXT: s_waitcnt vmcnt(0) 4361; GFX11-NEXT: v_alignbit_b32 v0, v0, v0, 16 4362; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4363; GFX11-NEXT: v_mov_b32_e32 v1, v0 4364; GFX11-NEXT: s_setpc_b64 s[30:31] 4365 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 4366 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 4367 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0> 4368 ret <4 x bfloat> %shuffle 4369} 4370 4371define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4372; GX900-LABEL: shuffle_v4bf16_1100: 4373; GX900: ; %bb.0: 4374; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4375; GX900-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 4376; GX900-NEXT: s_mov_b32 s4, 0x7060302 4377; GX900-NEXT: s_mov_b32 s5, 0x5040100 4378; GX900-NEXT: s_waitcnt vmcnt(0) 4379; GX900-NEXT: v_perm_b32 v0, v1, v1, s4 4380; GX900-NEXT: v_perm_b32 v1, v1, v1, s5 4381; GX900-NEXT: s_setpc_b64 s[30:31] 4382; 4383; GFX940-LABEL: shuffle_v4bf16_1100: 4384; GFX940: ; %bb.0: 4385; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4386; GFX940-NEXT: global_load_dwordx2 v[2:3], v[0:1], off 4387; GFX940-NEXT: s_mov_b32 s0, 0x7060302 4388; GFX940-NEXT: s_mov_b32 s1, 0x5040100 4389; GFX940-NEXT: s_waitcnt vmcnt(0) 4390; GFX940-NEXT: v_perm_b32 v0, v2, v2, s0 4391; GFX940-NEXT: v_perm_b32 v1, v2, v2, s1 4392; GFX940-NEXT: s_setpc_b64 s[30:31] 4393; 4394; GFX10-LABEL: shuffle_v4bf16_1100: 4395; GFX10: ; %bb.0: 4396; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4397; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off 4398; GFX10-NEXT: s_waitcnt vmcnt(0) 4399; GFX10-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 4400; GFX10-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 4401; GFX10-NEXT: s_setpc_b64 s[30:31] 4402; 4403; GFX11-LABEL: shuffle_v4bf16_1100: 4404; GFX11: ; %bb.0: 4405; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4406; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off 4407; GFX11-NEXT: s_waitcnt vmcnt(0) 4408; GFX11-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 4409; GFX11-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 4410; GFX11-NEXT: s_setpc_b64 s[30:31] 4411 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 4412 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 4413 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0> 4414 ret <4 x bfloat> %shuffle 4415} 4416 4417define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4418; GX900-LABEL: shuffle_v4bf16_6161: 4419; GX900: ; %bb.0: 4420; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4421; GX900-NEXT: global_load_dword v4, v[0:1], off 4422; GX900-NEXT: global_load_dword v5, v[2:3], off offset:4 4423; GX900-NEXT: s_mov_b32 s4, 0xffff 4424; GX900-NEXT: s_waitcnt vmcnt(0) 4425; GX900-NEXT: v_bfi_b32 v0, s4, v5, v4 4426; GX900-NEXT: v_mov_b32_e32 v1, v0 4427; GX900-NEXT: s_setpc_b64 s[30:31] 4428; 4429; GFX940-LABEL: shuffle_v4bf16_6161: 4430; GFX940: ; %bb.0: 4431; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4432; GFX940-NEXT: global_load_dword v4, v[0:1], off 4433; GFX940-NEXT: global_load_dword v5, v[2:3], off offset:4 4434; GFX940-NEXT: s_mov_b32 s0, 0xffff 4435; GFX940-NEXT: s_waitcnt vmcnt(0) 4436; GFX940-NEXT: v_bfi_b32 v0, s0, v5, v4 4437; GFX940-NEXT: v_mov_b32_e32 v1, v0 4438; GFX940-NEXT: s_setpc_b64 s[30:31] 4439; 4440; GFX10-LABEL: shuffle_v4bf16_6161: 4441; GFX10: ; %bb.0: 4442; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4443; GFX10-NEXT: global_load_dword v4, v[0:1], off 4444; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 4445; GFX10-NEXT: s_waitcnt vmcnt(0) 4446; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v5, v4 4447; GFX10-NEXT: v_mov_b32_e32 v1, v0 4448; GFX10-NEXT: s_setpc_b64 s[30:31] 4449; 4450; GFX11-LABEL: shuffle_v4bf16_6161: 4451; GFX11: ; %bb.0: 4452; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4453; GFX11-NEXT: global_load_b32 v0, v[0:1], off 4454; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 4455; GFX11-NEXT: s_waitcnt vmcnt(0) 4456; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 4457; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4458; GFX11-NEXT: v_mov_b32_e32 v1, v0 4459; GFX11-NEXT: s_setpc_b64 s[30:31] 4460 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 4461 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 4462 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1> 4463 ret <4 x bfloat> %shuffle 4464} 4465 4466define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4467; GX900-LABEL: shuffle_v4bf16_2333: 4468; GX900: ; %bb.0: 4469; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4470; GX900-NEXT: global_load_dword v0, v[0:1], off offset:4 4471; GX900-NEXT: s_mov_b32 s4, 0x7060302 4472; GX900-NEXT: s_waitcnt vmcnt(0) 4473; GX900-NEXT: v_perm_b32 v1, v0, v0, s4 4474; GX900-NEXT: s_setpc_b64 s[30:31] 4475; 4476; GFX940-LABEL: shuffle_v4bf16_2333: 4477; GFX940: ; %bb.0: 4478; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4479; GFX940-NEXT: global_load_dword v0, v[0:1], off offset:4 4480; GFX940-NEXT: s_mov_b32 s0, 0x7060302 4481; GFX940-NEXT: s_waitcnt vmcnt(0) 4482; GFX940-NEXT: v_perm_b32 v1, v0, v0, s0 4483; GFX940-NEXT: s_setpc_b64 s[30:31] 4484; 4485; GFX10-LABEL: shuffle_v4bf16_2333: 4486; GFX10: ; %bb.0: 4487; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4488; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 4489; GFX10-NEXT: s_waitcnt vmcnt(0) 4490; GFX10-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 4491; GFX10-NEXT: s_setpc_b64 s[30:31] 4492; 4493; GFX11-LABEL: shuffle_v4bf16_2333: 4494; GFX11: ; %bb.0: 4495; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4496; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 4497; GFX11-NEXT: s_waitcnt vmcnt(0) 4498; GFX11-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 4499; GFX11-NEXT: s_setpc_b64 s[30:31] 4500 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 4501 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 4502 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 4503 ret <4 x bfloat> %shuffle 4504} 4505 4506define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4507; GX900-LABEL: shuffle_v4bf16_6667: 4508; GX900: ; %bb.0: 4509; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4510; GX900-NEXT: global_load_dword v0, v[0:1], off offset:4 4511; GX900-NEXT: s_mov_b32 s4, 0x7060302 4512; GX900-NEXT: s_waitcnt vmcnt(0) 4513; GX900-NEXT: v_perm_b32 v1, v0, v0, s4 4514; GX900-NEXT: s_setpc_b64 s[30:31] 4515; 4516; GFX940-LABEL: shuffle_v4bf16_6667: 4517; GFX940: ; %bb.0: 4518; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4519; GFX940-NEXT: global_load_dword v0, v[0:1], off offset:4 4520; GFX940-NEXT: s_mov_b32 s0, 0x7060302 4521; GFX940-NEXT: s_waitcnt vmcnt(0) 4522; GFX940-NEXT: v_perm_b32 v1, v0, v0, s0 4523; GFX940-NEXT: s_setpc_b64 s[30:31] 4524; 4525; GFX10-LABEL: shuffle_v4bf16_6667: 4526; GFX10: ; %bb.0: 4527; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4528; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 4529; GFX10-NEXT: s_waitcnt vmcnt(0) 4530; GFX10-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 4531; GFX10-NEXT: s_setpc_b64 s[30:31] 4532; 4533; GFX11-LABEL: shuffle_v4bf16_6667: 4534; GFX11: ; %bb.0: 4535; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4536; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 4537; GFX11-NEXT: s_waitcnt vmcnt(0) 4538; GFX11-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 4539; GFX11-NEXT: s_setpc_b64 s[30:31] 4540 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 4541 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 4542 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3> 4543 ret <4 x bfloat> %shuffle 4544} 4545 4546define <4 x bfloat> @shuffle_v8bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4547; GFX9-LABEL: shuffle_v8bf16_0101: 4548; GFX9: ; %bb.0: 4549; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4550; GFX9-NEXT: global_load_dword v0, v[0:1], off 4551; GFX9-NEXT: s_waitcnt vmcnt(0) 4552; GFX9-NEXT: v_mov_b32_e32 v1, v0 4553; GFX9-NEXT: s_setpc_b64 s[30:31] 4554; 4555; GFX10-LABEL: shuffle_v8bf16_0101: 4556; GFX10: ; %bb.0: 4557; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4558; GFX10-NEXT: global_load_dword v0, v[0:1], off 4559; GFX10-NEXT: s_waitcnt vmcnt(0) 4560; GFX10-NEXT: v_mov_b32_e32 v1, v0 4561; GFX10-NEXT: s_setpc_b64 s[30:31] 4562; 4563; GFX11-LABEL: shuffle_v8bf16_0101: 4564; GFX11: ; %bb.0: 4565; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4566; GFX11-NEXT: global_load_b32 v0, v[0:1], off 4567; GFX11-NEXT: s_waitcnt vmcnt(0) 4568; GFX11-NEXT: v_mov_b32_e32 v1, v0 4569; GFX11-NEXT: s_setpc_b64 s[30:31] 4570 %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0 4571 %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1 4572 %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 4573 ret <4 x bfloat> %shuffle 4574} 4575 4576define <4 x bfloat> @shuffle_v8bf16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4577; GFX9-LABEL: shuffle_v8bf16_0123: 4578; GFX9: ; %bb.0: 4579; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4580; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 4581; GFX9-NEXT: s_waitcnt vmcnt(0) 4582; GFX9-NEXT: s_setpc_b64 s[30:31] 4583; 4584; GFX10-LABEL: shuffle_v8bf16_0123: 4585; GFX10: ; %bb.0: 4586; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4587; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 4588; GFX10-NEXT: s_waitcnt vmcnt(0) 4589; GFX10-NEXT: s_setpc_b64 s[30:31] 4590; 4591; GFX11-LABEL: shuffle_v8bf16_0123: 4592; GFX11: ; %bb.0: 4593; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4594; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 4595; GFX11-NEXT: s_waitcnt vmcnt(0) 4596; GFX11-NEXT: s_setpc_b64 s[30:31] 4597 %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0 4598 %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1 4599 %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4600 ret <4 x bfloat> %shuffle 4601} 4602 4603define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4604; GFX9-LABEL: shuffle_v8bf16_4589: 4605; GFX9: ; %bb.0: 4606; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4607; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:8 4608; GFX9-NEXT: global_load_dword v5, v[2:3], off 4609; GFX9-NEXT: s_waitcnt vmcnt(1) 4610; GFX9-NEXT: v_mov_b32_e32 v0, v4 4611; GFX9-NEXT: s_waitcnt vmcnt(0) 4612; GFX9-NEXT: v_mov_b32_e32 v1, v5 4613; GFX9-NEXT: s_setpc_b64 s[30:31] 4614; 4615; GFX10-LABEL: shuffle_v8bf16_4589: 4616; GFX10: ; %bb.0: 4617; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4618; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:8 4619; GFX10-NEXT: global_load_dword v5, v[2:3], off 4620; GFX10-NEXT: s_waitcnt vmcnt(1) 4621; GFX10-NEXT: v_mov_b32_e32 v0, v4 4622; GFX10-NEXT: s_waitcnt vmcnt(0) 4623; GFX10-NEXT: v_mov_b32_e32 v1, v5 4624; GFX10-NEXT: s_setpc_b64 s[30:31] 4625; 4626; GFX11-LABEL: shuffle_v8bf16_4589: 4627; GFX11: ; %bb.0: 4628; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4629; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:8 4630; GFX11-NEXT: global_load_b32 v1, v[2:3], off 4631; GFX11-NEXT: s_waitcnt vmcnt(0) 4632; GFX11-NEXT: s_setpc_b64 s[30:31] 4633 %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0 4634 %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1 4635 %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9> 4636 ret <4 x bfloat> %shuffle 4637} 4638 4639define <4 x bfloat> @shuffle_v8bf16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4640; GFX9-LABEL: shuffle_v8bf16_10_11_2_3: 4641; GFX9: ; %bb.0: 4642; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4643; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 4644; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 4645; GFX9-NEXT: s_waitcnt vmcnt(1) 4646; GFX9-NEXT: v_mov_b32_e32 v0, v4 4647; GFX9-NEXT: s_waitcnt vmcnt(0) 4648; GFX9-NEXT: v_mov_b32_e32 v1, v5 4649; GFX9-NEXT: s_setpc_b64 s[30:31] 4650; 4651; GFX10-LABEL: shuffle_v8bf16_10_11_2_3: 4652; GFX10: ; %bb.0: 4653; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4654; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 4655; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 4656; GFX10-NEXT: s_waitcnt vmcnt(1) 4657; GFX10-NEXT: v_mov_b32_e32 v0, v4 4658; GFX10-NEXT: s_waitcnt vmcnt(0) 4659; GFX10-NEXT: v_mov_b32_e32 v1, v5 4660; GFX10-NEXT: s_setpc_b64 s[30:31] 4661; 4662; GFX11-LABEL: shuffle_v8bf16_10_11_2_3: 4663; GFX11: ; %bb.0: 4664; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4665; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 4666; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 4667; GFX11-NEXT: s_waitcnt vmcnt(1) 4668; GFX11-NEXT: v_mov_b32_e32 v0, v2 4669; GFX11-NEXT: s_waitcnt vmcnt(0) 4670; GFX11-NEXT: s_setpc_b64 s[30:31] 4671 %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0 4672 %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1 4673 %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3> 4674 ret <4 x bfloat> %shuffle 4675} 4676 4677define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4678; GX900-LABEL: shuffle_v8bf16_13_14_2_3: 4679; GX900: ; %bb.0: 4680; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4681; GX900-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 4682; GX900-NEXT: global_load_dword v4, v[0:1], off offset:4 4683; GX900-NEXT: s_waitcnt vmcnt(1) 4684; GX900-NEXT: v_alignbit_b32 v0, v6, v5, 16 4685; GX900-NEXT: s_waitcnt vmcnt(0) 4686; GX900-NEXT: v_mov_b32_e32 v1, v4 4687; GX900-NEXT: s_setpc_b64 s[30:31] 4688; 4689; GFX940-LABEL: shuffle_v8bf16_13_14_2_3: 4690; GFX940: ; %bb.0: 4691; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4692; GFX940-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:8 4693; GFX940-NEXT: global_load_dword v4, v[0:1], off offset:4 4694; GFX940-NEXT: s_waitcnt vmcnt(1) 4695; GFX940-NEXT: v_alignbit_b32 v0, v7, v6, 16 4696; GFX940-NEXT: s_waitcnt vmcnt(0) 4697; GFX940-NEXT: v_mov_b32_e32 v1, v4 4698; GFX940-NEXT: s_setpc_b64 s[30:31] 4699; 4700; GFX10-LABEL: shuffle_v8bf16_13_14_2_3: 4701; GFX10: ; %bb.0: 4702; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4703; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 4704; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 4705; GFX10-NEXT: s_waitcnt vmcnt(1) 4706; GFX10-NEXT: v_alignbit_b32 v0, v6, v5, 16 4707; GFX10-NEXT: s_waitcnt vmcnt(0) 4708; GFX10-NEXT: v_mov_b32_e32 v1, v4 4709; GFX10-NEXT: s_setpc_b64 s[30:31] 4710; 4711; GFX11-LABEL: shuffle_v8bf16_13_14_2_3: 4712; GFX11: ; %bb.0: 4713; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4714; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8 4715; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 4716; GFX11-NEXT: s_waitcnt vmcnt(1) 4717; GFX11-NEXT: v_alignbit_b32 v0, v3, v2, 16 4718; GFX11-NEXT: s_waitcnt vmcnt(0) 4719; GFX11-NEXT: s_setpc_b64 s[30:31] 4720 %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0 4721 %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1 4722 %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3> 4723 ret <4 x bfloat> %shuffle 4724} 4725 4726define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4727; GX900-LABEL: shuffle_v3bf16_0122: 4728; GX900: ; %bb.0: 4729; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4730; GX900-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 4731; GX900-NEXT: s_mov_b32 s4, 0x5040100 4732; GX900-NEXT: s_waitcnt vmcnt(0) 4733; GX900-NEXT: v_perm_b32 v1, v1, v1, s4 4734; GX900-NEXT: s_setpc_b64 s[30:31] 4735; 4736; GFX940-LABEL: shuffle_v3bf16_0122: 4737; GFX940: ; %bb.0: 4738; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4739; GFX940-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 4740; GFX940-NEXT: s_mov_b32 s0, 0x5040100 4741; GFX940-NEXT: s_waitcnt vmcnt(0) 4742; GFX940-NEXT: v_perm_b32 v1, v1, v1, s0 4743; GFX940-NEXT: s_setpc_b64 s[30:31] 4744; 4745; GFX10-LABEL: shuffle_v3bf16_0122: 4746; GFX10: ; %bb.0: 4747; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4748; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 4749; GFX10-NEXT: s_waitcnt vmcnt(0) 4750; GFX10-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 4751; GFX10-NEXT: s_setpc_b64 s[30:31] 4752; 4753; GFX11-LABEL: shuffle_v3bf16_0122: 4754; GFX11: ; %bb.0: 4755; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4756; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 4757; GFX11-NEXT: s_waitcnt vmcnt(0) 4758; GFX11-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 4759; GFX11-NEXT: s_setpc_b64 s[30:31] 4760 %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0 4761 %val1 = load <3 x bfloat>, ptr addrspace(1) %arg1 4762 %shuffle = shufflevector <3 x bfloat> %val0, <3 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 4763 ret <4 x bfloat> %shuffle 4764} 4765 4766define <4 x bfloat> @shuffle_v2bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4767; GFX9-LABEL: shuffle_v2bf16_0122: 4768; GFX9: ; %bb.0: 4769; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4770; GFX9-NEXT: global_load_dword v0, v[0:1], off 4771; GFX9-NEXT: s_waitcnt vmcnt(0) 4772; GFX9-NEXT: v_alignbit_b32 v1, v0, v0, 16 4773; GFX9-NEXT: s_setpc_b64 s[30:31] 4774; 4775; GFX10-LABEL: shuffle_v2bf16_0122: 4776; GFX10: ; %bb.0: 4777; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4778; GFX10-NEXT: global_load_dword v0, v[0:1], off 4779; GFX10-NEXT: s_waitcnt vmcnt(0) 4780; GFX10-NEXT: v_alignbit_b32 v1, v0, v0, 16 4781; GFX10-NEXT: s_setpc_b64 s[30:31] 4782; 4783; GFX11-LABEL: shuffle_v2bf16_0122: 4784; GFX11: ; %bb.0: 4785; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4786; GFX11-NEXT: global_load_b32 v0, v[0:1], off 4787; GFX11-NEXT: s_waitcnt vmcnt(0) 4788; GFX11-NEXT: v_alignbit_b32 v1, v0, v0, 16 4789; GFX11-NEXT: s_setpc_b64 s[30:31] 4790 %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0 4791 %val1 = load <2 x bfloat>, ptr addrspace(1) %arg1 4792 %shuffle = shufflevector <2 x bfloat> %val0, <2 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0> 4793 ret <4 x bfloat> %shuffle 4794} 4795 4796define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 4797; GX900-LABEL: shuffle_v6bf16_452367: 4798; GX900: ; %bb.0: 4799; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4800; GX900-NEXT: v_mov_b32_e32 v6, v1 4801; GX900-NEXT: v_mov_b32_e32 v5, v0 4802; GX900-NEXT: v_mov_b32_e32 v4, v3 4803; GX900-NEXT: v_mov_b32_e32 v3, v2 4804; GX900-NEXT: global_load_dwordx3 v[0:2], v[5:6], off 4805; GX900-NEXT: global_load_dword v7, v[3:4], off 4806; GX900-NEXT: s_waitcnt vmcnt(1) 4807; GX900-NEXT: v_mov_b32_e32 v0, v2 4808; GX900-NEXT: s_waitcnt vmcnt(0) 4809; GX900-NEXT: v_mov_b32_e32 v2, v7 4810; GX900-NEXT: s_setpc_b64 s[30:31] 4811; 4812; GFX940-LABEL: shuffle_v6bf16_452367: 4813; GFX940: ; %bb.0: 4814; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4815; GFX940-NEXT: v_mov_b32_e32 v7, v1 4816; GFX940-NEXT: v_mov_b32_e32 v6, v0 4817; GFX940-NEXT: v_mov_b32_e32 v5, v3 4818; GFX940-NEXT: v_mov_b32_e32 v4, v2 4819; GFX940-NEXT: global_load_dwordx3 v[0:2], v[6:7], off 4820; GFX940-NEXT: global_load_dword v3, v[4:5], off 4821; GFX940-NEXT: s_waitcnt vmcnt(1) 4822; GFX940-NEXT: v_mov_b32_e32 v0, v2 4823; GFX940-NEXT: s_waitcnt vmcnt(0) 4824; GFX940-NEXT: v_mov_b32_e32 v2, v3 4825; GFX940-NEXT: s_setpc_b64 s[30:31] 4826; 4827; GFX10-LABEL: shuffle_v6bf16_452367: 4828; GFX10: ; %bb.0: 4829; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4830; GFX10-NEXT: v_mov_b32_e32 v6, v1 4831; GFX10-NEXT: v_mov_b32_e32 v5, v0 4832; GFX10-NEXT: v_mov_b32_e32 v4, v3 4833; GFX10-NEXT: v_mov_b32_e32 v3, v2 4834; GFX10-NEXT: global_load_dwordx3 v[0:2], v[5:6], off 4835; GFX10-NEXT: global_load_dword v7, v[3:4], off 4836; GFX10-NEXT: s_waitcnt vmcnt(1) 4837; GFX10-NEXT: v_mov_b32_e32 v0, v2 4838; GFX10-NEXT: s_waitcnt vmcnt(0) 4839; GFX10-NEXT: v_mov_b32_e32 v2, v7 4840; GFX10-NEXT: s_setpc_b64 s[30:31] 4841; 4842; GFX11-LABEL: shuffle_v6bf16_452367: 4843; GFX11: ; %bb.0: 4844; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4845; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 4846; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off 4847; GFX11-NEXT: global_load_b32 v3, v[3:4], off 4848; GFX11-NEXT: s_waitcnt vmcnt(1) 4849; GFX11-NEXT: v_mov_b32_e32 v0, v2 4850; GFX11-NEXT: s_waitcnt vmcnt(0) 4851; GFX11-NEXT: v_mov_b32_e32 v2, v3 4852; GFX11-NEXT: s_setpc_b64 s[30:31] 4853 %val0 = load <6 x bfloat>, ptr addrspace(1) %arg0 4854 %val1 = load <6 x bfloat>, ptr addrspace(1) %arg1 4855 %shuffle = shufflevector <6 x bfloat> %val0, <6 x bfloat> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7> 4856 ret <6 x bfloat> %shuffle 4857} 4858 4859define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) { 4860; GX900-LABEL: fma_shuffle_v2bf16: 4861; GX900: ; %bb.0: ; %entry 4862; GX900-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 4863; GX900-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 4864; GX900-NEXT: v_lshlrev_b32_e32 v0, 3, v0 4865; GX900-NEXT: s_movk_i32 s2, 0x7fff 4866; GX900-NEXT: s_mov_b32 s3, 0x7060302 4867; GX900-NEXT: s_waitcnt lgkmcnt(0) 4868; GX900-NEXT: global_load_dwordx2 v[1:2], v0, s[0:1] 4869; GX900-NEXT: global_load_dwordx2 v[3:4], v0, s[4:5] 4870; GX900-NEXT: global_load_dwordx2 v[5:6], v0, s[6:7] 4871; GX900-NEXT: s_waitcnt vmcnt(2) 4872; GX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 4873; GX900-NEXT: s_waitcnt vmcnt(1) 4874; GX900-NEXT: v_lshlrev_b32_e32 v8, 16, v3 4875; GX900-NEXT: s_waitcnt vmcnt(0) 4876; GX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 4877; GX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4878; GX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5 4879; GX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v2 4880; GX900-NEXT: v_lshlrev_b32_e32 v12, 16, v4 4881; GX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2 4882; GX900-NEXT: v_fma_f32 v7, v8, v9, v7 4883; GX900-NEXT: v_fma_f32 v1, v8, v5, v1 4884; GX900-NEXT: v_fma_f32 v2, v12, v5, v2 4885; GX900-NEXT: v_bfe_u32 v5, v7, 16, 1 4886; GX900-NEXT: v_fma_f32 v8, v12, v9, v11 4887; GX900-NEXT: v_or_b32_e32 v9, 0x400000, v7 4888; GX900-NEXT: v_bfe_u32 v11, v1, 16, 1 4889; GX900-NEXT: v_add3_u32 v5, v5, v7, s2 4890; GX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 4891; GX900-NEXT: v_or_b32_e32 v12, 0x400000, v1 4892; GX900-NEXT: v_bfe_u32 v13, v8, 16, 1 4893; GX900-NEXT: v_add3_u32 v11, v11, v1, s2 4894; GX900-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc 4895; GX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 4896; GX900-NEXT: v_or_b32_e32 v14, 0x400000, v8 4897; GX900-NEXT: v_bfe_u32 v15, v2, 16, 1 4898; GX900-NEXT: v_add3_u32 v13, v13, v8, s2 4899; GX900-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc 4900; GX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 4901; GX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 4902; GX900-NEXT: v_lshlrev_b32_e32 v10, 16, v6 4903; GX900-NEXT: v_or_b32_e32 v16, 0x400000, v2 4904; GX900-NEXT: v_add3_u32 v15, v15, v2, s2 4905; GX900-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc 4906; GX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 4907; GX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 4908; GX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 4909; GX900-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc 4910; GX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 4911; GX900-NEXT: v_fma_f32 v1, v3, v10, v1 4912; GX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 4913; GX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 4914; GX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 4915; GX900-NEXT: v_fma_f32 v3, v3, v6, v5 4916; GX900-NEXT: v_bfe_u32 v5, v1, 16, 1 4917; GX900-NEXT: v_fma_f32 v2, v4, v10, v2 4918; GX900-NEXT: v_fma_f32 v4, v4, v6, v7 4919; GX900-NEXT: v_or_b32_e32 v6, 0x400000, v1 4920; GX900-NEXT: v_bfe_u32 v7, v3, 16, 1 4921; GX900-NEXT: v_add3_u32 v5, v5, v1, s2 4922; GX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 4923; GX900-NEXT: v_or_b32_e32 v8, 0x400000, v3 4924; GX900-NEXT: v_bfe_u32 v9, v2, 16, 1 4925; GX900-NEXT: v_add3_u32 v7, v7, v3, s2 4926; GX900-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc 4927; GX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 4928; GX900-NEXT: v_or_b32_e32 v10, 0x400000, v2 4929; GX900-NEXT: v_bfe_u32 v11, v4, 16, 1 4930; GX900-NEXT: v_add3_u32 v9, v9, v2, s2 4931; GX900-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc 4932; GX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 4933; GX900-NEXT: v_or_b32_e32 v12, 0x400000, v4 4934; GX900-NEXT: v_add3_u32 v11, v11, v4, s2 4935; GX900-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc 4936; GX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 4937; GX900-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc 4938; GX900-NEXT: v_perm_b32 v2, v4, v2, s3 4939; GX900-NEXT: v_perm_b32 v1, v3, v1, s3 4940; GX900-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] 4941; GX900-NEXT: s_endpgm 4942; 4943; GFX940-LABEL: fma_shuffle_v2bf16: 4944; GFX940: ; %bb.0: ; %entry 4945; GFX940-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 4946; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 4947; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 4948; GFX940-NEXT: v_lshlrev_b32_e32 v6, 3, v0 4949; GFX940-NEXT: s_movk_i32 s2, 0x7fff 4950; GFX940-NEXT: s_mov_b32 s3, 0x7060302 4951; GFX940-NEXT: s_waitcnt lgkmcnt(0) 4952; GFX940-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] 4953; GFX940-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] 4954; GFX940-NEXT: global_load_dwordx2 v[4:5], v6, s[10:11] 4955; GFX940-NEXT: s_waitcnt vmcnt(2) 4956; GFX940-NEXT: v_lshlrev_b32_e32 v7, 16, v0 4957; GFX940-NEXT: s_waitcnt vmcnt(1) 4958; GFX940-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 4959; GFX940-NEXT: s_waitcnt vmcnt(0) 4960; GFX940-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 4961; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 4962; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v4 4963; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v1 4964; GFX940-NEXT: v_and_b32_e32 v12, 0xffff0000, v3 4965; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v3 4966; GFX940-NEXT: v_fmac_f32_e32 v8, v7, v9 4967; GFX940-NEXT: v_fmac_f32_e32 v2, v7, v4 4968; GFX940-NEXT: v_fmac_f32_e32 v3, v11, v4 4969; GFX940-NEXT: v_bfe_u32 v4, v8, 16, 1 4970; GFX940-NEXT: v_fmac_f32_e32 v12, v11, v9 4971; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v8 4972; GFX940-NEXT: v_bfe_u32 v9, v2, 16, 1 4973; GFX940-NEXT: v_add3_u32 v4, v4, v8, s2 4974; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 4975; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v2 4976; GFX940-NEXT: v_bfe_u32 v13, v12, 16, 1 4977; GFX940-NEXT: v_add3_u32 v9, v9, v2, s2 4978; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 4979; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 4980; GFX940-NEXT: v_or_b32_e32 v14, 0x400000, v12 4981; GFX940-NEXT: v_bfe_u32 v15, v3, 16, 1 4982; GFX940-NEXT: v_add3_u32 v13, v13, v12, s2 4983; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc 4984; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 4985; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 4986; GFX940-NEXT: v_lshlrev_b32_e32 v10, 16, v5 4987; GFX940-NEXT: v_or_b32_e32 v16, 0x400000, v3 4988; GFX940-NEXT: v_add3_u32 v15, v15, v3, s2 4989; GFX940-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc 4990; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 4991; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 4992; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 4993; GFX940-NEXT: v_cndmask_b32_e32 v3, v15, v16, vcc 4994; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 4995; GFX940-NEXT: v_fmac_f32_e32 v2, v0, v10 4996; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 4997; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 4998; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 4999; GFX940-NEXT: v_fmac_f32_e32 v4, v0, v5 5000; GFX940-NEXT: v_bfe_u32 v0, v2, 16, 1 5001; GFX940-NEXT: v_fmac_f32_e32 v3, v1, v10 5002; GFX940-NEXT: v_fmac_f32_e32 v7, v1, v5 5003; GFX940-NEXT: v_or_b32_e32 v1, 0x400000, v2 5004; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 5005; GFX940-NEXT: v_add3_u32 v0, v0, v2, s2 5006; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 5007; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 5008; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 5009; GFX940-NEXT: v_add3_u32 v5, v5, v4, s2 5010; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 5011; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 5012; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 5013; GFX940-NEXT: v_bfe_u32 v11, v7, 16, 1 5014; GFX940-NEXT: v_add3_u32 v9, v9, v3, s2 5015; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc 5016; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 5017; GFX940-NEXT: v_or_b32_e32 v12, 0x400000, v7 5018; GFX940-NEXT: v_add3_u32 v11, v11, v7, s2 5019; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc 5020; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 5021; GFX940-NEXT: v_perm_b32 v0, v2, v0, s3 5022; GFX940-NEXT: s_nop 0 5023; GFX940-NEXT: v_cndmask_b32_e32 v3, v11, v12, vcc 5024; GFX940-NEXT: v_perm_b32 v1, v3, v1, s3 5025; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 5026; GFX940-NEXT: s_endpgm 5027; 5028; GFX10-LABEL: fma_shuffle_v2bf16: 5029; GFX10: ; %bb.0: ; %entry 5030; GFX10-NEXT: s_clause 0x1 5031; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 5032; GFX10-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 5033; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 5034; GFX10-NEXT: s_waitcnt lgkmcnt(0) 5035; GFX10-NEXT: s_clause 0x2 5036; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] 5037; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[4:5] 5038; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] 5039; GFX10-NEXT: s_waitcnt vmcnt(2) 5040; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 5041; GFX10-NEXT: s_waitcnt vmcnt(1) 5042; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2 5043; GFX10-NEXT: s_waitcnt vmcnt(0) 5044; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 5045; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 5046; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 5047; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 5048; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v3 5049; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 5050; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v9 5051; GFX10-NEXT: v_fmac_f32_e32 v0, v8, v4 5052; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 5053; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v9 5054; GFX10-NEXT: v_fmac_f32_e32 v1, v12, v4 5055; GFX10-NEXT: v_bfe_u32 v4, v7, 16, 1 5056; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v7 5057; GFX10-NEXT: v_bfe_u32 v9, v0, 16, 1 5058; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 5059; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v0 5060; GFX10-NEXT: v_add3_u32 v4, v4, v7, 0x7fff 5061; GFX10-NEXT: v_bfe_u32 v15, v1, 16, 1 5062; GFX10-NEXT: v_add3_u32 v9, v9, v0, 0x7fff 5063; GFX10-NEXT: v_bfe_u32 v13, v11, 16, 1 5064; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v1 5065; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo 5066; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 5067; GFX10-NEXT: v_add3_u32 v15, v15, v1, 0x7fff 5068; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 5069; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v11 5070; GFX10-NEXT: v_add3_u32 v13, v13, v11, 0x7fff 5071; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo 5072; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 5073; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 5074; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 5075; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 5076; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 5077; GFX10-NEXT: v_cndmask_b32_e32 v1, v15, v16, vcc_lo 5078; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 5079; GFX10-NEXT: v_fmac_f32_e32 v4, v2, v5 5080; GFX10-NEXT: v_fmac_f32_e32 v0, v2, v10 5081; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 5082; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo 5083; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 5084; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 5085; GFX10-NEXT: v_fmac_f32_e32 v1, v3, v10 5086; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 5087; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 5088; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 5089; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1 5090; GFX10-NEXT: v_fmac_f32_e32 v7, v3, v5 5091; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 5092; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1 5093; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 5094; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff 5095; GFX10-NEXT: v_bfe_u32 v11, v7, 16, 1 5096; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo 5097; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 5098; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7 5099; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 5100; GFX10-NEXT: v_add3_u32 v11, v11, v7, 0x7fff 5101; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo 5102; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 5103; GFX10-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc_lo 5104; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 5105; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 5106; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo 5107; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 5108; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] 5109; GFX10-NEXT: s_endpgm 5110; 5111; GFX11-LABEL: fma_shuffle_v2bf16: 5112; GFX11: ; %bb.0: ; %entry 5113; GFX11-NEXT: s_clause 0x1 5114; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x10 5115; GFX11-NEXT: s_load_b128 s[4:7], s[4:5], 0x0 5116; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 5117; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 5118; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 5119; GFX11-NEXT: s_waitcnt lgkmcnt(0) 5120; GFX11-NEXT: s_clause 0x2 5121; GFX11-NEXT: global_load_b64 v[0:1], v6, s[0:1] 5122; GFX11-NEXT: global_load_b64 v[2:3], v6, s[4:5] 5123; GFX11-NEXT: global_load_b64 v[4:5], v6, s[6:7] 5124; GFX11-NEXT: s_waitcnt vmcnt(0) 5125; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 5126; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 5127; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 5128; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v3 5129; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 5130; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 5131; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 5132; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 5133; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5134; GFX11-NEXT: v_dual_fmac_f32 v1, v12, v4 :: v_dual_lshlrev_b32 v8, 16, v2 5135; GFX11-NEXT: v_bfe_u32 v15, v1, 16, 1 5136; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v1 5137; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) 5138; GFX11-NEXT: v_add3_u32 v15, v15, v1, 0x7fff 5139; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 5140; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 5141; GFX11-NEXT: v_dual_fmac_f32 v7, v8, v9 :: v_dual_lshlrev_b32 v0, 16, v0 5142; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 5143; GFX11-NEXT: v_fmac_f32_e32 v0, v8, v4 5144; GFX11-NEXT: v_bfe_u32 v4, v7, 16, 1 5145; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v7 5146; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 5147; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 5148; GFX11-NEXT: v_add3_u32 v4, v4, v7, 0x7fff 5149; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo 5150; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 5151; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 5152; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 5153; GFX11-NEXT: v_fmac_f32_e32 v4, v2, v5 5154; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v9 5155; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 5156; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v0 5157; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 5158; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 5159; GFX11-NEXT: v_bfe_u32 v13, v11, 16, 1 5160; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 5161; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff 5162; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 5163; GFX11-NEXT: v_add3_u32 v13, v13, v11, 0x7fff 5164; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) 5165; GFX11-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo 5166; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 5167; GFX11-NEXT: v_cndmask_b32_e32 v1, v15, v16, vcc_lo 5168; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 5169; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 5170; GFX11-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo 5171; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 5172; GFX11-NEXT: v_fmac_f32_e32 v1, v3, v10 5173; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 5174; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 5175; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1 5176; GFX11-NEXT: v_fmac_f32_e32 v7, v3, v5 5177; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 5178; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) 5179; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff 5180; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 5181; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 5182; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 5183; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 5184; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) 5185; GFX11-NEXT: v_fmac_f32_e32 v0, v2, v10 5186; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1 5187; GFX11-NEXT: v_add3_u32 v11, v11, v7, 0x7fff 5188; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 5189; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 5190; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 5191; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 5192; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 5193; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo 5194; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 5195; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo 5196; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 5197; GFX11-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc_lo 5198; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 5199; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 5200; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 5201; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo 5202; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 5203; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] 5204; GFX11-NEXT: s_endpgm 5205entry: 5206 %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() 5207 %tmp12 = zext i32 %tmp1 to i64 5208 %arrayidx = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %A, i64 %tmp12 5209 %tmp14 = load <4 x bfloat>, ptr addrspace(1) %arrayidx, align 8 5210 %arrayidx1 = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %B, i64 %tmp12 5211 %tmp15 = load <4 x bfloat>, ptr addrspace(1) %arrayidx1, align 8 5212 %arrayidx2 = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %C, i64 %tmp12 5213 %tmp16 = load <4 x bfloat>, ptr addrspace(1) %arrayidx2, align 8 5214 %tmp17 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> zeroinitializer 5215 %tmp18 = shufflevector <4 x bfloat> %tmp15, <4 x bfloat> undef, <2 x i32> <i32 0, i32 1> 5216 %tmp19 = shufflevector <4 x bfloat> %tmp16, <4 x bfloat> undef, <2 x i32> <i32 0, i32 1> 5217 %tmp20 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp17, <2 x bfloat> %tmp18, <2 x bfloat> %tmp19) 5218 %tmp21 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 1, i32 1> 5219 %tmp22 = shufflevector <4 x bfloat> %tmp15, <4 x bfloat> undef, <2 x i32> <i32 2, i32 3> 5220 %tmp23 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp21, <2 x bfloat> %tmp22, <2 x bfloat> %tmp20) 5221 %tmp24 = shufflevector <2 x bfloat> %tmp23, <2 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 5222 %tmp25 = shufflevector <4 x bfloat> %tmp24, <4 x bfloat> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 5223 %tmp26 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 2, i32 2> 5224 %tmp27 = shufflevector <4 x bfloat> %tmp25, <4 x bfloat> undef, <2 x i32> <i32 2, i32 3> 5225 %tmp28 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp26, <2 x bfloat> %tmp18, <2 x bfloat> %tmp27) 5226 %tmp29 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 3, i32 3> 5227 %tmp30 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp29, <2 x bfloat> %tmp22, <2 x bfloat> %tmp28) 5228 %tmp31 = shufflevector <2 x bfloat> %tmp30, <2 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 5229 %tmp32 = shufflevector <4 x bfloat> %tmp25, <4 x bfloat> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 5230 store <4 x bfloat> %tmp32, ptr addrspace(1) %arrayidx2, align 8 5231 ret void 5232} 5233 5234define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { 5235; GX900-LABEL: shuffle_v4bf16_0456: 5236; GX900: ; %bb.0: 5237; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5238; GX900-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 5239; GX900-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 5240; GX900-NEXT: s_mov_b32 s4, 0x5040100 5241; GX900-NEXT: ; kill: killed $vgpr0 killed $vgpr1 5242; GX900-NEXT: ; kill: killed $vgpr2 killed $vgpr3 5243; GX900-NEXT: s_waitcnt vmcnt(0) 5244; GX900-NEXT: v_perm_b32 v0, v5, v4, s4 5245; GX900-NEXT: v_alignbit_b32 v1, v6, v5, 16 5246; GX900-NEXT: s_setpc_b64 s[30:31] 5247; 5248; GFX940-LABEL: shuffle_v4bf16_0456: 5249; GFX940: ; %bb.0: 5250; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5251; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 5252; GFX940-NEXT: global_load_dwordx2 v[6:7], v[2:3], off 5253; GFX940-NEXT: s_mov_b32 s0, 0x5040100 5254; GFX940-NEXT: s_waitcnt vmcnt(0) 5255; GFX940-NEXT: v_perm_b32 v0, v6, v4, s0 5256; GFX940-NEXT: v_alignbit_b32 v1, v7, v6, 16 5257; GFX940-NEXT: s_setpc_b64 s[30:31] 5258; 5259; GFX10-LABEL: shuffle_v4bf16_0456: 5260; GFX10: ; %bb.0: 5261; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5262; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 5263; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off 5264; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 5265; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 5266; GFX10-NEXT: s_waitcnt vmcnt(0) 5267; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 5268; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16 5269; GFX10-NEXT: s_setpc_b64 s[30:31] 5270; 5271; GFX11-LABEL: shuffle_v4bf16_0456: 5272; GFX11: ; %bb.0: 5273; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5274; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 5275; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off 5276; GFX11-NEXT: s_waitcnt vmcnt(0) 5277; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 5278; GFX11-NEXT: v_alignbit_b32 v1, v2, v1, 16 5279; GFX11-NEXT: s_setpc_b64 s[30:31] 5280 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 5281 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 5282 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 5283 ret <4 x bfloat> %shuffle 5284} 5285 5286define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { 5287; GX900-LABEL: low16bits: 5288; GX900: ; %bb.0: ; %entry 5289; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5290; GX900-NEXT: global_load_dword v4, v[0:1], off 5291; GX900-NEXT: global_load_dword v5, v[2:3], off 5292; GX900-NEXT: s_mov_b32 s4, 0x5040100 5293; GX900-NEXT: s_waitcnt vmcnt(0) 5294; GX900-NEXT: v_perm_b32 v0, v5, v4, s4 5295; GX900-NEXT: s_setpc_b64 s[30:31] 5296; 5297; GFX940-LABEL: low16bits: 5298; GFX940: ; %bb.0: ; %entry 5299; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5300; GFX940-NEXT: global_load_dword v4, v[0:1], off 5301; GFX940-NEXT: global_load_dword v5, v[2:3], off 5302; GFX940-NEXT: s_mov_b32 s0, 0x5040100 5303; GFX940-NEXT: s_waitcnt vmcnt(0) 5304; GFX940-NEXT: v_perm_b32 v0, v5, v4, s0 5305; GFX940-NEXT: s_setpc_b64 s[30:31] 5306; 5307; GFX10-LABEL: low16bits: 5308; GFX10: ; %bb.0: ; %entry 5309; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5310; GFX10-NEXT: global_load_dword v4, v[0:1], off 5311; GFX10-NEXT: global_load_dword v5, v[2:3], off 5312; GFX10-NEXT: s_waitcnt vmcnt(0) 5313; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 5314; GFX10-NEXT: s_setpc_b64 s[30:31] 5315; 5316; GFX11-LABEL: low16bits: 5317; GFX11: ; %bb.0: ; %entry 5318; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5319; GFX11-NEXT: global_load_b32 v0, v[0:1], off 5320; GFX11-NEXT: global_load_b32 v1, v[2:3], off 5321; GFX11-NEXT: s_waitcnt vmcnt(0) 5322; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 5323; GFX11-NEXT: s_setpc_b64 s[30:31] 5324entry: 5325 %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4 5326 %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4 5327 %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 0, i32 undef> 5328 %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 2> 5329 ret <2 x bfloat> %vy1.2.vec.insert 5330} 5331 5332define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { 5333; GX900-LABEL: hi16bits_v2bf16: 5334; GX900: ; %bb.0: ; %entry 5335; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5336; GX900-NEXT: global_load_dword v4, v[0:1], off 5337; GX900-NEXT: global_load_dword v5, v[2:3], off 5338; GX900-NEXT: s_mov_b32 s4, 0x7060302 5339; GX900-NEXT: s_waitcnt vmcnt(0) 5340; GX900-NEXT: v_perm_b32 v0, v5, v4, s4 5341; GX900-NEXT: s_setpc_b64 s[30:31] 5342; 5343; GFX940-LABEL: hi16bits_v2bf16: 5344; GFX940: ; %bb.0: ; %entry 5345; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5346; GFX940-NEXT: global_load_dword v4, v[0:1], off 5347; GFX940-NEXT: global_load_dword v5, v[2:3], off 5348; GFX940-NEXT: s_mov_b32 s0, 0x7060302 5349; GFX940-NEXT: s_waitcnt vmcnt(0) 5350; GFX940-NEXT: v_perm_b32 v0, v5, v4, s0 5351; GFX940-NEXT: s_setpc_b64 s[30:31] 5352; 5353; GFX10-LABEL: hi16bits_v2bf16: 5354; GFX10: ; %bb.0: ; %entry 5355; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5356; GFX10-NEXT: global_load_dword v4, v[0:1], off 5357; GFX10-NEXT: global_load_dword v5, v[2:3], off 5358; GFX10-NEXT: s_waitcnt vmcnt(0) 5359; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302 5360; GFX10-NEXT: s_setpc_b64 s[30:31] 5361; 5362; GFX11-LABEL: hi16bits_v2bf16: 5363; GFX11: ; %bb.0: ; %entry 5364; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5365; GFX11-NEXT: global_load_b32 v0, v[0:1], off 5366; GFX11-NEXT: global_load_b32 v1, v[2:3], off 5367; GFX11-NEXT: s_waitcnt vmcnt(0) 5368; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 5369; GFX11-NEXT: s_setpc_b64 s[30:31] 5370entry: 5371 %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4 5372 %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4 5373 %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 undef> 5374 %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 3> 5375 ret <2 x bfloat> %vy1.2.vec.insert 5376} 5377 5378define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { 5379; GX900-LABEL: low16hi16bits_v2bf16: 5380; GX900: ; %bb.0: ; %entry 5381; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5382; GX900-NEXT: global_load_dword v4, v[2:3], off 5383; GX900-NEXT: global_load_dword v5, v[0:1], off 5384; GX900-NEXT: s_mov_b32 s4, 0xffff 5385; GX900-NEXT: s_waitcnt vmcnt(0) 5386; GX900-NEXT: v_bfi_b32 v0, s4, v5, v4 5387; GX900-NEXT: s_setpc_b64 s[30:31] 5388; 5389; GFX940-LABEL: low16hi16bits_v2bf16: 5390; GFX940: ; %bb.0: ; %entry 5391; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5392; GFX940-NEXT: global_load_dword v4, v[2:3], off 5393; GFX940-NEXT: global_load_dword v5, v[0:1], off 5394; GFX940-NEXT: s_mov_b32 s0, 0xffff 5395; GFX940-NEXT: s_waitcnt vmcnt(0) 5396; GFX940-NEXT: v_bfi_b32 v0, s0, v5, v4 5397; GFX940-NEXT: s_setpc_b64 s[30:31] 5398; 5399; GFX10-LABEL: low16hi16bits_v2bf16: 5400; GFX10: ; %bb.0: ; %entry 5401; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5402; GFX10-NEXT: global_load_dword v4, v[2:3], off 5403; GFX10-NEXT: global_load_dword v5, v[0:1], off 5404; GFX10-NEXT: s_waitcnt vmcnt(0) 5405; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v5, v4 5406; GFX10-NEXT: s_setpc_b64 s[30:31] 5407; 5408; GFX11-LABEL: low16hi16bits_v2bf16: 5409; GFX11: ; %bb.0: ; %entry 5410; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5411; GFX11-NEXT: global_load_b32 v2, v[2:3], off 5412; GFX11-NEXT: global_load_b32 v0, v[0:1], off 5413; GFX11-NEXT: s_waitcnt vmcnt(0) 5414; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 5415; GFX11-NEXT: s_setpc_b64 s[30:31] 5416entry: 5417 %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4 5418 %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4 5419 %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 0, i32 undef> 5420 %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 3> 5421 ret <2 x bfloat> %vy1.2.vec.insert 5422} 5423 5424define <2 x bfloat> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) { 5425; GFX9-LABEL: hi16low16bits: 5426; GFX9: ; %bb.0: ; %entry 5427; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5428; GFX9-NEXT: global_load_dword v4, v[0:1], off 5429; GFX9-NEXT: global_load_dword v5, v[2:3], off 5430; GFX9-NEXT: s_waitcnt vmcnt(0) 5431; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 5432; GFX9-NEXT: s_setpc_b64 s[30:31] 5433; 5434; GFX10-LABEL: hi16low16bits: 5435; GFX10: ; %bb.0: ; %entry 5436; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5437; GFX10-NEXT: global_load_dword v4, v[0:1], off 5438; GFX10-NEXT: global_load_dword v5, v[2:3], off 5439; GFX10-NEXT: s_waitcnt vmcnt(0) 5440; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16 5441; GFX10-NEXT: s_setpc_b64 s[30:31] 5442; 5443; GFX11-LABEL: hi16low16bits: 5444; GFX11: ; %bb.0: ; %entry 5445; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5446; GFX11-NEXT: global_load_b32 v0, v[0:1], off 5447; GFX11-NEXT: global_load_b32 v1, v[2:3], off 5448; GFX11-NEXT: s_waitcnt vmcnt(0) 5449; GFX11-NEXT: v_alignbit_b32 v0, v1, v0, 16 5450; GFX11-NEXT: s_setpc_b64 s[30:31] 5451entry: 5452 %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4 5453 %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4 5454 %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 undef> 5455 %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 2> 5456 ret <2 x bfloat> %vy1.2.vec.insert 5457} 5458 5459define <2 x bfloat> @v2bfloat_hi16bits(ptr addrspace(1) %x0) { 5460; GFX9-LABEL: v2bfloat_hi16bits: 5461; GFX9: ; %bb.0: ; %entry 5462; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5463; GFX9-NEXT: global_load_dword v0, v[0:1], off 5464; GFX9-NEXT: s_waitcnt vmcnt(0) 5465; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 5466; GFX9-NEXT: s_setpc_b64 s[30:31] 5467; 5468; GFX10-LABEL: v2bfloat_hi16bits: 5469; GFX10: ; %bb.0: ; %entry 5470; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5471; GFX10-NEXT: global_load_dword v0, v[0:1], off 5472; GFX10-NEXT: s_waitcnt vmcnt(0) 5473; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 5474; GFX10-NEXT: s_setpc_b64 s[30:31] 5475; 5476; GFX11-LABEL: v2bfloat_hi16bits: 5477; GFX11: ; %bb.0: ; %entry 5478; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5479; GFX11-NEXT: global_load_b32 v0, v[0:1], off 5480; GFX11-NEXT: s_waitcnt vmcnt(0) 5481; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 5482; GFX11-NEXT: s_setpc_b64 s[30:31] 5483entry: 5484 %load0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4 5485 %insert1 = insertelement <2 x bfloat> undef, bfloat 0.0, i32 0 5486 %insert2 = insertelement <2 x bfloat> %insert1, bfloat 0.0, i32 1 5487 %vec.ret = shufflevector <2 x bfloat> %insert2, <2 x bfloat> %load0, <2 x i32> <i32 0, i32 3> 5488 ret <2 x bfloat> %vec.ret 5489} 5490 5491define void @shuffle_v8bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 5492; GX900-LABEL: shuffle_v8bf16_concat: 5493; GX900: ; %bb.0: 5494; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5495; GX900-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 5496; GX900-NEXT: global_load_dwordx2 v[8:9], v[2:3], off 5497; GX900-NEXT: s_waitcnt vmcnt(0) 5498; GX900-NEXT: global_store_dwordx4 v[4:5], v[6:9], off 5499; GX900-NEXT: s_waitcnt vmcnt(0) 5500; GX900-NEXT: s_setpc_b64 s[30:31] 5501; 5502; GFX940-LABEL: shuffle_v8bf16_concat: 5503; GFX940: ; %bb.0: 5504; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5505; GFX940-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 5506; GFX940-NEXT: global_load_dwordx2 v[8:9], v[2:3], off 5507; GFX940-NEXT: s_waitcnt vmcnt(0) 5508; GFX940-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 5509; GFX940-NEXT: s_waitcnt vmcnt(0) 5510; GFX940-NEXT: s_setpc_b64 s[30:31] 5511; 5512; GFX10-LABEL: shuffle_v8bf16_concat: 5513; GFX10: ; %bb.0: 5514; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5515; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 5516; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off 5517; GFX10-NEXT: s_waitcnt vmcnt(0) 5518; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off 5519; GFX10-NEXT: s_setpc_b64 s[30:31] 5520; 5521; GFX11-LABEL: shuffle_v8bf16_concat: 5522; GFX11: ; %bb.0: 5523; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5524; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 5525; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off 5526; GFX11-NEXT: s_waitcnt vmcnt(0) 5527; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 5528; GFX11-NEXT: s_setpc_b64 s[30:31] 5529 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 5530 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 5531 %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 5532 store <8 x bfloat> %shuffle, ptr addrspace(1) %out 5533 ret void 5534} 5535 5536define void @shuffle_v16bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 5537; GX900-LABEL: shuffle_v16bf16_concat: 5538; GX900: ; %bb.0: 5539; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5540; GX900-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 5541; GX900-NEXT: global_load_dwordx4 v[10:13], v[0:1], off 5542; GX900-NEXT: s_waitcnt vmcnt(1) 5543; GX900-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 5544; GX900-NEXT: s_waitcnt vmcnt(1) 5545; GX900-NEXT: global_store_dwordx4 v[4:5], v[10:13], off 5546; GX900-NEXT: s_waitcnt vmcnt(0) 5547; GX900-NEXT: s_setpc_b64 s[30:31] 5548; 5549; GFX940-LABEL: shuffle_v16bf16_concat: 5550; GFX940: ; %bb.0: 5551; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5552; GFX940-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 5553; GFX940-NEXT: global_load_dwordx4 v[10:13], v[0:1], off 5554; GFX940-NEXT: s_waitcnt vmcnt(1) 5555; GFX940-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 sc0 sc1 5556; GFX940-NEXT: s_waitcnt vmcnt(1) 5557; GFX940-NEXT: global_store_dwordx4 v[4:5], v[10:13], off sc0 sc1 5558; GFX940-NEXT: s_waitcnt vmcnt(0) 5559; GFX940-NEXT: s_setpc_b64 s[30:31] 5560; 5561; GFX10-LABEL: shuffle_v16bf16_concat: 5562; GFX10: ; %bb.0: 5563; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5564; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 5565; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off 5566; GFX10-NEXT: s_waitcnt vmcnt(1) 5567; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 5568; GFX10-NEXT: s_waitcnt vmcnt(0) 5569; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off 5570; GFX10-NEXT: s_setpc_b64 s[30:31] 5571; 5572; GFX11-LABEL: shuffle_v16bf16_concat: 5573; GFX11: ; %bb.0: 5574; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5575; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off 5576; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off 5577; GFX11-NEXT: s_waitcnt vmcnt(1) 5578; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:16 5579; GFX11-NEXT: s_waitcnt vmcnt(0) 5580; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 5581; GFX11-NEXT: s_setpc_b64 s[30:31] 5582 %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0 5583 %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1 5584 %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 5585 store <16 x bfloat> %shuffle, ptr addrspace(1) %out 5586 ret void 5587} 5588 5589define void @shuffle_v32bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) { 5590; GX900-LABEL: shuffle_v32bf16_concat: 5591; GX900: ; %bb.0: 5592; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5593; GX900-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 5594; GX900-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 5595; GX900-NEXT: global_load_dwordx4 v[14:17], v[0:1], off 5596; GX900-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 5597; GX900-NEXT: s_waitcnt vmcnt(3) 5598; GX900-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 5599; GX900-NEXT: s_waitcnt vmcnt(3) 5600; GX900-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48 5601; GX900-NEXT: s_waitcnt vmcnt(3) 5602; GX900-NEXT: global_store_dwordx4 v[4:5], v[14:17], off 5603; GX900-NEXT: s_waitcnt vmcnt(3) 5604; GX900-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16 5605; GX900-NEXT: s_waitcnt vmcnt(0) 5606; GX900-NEXT: s_setpc_b64 s[30:31] 5607; 5608; GFX940-LABEL: shuffle_v32bf16_concat: 5609; GFX940: ; %bb.0: 5610; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5611; GFX940-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 5612; GFX940-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 5613; GFX940-NEXT: global_load_dwordx4 v[14:17], v[0:1], off 5614; GFX940-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 5615; GFX940-NEXT: s_waitcnt vmcnt(3) 5616; GFX940-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 sc0 sc1 5617; GFX940-NEXT: s_waitcnt vmcnt(3) 5618; GFX940-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48 sc0 sc1 5619; GFX940-NEXT: s_waitcnt vmcnt(3) 5620; GFX940-NEXT: global_store_dwordx4 v[4:5], v[14:17], off sc0 sc1 5621; GFX940-NEXT: s_waitcnt vmcnt(3) 5622; GFX940-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16 sc0 sc1 5623; GFX940-NEXT: s_waitcnt vmcnt(0) 5624; GFX940-NEXT: s_setpc_b64 s[30:31] 5625; 5626; GFX10-LABEL: shuffle_v32bf16_concat: 5627; GFX10: ; %bb.0: 5628; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5629; GFX10-NEXT: s_clause 0x1 5630; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off 5631; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16 5632; GFX10-NEXT: s_clause 0x1 5633; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off 5634; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16 5635; GFX10-NEXT: s_waitcnt vmcnt(3) 5636; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 5637; GFX10-NEXT: s_waitcnt vmcnt(2) 5638; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48 5639; GFX10-NEXT: s_waitcnt vmcnt(1) 5640; GFX10-NEXT: global_store_dwordx4 v[4:5], v[14:17], off 5641; GFX10-NEXT: s_waitcnt vmcnt(0) 5642; GFX10-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16 5643; GFX10-NEXT: s_setpc_b64 s[30:31] 5644; 5645; GFX11-LABEL: shuffle_v32bf16_concat: 5646; GFX11: ; %bb.0: 5647; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5648; GFX11-NEXT: s_clause 0x1 5649; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off 5650; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16 5651; GFX11-NEXT: s_clause 0x1 5652; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off 5653; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16 5654; GFX11-NEXT: s_waitcnt vmcnt(3) 5655; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:32 5656; GFX11-NEXT: s_waitcnt vmcnt(2) 5657; GFX11-NEXT: global_store_b128 v[4:5], v[10:13], off offset:48 5658; GFX11-NEXT: s_waitcnt vmcnt(1) 5659; GFX11-NEXT: global_store_b128 v[4:5], v[14:17], off 5660; GFX11-NEXT: s_waitcnt vmcnt(0) 5661; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off offset:16 5662; GFX11-NEXT: s_setpc_b64 s[30:31] 5663 %val0 = load <16 x bfloat>, ptr addrspace(1) %arg0 5664 %val1 = load <16 x bfloat>, ptr addrspace(1) %arg1 5665 %shuffle = shufflevector <16 x bfloat> %val0, <16 x bfloat> %val1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 5666 store <32 x bfloat> %shuffle, ptr addrspace(1) %out 5667 ret void 5668} 5669 5670declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 5671declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) #0 5672declare i32 @llvm.amdgcn.workitem.id.x() #0 5673 5674attributes #0 = { nounwind readnone speculatable } 5675