1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s 4 5define <2 x i8> @shuffle_v2i8_rebroadcast(ptr addrspace(1) %arg0) { 6; GFX9-LABEL: shuffle_v2i8_rebroadcast: 7; GFX9: ; %bb.0: ; %entry 8; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9; GFX9-NEXT: global_load_ushort v0, v[0:1], off 10; GFX9-NEXT: s_waitcnt vmcnt(0) 11; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 12; GFX9-NEXT: v_mov_b32_e32 v1, v0 13; GFX9-NEXT: s_setpc_b64 s[30:31] 14; 15; GFX10-LABEL: shuffle_v2i8_rebroadcast: 16; GFX10: ; %bb.0: ; %entry 17; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18; GFX10-NEXT: global_load_ushort v0, v[0:1], off 19; GFX10-NEXT: s_waitcnt vmcnt(0) 20; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0 21; GFX10-NEXT: v_mov_b32_e32 v1, v0 22; GFX10-NEXT: s_setpc_b64 s[30:31] 23; 24; GFX11-LABEL: shuffle_v2i8_rebroadcast: 25; GFX11: ; %bb.0: ; %entry 26; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27; GFX11-NEXT: global_load_u16 v0, v[0:1], off 28; GFX11-NEXT: s_waitcnt vmcnt(0) 29; GFX11-NEXT: v_lshrrev_b16 v0, 8, v0 30; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 31; GFX11-NEXT: v_mov_b32_e32 v1, v0 32; GFX11-NEXT: s_setpc_b64 s[30:31] 33entry: 34 %val0 = load <2 x i8>, ptr addrspace(1) %arg0 35 %val1 = shufflevector <2 x i8> %val0, <2 x i8> poison, <2 x i32> <i32 1, i32 1> 36 ret <2 x i8> %val1 37} 38 39define <4 x i8> @shuffle_v4i8_rebroadcast(ptr addrspace(1) %arg0) { 40; GFX9-LABEL: shuffle_v4i8_rebroadcast: 41; GFX9: ; %bb.0: ; %entry 42; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 43; GFX9-NEXT: global_load_dword v0, v[0:1], off 44; GFX9-NEXT: s_waitcnt vmcnt(0) 45; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 46; GFX9-NEXT: v_mov_b32_e32 v1, v0 47; GFX9-NEXT: v_mov_b32_e32 v2, v0 48; GFX9-NEXT: v_mov_b32_e32 v3, v0 49; GFX9-NEXT: s_setpc_b64 s[30:31] 50; 51; GFX10-LABEL: shuffle_v4i8_rebroadcast: 52; GFX10: ; %bb.0: ; %entry 53; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 54; GFX10-NEXT: global_load_dword v0, v[0:1], off 55; GFX10-NEXT: s_waitcnt vmcnt(0) 56; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 57; GFX10-NEXT: v_mov_b32_e32 v1, v0 58; GFX10-NEXT: v_mov_b32_e32 v2, v0 59; GFX10-NEXT: v_mov_b32_e32 v3, v0 60; GFX10-NEXT: s_setpc_b64 s[30:31] 61; 62; GFX11-LABEL: shuffle_v4i8_rebroadcast: 63; GFX11: ; %bb.0: ; %entry 64; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 65; GFX11-NEXT: global_load_b32 v0, v[0:1], off 66; GFX11-NEXT: s_waitcnt vmcnt(0) 67; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 68; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 69; GFX11-NEXT: v_mov_b32_e32 v1, v0 70; GFX11-NEXT: v_mov_b32_e32 v2, v0 71; GFX11-NEXT: v_mov_b32_e32 v3, v0 72; GFX11-NEXT: s_setpc_b64 s[30:31] 73entry: 74 %val0 = load <4 x i8>, ptr addrspace(1) %arg0 75 %val1 = shufflevector <4 x i8> %val0, <4 x i8> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 76 ret <4 x i8> %val1 77} 78 79define <8 x i8> @shuffle_v8i8_rebroadcast(ptr addrspace(1) %arg0) { 80; GFX9-LABEL: shuffle_v8i8_rebroadcast: 81; GFX9: ; %bb.0: ; %entry 82; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 83; GFX9-NEXT: global_load_dword v0, v[0:1], off 84; GFX9-NEXT: s_waitcnt vmcnt(0) 85; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 86; GFX9-NEXT: v_mov_b32_e32 v1, v0 87; GFX9-NEXT: v_mov_b32_e32 v2, v0 88; GFX9-NEXT: v_mov_b32_e32 v3, v0 89; GFX9-NEXT: v_mov_b32_e32 v4, v0 90; GFX9-NEXT: v_mov_b32_e32 v5, v0 91; GFX9-NEXT: v_mov_b32_e32 v6, v0 92; GFX9-NEXT: v_mov_b32_e32 v7, v0 93; GFX9-NEXT: s_setpc_b64 s[30:31] 94; 95; GFX10-LABEL: shuffle_v8i8_rebroadcast: 96; GFX10: ; %bb.0: ; %entry 97; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 98; GFX10-NEXT: global_load_dword v0, v[0:1], off 99; GFX10-NEXT: s_waitcnt vmcnt(0) 100; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 101; GFX10-NEXT: v_mov_b32_e32 v1, v0 102; GFX10-NEXT: v_mov_b32_e32 v2, v0 103; GFX10-NEXT: v_mov_b32_e32 v3, v0 104; GFX10-NEXT: v_mov_b32_e32 v4, v0 105; GFX10-NEXT: v_mov_b32_e32 v5, v0 106; GFX10-NEXT: v_mov_b32_e32 v6, v0 107; GFX10-NEXT: v_mov_b32_e32 v7, v0 108; GFX10-NEXT: s_setpc_b64 s[30:31] 109; 110; GFX11-LABEL: shuffle_v8i8_rebroadcast: 111; GFX11: ; %bb.0: ; %entry 112; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 113; GFX11-NEXT: global_load_b32 v0, v[0:1], off 114; GFX11-NEXT: s_waitcnt vmcnt(0) 115; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 116; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 117; GFX11-NEXT: v_mov_b32_e32 v1, v0 118; GFX11-NEXT: v_mov_b32_e32 v2, v0 119; GFX11-NEXT: v_mov_b32_e32 v3, v0 120; GFX11-NEXT: v_mov_b32_e32 v4, v0 121; GFX11-NEXT: v_mov_b32_e32 v5, v0 122; GFX11-NEXT: v_mov_b32_e32 v6, v0 123; GFX11-NEXT: v_mov_b32_e32 v7, v0 124; GFX11-NEXT: s_setpc_b64 s[30:31] 125entry: 126 %val0 = load <8 x i8>, ptr addrspace(1) %arg0 127 %val1 = shufflevector <8 x i8> %val0, <8 x i8> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 128 ret <8 x i8> %val1 129} 130 131define <16 x i8> @shuffle_v16i8_rebroadcast(ptr addrspace(1) %arg0) { 132; GFX9-LABEL: shuffle_v16i8_rebroadcast: 133; GFX9: ; %bb.0: ; %entry 134; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 135; GFX9-NEXT: global_load_dword v0, v[0:1], off 136; GFX9-NEXT: s_waitcnt vmcnt(0) 137; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 138; GFX9-NEXT: v_mov_b32_e32 v1, v0 139; GFX9-NEXT: v_mov_b32_e32 v2, v0 140; GFX9-NEXT: v_mov_b32_e32 v3, v0 141; GFX9-NEXT: v_mov_b32_e32 v4, v0 142; GFX9-NEXT: v_mov_b32_e32 v5, v0 143; GFX9-NEXT: v_mov_b32_e32 v6, v0 144; GFX9-NEXT: v_mov_b32_e32 v7, v0 145; GFX9-NEXT: v_mov_b32_e32 v8, v0 146; GFX9-NEXT: v_mov_b32_e32 v9, v0 147; GFX9-NEXT: v_mov_b32_e32 v10, v0 148; GFX9-NEXT: v_mov_b32_e32 v11, v0 149; GFX9-NEXT: v_mov_b32_e32 v12, v0 150; GFX9-NEXT: v_mov_b32_e32 v13, v0 151; GFX9-NEXT: v_mov_b32_e32 v14, v0 152; GFX9-NEXT: v_mov_b32_e32 v15, v0 153; GFX9-NEXT: s_setpc_b64 s[30:31] 154; 155; GFX10-LABEL: shuffle_v16i8_rebroadcast: 156; GFX10: ; %bb.0: ; %entry 157; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 158; GFX10-NEXT: global_load_dword v0, v[0:1], off 159; GFX10-NEXT: s_waitcnt vmcnt(0) 160; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 161; GFX10-NEXT: v_mov_b32_e32 v1, v0 162; GFX10-NEXT: v_mov_b32_e32 v2, v0 163; GFX10-NEXT: v_mov_b32_e32 v3, v0 164; GFX10-NEXT: v_mov_b32_e32 v4, v0 165; GFX10-NEXT: v_mov_b32_e32 v5, v0 166; GFX10-NEXT: v_mov_b32_e32 v6, v0 167; GFX10-NEXT: v_mov_b32_e32 v7, v0 168; GFX10-NEXT: v_mov_b32_e32 v8, v0 169; GFX10-NEXT: v_mov_b32_e32 v9, v0 170; GFX10-NEXT: v_mov_b32_e32 v10, v0 171; GFX10-NEXT: v_mov_b32_e32 v11, v0 172; GFX10-NEXT: v_mov_b32_e32 v12, v0 173; GFX10-NEXT: v_mov_b32_e32 v13, v0 174; GFX10-NEXT: v_mov_b32_e32 v14, v0 175; GFX10-NEXT: v_mov_b32_e32 v15, v0 176; GFX10-NEXT: s_setpc_b64 s[30:31] 177; 178; GFX11-LABEL: shuffle_v16i8_rebroadcast: 179; GFX11: ; %bb.0: ; %entry 180; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 181; GFX11-NEXT: global_load_b32 v0, v[0:1], off 182; GFX11-NEXT: s_waitcnt vmcnt(0) 183; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 184; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 185; GFX11-NEXT: v_mov_b32_e32 v1, v0 186; GFX11-NEXT: v_mov_b32_e32 v2, v0 187; GFX11-NEXT: v_mov_b32_e32 v3, v0 188; GFX11-NEXT: v_mov_b32_e32 v4, v0 189; GFX11-NEXT: v_mov_b32_e32 v5, v0 190; GFX11-NEXT: v_mov_b32_e32 v6, v0 191; GFX11-NEXT: v_mov_b32_e32 v7, v0 192; GFX11-NEXT: v_mov_b32_e32 v8, v0 193; GFX11-NEXT: v_mov_b32_e32 v9, v0 194; GFX11-NEXT: v_mov_b32_e32 v10, v0 195; GFX11-NEXT: v_mov_b32_e32 v11, v0 196; GFX11-NEXT: v_mov_b32_e32 v12, v0 197; GFX11-NEXT: v_mov_b32_e32 v13, v0 198; GFX11-NEXT: v_mov_b32_e32 v14, v0 199; GFX11-NEXT: v_mov_b32_e32 v15, v0 200; GFX11-NEXT: s_setpc_b64 s[30:31] 201entry: 202 %val0 = load <16 x i8>, ptr addrspace(1) %arg0 203 %val1 = shufflevector <16 x i8> %val0, <16 x i8> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 204 ret <16 x i8> %val1 205} 206 207define <32 x i8> @shuffle_v32i8_rebroadcast(ptr addrspace(1) %arg0) { 208; GFX9-LABEL: shuffle_v32i8_rebroadcast: 209; GFX9: ; %bb.0: ; %entry 210; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 211; GFX9-NEXT: global_load_dword v0, v[0:1], off 212; GFX9-NEXT: s_waitcnt vmcnt(0) 213; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 214; GFX9-NEXT: v_mov_b32_e32 v1, v0 215; GFX9-NEXT: v_mov_b32_e32 v2, v0 216; GFX9-NEXT: v_mov_b32_e32 v3, v0 217; GFX9-NEXT: v_mov_b32_e32 v4, v0 218; GFX9-NEXT: v_mov_b32_e32 v5, v0 219; GFX9-NEXT: v_mov_b32_e32 v6, v0 220; GFX9-NEXT: v_mov_b32_e32 v7, v0 221; GFX9-NEXT: v_mov_b32_e32 v8, v0 222; GFX9-NEXT: v_mov_b32_e32 v9, v0 223; GFX9-NEXT: v_mov_b32_e32 v10, v0 224; GFX9-NEXT: v_mov_b32_e32 v11, v0 225; GFX9-NEXT: v_mov_b32_e32 v12, v0 226; GFX9-NEXT: v_mov_b32_e32 v13, v0 227; GFX9-NEXT: v_mov_b32_e32 v14, v0 228; GFX9-NEXT: v_mov_b32_e32 v15, v0 229; GFX9-NEXT: v_mov_b32_e32 v16, v0 230; GFX9-NEXT: v_mov_b32_e32 v17, v0 231; GFX9-NEXT: v_mov_b32_e32 v18, v0 232; GFX9-NEXT: v_mov_b32_e32 v19, v0 233; GFX9-NEXT: v_mov_b32_e32 v20, v0 234; GFX9-NEXT: v_mov_b32_e32 v21, v0 235; GFX9-NEXT: v_mov_b32_e32 v22, v0 236; GFX9-NEXT: v_mov_b32_e32 v23, v0 237; GFX9-NEXT: v_mov_b32_e32 v24, v0 238; GFX9-NEXT: v_mov_b32_e32 v25, v0 239; GFX9-NEXT: v_mov_b32_e32 v26, v0 240; GFX9-NEXT: v_mov_b32_e32 v27, v0 241; GFX9-NEXT: v_mov_b32_e32 v28, v0 242; GFX9-NEXT: v_mov_b32_e32 v29, v0 243; GFX9-NEXT: v_mov_b32_e32 v30, v0 244; GFX9-NEXT: v_mov_b32_e32 v31, v0 245; GFX9-NEXT: s_setpc_b64 s[30:31] 246; 247; GFX10-LABEL: shuffle_v32i8_rebroadcast: 248; GFX10: ; %bb.0: ; %entry 249; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 250; GFX10-NEXT: global_load_dword v0, v[0:1], off 251; GFX10-NEXT: s_waitcnt vmcnt(0) 252; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 253; GFX10-NEXT: v_mov_b32_e32 v1, v0 254; GFX10-NEXT: v_mov_b32_e32 v2, v0 255; GFX10-NEXT: v_mov_b32_e32 v3, v0 256; GFX10-NEXT: v_mov_b32_e32 v4, v0 257; GFX10-NEXT: v_mov_b32_e32 v5, v0 258; GFX10-NEXT: v_mov_b32_e32 v6, v0 259; GFX10-NEXT: v_mov_b32_e32 v7, v0 260; GFX10-NEXT: v_mov_b32_e32 v8, v0 261; GFX10-NEXT: v_mov_b32_e32 v9, v0 262; GFX10-NEXT: v_mov_b32_e32 v10, v0 263; GFX10-NEXT: v_mov_b32_e32 v11, v0 264; GFX10-NEXT: v_mov_b32_e32 v12, v0 265; GFX10-NEXT: v_mov_b32_e32 v13, v0 266; GFX10-NEXT: v_mov_b32_e32 v14, v0 267; GFX10-NEXT: v_mov_b32_e32 v15, v0 268; GFX10-NEXT: v_mov_b32_e32 v16, v0 269; GFX10-NEXT: v_mov_b32_e32 v17, v0 270; GFX10-NEXT: v_mov_b32_e32 v18, v0 271; GFX10-NEXT: v_mov_b32_e32 v19, v0 272; GFX10-NEXT: v_mov_b32_e32 v20, v0 273; GFX10-NEXT: v_mov_b32_e32 v21, v0 274; GFX10-NEXT: v_mov_b32_e32 v22, v0 275; GFX10-NEXT: v_mov_b32_e32 v23, v0 276; GFX10-NEXT: v_mov_b32_e32 v24, v0 277; GFX10-NEXT: v_mov_b32_e32 v25, v0 278; GFX10-NEXT: v_mov_b32_e32 v26, v0 279; GFX10-NEXT: v_mov_b32_e32 v27, v0 280; GFX10-NEXT: v_mov_b32_e32 v28, v0 281; GFX10-NEXT: v_mov_b32_e32 v29, v0 282; GFX10-NEXT: v_mov_b32_e32 v30, v0 283; GFX10-NEXT: v_mov_b32_e32 v31, v0 284; GFX10-NEXT: s_setpc_b64 s[30:31] 285; 286; GFX11-LABEL: shuffle_v32i8_rebroadcast: 287; GFX11: ; %bb.0: ; %entry 288; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 289; GFX11-NEXT: global_load_b32 v0, v[0:1], off 290; GFX11-NEXT: s_waitcnt vmcnt(0) 291; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 292; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 293; GFX11-NEXT: v_mov_b32_e32 v1, v0 294; GFX11-NEXT: v_mov_b32_e32 v2, v0 295; GFX11-NEXT: v_mov_b32_e32 v3, v0 296; GFX11-NEXT: v_mov_b32_e32 v4, v0 297; GFX11-NEXT: v_mov_b32_e32 v5, v0 298; GFX11-NEXT: v_mov_b32_e32 v6, v0 299; GFX11-NEXT: v_mov_b32_e32 v7, v0 300; GFX11-NEXT: v_mov_b32_e32 v8, v0 301; GFX11-NEXT: v_mov_b32_e32 v9, v0 302; GFX11-NEXT: v_mov_b32_e32 v10, v0 303; GFX11-NEXT: v_mov_b32_e32 v11, v0 304; GFX11-NEXT: v_mov_b32_e32 v12, v0 305; GFX11-NEXT: v_mov_b32_e32 v13, v0 306; GFX11-NEXT: v_mov_b32_e32 v14, v0 307; GFX11-NEXT: v_mov_b32_e32 v15, v0 308; GFX11-NEXT: v_mov_b32_e32 v16, v0 309; GFX11-NEXT: v_mov_b32_e32 v17, v0 310; GFX11-NEXT: v_mov_b32_e32 v18, v0 311; GFX11-NEXT: v_mov_b32_e32 v19, v0 312; GFX11-NEXT: v_mov_b32_e32 v20, v0 313; GFX11-NEXT: v_mov_b32_e32 v21, v0 314; GFX11-NEXT: v_mov_b32_e32 v22, v0 315; GFX11-NEXT: v_mov_b32_e32 v23, v0 316; GFX11-NEXT: v_mov_b32_e32 v24, v0 317; GFX11-NEXT: v_mov_b32_e32 v25, v0 318; GFX11-NEXT: v_mov_b32_e32 v26, v0 319; GFX11-NEXT: v_mov_b32_e32 v27, v0 320; GFX11-NEXT: v_mov_b32_e32 v28, v0 321; GFX11-NEXT: v_mov_b32_e32 v29, v0 322; GFX11-NEXT: v_mov_b32_e32 v30, v0 323; GFX11-NEXT: v_mov_b32_e32 v31, v0 324; GFX11-NEXT: s_setpc_b64 s[30:31] 325entry: 326 %val0 = load <32 x i8>, ptr addrspace(1) %arg0 327 %val1 = shufflevector <32 x i8> %val0, <32 x i8> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 328 ret <32 x i8> %val1 329} 330 331define <2 x i16> @shuffle_v2i16_rebroadcast(ptr addrspace(1) %arg0) { 332; GFX9-LABEL: shuffle_v2i16_rebroadcast: 333; GFX9: ; %bb.0: ; %entry 334; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 335; GFX9-NEXT: global_load_dword v0, v[0:1], off 336; GFX9-NEXT: s_mov_b32 s4, 0x7060302 337; GFX9-NEXT: s_waitcnt vmcnt(0) 338; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 339; GFX9-NEXT: s_setpc_b64 s[30:31] 340; 341; GFX10-LABEL: shuffle_v2i16_rebroadcast: 342; GFX10: ; %bb.0: ; %entry 343; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 344; GFX10-NEXT: global_load_dword v0, v[0:1], off 345; GFX10-NEXT: s_waitcnt vmcnt(0) 346; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 347; GFX10-NEXT: s_setpc_b64 s[30:31] 348; 349; GFX11-LABEL: shuffle_v2i16_rebroadcast: 350; GFX11: ; %bb.0: ; %entry 351; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 352; GFX11-NEXT: global_load_b32 v0, v[0:1], off 353; GFX11-NEXT: s_waitcnt vmcnt(0) 354; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 355; GFX11-NEXT: s_setpc_b64 s[30:31] 356entry: 357 %val0 = load <2 x i16>, ptr addrspace(1) %arg0 358 %val1 = shufflevector <2 x i16> %val0, <2 x i16> poison, <2 x i32> <i32 1, i32 1> 359 ret <2 x i16> %val1 360} 361 362define <4 x i16> @shuffle_v4i16_rebroadcast(ptr addrspace(1) %arg0) { 363; GFX9-LABEL: shuffle_v4i16_rebroadcast: 364; GFX9: ; %bb.0: ; %entry 365; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 366; GFX9-NEXT: global_load_dword v0, v[0:1], off 367; GFX9-NEXT: s_mov_b32 s4, 0x7060302 368; GFX9-NEXT: s_waitcnt vmcnt(0) 369; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 370; GFX9-NEXT: v_mov_b32_e32 v1, v0 371; GFX9-NEXT: s_setpc_b64 s[30:31] 372; 373; GFX10-LABEL: shuffle_v4i16_rebroadcast: 374; GFX10: ; %bb.0: ; %entry 375; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 376; GFX10-NEXT: global_load_dword v0, v[0:1], off 377; GFX10-NEXT: s_waitcnt vmcnt(0) 378; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 379; GFX10-NEXT: v_mov_b32_e32 v1, v0 380; GFX10-NEXT: s_setpc_b64 s[30:31] 381; 382; GFX11-LABEL: shuffle_v4i16_rebroadcast: 383; GFX11: ; %bb.0: ; %entry 384; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 385; GFX11-NEXT: global_load_b32 v0, v[0:1], off 386; GFX11-NEXT: s_waitcnt vmcnt(0) 387; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 388; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 389; GFX11-NEXT: v_mov_b32_e32 v1, v0 390; GFX11-NEXT: s_setpc_b64 s[30:31] 391entry: 392 %val0 = load <4 x i16>, ptr addrspace(1) %arg0 393 %val1 = shufflevector <4 x i16> %val0, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 394 ret <4 x i16> %val1 395} 396 397define <8 x i16> @shuffle_v8i16_rebroadcast(ptr addrspace(1) %arg0) { 398; GFX9-LABEL: shuffle_v8i16_rebroadcast: 399; GFX9: ; %bb.0: ; %entry 400; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 401; GFX9-NEXT: global_load_dword v0, v[0:1], off 402; GFX9-NEXT: s_mov_b32 s4, 0x7060302 403; GFX9-NEXT: s_waitcnt vmcnt(0) 404; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 405; GFX9-NEXT: v_mov_b32_e32 v1, v0 406; GFX9-NEXT: v_mov_b32_e32 v2, v0 407; GFX9-NEXT: v_mov_b32_e32 v3, v0 408; GFX9-NEXT: s_setpc_b64 s[30:31] 409; 410; GFX10-LABEL: shuffle_v8i16_rebroadcast: 411; GFX10: ; %bb.0: ; %entry 412; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 413; GFX10-NEXT: global_load_dword v0, v[0:1], off 414; GFX10-NEXT: s_waitcnt vmcnt(0) 415; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 416; GFX10-NEXT: v_mov_b32_e32 v1, v0 417; GFX10-NEXT: v_mov_b32_e32 v2, v0 418; GFX10-NEXT: v_mov_b32_e32 v3, v0 419; GFX10-NEXT: s_setpc_b64 s[30:31] 420; 421; GFX11-LABEL: shuffle_v8i16_rebroadcast: 422; GFX11: ; %bb.0: ; %entry 423; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 424; GFX11-NEXT: global_load_b32 v0, v[0:1], off 425; GFX11-NEXT: s_waitcnt vmcnt(0) 426; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 427; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 428; GFX11-NEXT: v_mov_b32_e32 v1, v0 429; GFX11-NEXT: v_mov_b32_e32 v2, v0 430; GFX11-NEXT: v_mov_b32_e32 v3, v0 431; GFX11-NEXT: s_setpc_b64 s[30:31] 432entry: 433 %val0 = load <8 x i16>, ptr addrspace(1) %arg0 434 %val1 = shufflevector <8 x i16> %val0, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 435 ret <8 x i16> %val1 436} 437 438define <16 x i16> @shuffle_v16i16_rebroadcast(ptr addrspace(1) %arg0) { 439; GFX9-LABEL: shuffle_v16i16_rebroadcast: 440; GFX9: ; %bb.0: ; %entry 441; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 442; GFX9-NEXT: global_load_dword v0, v[0:1], off 443; GFX9-NEXT: s_mov_b32 s4, 0x7060302 444; GFX9-NEXT: s_waitcnt vmcnt(0) 445; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 446; GFX9-NEXT: v_mov_b32_e32 v1, v0 447; GFX9-NEXT: v_mov_b32_e32 v2, v0 448; GFX9-NEXT: v_mov_b32_e32 v3, v0 449; GFX9-NEXT: v_mov_b32_e32 v4, v0 450; GFX9-NEXT: v_mov_b32_e32 v5, v0 451; GFX9-NEXT: v_mov_b32_e32 v6, v0 452; GFX9-NEXT: v_mov_b32_e32 v7, v0 453; GFX9-NEXT: s_setpc_b64 s[30:31] 454; 455; GFX10-LABEL: shuffle_v16i16_rebroadcast: 456; GFX10: ; %bb.0: ; %entry 457; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 458; GFX10-NEXT: global_load_dword v0, v[0:1], off 459; GFX10-NEXT: s_waitcnt vmcnt(0) 460; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 461; GFX10-NEXT: v_mov_b32_e32 v1, v0 462; GFX10-NEXT: v_mov_b32_e32 v2, v0 463; GFX10-NEXT: v_mov_b32_e32 v3, v0 464; GFX10-NEXT: v_mov_b32_e32 v4, v0 465; GFX10-NEXT: v_mov_b32_e32 v5, v0 466; GFX10-NEXT: v_mov_b32_e32 v6, v0 467; GFX10-NEXT: v_mov_b32_e32 v7, v0 468; GFX10-NEXT: s_setpc_b64 s[30:31] 469; 470; GFX11-LABEL: shuffle_v16i16_rebroadcast: 471; GFX11: ; %bb.0: ; %entry 472; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 473; GFX11-NEXT: global_load_b32 v0, v[0:1], off 474; GFX11-NEXT: s_waitcnt vmcnt(0) 475; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 476; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 477; GFX11-NEXT: v_mov_b32_e32 v1, v0 478; GFX11-NEXT: v_mov_b32_e32 v2, v0 479; GFX11-NEXT: v_mov_b32_e32 v3, v0 480; GFX11-NEXT: v_mov_b32_e32 v4, v0 481; GFX11-NEXT: v_mov_b32_e32 v5, v0 482; GFX11-NEXT: v_mov_b32_e32 v6, v0 483; GFX11-NEXT: v_mov_b32_e32 v7, v0 484; GFX11-NEXT: s_setpc_b64 s[30:31] 485entry: 486 %val0 = load <16 x i16>, ptr addrspace(1) %arg0 487 %val1 = shufflevector <16 x i16> %val0, <16 x i16> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 488 ret <16 x i16> %val1 489} 490 491define <32 x i16> @shuffle_v32i16_rebroadcast(ptr addrspace(1) %arg0) { 492; GFX9-LABEL: shuffle_v32i16_rebroadcast: 493; GFX9: ; %bb.0: ; %entry 494; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 495; GFX9-NEXT: global_load_dword v0, v[0:1], off 496; GFX9-NEXT: s_mov_b32 s4, 0x7060302 497; GFX9-NEXT: s_waitcnt vmcnt(0) 498; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 499; GFX9-NEXT: v_mov_b32_e32 v1, v0 500; GFX9-NEXT: v_mov_b32_e32 v2, v0 501; GFX9-NEXT: v_mov_b32_e32 v3, v0 502; GFX9-NEXT: v_mov_b32_e32 v4, v0 503; GFX9-NEXT: v_mov_b32_e32 v5, v0 504; GFX9-NEXT: v_mov_b32_e32 v6, v0 505; GFX9-NEXT: v_mov_b32_e32 v7, v0 506; GFX9-NEXT: v_mov_b32_e32 v8, v0 507; GFX9-NEXT: v_mov_b32_e32 v9, v0 508; GFX9-NEXT: v_mov_b32_e32 v10, v0 509; GFX9-NEXT: v_mov_b32_e32 v11, v0 510; GFX9-NEXT: v_mov_b32_e32 v12, v0 511; GFX9-NEXT: v_mov_b32_e32 v13, v0 512; GFX9-NEXT: v_mov_b32_e32 v14, v0 513; GFX9-NEXT: v_mov_b32_e32 v15, v0 514; GFX9-NEXT: s_setpc_b64 s[30:31] 515; 516; GFX10-LABEL: shuffle_v32i16_rebroadcast: 517; GFX10: ; %bb.0: ; %entry 518; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 519; GFX10-NEXT: global_load_dword v0, v[0:1], off 520; GFX10-NEXT: s_waitcnt vmcnt(0) 521; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 522; GFX10-NEXT: v_mov_b32_e32 v1, v0 523; GFX10-NEXT: v_mov_b32_e32 v2, v0 524; GFX10-NEXT: v_mov_b32_e32 v3, v0 525; GFX10-NEXT: v_mov_b32_e32 v4, v0 526; GFX10-NEXT: v_mov_b32_e32 v5, v0 527; GFX10-NEXT: v_mov_b32_e32 v6, v0 528; GFX10-NEXT: v_mov_b32_e32 v7, v0 529; GFX10-NEXT: v_mov_b32_e32 v8, v0 530; GFX10-NEXT: v_mov_b32_e32 v9, v0 531; GFX10-NEXT: v_mov_b32_e32 v10, v0 532; GFX10-NEXT: v_mov_b32_e32 v11, v0 533; GFX10-NEXT: v_mov_b32_e32 v12, v0 534; GFX10-NEXT: v_mov_b32_e32 v13, v0 535; GFX10-NEXT: v_mov_b32_e32 v14, v0 536; GFX10-NEXT: v_mov_b32_e32 v15, v0 537; GFX10-NEXT: s_setpc_b64 s[30:31] 538; 539; GFX11-LABEL: shuffle_v32i16_rebroadcast: 540; GFX11: ; %bb.0: ; %entry 541; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 542; GFX11-NEXT: global_load_b32 v0, v[0:1], off 543; GFX11-NEXT: s_waitcnt vmcnt(0) 544; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 545; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 546; GFX11-NEXT: v_mov_b32_e32 v1, v0 547; GFX11-NEXT: v_mov_b32_e32 v2, v0 548; GFX11-NEXT: v_mov_b32_e32 v3, v0 549; GFX11-NEXT: v_mov_b32_e32 v4, v0 550; GFX11-NEXT: v_mov_b32_e32 v5, v0 551; GFX11-NEXT: v_mov_b32_e32 v6, v0 552; GFX11-NEXT: v_mov_b32_e32 v7, v0 553; GFX11-NEXT: v_mov_b32_e32 v8, v0 554; GFX11-NEXT: v_mov_b32_e32 v9, v0 555; GFX11-NEXT: v_mov_b32_e32 v10, v0 556; GFX11-NEXT: v_mov_b32_e32 v11, v0 557; GFX11-NEXT: v_mov_b32_e32 v12, v0 558; GFX11-NEXT: v_mov_b32_e32 v13, v0 559; GFX11-NEXT: v_mov_b32_e32 v14, v0 560; GFX11-NEXT: v_mov_b32_e32 v15, v0 561; GFX11-NEXT: s_setpc_b64 s[30:31] 562entry: 563 %val0 = load <32 x i16>, ptr addrspace(1) %arg0 564 %val1 = shufflevector <32 x i16> %val0, <32 x i16> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 565 ret <32 x i16> %val1 566} 567 568define <2 x i32> @shuffle_v2i32_rebroadcast(ptr addrspace(1) %arg0) { 569; GFX9-LABEL: shuffle_v2i32_rebroadcast: 570; GFX9: ; %bb.0: ; %entry 571; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 572; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 573; GFX9-NEXT: s_waitcnt vmcnt(0) 574; GFX9-NEXT: v_mov_b32_e32 v1, v0 575; GFX9-NEXT: s_setpc_b64 s[30:31] 576; 577; GFX10-LABEL: shuffle_v2i32_rebroadcast: 578; GFX10: ; %bb.0: ; %entry 579; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 580; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 581; GFX10-NEXT: s_waitcnt vmcnt(0) 582; GFX10-NEXT: v_mov_b32_e32 v1, v0 583; GFX10-NEXT: s_setpc_b64 s[30:31] 584; 585; GFX11-LABEL: shuffle_v2i32_rebroadcast: 586; GFX11: ; %bb.0: ; %entry 587; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 588; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 589; GFX11-NEXT: s_waitcnt vmcnt(0) 590; GFX11-NEXT: v_mov_b32_e32 v1, v0 591; GFX11-NEXT: s_setpc_b64 s[30:31] 592entry: 593 %val0 = load <2 x i32>, ptr addrspace(1) %arg0 594 %val1 = shufflevector <2 x i32> %val0, <2 x i32> poison, <2 x i32> <i32 1, i32 1> 595 ret <2 x i32> %val1 596} 597 598define <4 x i32> @shuffle_v4i32_rebroadcast(ptr addrspace(1) %arg0) { 599; GFX9-LABEL: shuffle_v4i32_rebroadcast: 600; GFX9: ; %bb.0: ; %entry 601; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 602; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 603; GFX9-NEXT: s_waitcnt vmcnt(0) 604; GFX9-NEXT: v_mov_b32_e32 v1, v0 605; GFX9-NEXT: v_mov_b32_e32 v2, v0 606; GFX9-NEXT: v_mov_b32_e32 v3, v0 607; GFX9-NEXT: s_setpc_b64 s[30:31] 608; 609; GFX10-LABEL: shuffle_v4i32_rebroadcast: 610; GFX10: ; %bb.0: ; %entry 611; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 612; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 613; GFX10-NEXT: s_waitcnt vmcnt(0) 614; GFX10-NEXT: v_mov_b32_e32 v1, v0 615; GFX10-NEXT: v_mov_b32_e32 v2, v0 616; GFX10-NEXT: v_mov_b32_e32 v3, v0 617; GFX10-NEXT: s_setpc_b64 s[30:31] 618; 619; GFX11-LABEL: shuffle_v4i32_rebroadcast: 620; GFX11: ; %bb.0: ; %entry 621; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 622; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 623; GFX11-NEXT: s_waitcnt vmcnt(0) 624; GFX11-NEXT: v_mov_b32_e32 v1, v0 625; GFX11-NEXT: v_mov_b32_e32 v2, v0 626; GFX11-NEXT: v_mov_b32_e32 v3, v0 627; GFX11-NEXT: s_setpc_b64 s[30:31] 628entry: 629 %val0 = load <4 x i32>, ptr addrspace(1) %arg0 630 %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 631 ret <4 x i32> %val1 632} 633 634define <8 x i32> @shuffle_v8i32_rebroadcast(ptr addrspace(1) %arg0) { 635; GFX9-LABEL: shuffle_v8i32_rebroadcast: 636; GFX9: ; %bb.0: ; %entry 637; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 638; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 639; GFX9-NEXT: s_waitcnt vmcnt(0) 640; GFX9-NEXT: v_mov_b32_e32 v1, v0 641; GFX9-NEXT: v_mov_b32_e32 v2, v0 642; GFX9-NEXT: v_mov_b32_e32 v3, v0 643; GFX9-NEXT: v_mov_b32_e32 v4, v0 644; GFX9-NEXT: v_mov_b32_e32 v5, v0 645; GFX9-NEXT: v_mov_b32_e32 v6, v0 646; GFX9-NEXT: v_mov_b32_e32 v7, v0 647; GFX9-NEXT: s_setpc_b64 s[30:31] 648; 649; GFX10-LABEL: shuffle_v8i32_rebroadcast: 650; GFX10: ; %bb.0: ; %entry 651; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 652; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 653; GFX10-NEXT: s_waitcnt vmcnt(0) 654; GFX10-NEXT: v_mov_b32_e32 v1, v0 655; GFX10-NEXT: v_mov_b32_e32 v2, v0 656; GFX10-NEXT: v_mov_b32_e32 v3, v0 657; GFX10-NEXT: v_mov_b32_e32 v4, v0 658; GFX10-NEXT: v_mov_b32_e32 v5, v0 659; GFX10-NEXT: v_mov_b32_e32 v6, v0 660; GFX10-NEXT: v_mov_b32_e32 v7, v0 661; GFX10-NEXT: s_setpc_b64 s[30:31] 662; 663; GFX11-LABEL: shuffle_v8i32_rebroadcast: 664; GFX11: ; %bb.0: ; %entry 665; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 666; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 667; GFX11-NEXT: s_waitcnt vmcnt(0) 668; GFX11-NEXT: v_mov_b32_e32 v1, v0 669; GFX11-NEXT: v_mov_b32_e32 v2, v0 670; GFX11-NEXT: v_mov_b32_e32 v3, v0 671; GFX11-NEXT: v_mov_b32_e32 v4, v0 672; GFX11-NEXT: v_mov_b32_e32 v5, v0 673; GFX11-NEXT: v_mov_b32_e32 v6, v0 674; GFX11-NEXT: v_mov_b32_e32 v7, v0 675; GFX11-NEXT: s_setpc_b64 s[30:31] 676entry: 677 %val0 = load <8 x i32>, ptr addrspace(1) %arg0 678 %val1 = shufflevector <8 x i32> %val0, <8 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 679 ret <8 x i32> %val1 680} 681 682define <16 x i32> @shuffle_v16i32_rebroadcast(ptr addrspace(1) %arg0) { 683; GFX9-LABEL: shuffle_v16i32_rebroadcast: 684; GFX9: ; %bb.0: ; %entry 685; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 686; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 687; GFX9-NEXT: s_waitcnt vmcnt(0) 688; GFX9-NEXT: v_mov_b32_e32 v1, v0 689; GFX9-NEXT: v_mov_b32_e32 v2, v0 690; GFX9-NEXT: v_mov_b32_e32 v3, v0 691; GFX9-NEXT: v_mov_b32_e32 v4, v0 692; GFX9-NEXT: v_mov_b32_e32 v5, v0 693; GFX9-NEXT: v_mov_b32_e32 v6, v0 694; GFX9-NEXT: v_mov_b32_e32 v7, v0 695; GFX9-NEXT: v_mov_b32_e32 v8, v0 696; GFX9-NEXT: v_mov_b32_e32 v9, v0 697; GFX9-NEXT: v_mov_b32_e32 v10, v0 698; GFX9-NEXT: v_mov_b32_e32 v11, v0 699; GFX9-NEXT: v_mov_b32_e32 v12, v0 700; GFX9-NEXT: v_mov_b32_e32 v13, v0 701; GFX9-NEXT: v_mov_b32_e32 v14, v0 702; GFX9-NEXT: v_mov_b32_e32 v15, v0 703; GFX9-NEXT: s_setpc_b64 s[30:31] 704; 705; GFX10-LABEL: shuffle_v16i32_rebroadcast: 706; GFX10: ; %bb.0: ; %entry 707; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 708; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 709; GFX10-NEXT: s_waitcnt vmcnt(0) 710; GFX10-NEXT: v_mov_b32_e32 v1, v0 711; GFX10-NEXT: v_mov_b32_e32 v2, v0 712; GFX10-NEXT: v_mov_b32_e32 v3, v0 713; GFX10-NEXT: v_mov_b32_e32 v4, v0 714; GFX10-NEXT: v_mov_b32_e32 v5, v0 715; GFX10-NEXT: v_mov_b32_e32 v6, v0 716; GFX10-NEXT: v_mov_b32_e32 v7, v0 717; GFX10-NEXT: v_mov_b32_e32 v8, v0 718; GFX10-NEXT: v_mov_b32_e32 v9, v0 719; GFX10-NEXT: v_mov_b32_e32 v10, v0 720; GFX10-NEXT: v_mov_b32_e32 v11, v0 721; GFX10-NEXT: v_mov_b32_e32 v12, v0 722; GFX10-NEXT: v_mov_b32_e32 v13, v0 723; GFX10-NEXT: v_mov_b32_e32 v14, v0 724; GFX10-NEXT: v_mov_b32_e32 v15, v0 725; GFX10-NEXT: s_setpc_b64 s[30:31] 726; 727; GFX11-LABEL: shuffle_v16i32_rebroadcast: 728; GFX11: ; %bb.0: ; %entry 729; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 730; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 731; GFX11-NEXT: s_waitcnt vmcnt(0) 732; GFX11-NEXT: v_mov_b32_e32 v1, v0 733; GFX11-NEXT: v_mov_b32_e32 v2, v0 734; GFX11-NEXT: v_mov_b32_e32 v3, v0 735; GFX11-NEXT: v_mov_b32_e32 v4, v0 736; GFX11-NEXT: v_mov_b32_e32 v5, v0 737; GFX11-NEXT: v_mov_b32_e32 v6, v0 738; GFX11-NEXT: v_mov_b32_e32 v7, v0 739; GFX11-NEXT: v_mov_b32_e32 v8, v0 740; GFX11-NEXT: v_mov_b32_e32 v9, v0 741; GFX11-NEXT: v_mov_b32_e32 v10, v0 742; GFX11-NEXT: v_mov_b32_e32 v11, v0 743; GFX11-NEXT: v_mov_b32_e32 v12, v0 744; GFX11-NEXT: v_mov_b32_e32 v13, v0 745; GFX11-NEXT: v_mov_b32_e32 v14, v0 746; GFX11-NEXT: v_mov_b32_e32 v15, v0 747; GFX11-NEXT: s_setpc_b64 s[30:31] 748entry: 749 %val0 = load <16 x i32>, ptr addrspace(1) %arg0 750 %val1 = shufflevector <16 x i32> %val0, <16 x i32> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 751 ret <16 x i32> %val1 752} 753 754define <32 x i32> @shuffle_v32i32_rebroadcast(ptr addrspace(1) %arg0) { 755; GFX9-LABEL: shuffle_v32i32_rebroadcast: 756; GFX9: ; %bb.0: ; %entry 757; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 758; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 759; GFX9-NEXT: s_waitcnt vmcnt(0) 760; GFX9-NEXT: v_mov_b32_e32 v1, v0 761; GFX9-NEXT: v_mov_b32_e32 v2, v0 762; GFX9-NEXT: v_mov_b32_e32 v3, v0 763; GFX9-NEXT: v_mov_b32_e32 v4, v0 764; GFX9-NEXT: v_mov_b32_e32 v5, v0 765; GFX9-NEXT: v_mov_b32_e32 v6, v0 766; GFX9-NEXT: v_mov_b32_e32 v7, v0 767; GFX9-NEXT: v_mov_b32_e32 v8, v0 768; GFX9-NEXT: v_mov_b32_e32 v9, v0 769; GFX9-NEXT: v_mov_b32_e32 v10, v0 770; GFX9-NEXT: v_mov_b32_e32 v11, v0 771; GFX9-NEXT: v_mov_b32_e32 v12, v0 772; GFX9-NEXT: v_mov_b32_e32 v13, v0 773; GFX9-NEXT: v_mov_b32_e32 v14, v0 774; GFX9-NEXT: v_mov_b32_e32 v15, v0 775; GFX9-NEXT: v_mov_b32_e32 v16, v0 776; GFX9-NEXT: v_mov_b32_e32 v17, v0 777; GFX9-NEXT: v_mov_b32_e32 v18, v0 778; GFX9-NEXT: v_mov_b32_e32 v19, v0 779; GFX9-NEXT: v_mov_b32_e32 v20, v0 780; GFX9-NEXT: v_mov_b32_e32 v21, v0 781; GFX9-NEXT: v_mov_b32_e32 v22, v0 782; GFX9-NEXT: v_mov_b32_e32 v23, v0 783; GFX9-NEXT: v_mov_b32_e32 v24, v0 784; GFX9-NEXT: v_mov_b32_e32 v25, v0 785; GFX9-NEXT: v_mov_b32_e32 v26, v0 786; GFX9-NEXT: v_mov_b32_e32 v27, v0 787; GFX9-NEXT: v_mov_b32_e32 v28, v0 788; GFX9-NEXT: v_mov_b32_e32 v29, v0 789; GFX9-NEXT: v_mov_b32_e32 v30, v0 790; GFX9-NEXT: v_mov_b32_e32 v31, v0 791; GFX9-NEXT: s_setpc_b64 s[30:31] 792; 793; GFX10-LABEL: shuffle_v32i32_rebroadcast: 794; GFX10: ; %bb.0: ; %entry 795; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 796; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 797; GFX10-NEXT: s_waitcnt vmcnt(0) 798; GFX10-NEXT: v_mov_b32_e32 v1, v0 799; GFX10-NEXT: v_mov_b32_e32 v2, v0 800; GFX10-NEXT: v_mov_b32_e32 v3, v0 801; GFX10-NEXT: v_mov_b32_e32 v4, v0 802; GFX10-NEXT: v_mov_b32_e32 v5, v0 803; GFX10-NEXT: v_mov_b32_e32 v6, v0 804; GFX10-NEXT: v_mov_b32_e32 v7, v0 805; GFX10-NEXT: v_mov_b32_e32 v8, v0 806; GFX10-NEXT: v_mov_b32_e32 v9, v0 807; GFX10-NEXT: v_mov_b32_e32 v10, v0 808; GFX10-NEXT: v_mov_b32_e32 v11, v0 809; GFX10-NEXT: v_mov_b32_e32 v12, v0 810; GFX10-NEXT: v_mov_b32_e32 v13, v0 811; GFX10-NEXT: v_mov_b32_e32 v14, v0 812; GFX10-NEXT: v_mov_b32_e32 v15, v0 813; GFX10-NEXT: v_mov_b32_e32 v16, v0 814; GFX10-NEXT: v_mov_b32_e32 v17, v0 815; GFX10-NEXT: v_mov_b32_e32 v18, v0 816; GFX10-NEXT: v_mov_b32_e32 v19, v0 817; GFX10-NEXT: v_mov_b32_e32 v20, v0 818; GFX10-NEXT: v_mov_b32_e32 v21, v0 819; GFX10-NEXT: v_mov_b32_e32 v22, v0 820; GFX10-NEXT: v_mov_b32_e32 v23, v0 821; GFX10-NEXT: v_mov_b32_e32 v24, v0 822; GFX10-NEXT: v_mov_b32_e32 v25, v0 823; GFX10-NEXT: v_mov_b32_e32 v26, v0 824; GFX10-NEXT: v_mov_b32_e32 v27, v0 825; GFX10-NEXT: v_mov_b32_e32 v28, v0 826; GFX10-NEXT: v_mov_b32_e32 v29, v0 827; GFX10-NEXT: v_mov_b32_e32 v30, v0 828; GFX10-NEXT: v_mov_b32_e32 v31, v0 829; GFX10-NEXT: s_setpc_b64 s[30:31] 830; 831; GFX11-LABEL: shuffle_v32i32_rebroadcast: 832; GFX11: ; %bb.0: ; %entry 833; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 834; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 835; GFX11-NEXT: s_waitcnt vmcnt(0) 836; GFX11-NEXT: v_mov_b32_e32 v1, v0 837; GFX11-NEXT: v_mov_b32_e32 v2, v0 838; GFX11-NEXT: v_mov_b32_e32 v3, v0 839; GFX11-NEXT: v_mov_b32_e32 v4, v0 840; GFX11-NEXT: v_mov_b32_e32 v5, v0 841; GFX11-NEXT: v_mov_b32_e32 v6, v0 842; GFX11-NEXT: v_mov_b32_e32 v7, v0 843; GFX11-NEXT: v_mov_b32_e32 v8, v0 844; GFX11-NEXT: v_mov_b32_e32 v9, v0 845; GFX11-NEXT: v_mov_b32_e32 v10, v0 846; GFX11-NEXT: v_mov_b32_e32 v11, v0 847; GFX11-NEXT: v_mov_b32_e32 v12, v0 848; GFX11-NEXT: v_mov_b32_e32 v13, v0 849; GFX11-NEXT: v_mov_b32_e32 v14, v0 850; GFX11-NEXT: v_mov_b32_e32 v15, v0 851; GFX11-NEXT: v_mov_b32_e32 v16, v0 852; GFX11-NEXT: v_mov_b32_e32 v17, v0 853; GFX11-NEXT: v_mov_b32_e32 v18, v0 854; GFX11-NEXT: v_mov_b32_e32 v19, v0 855; GFX11-NEXT: v_mov_b32_e32 v20, v0 856; GFX11-NEXT: v_mov_b32_e32 v21, v0 857; GFX11-NEXT: v_mov_b32_e32 v22, v0 858; GFX11-NEXT: v_mov_b32_e32 v23, v0 859; GFX11-NEXT: v_mov_b32_e32 v24, v0 860; GFX11-NEXT: v_mov_b32_e32 v25, v0 861; GFX11-NEXT: v_mov_b32_e32 v26, v0 862; GFX11-NEXT: v_mov_b32_e32 v27, v0 863; GFX11-NEXT: v_mov_b32_e32 v28, v0 864; GFX11-NEXT: v_mov_b32_e32 v29, v0 865; GFX11-NEXT: v_mov_b32_e32 v30, v0 866; GFX11-NEXT: v_mov_b32_e32 v31, v0 867; GFX11-NEXT: s_setpc_b64 s[30:31] 868entry: 869 %val0 = load <32 x i32>, ptr addrspace(1) %arg0 870 %val1 = shufflevector <32 x i32> %val0, <32 x i32> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 871 ret <32 x i32> %val1 872} 873 874define <2 x bfloat> @shuffle_v2bf16_rebroadcast(ptr addrspace(1) %arg0) { 875; GFX9-LABEL: shuffle_v2bf16_rebroadcast: 876; GFX9: ; %bb.0: ; %entry 877; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 878; GFX9-NEXT: global_load_dword v0, v[0:1], off 879; GFX9-NEXT: s_mov_b32 s4, 0x7060302 880; GFX9-NEXT: s_waitcnt vmcnt(0) 881; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 882; GFX9-NEXT: s_setpc_b64 s[30:31] 883; 884; GFX10-LABEL: shuffle_v2bf16_rebroadcast: 885; GFX10: ; %bb.0: ; %entry 886; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 887; GFX10-NEXT: global_load_dword v0, v[0:1], off 888; GFX10-NEXT: s_waitcnt vmcnt(0) 889; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 890; GFX10-NEXT: s_setpc_b64 s[30:31] 891; 892; GFX11-LABEL: shuffle_v2bf16_rebroadcast: 893; GFX11: ; %bb.0: ; %entry 894; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 895; GFX11-NEXT: global_load_b32 v0, v[0:1], off 896; GFX11-NEXT: s_waitcnt vmcnt(0) 897; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 898; GFX11-NEXT: s_setpc_b64 s[30:31] 899entry: 900 %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0 901 %val1 = shufflevector <2 x bfloat> %val0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1> 902 ret <2 x bfloat> %val1 903} 904 905define <3 x bfloat> @shuffle_v3bf16_rebroadcast(ptr addrspace(1) %arg0) { 906; GFX9-LABEL: shuffle_v3bf16_rebroadcast: 907; GFX9: ; %bb.0: ; %entry 908; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 909; GFX9-NEXT: global_load_dword v1, v[0:1], off 910; GFX9-NEXT: s_mov_b32 s4, 0x7060302 911; GFX9-NEXT: s_waitcnt vmcnt(0) 912; GFX9-NEXT: v_perm_b32 v0, v1, v1, s4 913; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 914; GFX9-NEXT: s_setpc_b64 s[30:31] 915; 916; GFX10-LABEL: shuffle_v3bf16_rebroadcast: 917; GFX10: ; %bb.0: ; %entry 918; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 919; GFX10-NEXT: global_load_dword v1, v[0:1], off 920; GFX10-NEXT: s_waitcnt vmcnt(0) 921; GFX10-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 922; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 923; GFX10-NEXT: s_setpc_b64 s[30:31] 924; 925; GFX11-LABEL: shuffle_v3bf16_rebroadcast: 926; GFX11: ; %bb.0: ; %entry 927; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 928; GFX11-NEXT: global_load_b32 v1, v[0:1], off 929; GFX11-NEXT: s_waitcnt vmcnt(0) 930; GFX11-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 931; GFX11-NEXT: v_alignbit_b32 v1, s0, v1, 16 932; GFX11-NEXT: s_setpc_b64 s[30:31] 933entry: 934 %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0 935 %val1 = shufflevector <3 x bfloat> %val0, <3 x bfloat> poison, <3 x i32> <i32 1, i32 1, i32 1> 936 ret <3 x bfloat> %val1 937} 938 939define <4 x bfloat> @shuffle_v4bf16_rebroadcast(ptr addrspace(1) %arg0) { 940; GFX9-LABEL: shuffle_v4bf16_rebroadcast: 941; GFX9: ; %bb.0: ; %entry 942; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 943; GFX9-NEXT: global_load_dword v0, v[0:1], off 944; GFX9-NEXT: s_mov_b32 s4, 0x7060302 945; GFX9-NEXT: s_waitcnt vmcnt(0) 946; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 947; GFX9-NEXT: v_mov_b32_e32 v1, v0 948; GFX9-NEXT: s_setpc_b64 s[30:31] 949; 950; GFX10-LABEL: shuffle_v4bf16_rebroadcast: 951; GFX10: ; %bb.0: ; %entry 952; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 953; GFX10-NEXT: global_load_dword v0, v[0:1], off 954; GFX10-NEXT: s_waitcnt vmcnt(0) 955; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 956; GFX10-NEXT: v_mov_b32_e32 v1, v0 957; GFX10-NEXT: s_setpc_b64 s[30:31] 958; 959; GFX11-LABEL: shuffle_v4bf16_rebroadcast: 960; GFX11: ; %bb.0: ; %entry 961; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 962; GFX11-NEXT: global_load_b32 v0, v[0:1], off 963; GFX11-NEXT: s_waitcnt vmcnt(0) 964; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 965; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 966; GFX11-NEXT: v_mov_b32_e32 v1, v0 967; GFX11-NEXT: s_setpc_b64 s[30:31] 968entry: 969 %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 970 %val1 = shufflevector <4 x bfloat> %val0, <4 x bfloat> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 971 ret <4 x bfloat> %val1 972} 973 974define <6 x bfloat> @shuffle_v6bf16_rebroadcast(ptr addrspace(1) %arg0) { 975; GFX9-LABEL: shuffle_v6bf16_rebroadcast: 976; GFX9: ; %bb.0: ; %entry 977; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 978; GFX9-NEXT: global_load_dword v0, v[0:1], off 979; GFX9-NEXT: s_mov_b32 s4, 0x7060302 980; GFX9-NEXT: s_waitcnt vmcnt(0) 981; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 982; GFX9-NEXT: v_mov_b32_e32 v1, v0 983; GFX9-NEXT: v_mov_b32_e32 v2, v0 984; GFX9-NEXT: s_setpc_b64 s[30:31] 985; 986; GFX10-LABEL: shuffle_v6bf16_rebroadcast: 987; GFX10: ; %bb.0: ; %entry 988; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 989; GFX10-NEXT: global_load_dword v0, v[0:1], off 990; GFX10-NEXT: s_waitcnt vmcnt(0) 991; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 992; GFX10-NEXT: v_mov_b32_e32 v1, v0 993; GFX10-NEXT: v_mov_b32_e32 v2, v0 994; GFX10-NEXT: s_setpc_b64 s[30:31] 995; 996; GFX11-LABEL: shuffle_v6bf16_rebroadcast: 997; GFX11: ; %bb.0: ; %entry 998; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 999; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1000; GFX11-NEXT: s_waitcnt vmcnt(0) 1001; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1002; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1003; GFX11-NEXT: v_mov_b32_e32 v1, v0 1004; GFX11-NEXT: v_mov_b32_e32 v2, v0 1005; GFX11-NEXT: s_setpc_b64 s[30:31] 1006entry: 1007 %val0 = load <6 x bfloat>, ptr addrspace(1) %arg0 1008 %val1 = shufflevector <6 x bfloat> %val0, <6 x bfloat> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1009 ret <6 x bfloat> %val1 1010} 1011 1012define <8 x bfloat> @shuffle_v8bf16_rebroadcast(ptr addrspace(1) %arg0) { 1013; GFX9-LABEL: shuffle_v8bf16_rebroadcast: 1014; GFX9: ; %bb.0: ; %entry 1015; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1016; GFX9-NEXT: global_load_dword v0, v[0:1], off 1017; GFX9-NEXT: s_mov_b32 s4, 0x7060302 1018; GFX9-NEXT: s_waitcnt vmcnt(0) 1019; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 1020; GFX9-NEXT: v_mov_b32_e32 v1, v0 1021; GFX9-NEXT: v_mov_b32_e32 v2, v0 1022; GFX9-NEXT: v_mov_b32_e32 v3, v0 1023; GFX9-NEXT: s_setpc_b64 s[30:31] 1024; 1025; GFX10-LABEL: shuffle_v8bf16_rebroadcast: 1026; GFX10: ; %bb.0: ; %entry 1027; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1028; GFX10-NEXT: global_load_dword v0, v[0:1], off 1029; GFX10-NEXT: s_waitcnt vmcnt(0) 1030; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1031; GFX10-NEXT: v_mov_b32_e32 v1, v0 1032; GFX10-NEXT: v_mov_b32_e32 v2, v0 1033; GFX10-NEXT: v_mov_b32_e32 v3, v0 1034; GFX10-NEXT: s_setpc_b64 s[30:31] 1035; 1036; GFX11-LABEL: shuffle_v8bf16_rebroadcast: 1037; GFX11: ; %bb.0: ; %entry 1038; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1039; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1040; GFX11-NEXT: s_waitcnt vmcnt(0) 1041; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1042; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1043; GFX11-NEXT: v_mov_b32_e32 v1, v0 1044; GFX11-NEXT: v_mov_b32_e32 v2, v0 1045; GFX11-NEXT: v_mov_b32_e32 v3, v0 1046; GFX11-NEXT: s_setpc_b64 s[30:31] 1047entry: 1048 %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0 1049 %val1 = shufflevector <8 x bfloat> %val0, <8 x bfloat> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1050 ret <8 x bfloat> %val1 1051} 1052 1053define <16 x bfloat> @shuffle_v16bf16_rebroadcast(ptr addrspace(1) %arg0) { 1054; GFX9-LABEL: shuffle_v16bf16_rebroadcast: 1055; GFX9: ; %bb.0: ; %entry 1056; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1057; GFX9-NEXT: global_load_dword v0, v[0:1], off 1058; GFX9-NEXT: s_mov_b32 s4, 0x7060302 1059; GFX9-NEXT: s_waitcnt vmcnt(0) 1060; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 1061; GFX9-NEXT: v_mov_b32_e32 v1, v0 1062; GFX9-NEXT: v_mov_b32_e32 v2, v0 1063; GFX9-NEXT: v_mov_b32_e32 v3, v0 1064; GFX9-NEXT: v_mov_b32_e32 v4, v0 1065; GFX9-NEXT: v_mov_b32_e32 v5, v0 1066; GFX9-NEXT: v_mov_b32_e32 v6, v0 1067; GFX9-NEXT: v_mov_b32_e32 v7, v0 1068; GFX9-NEXT: s_setpc_b64 s[30:31] 1069; 1070; GFX10-LABEL: shuffle_v16bf16_rebroadcast: 1071; GFX10: ; %bb.0: ; %entry 1072; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1073; GFX10-NEXT: global_load_dword v0, v[0:1], off 1074; GFX10-NEXT: s_waitcnt vmcnt(0) 1075; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1076; GFX10-NEXT: v_mov_b32_e32 v1, v0 1077; GFX10-NEXT: v_mov_b32_e32 v2, v0 1078; GFX10-NEXT: v_mov_b32_e32 v3, v0 1079; GFX10-NEXT: v_mov_b32_e32 v4, v0 1080; GFX10-NEXT: v_mov_b32_e32 v5, v0 1081; GFX10-NEXT: v_mov_b32_e32 v6, v0 1082; GFX10-NEXT: v_mov_b32_e32 v7, v0 1083; GFX10-NEXT: s_setpc_b64 s[30:31] 1084; 1085; GFX11-LABEL: shuffle_v16bf16_rebroadcast: 1086; GFX11: ; %bb.0: ; %entry 1087; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1088; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1089; GFX11-NEXT: s_waitcnt vmcnt(0) 1090; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1091; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1092; GFX11-NEXT: v_mov_b32_e32 v1, v0 1093; GFX11-NEXT: v_mov_b32_e32 v2, v0 1094; GFX11-NEXT: v_mov_b32_e32 v3, v0 1095; GFX11-NEXT: v_mov_b32_e32 v4, v0 1096; GFX11-NEXT: v_mov_b32_e32 v5, v0 1097; GFX11-NEXT: v_mov_b32_e32 v6, v0 1098; GFX11-NEXT: v_mov_b32_e32 v7, v0 1099; GFX11-NEXT: s_setpc_b64 s[30:31] 1100entry: 1101 %val0 = load <16 x bfloat>, ptr addrspace(1) %arg0 1102 %val1 = shufflevector <16 x bfloat> %val0, <16 x bfloat> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1103 ret <16 x bfloat> %val1 1104} 1105 1106define <32 x bfloat> @shuffle_v32bf16_rebroadcast(ptr addrspace(1) %arg0) { 1107; GFX9-LABEL: shuffle_v32bf16_rebroadcast: 1108; GFX9: ; %bb.0: ; %entry 1109; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1110; GFX9-NEXT: global_load_dword v0, v[0:1], off 1111; GFX9-NEXT: s_mov_b32 s4, 0x7060302 1112; GFX9-NEXT: s_waitcnt vmcnt(0) 1113; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 1114; GFX9-NEXT: v_mov_b32_e32 v1, v0 1115; GFX9-NEXT: v_mov_b32_e32 v2, v0 1116; GFX9-NEXT: v_mov_b32_e32 v3, v0 1117; GFX9-NEXT: v_mov_b32_e32 v4, v0 1118; GFX9-NEXT: v_mov_b32_e32 v5, v0 1119; GFX9-NEXT: v_mov_b32_e32 v6, v0 1120; GFX9-NEXT: v_mov_b32_e32 v7, v0 1121; GFX9-NEXT: v_mov_b32_e32 v8, v0 1122; GFX9-NEXT: v_mov_b32_e32 v9, v0 1123; GFX9-NEXT: v_mov_b32_e32 v10, v0 1124; GFX9-NEXT: v_mov_b32_e32 v11, v0 1125; GFX9-NEXT: v_mov_b32_e32 v12, v0 1126; GFX9-NEXT: v_mov_b32_e32 v13, v0 1127; GFX9-NEXT: v_mov_b32_e32 v14, v0 1128; GFX9-NEXT: v_mov_b32_e32 v15, v0 1129; GFX9-NEXT: s_setpc_b64 s[30:31] 1130; 1131; GFX10-LABEL: shuffle_v32bf16_rebroadcast: 1132; GFX10: ; %bb.0: ; %entry 1133; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1134; GFX10-NEXT: global_load_dword v0, v[0:1], off 1135; GFX10-NEXT: s_waitcnt vmcnt(0) 1136; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1137; GFX10-NEXT: v_mov_b32_e32 v1, v0 1138; GFX10-NEXT: v_mov_b32_e32 v2, v0 1139; GFX10-NEXT: v_mov_b32_e32 v3, v0 1140; GFX10-NEXT: v_mov_b32_e32 v4, v0 1141; GFX10-NEXT: v_mov_b32_e32 v5, v0 1142; GFX10-NEXT: v_mov_b32_e32 v6, v0 1143; GFX10-NEXT: v_mov_b32_e32 v7, v0 1144; GFX10-NEXT: v_mov_b32_e32 v8, v0 1145; GFX10-NEXT: v_mov_b32_e32 v9, v0 1146; GFX10-NEXT: v_mov_b32_e32 v10, v0 1147; GFX10-NEXT: v_mov_b32_e32 v11, v0 1148; GFX10-NEXT: v_mov_b32_e32 v12, v0 1149; GFX10-NEXT: v_mov_b32_e32 v13, v0 1150; GFX10-NEXT: v_mov_b32_e32 v14, v0 1151; GFX10-NEXT: v_mov_b32_e32 v15, v0 1152; GFX10-NEXT: s_setpc_b64 s[30:31] 1153; 1154; GFX11-LABEL: shuffle_v32bf16_rebroadcast: 1155; GFX11: ; %bb.0: ; %entry 1156; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1157; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1158; GFX11-NEXT: s_waitcnt vmcnt(0) 1159; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1160; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1161; GFX11-NEXT: v_mov_b32_e32 v1, v0 1162; GFX11-NEXT: v_mov_b32_e32 v2, v0 1163; GFX11-NEXT: v_mov_b32_e32 v3, v0 1164; GFX11-NEXT: v_mov_b32_e32 v4, v0 1165; GFX11-NEXT: v_mov_b32_e32 v5, v0 1166; GFX11-NEXT: v_mov_b32_e32 v6, v0 1167; GFX11-NEXT: v_mov_b32_e32 v7, v0 1168; GFX11-NEXT: v_mov_b32_e32 v8, v0 1169; GFX11-NEXT: v_mov_b32_e32 v9, v0 1170; GFX11-NEXT: v_mov_b32_e32 v10, v0 1171; GFX11-NEXT: v_mov_b32_e32 v11, v0 1172; GFX11-NEXT: v_mov_b32_e32 v12, v0 1173; GFX11-NEXT: v_mov_b32_e32 v13, v0 1174; GFX11-NEXT: v_mov_b32_e32 v14, v0 1175; GFX11-NEXT: v_mov_b32_e32 v15, v0 1176; GFX11-NEXT: s_setpc_b64 s[30:31] 1177entry: 1178 %val0 = load <32 x bfloat>, ptr addrspace(1) %arg0 1179 %val1 = shufflevector <32 x bfloat> %val0, <32 x bfloat> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1180 ret <32 x bfloat> %val1 1181} 1182 1183define <2 x half> @shuffle_v2f16_rebroadcast(ptr addrspace(1) %arg0) { 1184; GFX9-LABEL: shuffle_v2f16_rebroadcast: 1185; GFX9: ; %bb.0: ; %entry 1186; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1187; GFX9-NEXT: global_load_dword v0, v[0:1], off 1188; GFX9-NEXT: s_mov_b32 s4, 0x7060302 1189; GFX9-NEXT: s_waitcnt vmcnt(0) 1190; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 1191; GFX9-NEXT: s_setpc_b64 s[30:31] 1192; 1193; GFX10-LABEL: shuffle_v2f16_rebroadcast: 1194; GFX10: ; %bb.0: ; %entry 1195; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1196; GFX10-NEXT: global_load_dword v0, v[0:1], off 1197; GFX10-NEXT: s_waitcnt vmcnt(0) 1198; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1199; GFX10-NEXT: s_setpc_b64 s[30:31] 1200; 1201; GFX11-LABEL: shuffle_v2f16_rebroadcast: 1202; GFX11: ; %bb.0: ; %entry 1203; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1204; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1205; GFX11-NEXT: s_waitcnt vmcnt(0) 1206; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1207; GFX11-NEXT: s_setpc_b64 s[30:31] 1208entry: 1209 %val0 = load <2 x half>, ptr addrspace(1) %arg0 1210 %val1 = shufflevector <2 x half> %val0, <2 x half> poison, <2 x i32> <i32 1, i32 1> 1211 ret <2 x half> %val1 1212} 1213 1214define <3 x half> @shuffle_v3f16_rebroadcast(ptr addrspace(1) %arg0) { 1215; GFX9-LABEL: shuffle_v3f16_rebroadcast: 1216; GFX9: ; %bb.0: ; %entry 1217; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1218; GFX9-NEXT: global_load_dword v1, v[0:1], off 1219; GFX9-NEXT: s_mov_b32 s4, 0x7060302 1220; GFX9-NEXT: s_waitcnt vmcnt(0) 1221; GFX9-NEXT: v_perm_b32 v0, v1, v1, s4 1222; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 1223; GFX9-NEXT: s_setpc_b64 s[30:31] 1224; 1225; GFX10-LABEL: shuffle_v3f16_rebroadcast: 1226; GFX10: ; %bb.0: ; %entry 1227; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1228; GFX10-NEXT: global_load_dword v1, v[0:1], off 1229; GFX10-NEXT: s_waitcnt vmcnt(0) 1230; GFX10-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 1231; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 1232; GFX10-NEXT: s_setpc_b64 s[30:31] 1233; 1234; GFX11-LABEL: shuffle_v3f16_rebroadcast: 1235; GFX11: ; %bb.0: ; %entry 1236; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1237; GFX11-NEXT: global_load_b32 v1, v[0:1], off 1238; GFX11-NEXT: s_waitcnt vmcnt(0) 1239; GFX11-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 1240; GFX11-NEXT: v_alignbit_b32 v1, s0, v1, 16 1241; GFX11-NEXT: s_setpc_b64 s[30:31] 1242entry: 1243 %val0 = load <3 x half>, ptr addrspace(1) %arg0 1244 %val1 = shufflevector <3 x half> %val0, <3 x half> poison, <3 x i32> <i32 1, i32 1, i32 1> 1245 ret <3 x half> %val1 1246} 1247 1248define <4 x half> @shuffle_v4f16_rebroadcast(ptr addrspace(1) %arg0) { 1249; GFX9-LABEL: shuffle_v4f16_rebroadcast: 1250; GFX9: ; %bb.0: ; %entry 1251; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1252; GFX9-NEXT: global_load_dword v0, v[0:1], off 1253; GFX9-NEXT: s_mov_b32 s4, 0x7060302 1254; GFX9-NEXT: s_waitcnt vmcnt(0) 1255; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 1256; GFX9-NEXT: v_mov_b32_e32 v1, v0 1257; GFX9-NEXT: s_setpc_b64 s[30:31] 1258; 1259; GFX10-LABEL: shuffle_v4f16_rebroadcast: 1260; GFX10: ; %bb.0: ; %entry 1261; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1262; GFX10-NEXT: global_load_dword v0, v[0:1], off 1263; GFX10-NEXT: s_waitcnt vmcnt(0) 1264; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1265; GFX10-NEXT: v_mov_b32_e32 v1, v0 1266; GFX10-NEXT: s_setpc_b64 s[30:31] 1267; 1268; GFX11-LABEL: shuffle_v4f16_rebroadcast: 1269; GFX11: ; %bb.0: ; %entry 1270; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1271; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1272; GFX11-NEXT: s_waitcnt vmcnt(0) 1273; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1274; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1275; GFX11-NEXT: v_mov_b32_e32 v1, v0 1276; GFX11-NEXT: s_setpc_b64 s[30:31] 1277entry: 1278 %val0 = load <4 x half>, ptr addrspace(1) %arg0 1279 %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1280 ret <4 x half> %val1 1281} 1282 1283define <6 x half> @shuffle_v6f16_rebroadcast(ptr addrspace(1) %arg0) { 1284; GFX9-LABEL: shuffle_v6f16_rebroadcast: 1285; GFX9: ; %bb.0: ; %entry 1286; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1287; GFX9-NEXT: global_load_dword v0, v[0:1], off 1288; GFX9-NEXT: s_mov_b32 s4, 0x7060302 1289; GFX9-NEXT: s_waitcnt vmcnt(0) 1290; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 1291; GFX9-NEXT: v_mov_b32_e32 v1, v0 1292; GFX9-NEXT: v_mov_b32_e32 v2, v0 1293; GFX9-NEXT: s_setpc_b64 s[30:31] 1294; 1295; GFX10-LABEL: shuffle_v6f16_rebroadcast: 1296; GFX10: ; %bb.0: ; %entry 1297; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1298; GFX10-NEXT: global_load_dword v0, v[0:1], off 1299; GFX10-NEXT: s_waitcnt vmcnt(0) 1300; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1301; GFX10-NEXT: v_mov_b32_e32 v1, v0 1302; GFX10-NEXT: v_mov_b32_e32 v2, v0 1303; GFX10-NEXT: s_setpc_b64 s[30:31] 1304; 1305; GFX11-LABEL: shuffle_v6f16_rebroadcast: 1306; GFX11: ; %bb.0: ; %entry 1307; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1308; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1309; GFX11-NEXT: s_waitcnt vmcnt(0) 1310; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1311; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1312; GFX11-NEXT: v_mov_b32_e32 v1, v0 1313; GFX11-NEXT: v_mov_b32_e32 v2, v0 1314; GFX11-NEXT: s_setpc_b64 s[30:31] 1315entry: 1316 %val0 = load <6 x half>, ptr addrspace(1) %arg0 1317 %val1 = shufflevector <6 x half> %val0, <6 x half> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1318 ret <6 x half> %val1 1319} 1320 1321define <8 x half> @shuffle_v8f16_rebroadcast(ptr addrspace(1) %arg0) { 1322; GFX9-LABEL: shuffle_v8f16_rebroadcast: 1323; GFX9: ; %bb.0: ; %entry 1324; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1325; GFX9-NEXT: global_load_dword v0, v[0:1], off 1326; GFX9-NEXT: s_mov_b32 s4, 0x7060302 1327; GFX9-NEXT: s_waitcnt vmcnt(0) 1328; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 1329; GFX9-NEXT: v_mov_b32_e32 v1, v0 1330; GFX9-NEXT: v_mov_b32_e32 v2, v0 1331; GFX9-NEXT: v_mov_b32_e32 v3, v0 1332; GFX9-NEXT: s_setpc_b64 s[30:31] 1333; 1334; GFX10-LABEL: shuffle_v8f16_rebroadcast: 1335; GFX10: ; %bb.0: ; %entry 1336; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1337; GFX10-NEXT: global_load_dword v0, v[0:1], off 1338; GFX10-NEXT: s_waitcnt vmcnt(0) 1339; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1340; GFX10-NEXT: v_mov_b32_e32 v1, v0 1341; GFX10-NEXT: v_mov_b32_e32 v2, v0 1342; GFX10-NEXT: v_mov_b32_e32 v3, v0 1343; GFX10-NEXT: s_setpc_b64 s[30:31] 1344; 1345; GFX11-LABEL: shuffle_v8f16_rebroadcast: 1346; GFX11: ; %bb.0: ; %entry 1347; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1348; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1349; GFX11-NEXT: s_waitcnt vmcnt(0) 1350; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1351; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1352; GFX11-NEXT: v_mov_b32_e32 v1, v0 1353; GFX11-NEXT: v_mov_b32_e32 v2, v0 1354; GFX11-NEXT: v_mov_b32_e32 v3, v0 1355; GFX11-NEXT: s_setpc_b64 s[30:31] 1356entry: 1357 %val0 = load <8 x half>, ptr addrspace(1) %arg0 1358 %val1 = shufflevector <8 x half> %val0, <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1359 ret <8 x half> %val1 1360} 1361 1362define <16 x half> @shuffle_v16f16_rebroadcast(ptr addrspace(1) %arg0) { 1363; GFX9-LABEL: shuffle_v16f16_rebroadcast: 1364; GFX9: ; %bb.0: ; %entry 1365; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1366; GFX9-NEXT: global_load_dword v0, v[0:1], off 1367; GFX9-NEXT: s_mov_b32 s4, 0x7060302 1368; GFX9-NEXT: s_waitcnt vmcnt(0) 1369; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 1370; GFX9-NEXT: v_mov_b32_e32 v1, v0 1371; GFX9-NEXT: v_mov_b32_e32 v2, v0 1372; GFX9-NEXT: v_mov_b32_e32 v3, v0 1373; GFX9-NEXT: v_mov_b32_e32 v4, v0 1374; GFX9-NEXT: v_mov_b32_e32 v5, v0 1375; GFX9-NEXT: v_mov_b32_e32 v6, v0 1376; GFX9-NEXT: v_mov_b32_e32 v7, v0 1377; GFX9-NEXT: s_setpc_b64 s[30:31] 1378; 1379; GFX10-LABEL: shuffle_v16f16_rebroadcast: 1380; GFX10: ; %bb.0: ; %entry 1381; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1382; GFX10-NEXT: global_load_dword v0, v[0:1], off 1383; GFX10-NEXT: s_waitcnt vmcnt(0) 1384; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1385; GFX10-NEXT: v_mov_b32_e32 v1, v0 1386; GFX10-NEXT: v_mov_b32_e32 v2, v0 1387; GFX10-NEXT: v_mov_b32_e32 v3, v0 1388; GFX10-NEXT: v_mov_b32_e32 v4, v0 1389; GFX10-NEXT: v_mov_b32_e32 v5, v0 1390; GFX10-NEXT: v_mov_b32_e32 v6, v0 1391; GFX10-NEXT: v_mov_b32_e32 v7, v0 1392; GFX10-NEXT: s_setpc_b64 s[30:31] 1393; 1394; GFX11-LABEL: shuffle_v16f16_rebroadcast: 1395; GFX11: ; %bb.0: ; %entry 1396; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1397; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1398; GFX11-NEXT: s_waitcnt vmcnt(0) 1399; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1400; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1401; GFX11-NEXT: v_mov_b32_e32 v1, v0 1402; GFX11-NEXT: v_mov_b32_e32 v2, v0 1403; GFX11-NEXT: v_mov_b32_e32 v3, v0 1404; GFX11-NEXT: v_mov_b32_e32 v4, v0 1405; GFX11-NEXT: v_mov_b32_e32 v5, v0 1406; GFX11-NEXT: v_mov_b32_e32 v6, v0 1407; GFX11-NEXT: v_mov_b32_e32 v7, v0 1408; GFX11-NEXT: s_setpc_b64 s[30:31] 1409entry: 1410 %val0 = load <16 x half>, ptr addrspace(1) %arg0 1411 %val1 = shufflevector <16 x half> %val0, <16 x half> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1412 ret <16 x half> %val1 1413} 1414 1415define <32 x half> @shuffle_v32f16_rebroadcast(ptr addrspace(1) %arg0) { 1416; GFX9-LABEL: shuffle_v32f16_rebroadcast: 1417; GFX9: ; %bb.0: ; %entry 1418; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1419; GFX9-NEXT: global_load_dword v0, v[0:1], off 1420; GFX9-NEXT: s_mov_b32 s4, 0x7060302 1421; GFX9-NEXT: s_waitcnt vmcnt(0) 1422; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 1423; GFX9-NEXT: v_mov_b32_e32 v1, v0 1424; GFX9-NEXT: v_mov_b32_e32 v2, v0 1425; GFX9-NEXT: v_mov_b32_e32 v3, v0 1426; GFX9-NEXT: v_mov_b32_e32 v4, v0 1427; GFX9-NEXT: v_mov_b32_e32 v5, v0 1428; GFX9-NEXT: v_mov_b32_e32 v6, v0 1429; GFX9-NEXT: v_mov_b32_e32 v7, v0 1430; GFX9-NEXT: v_mov_b32_e32 v8, v0 1431; GFX9-NEXT: v_mov_b32_e32 v9, v0 1432; GFX9-NEXT: v_mov_b32_e32 v10, v0 1433; GFX9-NEXT: v_mov_b32_e32 v11, v0 1434; GFX9-NEXT: v_mov_b32_e32 v12, v0 1435; GFX9-NEXT: v_mov_b32_e32 v13, v0 1436; GFX9-NEXT: v_mov_b32_e32 v14, v0 1437; GFX9-NEXT: v_mov_b32_e32 v15, v0 1438; GFX9-NEXT: s_setpc_b64 s[30:31] 1439; 1440; GFX10-LABEL: shuffle_v32f16_rebroadcast: 1441; GFX10: ; %bb.0: ; %entry 1442; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1443; GFX10-NEXT: global_load_dword v0, v[0:1], off 1444; GFX10-NEXT: s_waitcnt vmcnt(0) 1445; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1446; GFX10-NEXT: v_mov_b32_e32 v1, v0 1447; GFX10-NEXT: v_mov_b32_e32 v2, v0 1448; GFX10-NEXT: v_mov_b32_e32 v3, v0 1449; GFX10-NEXT: v_mov_b32_e32 v4, v0 1450; GFX10-NEXT: v_mov_b32_e32 v5, v0 1451; GFX10-NEXT: v_mov_b32_e32 v6, v0 1452; GFX10-NEXT: v_mov_b32_e32 v7, v0 1453; GFX10-NEXT: v_mov_b32_e32 v8, v0 1454; GFX10-NEXT: v_mov_b32_e32 v9, v0 1455; GFX10-NEXT: v_mov_b32_e32 v10, v0 1456; GFX10-NEXT: v_mov_b32_e32 v11, v0 1457; GFX10-NEXT: v_mov_b32_e32 v12, v0 1458; GFX10-NEXT: v_mov_b32_e32 v13, v0 1459; GFX10-NEXT: v_mov_b32_e32 v14, v0 1460; GFX10-NEXT: v_mov_b32_e32 v15, v0 1461; GFX10-NEXT: s_setpc_b64 s[30:31] 1462; 1463; GFX11-LABEL: shuffle_v32f16_rebroadcast: 1464; GFX11: ; %bb.0: ; %entry 1465; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1466; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1467; GFX11-NEXT: s_waitcnt vmcnt(0) 1468; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 1469; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1470; GFX11-NEXT: v_mov_b32_e32 v1, v0 1471; GFX11-NEXT: v_mov_b32_e32 v2, v0 1472; GFX11-NEXT: v_mov_b32_e32 v3, v0 1473; GFX11-NEXT: v_mov_b32_e32 v4, v0 1474; GFX11-NEXT: v_mov_b32_e32 v5, v0 1475; GFX11-NEXT: v_mov_b32_e32 v6, v0 1476; GFX11-NEXT: v_mov_b32_e32 v7, v0 1477; GFX11-NEXT: v_mov_b32_e32 v8, v0 1478; GFX11-NEXT: v_mov_b32_e32 v9, v0 1479; GFX11-NEXT: v_mov_b32_e32 v10, v0 1480; GFX11-NEXT: v_mov_b32_e32 v11, v0 1481; GFX11-NEXT: v_mov_b32_e32 v12, v0 1482; GFX11-NEXT: v_mov_b32_e32 v13, v0 1483; GFX11-NEXT: v_mov_b32_e32 v14, v0 1484; GFX11-NEXT: v_mov_b32_e32 v15, v0 1485; GFX11-NEXT: s_setpc_b64 s[30:31] 1486entry: 1487 %val0 = load <32 x half>, ptr addrspace(1) %arg0 1488 %val1 = shufflevector <32 x half> %val0, <32 x half> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1489 ret <32 x half> %val1 1490} 1491 1492define <2 x float> @shuffle_v2f32_rebroadcast(ptr addrspace(1) %arg0) { 1493; GFX9-LABEL: shuffle_v2f32_rebroadcast: 1494; GFX9: ; %bb.0: ; %entry 1495; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1496; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1497; GFX9-NEXT: s_waitcnt vmcnt(0) 1498; GFX9-NEXT: v_mov_b32_e32 v0, v1 1499; GFX9-NEXT: s_setpc_b64 s[30:31] 1500; 1501; GFX10-LABEL: shuffle_v2f32_rebroadcast: 1502; GFX10: ; %bb.0: ; %entry 1503; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1504; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1505; GFX10-NEXT: s_waitcnt vmcnt(0) 1506; GFX10-NEXT: v_mov_b32_e32 v0, v1 1507; GFX10-NEXT: s_setpc_b64 s[30:31] 1508; 1509; GFX11-LABEL: shuffle_v2f32_rebroadcast: 1510; GFX11: ; %bb.0: ; %entry 1511; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1512; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off 1513; GFX11-NEXT: s_waitcnt vmcnt(0) 1514; GFX11-NEXT: v_mov_b32_e32 v0, v1 1515; GFX11-NEXT: s_setpc_b64 s[30:31] 1516entry: 1517 %val0 = load <2 x float>, ptr addrspace(1) %arg0 1518 %val1 = shufflevector <2 x float> %val0, <2 x float> poison, <2 x i32> <i32 1, i32 1> 1519 ret <2 x float> %val1 1520} 1521 1522define <3 x float> @shuffle_v3f32_rebroadcast(ptr addrspace(1) %arg0) { 1523; GFX9-LABEL: shuffle_v3f32_rebroadcast: 1524; GFX9: ; %bb.0: ; %entry 1525; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1526; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off 1527; GFX9-NEXT: s_waitcnt vmcnt(0) 1528; GFX9-NEXT: v_mov_b32_e32 v0, v1 1529; GFX9-NEXT: v_mov_b32_e32 v2, v1 1530; GFX9-NEXT: s_setpc_b64 s[30:31] 1531; 1532; GFX10-LABEL: shuffle_v3f32_rebroadcast: 1533; GFX10: ; %bb.0: ; %entry 1534; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1535; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off 1536; GFX10-NEXT: s_waitcnt vmcnt(0) 1537; GFX10-NEXT: v_mov_b32_e32 v0, v1 1538; GFX10-NEXT: v_mov_b32_e32 v2, v1 1539; GFX10-NEXT: s_setpc_b64 s[30:31] 1540; 1541; GFX11-LABEL: shuffle_v3f32_rebroadcast: 1542; GFX11: ; %bb.0: ; %entry 1543; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1544; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off 1545; GFX11-NEXT: s_waitcnt vmcnt(0) 1546; GFX11-NEXT: v_mov_b32_e32 v0, v1 1547; GFX11-NEXT: v_mov_b32_e32 v2, v1 1548; GFX11-NEXT: s_setpc_b64 s[30:31] 1549entry: 1550 %val0 = load <3 x float>, ptr addrspace(1) %arg0 1551 %val1 = shufflevector <3 x float> %val0, <3 x float> poison, <3 x i32> <i32 1, i32 1, i32 1> 1552 ret <3 x float> %val1 1553} 1554 1555define <4 x float> @shuffle_v4f32_rebroadcast(ptr addrspace(1) %arg0) { 1556; GFX9-LABEL: shuffle_v4f32_rebroadcast: 1557; GFX9: ; %bb.0: ; %entry 1558; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1559; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 1560; GFX9-NEXT: s_waitcnt vmcnt(0) 1561; GFX9-NEXT: v_mov_b32_e32 v0, v1 1562; GFX9-NEXT: v_mov_b32_e32 v2, v1 1563; GFX9-NEXT: v_mov_b32_e32 v3, v1 1564; GFX9-NEXT: s_setpc_b64 s[30:31] 1565; 1566; GFX10-LABEL: shuffle_v4f32_rebroadcast: 1567; GFX10: ; %bb.0: ; %entry 1568; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1569; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 1570; GFX10-NEXT: s_waitcnt vmcnt(0) 1571; GFX10-NEXT: v_mov_b32_e32 v0, v1 1572; GFX10-NEXT: v_mov_b32_e32 v2, v1 1573; GFX10-NEXT: v_mov_b32_e32 v3, v1 1574; GFX10-NEXT: s_setpc_b64 s[30:31] 1575; 1576; GFX11-LABEL: shuffle_v4f32_rebroadcast: 1577; GFX11: ; %bb.0: ; %entry 1578; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1579; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off 1580; GFX11-NEXT: s_waitcnt vmcnt(0) 1581; GFX11-NEXT: v_mov_b32_e32 v0, v1 1582; GFX11-NEXT: v_mov_b32_e32 v2, v1 1583; GFX11-NEXT: v_mov_b32_e32 v3, v1 1584; GFX11-NEXT: s_setpc_b64 s[30:31] 1585entry: 1586 %val0 = load <4 x float>, ptr addrspace(1) %arg0 1587 %val1 = shufflevector <4 x float> %val0, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1588 ret <4 x float> %val1 1589} 1590 1591define <6 x float> @shuffle_v6f32_rebroadcast(ptr addrspace(1) %arg0) { 1592; GFX9-LABEL: shuffle_v6f32_rebroadcast: 1593; GFX9: ; %bb.0: ; %entry 1594; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1595; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 1596; GFX9-NEXT: s_waitcnt vmcnt(0) 1597; GFX9-NEXT: v_mov_b32_e32 v0, v1 1598; GFX9-NEXT: v_mov_b32_e32 v2, v1 1599; GFX9-NEXT: v_mov_b32_e32 v3, v1 1600; GFX9-NEXT: v_mov_b32_e32 v4, v1 1601; GFX9-NEXT: v_mov_b32_e32 v5, v1 1602; GFX9-NEXT: s_setpc_b64 s[30:31] 1603; 1604; GFX10-LABEL: shuffle_v6f32_rebroadcast: 1605; GFX10: ; %bb.0: ; %entry 1606; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1607; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 1608; GFX10-NEXT: s_waitcnt vmcnt(0) 1609; GFX10-NEXT: v_mov_b32_e32 v0, v1 1610; GFX10-NEXT: v_mov_b32_e32 v2, v1 1611; GFX10-NEXT: v_mov_b32_e32 v3, v1 1612; GFX10-NEXT: v_mov_b32_e32 v4, v1 1613; GFX10-NEXT: v_mov_b32_e32 v5, v1 1614; GFX10-NEXT: s_setpc_b64 s[30:31] 1615; 1616; GFX11-LABEL: shuffle_v6f32_rebroadcast: 1617; GFX11: ; %bb.0: ; %entry 1618; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1619; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off 1620; GFX11-NEXT: s_waitcnt vmcnt(0) 1621; GFX11-NEXT: v_mov_b32_e32 v0, v1 1622; GFX11-NEXT: v_mov_b32_e32 v2, v1 1623; GFX11-NEXT: v_mov_b32_e32 v3, v1 1624; GFX11-NEXT: v_mov_b32_e32 v4, v1 1625; GFX11-NEXT: v_mov_b32_e32 v5, v1 1626; GFX11-NEXT: s_setpc_b64 s[30:31] 1627entry: 1628 %val0 = load <6 x float>, ptr addrspace(1) %arg0 1629 %val1 = shufflevector <6 x float> %val0, <6 x float> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1630 ret <6 x float> %val1 1631} 1632 1633define <8 x float> @shuffle_v8f32_rebroadcast(ptr addrspace(1) %arg0) { 1634; GFX9-LABEL: shuffle_v8f32_rebroadcast: 1635; GFX9: ; %bb.0: ; %entry 1636; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1637; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 1638; GFX9-NEXT: s_waitcnt vmcnt(0) 1639; GFX9-NEXT: v_mov_b32_e32 v0, v1 1640; GFX9-NEXT: v_mov_b32_e32 v2, v1 1641; GFX9-NEXT: v_mov_b32_e32 v3, v1 1642; GFX9-NEXT: v_mov_b32_e32 v4, v1 1643; GFX9-NEXT: v_mov_b32_e32 v5, v1 1644; GFX9-NEXT: v_mov_b32_e32 v6, v1 1645; GFX9-NEXT: v_mov_b32_e32 v7, v1 1646; GFX9-NEXT: s_setpc_b64 s[30:31] 1647; 1648; GFX10-LABEL: shuffle_v8f32_rebroadcast: 1649; GFX10: ; %bb.0: ; %entry 1650; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1651; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 1652; GFX10-NEXT: s_waitcnt vmcnt(0) 1653; GFX10-NEXT: v_mov_b32_e32 v0, v1 1654; GFX10-NEXT: v_mov_b32_e32 v2, v1 1655; GFX10-NEXT: v_mov_b32_e32 v3, v1 1656; GFX10-NEXT: v_mov_b32_e32 v4, v1 1657; GFX10-NEXT: v_mov_b32_e32 v5, v1 1658; GFX10-NEXT: v_mov_b32_e32 v6, v1 1659; GFX10-NEXT: v_mov_b32_e32 v7, v1 1660; GFX10-NEXT: s_setpc_b64 s[30:31] 1661; 1662; GFX11-LABEL: shuffle_v8f32_rebroadcast: 1663; GFX11: ; %bb.0: ; %entry 1664; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1665; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off 1666; GFX11-NEXT: s_waitcnt vmcnt(0) 1667; GFX11-NEXT: v_mov_b32_e32 v0, v1 1668; GFX11-NEXT: v_mov_b32_e32 v2, v1 1669; GFX11-NEXT: v_mov_b32_e32 v3, v1 1670; GFX11-NEXT: v_mov_b32_e32 v4, v1 1671; GFX11-NEXT: v_mov_b32_e32 v5, v1 1672; GFX11-NEXT: v_mov_b32_e32 v6, v1 1673; GFX11-NEXT: v_mov_b32_e32 v7, v1 1674; GFX11-NEXT: s_setpc_b64 s[30:31] 1675entry: 1676 %val0 = load <8 x float>, ptr addrspace(1) %arg0 1677 %val1 = shufflevector <8 x float> %val0, <8 x float> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1678 ret <8 x float> %val1 1679} 1680 1681define <16 x float> @shuffle_v16f32_rebroadcast(ptr addrspace(1) %arg0) { 1682; GFX9-LABEL: shuffle_v16f32_rebroadcast: 1683; GFX9: ; %bb.0: ; %entry 1684; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1685; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 1686; GFX9-NEXT: s_waitcnt vmcnt(0) 1687; GFX9-NEXT: v_mov_b32_e32 v0, v1 1688; GFX9-NEXT: v_mov_b32_e32 v2, v1 1689; GFX9-NEXT: v_mov_b32_e32 v3, v1 1690; GFX9-NEXT: v_mov_b32_e32 v4, v1 1691; GFX9-NEXT: v_mov_b32_e32 v5, v1 1692; GFX9-NEXT: v_mov_b32_e32 v6, v1 1693; GFX9-NEXT: v_mov_b32_e32 v7, v1 1694; GFX9-NEXT: v_mov_b32_e32 v8, v1 1695; GFX9-NEXT: v_mov_b32_e32 v9, v1 1696; GFX9-NEXT: v_mov_b32_e32 v10, v1 1697; GFX9-NEXT: v_mov_b32_e32 v11, v1 1698; GFX9-NEXT: v_mov_b32_e32 v12, v1 1699; GFX9-NEXT: v_mov_b32_e32 v13, v1 1700; GFX9-NEXT: v_mov_b32_e32 v14, v1 1701; GFX9-NEXT: v_mov_b32_e32 v15, v1 1702; GFX9-NEXT: s_setpc_b64 s[30:31] 1703; 1704; GFX10-LABEL: shuffle_v16f32_rebroadcast: 1705; GFX10: ; %bb.0: ; %entry 1706; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1707; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 1708; GFX10-NEXT: s_waitcnt vmcnt(0) 1709; GFX10-NEXT: v_mov_b32_e32 v0, v1 1710; GFX10-NEXT: v_mov_b32_e32 v2, v1 1711; GFX10-NEXT: v_mov_b32_e32 v3, v1 1712; GFX10-NEXT: v_mov_b32_e32 v4, v1 1713; GFX10-NEXT: v_mov_b32_e32 v5, v1 1714; GFX10-NEXT: v_mov_b32_e32 v6, v1 1715; GFX10-NEXT: v_mov_b32_e32 v7, v1 1716; GFX10-NEXT: v_mov_b32_e32 v8, v1 1717; GFX10-NEXT: v_mov_b32_e32 v9, v1 1718; GFX10-NEXT: v_mov_b32_e32 v10, v1 1719; GFX10-NEXT: v_mov_b32_e32 v11, v1 1720; GFX10-NEXT: v_mov_b32_e32 v12, v1 1721; GFX10-NEXT: v_mov_b32_e32 v13, v1 1722; GFX10-NEXT: v_mov_b32_e32 v14, v1 1723; GFX10-NEXT: v_mov_b32_e32 v15, v1 1724; GFX10-NEXT: s_setpc_b64 s[30:31] 1725; 1726; GFX11-LABEL: shuffle_v16f32_rebroadcast: 1727; GFX11: ; %bb.0: ; %entry 1728; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1729; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off 1730; GFX11-NEXT: s_waitcnt vmcnt(0) 1731; GFX11-NEXT: v_mov_b32_e32 v0, v1 1732; GFX11-NEXT: v_mov_b32_e32 v2, v1 1733; GFX11-NEXT: v_mov_b32_e32 v3, v1 1734; GFX11-NEXT: v_mov_b32_e32 v4, v1 1735; GFX11-NEXT: v_mov_b32_e32 v5, v1 1736; GFX11-NEXT: v_mov_b32_e32 v6, v1 1737; GFX11-NEXT: v_mov_b32_e32 v7, v1 1738; GFX11-NEXT: v_mov_b32_e32 v8, v1 1739; GFX11-NEXT: v_mov_b32_e32 v9, v1 1740; GFX11-NEXT: v_mov_b32_e32 v10, v1 1741; GFX11-NEXT: v_mov_b32_e32 v11, v1 1742; GFX11-NEXT: v_mov_b32_e32 v12, v1 1743; GFX11-NEXT: v_mov_b32_e32 v13, v1 1744; GFX11-NEXT: v_mov_b32_e32 v14, v1 1745; GFX11-NEXT: v_mov_b32_e32 v15, v1 1746; GFX11-NEXT: s_setpc_b64 s[30:31] 1747entry: 1748 %val0 = load <16 x float>, ptr addrspace(1) %arg0 1749 %val1 = shufflevector <16 x float> %val0, <16 x float> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1750 ret <16 x float> %val1 1751} 1752 1753define <32 x float> @shuffle_v32f32_rebroadcast(ptr addrspace(1) %arg0) { 1754; GFX9-LABEL: shuffle_v32f32_rebroadcast: 1755; GFX9: ; %bb.0: ; %entry 1756; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1757; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 1758; GFX9-NEXT: s_waitcnt vmcnt(0) 1759; GFX9-NEXT: v_mov_b32_e32 v0, v1 1760; GFX9-NEXT: v_mov_b32_e32 v2, v1 1761; GFX9-NEXT: v_mov_b32_e32 v3, v1 1762; GFX9-NEXT: v_mov_b32_e32 v4, v1 1763; GFX9-NEXT: v_mov_b32_e32 v5, v1 1764; GFX9-NEXT: v_mov_b32_e32 v6, v1 1765; GFX9-NEXT: v_mov_b32_e32 v7, v1 1766; GFX9-NEXT: v_mov_b32_e32 v8, v1 1767; GFX9-NEXT: v_mov_b32_e32 v9, v1 1768; GFX9-NEXT: v_mov_b32_e32 v10, v1 1769; GFX9-NEXT: v_mov_b32_e32 v11, v1 1770; GFX9-NEXT: v_mov_b32_e32 v12, v1 1771; GFX9-NEXT: v_mov_b32_e32 v13, v1 1772; GFX9-NEXT: v_mov_b32_e32 v14, v1 1773; GFX9-NEXT: v_mov_b32_e32 v15, v1 1774; GFX9-NEXT: v_mov_b32_e32 v16, v1 1775; GFX9-NEXT: v_mov_b32_e32 v17, v1 1776; GFX9-NEXT: v_mov_b32_e32 v18, v1 1777; GFX9-NEXT: v_mov_b32_e32 v19, v1 1778; GFX9-NEXT: v_mov_b32_e32 v20, v1 1779; GFX9-NEXT: v_mov_b32_e32 v21, v1 1780; GFX9-NEXT: v_mov_b32_e32 v22, v1 1781; GFX9-NEXT: v_mov_b32_e32 v23, v1 1782; GFX9-NEXT: v_mov_b32_e32 v24, v1 1783; GFX9-NEXT: v_mov_b32_e32 v25, v1 1784; GFX9-NEXT: v_mov_b32_e32 v26, v1 1785; GFX9-NEXT: v_mov_b32_e32 v27, v1 1786; GFX9-NEXT: v_mov_b32_e32 v28, v1 1787; GFX9-NEXT: v_mov_b32_e32 v29, v1 1788; GFX9-NEXT: v_mov_b32_e32 v30, v1 1789; GFX9-NEXT: v_mov_b32_e32 v31, v1 1790; GFX9-NEXT: s_setpc_b64 s[30:31] 1791; 1792; GFX10-LABEL: shuffle_v32f32_rebroadcast: 1793; GFX10: ; %bb.0: ; %entry 1794; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1795; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off 1796; GFX10-NEXT: s_waitcnt vmcnt(0) 1797; GFX10-NEXT: v_mov_b32_e32 v0, v1 1798; GFX10-NEXT: v_mov_b32_e32 v2, v1 1799; GFX10-NEXT: v_mov_b32_e32 v3, v1 1800; GFX10-NEXT: v_mov_b32_e32 v4, v1 1801; GFX10-NEXT: v_mov_b32_e32 v5, v1 1802; GFX10-NEXT: v_mov_b32_e32 v6, v1 1803; GFX10-NEXT: v_mov_b32_e32 v7, v1 1804; GFX10-NEXT: v_mov_b32_e32 v8, v1 1805; GFX10-NEXT: v_mov_b32_e32 v9, v1 1806; GFX10-NEXT: v_mov_b32_e32 v10, v1 1807; GFX10-NEXT: v_mov_b32_e32 v11, v1 1808; GFX10-NEXT: v_mov_b32_e32 v12, v1 1809; GFX10-NEXT: v_mov_b32_e32 v13, v1 1810; GFX10-NEXT: v_mov_b32_e32 v14, v1 1811; GFX10-NEXT: v_mov_b32_e32 v15, v1 1812; GFX10-NEXT: v_mov_b32_e32 v16, v1 1813; GFX10-NEXT: v_mov_b32_e32 v17, v1 1814; GFX10-NEXT: v_mov_b32_e32 v18, v1 1815; GFX10-NEXT: v_mov_b32_e32 v19, v1 1816; GFX10-NEXT: v_mov_b32_e32 v20, v1 1817; GFX10-NEXT: v_mov_b32_e32 v21, v1 1818; GFX10-NEXT: v_mov_b32_e32 v22, v1 1819; GFX10-NEXT: v_mov_b32_e32 v23, v1 1820; GFX10-NEXT: v_mov_b32_e32 v24, v1 1821; GFX10-NEXT: v_mov_b32_e32 v25, v1 1822; GFX10-NEXT: v_mov_b32_e32 v26, v1 1823; GFX10-NEXT: v_mov_b32_e32 v27, v1 1824; GFX10-NEXT: v_mov_b32_e32 v28, v1 1825; GFX10-NEXT: v_mov_b32_e32 v29, v1 1826; GFX10-NEXT: v_mov_b32_e32 v30, v1 1827; GFX10-NEXT: v_mov_b32_e32 v31, v1 1828; GFX10-NEXT: s_setpc_b64 s[30:31] 1829; 1830; GFX11-LABEL: shuffle_v32f32_rebroadcast: 1831; GFX11: ; %bb.0: ; %entry 1832; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1833; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off 1834; GFX11-NEXT: s_waitcnt vmcnt(0) 1835; GFX11-NEXT: v_mov_b32_e32 v0, v1 1836; GFX11-NEXT: v_mov_b32_e32 v2, v1 1837; GFX11-NEXT: v_mov_b32_e32 v3, v1 1838; GFX11-NEXT: v_mov_b32_e32 v4, v1 1839; GFX11-NEXT: v_mov_b32_e32 v5, v1 1840; GFX11-NEXT: v_mov_b32_e32 v6, v1 1841; GFX11-NEXT: v_mov_b32_e32 v7, v1 1842; GFX11-NEXT: v_mov_b32_e32 v8, v1 1843; GFX11-NEXT: v_mov_b32_e32 v9, v1 1844; GFX11-NEXT: v_mov_b32_e32 v10, v1 1845; GFX11-NEXT: v_mov_b32_e32 v11, v1 1846; GFX11-NEXT: v_mov_b32_e32 v12, v1 1847; GFX11-NEXT: v_mov_b32_e32 v13, v1 1848; GFX11-NEXT: v_mov_b32_e32 v14, v1 1849; GFX11-NEXT: v_mov_b32_e32 v15, v1 1850; GFX11-NEXT: v_mov_b32_e32 v16, v1 1851; GFX11-NEXT: v_mov_b32_e32 v17, v1 1852; GFX11-NEXT: v_mov_b32_e32 v18, v1 1853; GFX11-NEXT: v_mov_b32_e32 v19, v1 1854; GFX11-NEXT: v_mov_b32_e32 v20, v1 1855; GFX11-NEXT: v_mov_b32_e32 v21, v1 1856; GFX11-NEXT: v_mov_b32_e32 v22, v1 1857; GFX11-NEXT: v_mov_b32_e32 v23, v1 1858; GFX11-NEXT: v_mov_b32_e32 v24, v1 1859; GFX11-NEXT: v_mov_b32_e32 v25, v1 1860; GFX11-NEXT: v_mov_b32_e32 v26, v1 1861; GFX11-NEXT: v_mov_b32_e32 v27, v1 1862; GFX11-NEXT: v_mov_b32_e32 v28, v1 1863; GFX11-NEXT: v_mov_b32_e32 v29, v1 1864; GFX11-NEXT: v_mov_b32_e32 v30, v1 1865; GFX11-NEXT: v_mov_b32_e32 v31, v1 1866; GFX11-NEXT: s_setpc_b64 s[30:31] 1867entry: 1868 %val0 = load <32 x float>, ptr addrspace(1) %arg0 1869 %val1 = shufflevector <32 x float> %val0, <32 x float> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1870 ret <32 x float> %val1 1871} 1872