1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s 5 6define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) { 7; SI-LABEL: vec_8xi16_extract_4xi16: 8; SI: ; %bb.0: 9; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10; SI-NEXT: s_cbranch_scc0 .LBB0_2 11; SI-NEXT: ; %bb.1: ; %F 12; SI-NEXT: s_mov_b32 s6, 0 13; SI-NEXT: s_mov_b32 s7, 0xf000 14; SI-NEXT: s_mov_b32 s4, s6 15; SI-NEXT: s_mov_b32 s5, s6 16; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 17; SI-NEXT: s_waitcnt vmcnt(0) 18; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 19; SI-NEXT: s_waitcnt vmcnt(0) 20; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 21; SI-NEXT: s_waitcnt vmcnt(0) 22; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 23; SI-NEXT: s_waitcnt vmcnt(0) 24; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 25; SI-NEXT: s_waitcnt vmcnt(0) 26; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 27; SI-NEXT: s_waitcnt vmcnt(0) 28; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 29; SI-NEXT: s_waitcnt vmcnt(0) 30; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc 31; SI-NEXT: s_waitcnt vmcnt(0) 32; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 33; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 34; SI-NEXT: v_or_b32_e32 v3, v6, v2 35; SI-NEXT: v_or_b32_e32 v2, v4, v5 36; SI-NEXT: s_mov_b64 vcc, exec 37; SI-NEXT: s_cbranch_execz .LBB0_3 38; SI-NEXT: s_branch .LBB0_4 39; SI-NEXT: .LBB0_2: 40; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 41; SI-NEXT: s_mov_b64 vcc, 0 42; SI-NEXT: .LBB0_3: ; %T 43; SI-NEXT: s_mov_b32 s6, 0 44; SI-NEXT: s_mov_b32 s7, 0xf000 45; SI-NEXT: s_mov_b32 s4, s6 46; SI-NEXT: s_mov_b32 s5, s6 47; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 48; SI-NEXT: s_waitcnt vmcnt(0) 49; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc 50; SI-NEXT: s_waitcnt vmcnt(0) 51; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 52; SI-NEXT: s_waitcnt vmcnt(0) 53; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 54; SI-NEXT: s_waitcnt vmcnt(0) 55; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 56; SI-NEXT: s_waitcnt vmcnt(0) 57; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 58; SI-NEXT: s_waitcnt vmcnt(0) 59; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 60; SI-NEXT: s_waitcnt vmcnt(0) 61; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc 62; SI-NEXT: s_waitcnt vmcnt(0) 63; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 64; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 65; SI-NEXT: v_or_b32_e32 v3, v4, v0 66; SI-NEXT: v_or_b32_e32 v2, v2, v1 67; SI-NEXT: .LBB0_4: ; %exit 68; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v2 69; SI-NEXT: v_bfe_i32 v1, v2, 0, 16 70; SI-NEXT: v_bfe_i32 v2, v3, 0, 16 71; SI-NEXT: v_mov_b32_e32 v3, 0xffff0000 72; SI-NEXT: v_bfrev_b32_e32 v4, 1 73; SI-NEXT: v_mov_b32_e32 v5, 0xffff 74; SI-NEXT: v_mov_b32_e32 v6, 0x8000 75; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 76; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 77; SI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc 78; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 79; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc 80; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 81; SI-NEXT: v_cndmask_b32_e32 v1, -1, v7, vcc 82; SI-NEXT: v_or_b32_e32 v0, v0, v4 83; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 84; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1 85; SI-NEXT: v_or_b32_e32 v2, v3, v2 86; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 87; SI-NEXT: s_setpc_b64 s[30:31] 88; 89; GFX9-LABEL: vec_8xi16_extract_4xi16: 90; GFX9: ; %bb.0: 91; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 92; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 93; GFX9-NEXT: ; %bb.1: ; %F 94; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc 95; GFX9-NEXT: s_waitcnt vmcnt(0) 96; GFX9-NEXT: s_cbranch_execz .LBB0_3 97; GFX9-NEXT: s_branch .LBB0_4 98; GFX9-NEXT: .LBB0_2: 99; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 100; GFX9-NEXT: .LBB0_3: ; %T 101; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc 102; GFX9-NEXT: s_waitcnt vmcnt(0) 103; GFX9-NEXT: .LBB0_4: ; %exit 104; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0] 105; GFX9-NEXT: s_movk_i32 s4, 0x8000 106; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 107; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 108; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] 109; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 110; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 111; GFX9-NEXT: s_mov_b32 s4, 0x5040100 112; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 113; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 114; GFX9-NEXT: s_setpc_b64 s[30:31] 115; 116; GFX11-LABEL: vec_8xi16_extract_4xi16: 117; GFX11: ; %bb.0: 118; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 119; GFX11-NEXT: s_cbranch_scc0 .LBB0_2 120; GFX11-NEXT: ; %bb.1: ; %F 121; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc 122; GFX11-NEXT: s_waitcnt vmcnt(0) 123; GFX11-NEXT: s_cbranch_execz .LBB0_3 124; GFX11-NEXT: s_branch .LBB0_4 125; GFX11-NEXT: .LBB0_2: 126; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 127; GFX11-NEXT: .LBB0_3: ; %T 128; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc 129; GFX11-NEXT: s_waitcnt vmcnt(0) 130; GFX11-NEXT: .LBB0_4: ; %exit 131; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] 132; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0] 133; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 134; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 135; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 136; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1 137; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 138; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 139; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2 140; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3 141; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 142; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 143; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 144; GFX11-NEXT: s_setpc_b64 s[30:31] 145 br i1 undef, label %T, label %F 146 147T: 148 %t = load volatile <8 x i16>, ptr addrspace(1) %p0 149 br label %exit 150 151F: 152 %f = load volatile <8 x i16>, ptr addrspace(1) %p1 153 br label %exit 154 155exit: 156 %m = phi <8 x i16> [ %t, %T ], [ %f, %F ] 157 %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 158 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 159 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 160 ret <4 x i16> %r2 161} 162 163define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(1) %p1) { 164; SI-LABEL: vec_8xi16_extract_4xi16_2: 165; SI: ; %bb.0: 166; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 167; SI-NEXT: s_cbranch_scc0 .LBB1_2 168; SI-NEXT: ; %bb.1: ; %F 169; SI-NEXT: s_mov_b32 s6, 0 170; SI-NEXT: s_mov_b32 s7, 0xf000 171; SI-NEXT: s_mov_b32 s4, s6 172; SI-NEXT: s_mov_b32 s5, s6 173; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 174; SI-NEXT: s_waitcnt vmcnt(0) 175; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc 176; SI-NEXT: s_waitcnt vmcnt(0) 177; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc 178; SI-NEXT: s_waitcnt vmcnt(0) 179; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc 180; SI-NEXT: s_waitcnt vmcnt(0) 181; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc 182; SI-NEXT: s_waitcnt vmcnt(0) 183; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc 184; SI-NEXT: s_waitcnt vmcnt(0) 185; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc 186; SI-NEXT: s_waitcnt vmcnt(0) 187; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc 188; SI-NEXT: s_waitcnt vmcnt(0) 189; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 190; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 191; SI-NEXT: v_or_b32_e32 v5, v6, v2 192; SI-NEXT: v_or_b32_e32 v4, v4, v3 193; SI-NEXT: s_mov_b64 vcc, exec 194; SI-NEXT: s_cbranch_execz .LBB1_3 195; SI-NEXT: s_branch .LBB1_4 196; SI-NEXT: .LBB1_2: 197; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 198; SI-NEXT: s_mov_b64 vcc, 0 199; SI-NEXT: .LBB1_3: ; %T 200; SI-NEXT: s_mov_b32 s6, 0 201; SI-NEXT: s_mov_b32 s7, 0xf000 202; SI-NEXT: s_mov_b32 s4, s6 203; SI-NEXT: s_mov_b32 s5, s6 204; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 205; SI-NEXT: s_waitcnt vmcnt(0) 206; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc 207; SI-NEXT: s_waitcnt vmcnt(0) 208; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc 209; SI-NEXT: s_waitcnt vmcnt(0) 210; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc 211; SI-NEXT: s_waitcnt vmcnt(0) 212; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc 213; SI-NEXT: s_waitcnt vmcnt(0) 214; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc 215; SI-NEXT: s_waitcnt vmcnt(0) 216; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc 217; SI-NEXT: s_waitcnt vmcnt(0) 218; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc 219; SI-NEXT: s_waitcnt vmcnt(0) 220; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 221; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 222; SI-NEXT: v_or_b32_e32 v5, v4, v0 223; SI-NEXT: v_or_b32_e32 v4, v2, v1 224; SI-NEXT: .LBB1_4: ; %exit 225; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v4 226; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 48 227; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 228; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 229; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000 230; SI-NEXT: v_bfrev_b32_e32 v5, 1 231; SI-NEXT: v_mov_b32_e32 v6, 0xffff 232; SI-NEXT: v_mov_b32_e32 v7, 0x8000 233; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 234; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc 235; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 236; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc 237; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 238; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc 239; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 240; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 241; SI-NEXT: v_or_b32_e32 v0, v1, v8 242; SI-NEXT: v_or_b32_e32 v2, v2, v3 243; SI-NEXT: v_alignbit_b32 v1, v2, v8, 16 244; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 245; SI-NEXT: s_setpc_b64 s[30:31] 246; 247; GFX9-LABEL: vec_8xi16_extract_4xi16_2: 248; GFX9: ; %bb.0: 249; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 250; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 251; GFX9-NEXT: ; %bb.1: ; %F 252; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc 253; GFX9-NEXT: s_waitcnt vmcnt(0) 254; GFX9-NEXT: s_cbranch_execz .LBB1_3 255; GFX9-NEXT: s_branch .LBB1_4 256; GFX9-NEXT: .LBB1_2: 257; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 258; GFX9-NEXT: .LBB1_3: ; %T 259; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc 260; GFX9-NEXT: s_waitcnt vmcnt(0) 261; GFX9-NEXT: .LBB1_4: ; %exit 262; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1] 263; GFX9-NEXT: s_movk_i32 s4, 0x8000 264; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 265; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 266; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] 267; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 268; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 269; GFX9-NEXT: s_mov_b32 s4, 0x5040100 270; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 271; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 272; GFX9-NEXT: s_setpc_b64 s[30:31] 273; 274; GFX11-LABEL: vec_8xi16_extract_4xi16_2: 275; GFX11: ; %bb.0: 276; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 277; GFX11-NEXT: s_cbranch_scc0 .LBB1_2 278; GFX11-NEXT: ; %bb.1: ; %F 279; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc 280; GFX11-NEXT: s_waitcnt vmcnt(0) 281; GFX11-NEXT: s_cbranch_execz .LBB1_3 282; GFX11-NEXT: s_branch .LBB1_4 283; GFX11-NEXT: .LBB1_2: 284; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 285; GFX11-NEXT: .LBB1_3: ; %T 286; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc 287; GFX11-NEXT: s_waitcnt vmcnt(0) 288; GFX11-NEXT: .LBB1_4: ; %exit 289; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] 290; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1] 291; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 292; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 293; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 294; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1 295; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 296; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 297; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2 298; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3 299; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 300; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 301; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 302; GFX11-NEXT: s_setpc_b64 s[30:31] 303 br i1 undef, label %T, label %F 304 305T: 306 %t = load volatile <8 x i16>, ptr addrspace(1) %p0 307 br label %exit 308 309F: 310 %f = load volatile <8 x i16>, ptr addrspace(1) %p1 311 br label %exit 312 313exit: 314 %m = phi <8 x i16> [ %t, %T ], [ %f, %F ] 315 %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 316 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 317 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 318 ret <4 x i16> %r2 319} 320 321define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) { 322; SI-LABEL: vec_8xf16_extract_4xf16: 323; SI: ; %bb.0: 324; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 325; SI-NEXT: s_cbranch_scc0 .LBB2_2 326; SI-NEXT: ; %bb.1: ; %F 327; SI-NEXT: s_mov_b32 s6, 0 328; SI-NEXT: s_mov_b32 s7, 0xf000 329; SI-NEXT: s_mov_b32 s4, s6 330; SI-NEXT: s_mov_b32 s5, s6 331; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 332; SI-NEXT: s_waitcnt vmcnt(0) 333; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 334; SI-NEXT: s_waitcnt vmcnt(0) 335; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 336; SI-NEXT: s_waitcnt vmcnt(0) 337; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 338; SI-NEXT: s_waitcnt vmcnt(0) 339; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 340; SI-NEXT: s_waitcnt vmcnt(0) 341; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 342; SI-NEXT: s_waitcnt vmcnt(0) 343; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 344; SI-NEXT: s_waitcnt vmcnt(0) 345; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc 346; SI-NEXT: s_waitcnt vmcnt(0) 347; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 348; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 349; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 350; SI-NEXT: v_or_b32_e32 v2, v6, v2 351; SI-NEXT: v_or_b32_e32 v4, v4, v7 352; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 353; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 354; SI-NEXT: s_mov_b64 vcc, exec 355; SI-NEXT: s_cbranch_execz .LBB2_3 356; SI-NEXT: s_branch .LBB2_4 357; SI-NEXT: .LBB2_2: 358; SI-NEXT: ; implicit-def: $vgpr4 359; SI-NEXT: ; implicit-def: $vgpr3 360; SI-NEXT: ; implicit-def: $vgpr2 361; SI-NEXT: s_mov_b64 vcc, 0 362; SI-NEXT: .LBB2_3: ; %T 363; SI-NEXT: s_mov_b32 s6, 0 364; SI-NEXT: s_mov_b32 s7, 0xf000 365; SI-NEXT: s_mov_b32 s4, s6 366; SI-NEXT: s_mov_b32 s5, s6 367; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 368; SI-NEXT: s_waitcnt vmcnt(0) 369; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc 370; SI-NEXT: s_waitcnt vmcnt(0) 371; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 372; SI-NEXT: s_waitcnt vmcnt(0) 373; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 374; SI-NEXT: s_waitcnt vmcnt(0) 375; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 376; SI-NEXT: s_waitcnt vmcnt(0) 377; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 378; SI-NEXT: s_waitcnt vmcnt(0) 379; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 380; SI-NEXT: s_waitcnt vmcnt(0) 381; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc 382; SI-NEXT: s_waitcnt vmcnt(0) 383; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 384; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 385; SI-NEXT: v_or_b32_e32 v0, v4, v0 386; SI-NEXT: v_or_b32_e32 v1, v2, v1 387; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 388; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 389; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 390; SI-NEXT: .LBB2_4: ; %exit 391; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 392; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 393; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 394; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 395; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 396; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 397; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 398; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 399; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 400; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 401; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 402; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 403; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 404; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 405; SI-NEXT: v_mov_b32_e32 v3, v2 406; SI-NEXT: s_setpc_b64 s[30:31] 407; 408; GFX9-LABEL: vec_8xf16_extract_4xf16: 409; GFX9: ; %bb.0: 410; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 411; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 412; GFX9-NEXT: ; %bb.1: ; %F 413; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc 414; GFX9-NEXT: s_waitcnt vmcnt(0) 415; GFX9-NEXT: s_cbranch_execz .LBB2_3 416; GFX9-NEXT: s_branch .LBB2_4 417; GFX9-NEXT: .LBB2_2: 418; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 419; GFX9-NEXT: .LBB2_3: ; %T 420; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc 421; GFX9-NEXT: s_waitcnt vmcnt(0) 422; GFX9-NEXT: .LBB2_4: ; %exit 423; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900 424; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00 425; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2 426; GFX9-NEXT: v_mov_b32_e32 v5, 0x3800 427; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc 428; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v5 src0_sel:WORD_1 src1_sel:DWORD 429; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 430; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, 0.5, v3 431; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc 432; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v3 433; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 434; GFX9-NEXT: v_pack_b32_f16 v1, v0, v5 435; GFX9-NEXT: v_pack_b32_f16 v0, v4, v2 436; GFX9-NEXT: s_setpc_b64 s[30:31] 437; 438; GFX11-LABEL: vec_8xf16_extract_4xf16: 439; GFX11: ; %bb.0: 440; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 441; GFX11-NEXT: s_cbranch_scc0 .LBB2_2 442; GFX11-NEXT: ; %bb.1: ; %F 443; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc 444; GFX11-NEXT: s_waitcnt vmcnt(0) 445; GFX11-NEXT: s_cbranch_execz .LBB2_3 446; GFX11-NEXT: s_branch .LBB2_4 447; GFX11-NEXT: .LBB2_2: 448; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 449; GFX11-NEXT: .LBB2_3: ; %T 450; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc 451; GFX11-NEXT: s_waitcnt vmcnt(0) 452; GFX11-NEXT: .LBB2_4: ; %exit 453; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00 454; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 455; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 456; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 457; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo 458; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v1 459; GFX11-NEXT: v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0 460; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v3 461; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) 462; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo 463; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3 464; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo 465; GFX11-NEXT: v_pack_b32_f16 v0, v2, v1 466; GFX11-NEXT: v_pack_b32_f16 v1, v3, v4 467; GFX11-NEXT: s_setpc_b64 s[30:31] 468 br i1 undef, label %T, label %F 469 470T: 471 %t = load volatile <8 x half>, ptr addrspace(1) %p0 472 br label %exit 473 474F: 475 %f = load volatile <8 x half>, ptr addrspace(1) %p1 476 br label %exit 477 478exit: 479 %m = phi <8 x half> [ %t, %T ], [ %f, %F ] 480 %v2 = shufflevector <8 x half> %m, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 481 %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800> 482 %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00> 483 ret <4 x half> %r2 484} 485 486define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) { 487; 488; SI-LABEL: vec_16xi16_extract_4xi16: 489; SI: ; %bb.0: 490; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 491; SI-NEXT: s_cbranch_scc0 .LBB3_2 492; SI-NEXT: ; %bb.1: ; %F 493; SI-NEXT: s_mov_b32 s6, 0 494; SI-NEXT: s_mov_b32 s7, 0xf000 495; SI-NEXT: s_mov_b32 s4, s6 496; SI-NEXT: s_mov_b32 s5, s6 497; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 498; SI-NEXT: s_waitcnt vmcnt(0) 499; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 500; SI-NEXT: s_waitcnt vmcnt(0) 501; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 502; SI-NEXT: s_waitcnt vmcnt(0) 503; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 504; SI-NEXT: s_waitcnt vmcnt(0) 505; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 506; SI-NEXT: s_waitcnt vmcnt(0) 507; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 508; SI-NEXT: s_waitcnt vmcnt(0) 509; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 510; SI-NEXT: s_waitcnt vmcnt(0) 511; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc 512; SI-NEXT: s_waitcnt vmcnt(0) 513; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc 514; SI-NEXT: s_waitcnt vmcnt(0) 515; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc 516; SI-NEXT: s_waitcnt vmcnt(0) 517; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc 518; SI-NEXT: s_waitcnt vmcnt(0) 519; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc 520; SI-NEXT: s_waitcnt vmcnt(0) 521; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc 522; SI-NEXT: s_waitcnt vmcnt(0) 523; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc 524; SI-NEXT: s_waitcnt vmcnt(0) 525; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc 526; SI-NEXT: s_waitcnt vmcnt(0) 527; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc 528; SI-NEXT: s_waitcnt vmcnt(0) 529; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 530; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 531; SI-NEXT: v_or_b32_e32 v3, v6, v2 532; SI-NEXT: v_or_b32_e32 v2, v4, v5 533; SI-NEXT: s_mov_b64 vcc, exec 534; SI-NEXT: s_cbranch_execz .LBB3_3 535; SI-NEXT: s_branch .LBB3_4 536; SI-NEXT: .LBB3_2: 537; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 538; SI-NEXT: s_mov_b64 vcc, 0 539; SI-NEXT: .LBB3_3: ; %T 540; SI-NEXT: s_mov_b32 s6, 0 541; SI-NEXT: s_mov_b32 s7, 0xf000 542; SI-NEXT: s_mov_b32 s4, s6 543; SI-NEXT: s_mov_b32 s5, s6 544; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 545; SI-NEXT: s_waitcnt vmcnt(0) 546; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc 547; SI-NEXT: s_waitcnt vmcnt(0) 548; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 549; SI-NEXT: s_waitcnt vmcnt(0) 550; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 551; SI-NEXT: s_waitcnt vmcnt(0) 552; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 553; SI-NEXT: s_waitcnt vmcnt(0) 554; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 555; SI-NEXT: s_waitcnt vmcnt(0) 556; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 557; SI-NEXT: s_waitcnt vmcnt(0) 558; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc 559; SI-NEXT: s_waitcnt vmcnt(0) 560; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc 561; SI-NEXT: s_waitcnt vmcnt(0) 562; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc 563; SI-NEXT: s_waitcnt vmcnt(0) 564; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc 565; SI-NEXT: s_waitcnt vmcnt(0) 566; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc 567; SI-NEXT: s_waitcnt vmcnt(0) 568; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc 569; SI-NEXT: s_waitcnt vmcnt(0) 570; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc 571; SI-NEXT: s_waitcnt vmcnt(0) 572; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc 573; SI-NEXT: s_waitcnt vmcnt(0) 574; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc 575; SI-NEXT: s_waitcnt vmcnt(0) 576; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 577; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 578; SI-NEXT: v_or_b32_e32 v3, v4, v0 579; SI-NEXT: v_or_b32_e32 v2, v2, v1 580; SI-NEXT: .LBB3_4: ; %exit 581; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v2 582; SI-NEXT: v_bfe_i32 v1, v2, 0, 16 583; SI-NEXT: v_bfe_i32 v2, v3, 0, 16 584; SI-NEXT: v_mov_b32_e32 v3, 0xffff0000 585; SI-NEXT: v_bfrev_b32_e32 v4, 1 586; SI-NEXT: v_mov_b32_e32 v5, 0xffff 587; SI-NEXT: v_mov_b32_e32 v6, 0x8000 588; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 589; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 590; SI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc 591; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 592; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc 593; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 594; SI-NEXT: v_cndmask_b32_e32 v1, -1, v7, vcc 595; SI-NEXT: v_or_b32_e32 v0, v0, v4 596; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 597; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1 598; SI-NEXT: v_or_b32_e32 v2, v3, v2 599; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16 600; SI-NEXT: s_setpc_b64 s[30:31] 601; 602; GFX9-LABEL: vec_16xi16_extract_4xi16: 603; GFX9: ; %bb.0: 604; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 605; GFX9-NEXT: s_cbranch_scc0 .LBB3_2 606; GFX9-NEXT: ; %bb.1: ; %F 607; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc 608; GFX9-NEXT: s_waitcnt vmcnt(0) 609; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc 610; GFX9-NEXT: s_waitcnt vmcnt(0) 611; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 612; GFX9-NEXT: s_cbranch_execz .LBB3_3 613; GFX9-NEXT: s_branch .LBB3_4 614; GFX9-NEXT: .LBB3_2: 615; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 616; GFX9-NEXT: .LBB3_3: ; %T 617; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc 618; GFX9-NEXT: s_waitcnt vmcnt(0) 619; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc 620; GFX9-NEXT: s_waitcnt vmcnt(0) 621; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 622; GFX9-NEXT: .LBB3_4: ; %exit 623; GFX9-NEXT: s_waitcnt vmcnt(0) 624; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0] 625; GFX9-NEXT: s_movk_i32 s4, 0x8000 626; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 627; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 628; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] 629; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 630; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 631; GFX9-NEXT: s_mov_b32 s4, 0x5040100 632; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 633; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 634; GFX9-NEXT: s_setpc_b64 s[30:31] 635; 636; GFX11-LABEL: vec_16xi16_extract_4xi16: 637; GFX11: ; %bb.0: 638; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 639; GFX11-NEXT: s_cbranch_scc0 .LBB3_2 640; GFX11-NEXT: ; %bb.1: ; %F 641; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc 642; GFX11-NEXT: s_waitcnt vmcnt(0) 643; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc 644; GFX11-NEXT: s_waitcnt vmcnt(0) 645; GFX11-NEXT: s_cbranch_execz .LBB3_3 646; GFX11-NEXT: s_branch .LBB3_4 647; GFX11-NEXT: .LBB3_2: 648; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 649; GFX11-NEXT: .LBB3_3: ; %T 650; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc 651; GFX11-NEXT: s_waitcnt vmcnt(0) 652; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc 653; GFX11-NEXT: s_waitcnt vmcnt(0) 654; GFX11-NEXT: .LBB3_4: ; %exit 655; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] 656; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0] 657; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 658; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 659; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 660; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1 661; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 662; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 663; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2 664; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3 665; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 666; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 667; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 668; GFX11-NEXT: s_setpc_b64 s[30:31] 669 br i1 undef, label %T, label %F 670 671T: 672 %t = load volatile <16 x i16>, ptr addrspace(1) %p0 673 br label %exit 674 675F: 676 %f = load volatile <16 x i16>, ptr addrspace(1) %p1 677 br label %exit 678 679exit: 680 %m = phi <16 x i16> [ %t, %T ], [ %f, %F ] 681 %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 682 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 683 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 684 ret <4 x i16> %r2 685} 686 687define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(1) %p1) { 688; 689; SI-LABEL: vec_16xi16_extract_4xi16_2: 690; SI: ; %bb.0: 691; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 692; SI-NEXT: s_cbranch_scc0 .LBB4_2 693; SI-NEXT: ; %bb.1: ; %F 694; SI-NEXT: s_mov_b32 s6, 0 695; SI-NEXT: s_mov_b32 s7, 0xf000 696; SI-NEXT: s_mov_b32 s4, s6 697; SI-NEXT: s_mov_b32 s5, s6 698; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 699; SI-NEXT: s_waitcnt vmcnt(0) 700; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc 701; SI-NEXT: s_waitcnt vmcnt(0) 702; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc 703; SI-NEXT: s_waitcnt vmcnt(0) 704; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc 705; SI-NEXT: s_waitcnt vmcnt(0) 706; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc 707; SI-NEXT: s_waitcnt vmcnt(0) 708; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc 709; SI-NEXT: s_waitcnt vmcnt(0) 710; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc 711; SI-NEXT: s_waitcnt vmcnt(0) 712; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc 713; SI-NEXT: s_waitcnt vmcnt(0) 714; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc 715; SI-NEXT: s_waitcnt vmcnt(0) 716; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc 717; SI-NEXT: s_waitcnt vmcnt(0) 718; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc 719; SI-NEXT: s_waitcnt vmcnt(0) 720; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc 721; SI-NEXT: s_waitcnt vmcnt(0) 722; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc 723; SI-NEXT: s_waitcnt vmcnt(0) 724; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc 725; SI-NEXT: s_waitcnt vmcnt(0) 726; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc 727; SI-NEXT: s_waitcnt vmcnt(0) 728; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc 729; SI-NEXT: s_waitcnt vmcnt(0) 730; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 731; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 732; SI-NEXT: v_or_b32_e32 v5, v6, v2 733; SI-NEXT: v_or_b32_e32 v4, v4, v3 734; SI-NEXT: s_mov_b64 vcc, exec 735; SI-NEXT: s_cbranch_execz .LBB4_3 736; SI-NEXT: s_branch .LBB4_4 737; SI-NEXT: .LBB4_2: 738; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 739; SI-NEXT: s_mov_b64 vcc, 0 740; SI-NEXT: .LBB4_3: ; %T 741; SI-NEXT: s_mov_b32 s6, 0 742; SI-NEXT: s_mov_b32 s7, 0xf000 743; SI-NEXT: s_mov_b32 s4, s6 744; SI-NEXT: s_mov_b32 s5, s6 745; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 746; SI-NEXT: s_waitcnt vmcnt(0) 747; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc 748; SI-NEXT: s_waitcnt vmcnt(0) 749; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc 750; SI-NEXT: s_waitcnt vmcnt(0) 751; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc 752; SI-NEXT: s_waitcnt vmcnt(0) 753; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc 754; SI-NEXT: s_waitcnt vmcnt(0) 755; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc 756; SI-NEXT: s_waitcnt vmcnt(0) 757; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc 758; SI-NEXT: s_waitcnt vmcnt(0) 759; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc 760; SI-NEXT: s_waitcnt vmcnt(0) 761; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc 762; SI-NEXT: s_waitcnt vmcnt(0) 763; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc 764; SI-NEXT: s_waitcnt vmcnt(0) 765; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc 766; SI-NEXT: s_waitcnt vmcnt(0) 767; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc 768; SI-NEXT: s_waitcnt vmcnt(0) 769; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc 770; SI-NEXT: s_waitcnt vmcnt(0) 771; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc 772; SI-NEXT: s_waitcnt vmcnt(0) 773; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc 774; SI-NEXT: s_waitcnt vmcnt(0) 775; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc 776; SI-NEXT: s_waitcnt vmcnt(0) 777; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 778; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 779; SI-NEXT: v_or_b32_e32 v5, v4, v0 780; SI-NEXT: v_or_b32_e32 v4, v2, v1 781; SI-NEXT: .LBB4_4: ; %exit 782; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v4 783; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 48 784; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 785; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 786; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000 787; SI-NEXT: v_bfrev_b32_e32 v5, 1 788; SI-NEXT: v_mov_b32_e32 v6, 0xffff 789; SI-NEXT: v_mov_b32_e32 v7, 0x8000 790; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 791; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc 792; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 793; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc 794; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 795; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc 796; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 797; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 798; SI-NEXT: v_or_b32_e32 v0, v1, v8 799; SI-NEXT: v_or_b32_e32 v2, v2, v3 800; SI-NEXT: v_alignbit_b32 v1, v2, v8, 16 801; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 802; SI-NEXT: s_setpc_b64 s[30:31] 803; 804; GFX9-LABEL: vec_16xi16_extract_4xi16_2: 805; GFX9: ; %bb.0: 806; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 807; GFX9-NEXT: s_cbranch_scc0 .LBB4_2 808; GFX9-NEXT: ; %bb.1: ; %F 809; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc 810; GFX9-NEXT: s_waitcnt vmcnt(0) 811; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc 812; GFX9-NEXT: s_waitcnt vmcnt(0) 813; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 814; GFX9-NEXT: s_cbranch_execz .LBB4_3 815; GFX9-NEXT: s_branch .LBB4_4 816; GFX9-NEXT: .LBB4_2: 817; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 818; GFX9-NEXT: .LBB4_3: ; %T 819; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc 820; GFX9-NEXT: s_waitcnt vmcnt(0) 821; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc 822; GFX9-NEXT: s_waitcnt vmcnt(0) 823; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 824; GFX9-NEXT: .LBB4_4: ; %exit 825; GFX9-NEXT: s_waitcnt vmcnt(0) 826; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1] 827; GFX9-NEXT: s_movk_i32 s4, 0x8000 828; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 829; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 830; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v6 op_sel_hi:[0,1] 831; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 832; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 833; GFX9-NEXT: s_mov_b32 s4, 0x5040100 834; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 835; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 836; GFX9-NEXT: s_setpc_b64 s[30:31] 837; 838; GFX11-LABEL: vec_16xi16_extract_4xi16_2: 839; GFX11: ; %bb.0: 840; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 841; GFX11-NEXT: s_cbranch_scc0 .LBB4_2 842; GFX11-NEXT: ; %bb.1: ; %F 843; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc 844; GFX11-NEXT: s_waitcnt vmcnt(0) 845; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc 846; GFX11-NEXT: s_waitcnt vmcnt(0) 847; GFX11-NEXT: s_cbranch_execz .LBB4_3 848; GFX11-NEXT: s_branch .LBB4_4 849; GFX11-NEXT: .LBB4_2: 850; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 851; GFX11-NEXT: .LBB4_3: ; %T 852; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc 853; GFX11-NEXT: s_waitcnt vmcnt(0) 854; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc 855; GFX11-NEXT: s_waitcnt vmcnt(0) 856; GFX11-NEXT: .LBB4_4: ; %exit 857; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] 858; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1] 859; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 860; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 861; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 862; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1 863; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 864; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 865; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2 866; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3 867; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 868; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 869; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 870; GFX11-NEXT: s_setpc_b64 s[30:31] 871 br i1 undef, label %T, label %F 872 873T: 874 %t = load volatile <16 x i16>, ptr addrspace(1) %p0 875 br label %exit 876 877F: 878 %f = load volatile <16 x i16>, ptr addrspace(1) %p1 879 br label %exit 880 881exit: 882 %m = phi <16 x i16> [ %t, %T ], [ %f, %F ] 883 %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 884 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1> 885 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1> 886 ret <4 x i16> %r2 887} 888 889define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) { 890; 891; SI-LABEL: vec_16xf16_extract_4xf16: 892; SI: ; %bb.0: 893; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 894; SI-NEXT: s_cbranch_scc0 .LBB5_2 895; SI-NEXT: ; %bb.1: ; %F 896; SI-NEXT: s_mov_b32 s6, 0 897; SI-NEXT: s_mov_b32 s7, 0xf000 898; SI-NEXT: s_mov_b32 s4, s6 899; SI-NEXT: s_mov_b32 s5, s6 900; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc 901; SI-NEXT: s_waitcnt vmcnt(0) 902; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc 903; SI-NEXT: s_waitcnt vmcnt(0) 904; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc 905; SI-NEXT: s_waitcnt vmcnt(0) 906; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc 907; SI-NEXT: s_waitcnt vmcnt(0) 908; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc 909; SI-NEXT: s_waitcnt vmcnt(0) 910; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc 911; SI-NEXT: s_waitcnt vmcnt(0) 912; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc 913; SI-NEXT: s_waitcnt vmcnt(0) 914; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc 915; SI-NEXT: s_waitcnt vmcnt(0) 916; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc 917; SI-NEXT: s_waitcnt vmcnt(0) 918; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc 919; SI-NEXT: s_waitcnt vmcnt(0) 920; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc 921; SI-NEXT: s_waitcnt vmcnt(0) 922; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc 923; SI-NEXT: s_waitcnt vmcnt(0) 924; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc 925; SI-NEXT: s_waitcnt vmcnt(0) 926; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc 927; SI-NEXT: s_waitcnt vmcnt(0) 928; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc 929; SI-NEXT: s_waitcnt vmcnt(0) 930; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc 931; SI-NEXT: s_waitcnt vmcnt(0) 932; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 933; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 934; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 935; SI-NEXT: v_or_b32_e32 v2, v6, v2 936; SI-NEXT: v_or_b32_e32 v4, v4, v7 937; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 938; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 939; SI-NEXT: s_mov_b64 vcc, exec 940; SI-NEXT: s_cbranch_execz .LBB5_3 941; SI-NEXT: s_branch .LBB5_4 942; SI-NEXT: .LBB5_2: 943; SI-NEXT: ; implicit-def: $vgpr4 944; SI-NEXT: ; implicit-def: $vgpr3 945; SI-NEXT: ; implicit-def: $vgpr2 946; SI-NEXT: s_mov_b64 vcc, 0 947; SI-NEXT: .LBB5_3: ; %T 948; SI-NEXT: s_mov_b32 s6, 0 949; SI-NEXT: s_mov_b32 s7, 0xf000 950; SI-NEXT: s_mov_b32 s4, s6 951; SI-NEXT: s_mov_b32 s5, s6 952; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 953; SI-NEXT: s_waitcnt vmcnt(0) 954; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc 955; SI-NEXT: s_waitcnt vmcnt(0) 956; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc 957; SI-NEXT: s_waitcnt vmcnt(0) 958; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc 959; SI-NEXT: s_waitcnt vmcnt(0) 960; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc 961; SI-NEXT: s_waitcnt vmcnt(0) 962; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc 963; SI-NEXT: s_waitcnt vmcnt(0) 964; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc 965; SI-NEXT: s_waitcnt vmcnt(0) 966; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc 967; SI-NEXT: s_waitcnt vmcnt(0) 968; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc 969; SI-NEXT: s_waitcnt vmcnt(0) 970; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc 971; SI-NEXT: s_waitcnt vmcnt(0) 972; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc 973; SI-NEXT: s_waitcnt vmcnt(0) 974; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc 975; SI-NEXT: s_waitcnt vmcnt(0) 976; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc 977; SI-NEXT: s_waitcnt vmcnt(0) 978; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc 979; SI-NEXT: s_waitcnt vmcnt(0) 980; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc 981; SI-NEXT: s_waitcnt vmcnt(0) 982; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc 983; SI-NEXT: s_waitcnt vmcnt(0) 984; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 985; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 986; SI-NEXT: v_or_b32_e32 v0, v4, v0 987; SI-NEXT: v_or_b32_e32 v1, v2, v1 988; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 989; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 990; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 991; SI-NEXT: .LBB5_4: ; %exit 992; SI-NEXT: v_cvt_f16_f32_e32 v0, v4 993; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 994; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 995; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 996; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 997; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 998; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 999; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1000; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 1001; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc 1002; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 1003; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 1004; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 1005; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc 1006; SI-NEXT: v_mov_b32_e32 v3, v2 1007; SI-NEXT: s_setpc_b64 s[30:31] 1008; 1009; GFX9-LABEL: vec_16xf16_extract_4xf16: 1010; GFX9: ; %bb.0: 1011; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1012; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 1013; GFX9-NEXT: ; %bb.1: ; %F 1014; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc 1015; GFX9-NEXT: s_waitcnt vmcnt(0) 1016; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc 1017; GFX9-NEXT: s_waitcnt vmcnt(0) 1018; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 1019; GFX9-NEXT: s_cbranch_execz .LBB5_3 1020; GFX9-NEXT: s_branch .LBB5_4 1021; GFX9-NEXT: .LBB5_2: 1022; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 1023; GFX9-NEXT: .LBB5_3: ; %T 1024; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc 1025; GFX9-NEXT: s_waitcnt vmcnt(0) 1026; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc 1027; GFX9-NEXT: s_waitcnt vmcnt(0) 1028; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 1029; GFX9-NEXT: .LBB5_4: ; %exit 1030; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900 1031; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00 1032; GFX9-NEXT: s_waitcnt vmcnt(0) 1033; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4 1034; GFX9-NEXT: v_mov_b32_e32 v3, 0x3800 1035; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 1036; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v3 src0_sel:WORD_1 src1_sel:DWORD 1037; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 1038; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, 0.5, v5 1039; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc 1040; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v5 1041; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 1042; GFX9-NEXT: v_pack_b32_f16 v1, v0, v4 1043; GFX9-NEXT: v_pack_b32_f16 v0, v2, v3 1044; GFX9-NEXT: s_setpc_b64 s[30:31] 1045; 1046; GFX11-LABEL: vec_16xf16_extract_4xf16: 1047; GFX11: ; %bb.0: 1048; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1049; GFX11-NEXT: s_cbranch_scc0 .LBB5_2 1050; GFX11-NEXT: ; %bb.1: ; %F 1051; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc 1052; GFX11-NEXT: s_waitcnt vmcnt(0) 1053; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc 1054; GFX11-NEXT: s_waitcnt vmcnt(0) 1055; GFX11-NEXT: s_cbranch_execz .LBB5_3 1056; GFX11-NEXT: s_branch .LBB5_4 1057; GFX11-NEXT: .LBB5_2: 1058; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 1059; GFX11-NEXT: .LBB5_3: ; %T 1060; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc 1061; GFX11-NEXT: s_waitcnt vmcnt(0) 1062; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc 1063; GFX11-NEXT: s_waitcnt vmcnt(0) 1064; GFX11-NEXT: .LBB5_4: ; %exit 1065; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00 1066; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 1067; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 1068; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 1069; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo 1070; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v1 1071; GFX11-NEXT: v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0 1072; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v3 1073; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) 1074; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo 1075; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3 1076; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo 1077; GFX11-NEXT: v_pack_b32_f16 v0, v2, v1 1078; GFX11-NEXT: v_pack_b32_f16 v1, v3, v4 1079; GFX11-NEXT: s_setpc_b64 s[30:31] 1080 br i1 undef, label %T, label %F 1081 1082T: 1083 %t = load volatile <16 x half>, ptr addrspace(1) %p0 1084 br label %exit 1085 1086F: 1087 %f = load volatile <16 x half>, ptr addrspace(1) %p1 1088 br label %exit 1089 1090exit: 1091 %m = phi <16 x half> [ %t, %T ], [ %f, %F ] 1092 %v2 = shufflevector <16 x half> %m, <16 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 1093 %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800> 1094 %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00> 1095 ret <4 x half> %r2 1096} 1097 1098define <8 x i16> @large_vector(ptr addrspace(3) %p, i32 %idxp) { 1099; SI-LABEL: large_vector: 1100; SI: ; %bb.0: 1101; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1102; SI-NEXT: v_lshlrev_b32_e32 v1, 5, v1 1103; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1104; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 1105; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 1106; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 1107; SI-NEXT: s_mov_b32 m0, -1 1108; SI-NEXT: ds_read_b32 v0, v0 1109; SI-NEXT: ds_read_b32 v2, v1 1110; SI-NEXT: ds_read_b32 v4, v3 1111; SI-NEXT: ds_read_b32 v6, v5 1112; SI-NEXT: s_waitcnt lgkmcnt(3) 1113; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1114; SI-NEXT: s_waitcnt lgkmcnt(2) 1115; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 1116; SI-NEXT: s_waitcnt lgkmcnt(1) 1117; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 1118; SI-NEXT: s_waitcnt lgkmcnt(0) 1119; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 1120; SI-NEXT: s_setpc_b64 s[30:31] 1121; 1122; GFX9-LABEL: large_vector: 1123; GFX9: ; %bb.0: 1124; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1125; GFX9-NEXT: v_lshl_add_u32 v2, v1, 5, v0 1126; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 1127; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 1128; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1129; GFX9-NEXT: s_setpc_b64 s[30:31] 1130; 1131; GFX11-LABEL: large_vector: 1132; GFX11: ; %bb.0: 1133; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1134; GFX11-NEXT: v_lshl_add_u32 v2, v1, 5, v0 1135; GFX11-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1 1136; GFX11-NEXT: ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3 1137; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1138; GFX11-NEXT: s_setpc_b64 s[30:31] 1139 %idx = shl i32 %idxp, 4 1140 1141 %i.0 = or disjoint i32 %idx, 0 1142 %p.0 = getelementptr half, ptr addrspace(3) %p, i32 %i.0 1143 %x.0 = load i16, ptr addrspace(3) %p.0, align 4 1144 %v0p = insertelement <8 x i16> poison, i16 %x.0, i32 0 1145 %i.1 = or disjoint i32 %idx, 1 1146 %p.1 = getelementptr half, ptr addrspace(3) %p, i32 %i.1 1147 %x.1 = load i16, ptr addrspace(3) %p.1, align 2 1148 %v0 = insertelement <8 x i16> %v0p, i16 %x.1, i32 1 1149 1150 %i.2 = or disjoint i32 %idx, 2 1151 %p.2 = getelementptr half, ptr addrspace(3) %p, i32 %i.2 1152 %x.2 = load i16, ptr addrspace(3) %p.2, align 4 1153 %v1p = insertelement <8 x i16> poison, i16 %x.2, i32 0 1154 %i.3 = or disjoint i32 %idx, 3 1155 %p.3 = getelementptr half, ptr addrspace(3) %p, i32 %i.3 1156 %x.3 = load i16, ptr addrspace(3) %p.3, align 2 1157 %v1 = insertelement <8 x i16> %v1p, i16 %x.3, i32 1 1158 1159 %i.4 = or disjoint i32 %idx, 4 1160 %p.4 = getelementptr half, ptr addrspace(3) %p, i32 %i.4 1161 %x.4 = load i16, ptr addrspace(3) %p.4, align 4 1162 %v2p = insertelement <8 x i16> poison, i16 %x.4, i32 0 1163 %i.5 = or disjoint i32 %idx, 5 1164 %p.5 = getelementptr half, ptr addrspace(3) %p, i32 %i.5 1165 %x.5 = load i16, ptr addrspace(3) %p.5, align 2 1166 %v2 = insertelement <8 x i16> %v2p, i16 %x.5, i32 1 1167 1168 %i.6 = or disjoint i32 %idx, 6 1169 %p.6 = getelementptr half, ptr addrspace(3) %p, i32 %i.6 1170 %x.6 = load i16, ptr addrspace(3) %p.6, align 4 1171 %v3p = insertelement <8 x i16> poison, i16 %x.6, i32 0 1172 %i.7 = or disjoint i32 %idx, 7 1173 %p.7 = getelementptr half, ptr addrspace(3) %p, i32 %i.7 1174 %x.7 = load i16, ptr addrspace(3) %p.7, align 2 1175 %v3 = insertelement <8 x i16> %v3p, i16 %x.7, i32 1 1176 1177 %z.1 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef> 1178 %z.2 = shufflevector <8 x i16> %z.1, <8 x i16> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef> 1179 %z.3 = shufflevector <8 x i16> %z.2, <8 x i16> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9> 1180 ret <8 x i16> %z.3 1181} 1182 1183define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addrspace(1) %p0, ptr addrspace(1) %p1) { 1184; SI-LABEL: vec_16xi16_extract_8xi16_0: 1185; SI: ; %bb.0: 1186; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1187; SI-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 1188; SI-NEXT: s_waitcnt vmcnt(0) 1189; SI-NEXT: v_and_b32_e32 v4, 1, v4 1190; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 1191; SI-NEXT: s_and_b64 s[34:35], vcc, exec 1192; SI-NEXT: s_mov_b32 s38, 0 1193; SI-NEXT: s_cbranch_scc0 .LBB7_2 1194; SI-NEXT: ; %bb.1: ; %F 1195; SI-NEXT: s_mov_b32 s39, 0xf000 1196; SI-NEXT: s_mov_b32 s36, s38 1197; SI-NEXT: s_mov_b32 s37, s38 1198; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 glc 1199; SI-NEXT: s_waitcnt vmcnt(0) 1200; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:2 glc 1201; SI-NEXT: s_waitcnt vmcnt(0) 1202; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc 1203; SI-NEXT: s_waitcnt vmcnt(0) 1204; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:6 glc 1205; SI-NEXT: s_waitcnt vmcnt(0) 1206; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc 1207; SI-NEXT: s_waitcnt vmcnt(0) 1208; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc 1209; SI-NEXT: s_waitcnt vmcnt(0) 1210; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc 1211; SI-NEXT: s_waitcnt vmcnt(0) 1212; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:14 glc 1213; SI-NEXT: s_waitcnt vmcnt(0) 1214; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc 1215; SI-NEXT: s_waitcnt vmcnt(0) 1216; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:18 glc 1217; SI-NEXT: s_waitcnt vmcnt(0) 1218; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:20 glc 1219; SI-NEXT: s_waitcnt vmcnt(0) 1220; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:22 glc 1221; SI-NEXT: s_waitcnt vmcnt(0) 1222; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:24 glc 1223; SI-NEXT: s_waitcnt vmcnt(0) 1224; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:26 glc 1225; SI-NEXT: s_waitcnt vmcnt(0) 1226; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:28 glc 1227; SI-NEXT: s_waitcnt vmcnt(0) 1228; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc 1229; SI-NEXT: s_waitcnt vmcnt(0) 1230; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 1231; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 1232; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v5 1233; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 1234; SI-NEXT: v_or_b32_e32 v5, v10, v2 1235; SI-NEXT: v_or_b32_e32 v4, v8, v3 1236; SI-NEXT: v_or_b32_e32 v3, v7, v9 1237; SI-NEXT: v_or_b32_e32 v2, v6, v11 1238; SI-NEXT: s_mov_b64 vcc, exec 1239; SI-NEXT: s_cbranch_execz .LBB7_3 1240; SI-NEXT: s_branch .LBB7_4 1241; SI-NEXT: .LBB7_2: 1242; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 1243; SI-NEXT: s_mov_b64 vcc, 0 1244; SI-NEXT: .LBB7_3: ; %T 1245; SI-NEXT: s_mov_b32 s39, 0xf000 1246; SI-NEXT: s_mov_b32 s36, s38 1247; SI-NEXT: s_mov_b32 s37, s38 1248; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 glc 1249; SI-NEXT: s_waitcnt vmcnt(0) 1250; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:2 glc 1251; SI-NEXT: s_waitcnt vmcnt(0) 1252; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc 1253; SI-NEXT: s_waitcnt vmcnt(0) 1254; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc 1255; SI-NEXT: s_waitcnt vmcnt(0) 1256; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:8 glc 1257; SI-NEXT: s_waitcnt vmcnt(0) 1258; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:10 glc 1259; SI-NEXT: s_waitcnt vmcnt(0) 1260; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:12 glc 1261; SI-NEXT: s_waitcnt vmcnt(0) 1262; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:14 glc 1263; SI-NEXT: s_waitcnt vmcnt(0) 1264; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc 1265; SI-NEXT: s_waitcnt vmcnt(0) 1266; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:18 glc 1267; SI-NEXT: s_waitcnt vmcnt(0) 1268; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:20 glc 1269; SI-NEXT: s_waitcnt vmcnt(0) 1270; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:22 glc 1271; SI-NEXT: s_waitcnt vmcnt(0) 1272; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:24 glc 1273; SI-NEXT: s_waitcnt vmcnt(0) 1274; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:26 glc 1275; SI-NEXT: s_waitcnt vmcnt(0) 1276; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:28 glc 1277; SI-NEXT: s_waitcnt vmcnt(0) 1278; SI-NEXT: buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc 1279; SI-NEXT: s_waitcnt vmcnt(0) 1280; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 1281; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 1282; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4 1283; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 1284; SI-NEXT: v_or_b32_e32 v5, v8, v0 1285; SI-NEXT: v_or_b32_e32 v4, v7, v1 1286; SI-NEXT: v_or_b32_e32 v3, v6, v9 1287; SI-NEXT: v_or_b32_e32 v2, v2, v10 1288; SI-NEXT: .LBB7_4: ; %exit 1289; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 1290; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 1291; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 1292; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 1293; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 1294; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 1295; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 1296; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 1297; SI-NEXT: s_movk_i32 s34, 0x3800 1298; SI-NEXT: v_mov_b32_e32 v8, 0x3d000000 1299; SI-NEXT: v_mov_b32_e32 v9, 0x39000000 1300; SI-NEXT: v_mov_b32_e32 v10, 0x3d00 1301; SI-NEXT: v_mov_b32_e32 v11, 0x3900 1302; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v0 1303; SI-NEXT: v_cndmask_b32_e32 v12, v8, v9, vcc 1304; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v1 1305; SI-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc 1306; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v2 1307; SI-NEXT: v_cndmask_b32_e32 v13, v8, v9, vcc 1308; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4 1309; SI-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc 1310; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6 1311; SI-NEXT: v_cndmask_b32_e32 v14, v8, v9, vcc 1312; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v5 1313; SI-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc 1314; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v7 1315; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 1316; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v3 1317; SI-NEXT: v_cndmask_b32_e32 v3, v10, v11, vcc 1318; SI-NEXT: v_or_b32_e32 v0, v0, v12 1319; SI-NEXT: v_or_b32_e32 v4, v1, v13 1320; SI-NEXT: v_or_b32_e32 v6, v2, v14 1321; SI-NEXT: v_or_b32_e32 v2, v3, v5 1322; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 1323; SI-NEXT: v_alignbit_b32 v1, v2, v12, 16 1324; SI-NEXT: v_alignbit_b32 v5, v6, v13, 16 1325; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 1326; SI-NEXT: s_setpc_b64 s[30:31] 1327; 1328; GFX9-LABEL: vec_16xi16_extract_8xi16_0: 1329; GFX9: ; %bb.0: 1330; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1331; GFX9-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 1332; GFX9-NEXT: s_waitcnt vmcnt(0) 1333; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 1334; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 1335; GFX9-NEXT: s_and_b64 s[34:35], vcc, exec 1336; GFX9-NEXT: s_cbranch_scc0 .LBB7_2 1337; GFX9-NEXT: ; %bb.1: ; %F 1338; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc 1339; GFX9-NEXT: s_waitcnt vmcnt(0) 1340; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc 1341; GFX9-NEXT: s_waitcnt vmcnt(0) 1342; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 1343; GFX9-NEXT: s_cbranch_execz .LBB7_3 1344; GFX9-NEXT: s_branch .LBB7_4 1345; GFX9-NEXT: .LBB7_2: 1346; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 1347; GFX9-NEXT: .LBB7_3: ; %T 1348; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc 1349; GFX9-NEXT: s_waitcnt vmcnt(0) 1350; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc 1351; GFX9-NEXT: s_waitcnt vmcnt(0) 1352; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 1353; GFX9-NEXT: .LBB7_4: ; %exit 1354; GFX9-NEXT: s_movk_i32 s35, 0x3801 1355; GFX9-NEXT: s_movk_i32 s34, 0x3800 1356; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900 1357; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00 1358; GFX9-NEXT: s_waitcnt vmcnt(0) 1359; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, s35, v7 1360; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 1361; GFX9-NEXT: v_cmp_gt_u16_sdwa vcc, v7, s34 src0_sel:WORD_1 src1_sel:DWORD 1362; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v0, vcc 1363; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, s35, v6 1364; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 1365; GFX9-NEXT: v_cmp_lt_u16_sdwa vcc, v6, s35 src0_sel:WORD_1 src1_sel:DWORD 1366; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc 1367; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, s35, v5 1368; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc 1369; GFX9-NEXT: v_cmp_lt_u16_sdwa vcc, v5, s35 src0_sel:WORD_1 src1_sel:DWORD 1370; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc 1371; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, s35, v4 1372; GFX9-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc 1373; GFX9-NEXT: v_cmp_lt_u16_sdwa vcc, v4, s35 src0_sel:WORD_1 src1_sel:DWORD 1374; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 1375; GFX9-NEXT: s_mov_b32 s34, 0x5040100 1376; GFX9-NEXT: v_perm_b32 v0, v0, v9, s34 1377; GFX9-NEXT: v_perm_b32 v1, v5, v8, s34 1378; GFX9-NEXT: v_perm_b32 v2, v6, v2, s34 1379; GFX9-NEXT: v_perm_b32 v3, v7, v3, s34 1380; GFX9-NEXT: s_setpc_b64 s[30:31] 1381; 1382; GFX11-LABEL: vec_16xi16_extract_8xi16_0: 1383; GFX11: ; %bb.0: 1384; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1385; GFX11-NEXT: scratch_load_u8 v4, off, s32 1386; GFX11-NEXT: s_mov_b32 s0, 0 1387; GFX11-NEXT: s_waitcnt vmcnt(0) 1388; GFX11-NEXT: v_and_b32_e32 v4, 1, v4 1389; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1390; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 1391; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo 1392; GFX11-NEXT: s_cbranch_scc0 .LBB7_2 1393; GFX11-NEXT: ; %bb.1: ; %F 1394; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc 1395; GFX11-NEXT: s_waitcnt vmcnt(0) 1396; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc 1397; GFX11-NEXT: s_waitcnt vmcnt(0) 1398; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 1399; GFX11-NEXT: s_cbranch_vccz .LBB7_3 1400; GFX11-NEXT: s_branch .LBB7_4 1401; GFX11-NEXT: .LBB7_2: 1402; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 1403; GFX11-NEXT: .LBB7_3: ; %T 1404; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc 1405; GFX11-NEXT: s_waitcnt vmcnt(0) 1406; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc 1407; GFX11-NEXT: s_waitcnt vmcnt(0) 1408; GFX11-NEXT: .LBB7_4: ; %exit 1409; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5 1410; GFX11-NEXT: v_mov_b32_e32 v9, 0x3900 1411; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00 1412; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 1413; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v5 1414; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 1415; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 1416; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo 1417; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v4 1418; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo 1419; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v7 1420; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo 1421; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v3 1422; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo 1423; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v2 1424; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo 1425; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v0 1426; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo 1427; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v6 1428; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) 1429; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 1430; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo 1431; GFX11-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x3800, v8 1432; GFX11-NEXT: v_perm_b32 v2, v7, v4, 0x5040100 1433; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x5040100 1434; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo 1435; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1436; GFX11-NEXT: v_perm_b32 v3, v6, v5, 0x5040100 1437; GFX11-NEXT: s_setpc_b64 s[30:31] 1438 br i1 %cond, label %T, label %F 1439 1440T: 1441 %t = load volatile <16 x i16>, ptr addrspace(1) %p0 1442 br label %exit 1443 1444F: 1445 %f = load volatile <16 x i16>, ptr addrspace(1) %p1 1446 br label %exit 1447 1448exit: 1449 %m = phi <16 x i16> [ %t, %T ], [ %f, %F ] 1450 %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1451 %b2 = icmp ugt <8 x i16> %v2, <i16 u0x3800, i16 u0x3800, i16 u0x3800, i16 u0x3800, i16 u0x3800, i16 u0x3800, i16 u0x3800, i16 u0x3800> 1452 %r2 = select <8 x i1> %b2, <8 x i16> <i16 u0x3900, i16 u0x3900, i16 u0x3900, i16 u0x3900, i16 u0x3900, i16 u0x3900, i16 u0x3900, i16 u0x3900>, <8 x i16> <i16 u0x3D00, i16 u0x3D00, i16 u0x3D00, i16 u0x3D00, i16 u0x3D00, i16 u0x3D00, i16 u0x3D00, i16 u0x3D00> 1453 ret <8 x i16> %r2 1454} 1455 1456define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr addrspace(1) %p0, ptr addrspace(1) %p1) { 1457; SI-LABEL: vec_16xf16_extract_8xf16_0: 1458; SI: ; %bb.0: 1459; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1460; SI-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 1461; SI-NEXT: s_waitcnt vmcnt(0) 1462; SI-NEXT: v_and_b32_e32 v4, 1, v4 1463; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 1464; SI-NEXT: s_and_b64 s[34:35], vcc, exec 1465; SI-NEXT: s_mov_b32 s38, 0 1466; SI-NEXT: s_cbranch_scc0 .LBB8_2 1467; SI-NEXT: ; %bb.1: ; %F 1468; SI-NEXT: s_mov_b32 s39, 0xf000 1469; SI-NEXT: s_mov_b32 s36, s38 1470; SI-NEXT: s_mov_b32 s37, s38 1471; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 glc 1472; SI-NEXT: s_waitcnt vmcnt(0) 1473; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:2 glc 1474; SI-NEXT: s_waitcnt vmcnt(0) 1475; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc 1476; SI-NEXT: s_waitcnt vmcnt(0) 1477; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc 1478; SI-NEXT: s_waitcnt vmcnt(0) 1479; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc 1480; SI-NEXT: s_waitcnt vmcnt(0) 1481; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc 1482; SI-NEXT: s_waitcnt vmcnt(0) 1483; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc 1484; SI-NEXT: s_waitcnt vmcnt(0) 1485; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:14 glc 1486; SI-NEXT: s_waitcnt vmcnt(0) 1487; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc 1488; SI-NEXT: s_waitcnt vmcnt(0) 1489; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:18 glc 1490; SI-NEXT: s_waitcnt vmcnt(0) 1491; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:20 glc 1492; SI-NEXT: s_waitcnt vmcnt(0) 1493; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:22 glc 1494; SI-NEXT: s_waitcnt vmcnt(0) 1495; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:24 glc 1496; SI-NEXT: s_waitcnt vmcnt(0) 1497; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:26 glc 1498; SI-NEXT: s_waitcnt vmcnt(0) 1499; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:28 glc 1500; SI-NEXT: s_waitcnt vmcnt(0) 1501; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc 1502; SI-NEXT: s_waitcnt vmcnt(0) 1503; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 1504; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v9 1505; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 1506; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v6 1507; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 1508; SI-NEXT: v_cvt_f32_f16_e32 v3, v9 1509; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 1510; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 1511; SI-NEXT: v_or_b32_e32 v9, v10, v12 1512; SI-NEXT: v_or_b32_e32 v8, v8, v13 1513; SI-NEXT: v_or_b32_e32 v10, v7, v14 1514; SI-NEXT: v_or_b32_e32 v11, v5, v15 1515; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 1516; SI-NEXT: v_cvt_f32_f16_e32 v7, v8 1517; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 1518; SI-NEXT: v_cvt_f32_f16_e32 v9, v11 1519; SI-NEXT: s_mov_b64 vcc, exec 1520; SI-NEXT: s_cbranch_execz .LBB8_3 1521; SI-NEXT: s_branch .LBB8_4 1522; SI-NEXT: .LBB8_2: 1523; SI-NEXT: ; implicit-def: $vgpr9 1524; SI-NEXT: ; implicit-def: $vgpr6 1525; SI-NEXT: ; implicit-def: $vgpr8 1526; SI-NEXT: ; implicit-def: $vgpr4 1527; SI-NEXT: ; implicit-def: $vgpr7 1528; SI-NEXT: ; implicit-def: $vgpr3 1529; SI-NEXT: ; implicit-def: $vgpr5 1530; SI-NEXT: ; implicit-def: $vgpr2 1531; SI-NEXT: s_mov_b64 vcc, 0 1532; SI-NEXT: .LBB8_3: ; %T 1533; SI-NEXT: s_mov_b32 s39, 0xf000 1534; SI-NEXT: s_mov_b32 s36, s38 1535; SI-NEXT: s_mov_b32 s37, s38 1536; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 glc 1537; SI-NEXT: s_waitcnt vmcnt(0) 1538; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:2 glc 1539; SI-NEXT: s_waitcnt vmcnt(0) 1540; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:4 glc 1541; SI-NEXT: s_waitcnt vmcnt(0) 1542; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc 1543; SI-NEXT: s_waitcnt vmcnt(0) 1544; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc 1545; SI-NEXT: s_waitcnt vmcnt(0) 1546; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:10 glc 1547; SI-NEXT: s_waitcnt vmcnt(0) 1548; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:12 glc 1549; SI-NEXT: s_waitcnt vmcnt(0) 1550; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:14 glc 1551; SI-NEXT: s_waitcnt vmcnt(0) 1552; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc 1553; SI-NEXT: s_waitcnt vmcnt(0) 1554; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:18 glc 1555; SI-NEXT: s_waitcnt vmcnt(0) 1556; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:20 glc 1557; SI-NEXT: s_waitcnt vmcnt(0) 1558; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:22 glc 1559; SI-NEXT: s_waitcnt vmcnt(0) 1560; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:24 glc 1561; SI-NEXT: s_waitcnt vmcnt(0) 1562; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:26 glc 1563; SI-NEXT: s_waitcnt vmcnt(0) 1564; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:28 glc 1565; SI-NEXT: s_waitcnt vmcnt(0) 1566; SI-NEXT: buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc 1567; SI-NEXT: s_waitcnt vmcnt(0) 1568; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 1569; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 1570; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 1571; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v6 1572; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1573; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1574; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 1575; SI-NEXT: v_or_b32_e32 v0, v9, v0 1576; SI-NEXT: v_or_b32_e32 v1, v8, v1 1577; SI-NEXT: v_or_b32_e32 v8, v7, v10 1578; SI-NEXT: v_or_b32_e32 v9, v5, v11 1579; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 1580; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 1581; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 1582; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 1583; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 1584; SI-NEXT: .LBB8_4: ; %exit 1585; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 1586; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 1587; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 1588; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 1589; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 1590; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1591; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 1592; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1593; SI-NEXT: v_mov_b32_e32 v8, 0x3fa00000 1594; SI-NEXT: v_mov_b32_e32 v9, 0x3f200000 1595; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1596; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1597; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 1598; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 1599; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 1600; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 1601; SI-NEXT: v_cvt_f32_f16_e32 v11, v5 1602; SI-NEXT: v_cvt_f32_f16_e32 v12, v2 1603; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 1604; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc 1605; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 1606; SI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 1607; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v6 1608; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc 1609; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v4 1610; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc 1611; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v7 1612; SI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc 1613; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v10 1614; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 1615; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v11 1616; SI-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc 1617; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v12 1618; SI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc 1619; SI-NEXT: s_setpc_b64 s[30:31] 1620; 1621; GFX9-LABEL: vec_16xf16_extract_8xf16_0: 1622; GFX9: ; %bb.0: 1623; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1624; GFX9-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 1625; GFX9-NEXT: s_waitcnt vmcnt(0) 1626; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 1627; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 1628; GFX9-NEXT: s_and_b64 s[34:35], vcc, exec 1629; GFX9-NEXT: s_cbranch_scc0 .LBB8_2 1630; GFX9-NEXT: ; %bb.1: ; %F 1631; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc 1632; GFX9-NEXT: s_waitcnt vmcnt(0) 1633; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc 1634; GFX9-NEXT: s_waitcnt vmcnt(0) 1635; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 1636; GFX9-NEXT: s_cbranch_execz .LBB8_3 1637; GFX9-NEXT: s_branch .LBB8_4 1638; GFX9-NEXT: .LBB8_2: 1639; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 1640; GFX9-NEXT: .LBB8_3: ; %T 1641; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc 1642; GFX9-NEXT: s_waitcnt vmcnt(0) 1643; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc 1644; GFX9-NEXT: s_waitcnt vmcnt(0) 1645; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 1646; GFX9-NEXT: .LBB8_4: ; %exit 1647; GFX9-NEXT: v_mov_b32_e32 v0, 0x3800 1648; GFX9-NEXT: v_mov_b32_e32 v1, 0x3900 1649; GFX9-NEXT: v_mov_b32_e32 v2, 0x3d00 1650; GFX9-NEXT: s_waitcnt vmcnt(0) 1651; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v7 1652; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc 1653; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v7, v0 src0_sel:WORD_1 src1_sel:DWORD 1654; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc 1655; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v6 1656; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v2, vcc 1657; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v6, v0 src0_sel:WORD_1 src1_sel:DWORD 1658; GFX9-NEXT: v_cndmask_b32_e32 v6, v1, v2, vcc 1659; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v5 1660; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v2, vcc 1661; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v5, v0 src0_sel:WORD_1 src1_sel:DWORD 1662; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v2, vcc 1663; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4 1664; GFX9-NEXT: v_cndmask_b32_e32 v10, v1, v2, vcc 1665; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD 1666; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 1667; GFX9-NEXT: v_pack_b32_f16 v0, v10, v0 1668; GFX9-NEXT: v_pack_b32_f16 v1, v9, v5 1669; GFX9-NEXT: v_pack_b32_f16 v2, v8, v6 1670; GFX9-NEXT: v_pack_b32_f16 v3, v3, v7 1671; GFX9-NEXT: s_setpc_b64 s[30:31] 1672; 1673; GFX11-LABEL: vec_16xf16_extract_8xf16_0: 1674; GFX11: ; %bb.0: 1675; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1676; GFX11-NEXT: scratch_load_u8 v4, off, s32 1677; GFX11-NEXT: s_mov_b32 s0, 0 1678; GFX11-NEXT: s_waitcnt vmcnt(0) 1679; GFX11-NEXT: v_and_b32_e32 v4, 1, v4 1680; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1681; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 1682; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo 1683; GFX11-NEXT: s_cbranch_scc0 .LBB8_2 1684; GFX11-NEXT: ; %bb.1: ; %F 1685; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc 1686; GFX11-NEXT: s_waitcnt vmcnt(0) 1687; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc 1688; GFX11-NEXT: s_waitcnt vmcnt(0) 1689; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 1690; GFX11-NEXT: s_cbranch_vccz .LBB8_3 1691; GFX11-NEXT: s_branch .LBB8_4 1692; GFX11-NEXT: .LBB8_2: 1693; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 1694; GFX11-NEXT: .LBB8_3: ; %T 1695; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc 1696; GFX11-NEXT: s_waitcnt vmcnt(0) 1697; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc 1698; GFX11-NEXT: s_waitcnt vmcnt(0) 1699; GFX11-NEXT: .LBB8_4: ; %exit 1700; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v5 1701; GFX11-NEXT: v_mov_b32_e32 v9, 0x3900 1702; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00 1703; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 1704; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v5 1705; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 1706; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 1707; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo 1708; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v4 1709; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo 1710; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v7 1711; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo 1712; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3 1713; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo 1714; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2 1715; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo 1716; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v0 1717; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo 1718; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v6 1719; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) 1720; GFX11-NEXT: v_pack_b32_f16 v0, v2, v0 1721; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo 1722; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v8 1723; GFX11-NEXT: v_pack_b32_f16 v2, v4, v7 1724; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1 1725; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo 1726; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1727; GFX11-NEXT: v_pack_b32_f16 v3, v5, v6 1728; GFX11-NEXT: s_setpc_b64 s[30:31] 1729 br i1 %cond, label %T, label %F 1730 1731T: 1732 %t = load volatile <16 x half>, ptr addrspace(1) %p0 1733 br label %exit 1734 1735F: 1736 %f = load volatile <16 x half>, ptr addrspace(1) %p1 1737 br label %exit 1738 1739exit: 1740 %m = phi <16 x half> [ %t, %T ], [ %f, %F ] 1741 %v2 = shufflevector <16 x half> %m, <16 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1742 %b2 = fcmp ugt <8 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800> 1743 %r2 = select <8 x i1> %b2, <8 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <8 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00> 1744 ret <8 x half> %r2 1745} 1746