1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI 3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI 4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11 5 6define amdgpu_kernel void @select_f16( 7; SI-LABEL: select_f16: 8; SI: ; %bb.0: ; %entry 9; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 10; SI-NEXT: s_mov_b32 s3, 0xf000 11; SI-NEXT: s_mov_b32 s2, -1 12; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 13; SI-NEXT: s_mov_b32 s18, s2 14; SI-NEXT: s_waitcnt lgkmcnt(0) 15; SI-NEXT: s_mov_b32 s16, s10 16; SI-NEXT: s_mov_b32 s17, s11 17; SI-NEXT: s_mov_b32 s19, s3 18; SI-NEXT: s_mov_b32 s20, s12 19; SI-NEXT: s_mov_b32 s21, s13 20; SI-NEXT: s_mov_b32 s22, s2 21; SI-NEXT: s_mov_b32 s23, s3 22; SI-NEXT: s_mov_b32 s12, s14 23; SI-NEXT: s_mov_b32 s13, s15 24; SI-NEXT: s_mov_b32 s14, s2 25; SI-NEXT: s_mov_b32 s15, s3 26; SI-NEXT: s_mov_b32 s6, s2 27; SI-NEXT: s_mov_b32 s7, s3 28; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc 29; SI-NEXT: s_waitcnt vmcnt(0) 30; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc 31; SI-NEXT: s_waitcnt vmcnt(0) 32; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc 33; SI-NEXT: s_waitcnt vmcnt(0) 34; SI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc 35; SI-NEXT: s_waitcnt vmcnt(0) 36; SI-NEXT: s_mov_b32 s0, s8 37; SI-NEXT: s_mov_b32 s1, s9 38; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 39; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 40; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 41; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 42; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 43; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 44; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 45; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 46; SI-NEXT: s_endpgm 47; 48; VI-LABEL: select_f16: 49; VI: ; %bb.0: ; %entry 50; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 51; VI-NEXT: s_mov_b32 s3, 0xf000 52; VI-NEXT: s_mov_b32 s2, -1 53; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x44 54; VI-NEXT: s_mov_b32 s18, s2 55; VI-NEXT: s_waitcnt lgkmcnt(0) 56; VI-NEXT: s_mov_b32 s16, s10 57; VI-NEXT: s_mov_b32 s17, s11 58; VI-NEXT: s_mov_b32 s19, s3 59; VI-NEXT: s_mov_b32 s20, s12 60; VI-NEXT: s_mov_b32 s21, s13 61; VI-NEXT: s_mov_b32 s22, s2 62; VI-NEXT: s_mov_b32 s23, s3 63; VI-NEXT: s_mov_b32 s12, s14 64; VI-NEXT: s_mov_b32 s13, s15 65; VI-NEXT: s_mov_b32 s14, s2 66; VI-NEXT: s_mov_b32 s15, s3 67; VI-NEXT: s_mov_b32 s6, s2 68; VI-NEXT: s_mov_b32 s7, s3 69; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc 70; VI-NEXT: s_waitcnt vmcnt(0) 71; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc 72; VI-NEXT: s_waitcnt vmcnt(0) 73; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc 74; VI-NEXT: s_waitcnt vmcnt(0) 75; VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 glc 76; VI-NEXT: s_waitcnt vmcnt(0) 77; VI-NEXT: s_mov_b32 s0, s8 78; VI-NEXT: s_mov_b32 s1, s9 79; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 80; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 81; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 82; VI-NEXT: s_endpgm 83; 84; GFX11-LABEL: select_f16: 85; GFX11: ; %bb.0: ; %entry 86; GFX11-NEXT: s_clause 0x1 87; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 88; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 89; GFX11-NEXT: s_mov_b32 s6, -1 90; GFX11-NEXT: s_mov_b32 s7, 0x31016000 91; GFX11-NEXT: s_mov_b32 s18, s6 92; GFX11-NEXT: s_mov_b32 s19, s7 93; GFX11-NEXT: s_mov_b32 s22, s6 94; GFX11-NEXT: s_mov_b32 s23, s7 95; GFX11-NEXT: s_mov_b32 s26, s6 96; GFX11-NEXT: s_mov_b32 s27, s7 97; GFX11-NEXT: s_mov_b32 s2, s6 98; GFX11-NEXT: s_mov_b32 s3, s7 99; GFX11-NEXT: s_waitcnt lgkmcnt(0) 100; GFX11-NEXT: s_mov_b32 s16, s10 101; GFX11-NEXT: s_mov_b32 s17, s11 102; GFX11-NEXT: s_mov_b32 s20, s12 103; GFX11-NEXT: s_mov_b32 s21, s13 104; GFX11-NEXT: s_mov_b32 s24, s14 105; GFX11-NEXT: s_mov_b32 s25, s15 106; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc 107; GFX11-NEXT: s_waitcnt vmcnt(0) 108; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc 109; GFX11-NEXT: s_waitcnt vmcnt(0) 110; GFX11-NEXT: buffer_load_u16 v2, off, s[24:27], 0 glc dlc 111; GFX11-NEXT: s_waitcnt vmcnt(0) 112; GFX11-NEXT: buffer_load_u16 v3, off, s[0:3], 0 glc dlc 113; GFX11-NEXT: s_waitcnt vmcnt(0) 114; GFX11-NEXT: s_mov_b32 s4, s8 115; GFX11-NEXT: s_mov_b32 s5, s9 116; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 117; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo 118; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 119; GFX11-NEXT: s_endpgm 120 121 ptr addrspace(1) %r, 122 ptr addrspace(1) %a, 123 ptr addrspace(1) %b, 124 ptr addrspace(1) %c, 125 ptr addrspace(1) %d) { 126entry: 127 %a.val = load volatile half, ptr addrspace(1) %a 128 %b.val = load volatile half, ptr addrspace(1) %b 129 %c.val = load volatile half, ptr addrspace(1) %c 130 %d.val = load volatile half, ptr addrspace(1) %d 131 %fcmp = fcmp olt half %a.val, %b.val 132 %r.val = select i1 %fcmp, half %c.val, half %d.val 133 store half %r.val, ptr addrspace(1) %r 134 ret void 135} 136 137define amdgpu_kernel void @select_f16_imm_a( 138; SI-LABEL: select_f16_imm_a: 139; SI: ; %bb.0: ; %entry 140; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 141; SI-NEXT: s_mov_b32 s11, 0xf000 142; SI-NEXT: s_mov_b32 s10, -1 143; SI-NEXT: s_mov_b32 s14, s10 144; SI-NEXT: s_mov_b32 s15, s11 145; SI-NEXT: s_waitcnt lgkmcnt(0) 146; SI-NEXT: s_mov_b32 s12, s2 147; SI-NEXT: s_mov_b32 s13, s3 148; SI-NEXT: s_mov_b32 s16, s4 149; SI-NEXT: s_mov_b32 s17, s5 150; SI-NEXT: s_mov_b32 s18, s10 151; SI-NEXT: s_mov_b32 s19, s11 152; SI-NEXT: s_mov_b32 s4, s6 153; SI-NEXT: s_mov_b32 s5, s7 154; SI-NEXT: s_mov_b32 s6, s10 155; SI-NEXT: s_mov_b32 s7, s11 156; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 157; SI-NEXT: s_waitcnt vmcnt(0) 158; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 159; SI-NEXT: s_waitcnt vmcnt(0) 160; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 161; SI-NEXT: s_waitcnt vmcnt(0) 162; SI-NEXT: s_mov_b32 s8, s0 163; SI-NEXT: s_mov_b32 s9, s1 164; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 165; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 166; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 167; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 168; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 169; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 170; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 171; SI-NEXT: s_endpgm 172; 173; VI-LABEL: select_f16_imm_a: 174; VI: ; %bb.0: ; %entry 175; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 176; VI-NEXT: s_mov_b32 s11, 0xf000 177; VI-NEXT: s_mov_b32 s10, -1 178; VI-NEXT: s_mov_b32 s14, s10 179; VI-NEXT: s_mov_b32 s15, s11 180; VI-NEXT: s_waitcnt lgkmcnt(0) 181; VI-NEXT: s_mov_b32 s12, s2 182; VI-NEXT: s_mov_b32 s13, s3 183; VI-NEXT: s_mov_b32 s16, s4 184; VI-NEXT: s_mov_b32 s17, s5 185; VI-NEXT: s_mov_b32 s18, s10 186; VI-NEXT: s_mov_b32 s19, s11 187; VI-NEXT: s_mov_b32 s4, s6 188; VI-NEXT: s_mov_b32 s5, s7 189; VI-NEXT: s_mov_b32 s6, s10 190; VI-NEXT: s_mov_b32 s7, s11 191; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 192; VI-NEXT: s_waitcnt vmcnt(0) 193; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 194; VI-NEXT: s_waitcnt vmcnt(0) 195; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 196; VI-NEXT: s_waitcnt vmcnt(0) 197; VI-NEXT: s_mov_b32 s8, s0 198; VI-NEXT: s_mov_b32 s9, s1 199; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 200; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 201; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 202; VI-NEXT: s_endpgm 203; 204; GFX11-LABEL: select_f16_imm_a: 205; GFX11: ; %bb.0: ; %entry 206; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 207; GFX11-NEXT: s_mov_b32 s10, -1 208; GFX11-NEXT: s_mov_b32 s11, 0x31016000 209; GFX11-NEXT: s_mov_b32 s14, s10 210; GFX11-NEXT: s_mov_b32 s15, s11 211; GFX11-NEXT: s_mov_b32 s18, s10 212; GFX11-NEXT: s_mov_b32 s19, s11 213; GFX11-NEXT: s_mov_b32 s22, s10 214; GFX11-NEXT: s_mov_b32 s23, s11 215; GFX11-NEXT: s_waitcnt lgkmcnt(0) 216; GFX11-NEXT: s_mov_b32 s12, s2 217; GFX11-NEXT: s_mov_b32 s13, s3 218; GFX11-NEXT: s_mov_b32 s16, s4 219; GFX11-NEXT: s_mov_b32 s17, s5 220; GFX11-NEXT: s_mov_b32 s20, s6 221; GFX11-NEXT: s_mov_b32 s21, s7 222; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 223; GFX11-NEXT: s_waitcnt vmcnt(0) 224; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc 225; GFX11-NEXT: s_waitcnt vmcnt(0) 226; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc 227; GFX11-NEXT: s_waitcnt vmcnt(0) 228; GFX11-NEXT: s_mov_b32 s8, s0 229; GFX11-NEXT: s_mov_b32 s9, s1 230; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0 231; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo 232; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 233; GFX11-NEXT: s_endpgm 234 ptr addrspace(1) %r, 235 ptr addrspace(1) %b, 236 ptr addrspace(1) %c, 237 ptr addrspace(1) %d) { 238entry: 239 %b.val = load volatile half, ptr addrspace(1) %b 240 %c.val = load volatile half, ptr addrspace(1) %c 241 %d.val = load volatile half, ptr addrspace(1) %d 242 %fcmp = fcmp olt half 0xH3800, %b.val 243 %r.val = select i1 %fcmp, half %c.val, half %d.val 244 store half %r.val, ptr addrspace(1) %r 245 ret void 246} 247 248define amdgpu_kernel void @select_f16_imm_b( 249; SI-LABEL: select_f16_imm_b: 250; SI: ; %bb.0: ; %entry 251; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 252; SI-NEXT: s_mov_b32 s11, 0xf000 253; SI-NEXT: s_mov_b32 s10, -1 254; SI-NEXT: s_mov_b32 s14, s10 255; SI-NEXT: s_mov_b32 s15, s11 256; SI-NEXT: s_waitcnt lgkmcnt(0) 257; SI-NEXT: s_mov_b32 s12, s2 258; SI-NEXT: s_mov_b32 s13, s3 259; SI-NEXT: s_mov_b32 s16, s4 260; SI-NEXT: s_mov_b32 s17, s5 261; SI-NEXT: s_mov_b32 s18, s10 262; SI-NEXT: s_mov_b32 s19, s11 263; SI-NEXT: s_mov_b32 s4, s6 264; SI-NEXT: s_mov_b32 s5, s7 265; SI-NEXT: s_mov_b32 s6, s10 266; SI-NEXT: s_mov_b32 s7, s11 267; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 268; SI-NEXT: s_waitcnt vmcnt(0) 269; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 270; SI-NEXT: s_waitcnt vmcnt(0) 271; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 272; SI-NEXT: s_waitcnt vmcnt(0) 273; SI-NEXT: s_mov_b32 s8, s0 274; SI-NEXT: s_mov_b32 s9, s1 275; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 276; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 277; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 278; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 279; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 280; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 281; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 282; SI-NEXT: s_endpgm 283; 284; VI-LABEL: select_f16_imm_b: 285; VI: ; %bb.0: ; %entry 286; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 287; VI-NEXT: s_mov_b32 s11, 0xf000 288; VI-NEXT: s_mov_b32 s10, -1 289; VI-NEXT: s_mov_b32 s14, s10 290; VI-NEXT: s_mov_b32 s15, s11 291; VI-NEXT: s_waitcnt lgkmcnt(0) 292; VI-NEXT: s_mov_b32 s12, s2 293; VI-NEXT: s_mov_b32 s13, s3 294; VI-NEXT: s_mov_b32 s16, s4 295; VI-NEXT: s_mov_b32 s17, s5 296; VI-NEXT: s_mov_b32 s18, s10 297; VI-NEXT: s_mov_b32 s19, s11 298; VI-NEXT: s_mov_b32 s4, s6 299; VI-NEXT: s_mov_b32 s5, s7 300; VI-NEXT: s_mov_b32 s6, s10 301; VI-NEXT: s_mov_b32 s7, s11 302; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 303; VI-NEXT: s_waitcnt vmcnt(0) 304; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 305; VI-NEXT: s_waitcnt vmcnt(0) 306; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 307; VI-NEXT: s_waitcnt vmcnt(0) 308; VI-NEXT: s_mov_b32 s8, s0 309; VI-NEXT: s_mov_b32 s9, s1 310; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 311; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 312; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 313; VI-NEXT: s_endpgm 314; 315; GFX11-LABEL: select_f16_imm_b: 316; GFX11: ; %bb.0: ; %entry 317; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 318; GFX11-NEXT: s_mov_b32 s10, -1 319; GFX11-NEXT: s_mov_b32 s11, 0x31016000 320; GFX11-NEXT: s_mov_b32 s14, s10 321; GFX11-NEXT: s_mov_b32 s15, s11 322; GFX11-NEXT: s_mov_b32 s18, s10 323; GFX11-NEXT: s_mov_b32 s19, s11 324; GFX11-NEXT: s_mov_b32 s22, s10 325; GFX11-NEXT: s_mov_b32 s23, s11 326; GFX11-NEXT: s_waitcnt lgkmcnt(0) 327; GFX11-NEXT: s_mov_b32 s12, s2 328; GFX11-NEXT: s_mov_b32 s13, s3 329; GFX11-NEXT: s_mov_b32 s16, s4 330; GFX11-NEXT: s_mov_b32 s17, s5 331; GFX11-NEXT: s_mov_b32 s20, s6 332; GFX11-NEXT: s_mov_b32 s21, s7 333; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 334; GFX11-NEXT: s_waitcnt vmcnt(0) 335; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc 336; GFX11-NEXT: s_waitcnt vmcnt(0) 337; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc 338; GFX11-NEXT: s_waitcnt vmcnt(0) 339; GFX11-NEXT: s_mov_b32 s8, s0 340; GFX11-NEXT: s_mov_b32 s9, s1 341; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0 342; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo 343; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 344; GFX11-NEXT: s_endpgm 345 ptr addrspace(1) %r, 346 ptr addrspace(1) %a, 347 ptr addrspace(1) %c, 348 ptr addrspace(1) %d) { 349entry: 350 %a.val = load volatile half, ptr addrspace(1) %a 351 %c.val = load volatile half, ptr addrspace(1) %c 352 %d.val = load volatile half, ptr addrspace(1) %d 353 %fcmp = fcmp olt half %a.val, 0xH3800 354 %r.val = select i1 %fcmp, half %c.val, half %d.val 355 store half %r.val, ptr addrspace(1) %r 356 ret void 357} 358 359define amdgpu_kernel void @select_f16_imm_c( 360; SI-LABEL: select_f16_imm_c: 361; SI: ; %bb.0: ; %entry 362; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 363; SI-NEXT: s_mov_b32 s11, 0xf000 364; SI-NEXT: s_mov_b32 s10, -1 365; SI-NEXT: s_mov_b32 s14, s10 366; SI-NEXT: s_mov_b32 s15, s11 367; SI-NEXT: s_waitcnt lgkmcnt(0) 368; SI-NEXT: s_mov_b32 s12, s2 369; SI-NEXT: s_mov_b32 s13, s3 370; SI-NEXT: s_mov_b32 s16, s4 371; SI-NEXT: s_mov_b32 s17, s5 372; SI-NEXT: s_mov_b32 s18, s10 373; SI-NEXT: s_mov_b32 s19, s11 374; SI-NEXT: s_mov_b32 s4, s6 375; SI-NEXT: s_mov_b32 s5, s7 376; SI-NEXT: s_mov_b32 s6, s10 377; SI-NEXT: s_mov_b32 s7, s11 378; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 379; SI-NEXT: s_waitcnt vmcnt(0) 380; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 381; SI-NEXT: s_waitcnt vmcnt(0) 382; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 383; SI-NEXT: s_waitcnt vmcnt(0) 384; SI-NEXT: s_mov_b32 s8, s0 385; SI-NEXT: s_mov_b32 s9, s1 386; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 387; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 388; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 389; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 390; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc 391; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 392; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 393; SI-NEXT: s_endpgm 394; 395; VI-LABEL: select_f16_imm_c: 396; VI: ; %bb.0: ; %entry 397; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 398; VI-NEXT: s_mov_b32 s11, 0xf000 399; VI-NEXT: s_mov_b32 s10, -1 400; VI-NEXT: s_mov_b32 s14, s10 401; VI-NEXT: s_mov_b32 s15, s11 402; VI-NEXT: s_waitcnt lgkmcnt(0) 403; VI-NEXT: s_mov_b32 s12, s2 404; VI-NEXT: s_mov_b32 s13, s3 405; VI-NEXT: s_mov_b32 s16, s4 406; VI-NEXT: s_mov_b32 s17, s5 407; VI-NEXT: s_mov_b32 s18, s10 408; VI-NEXT: s_mov_b32 s19, s11 409; VI-NEXT: s_mov_b32 s4, s6 410; VI-NEXT: s_mov_b32 s5, s7 411; VI-NEXT: s_mov_b32 s6, s10 412; VI-NEXT: s_mov_b32 s7, s11 413; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 414; VI-NEXT: s_waitcnt vmcnt(0) 415; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 416; VI-NEXT: s_waitcnt vmcnt(0) 417; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 418; VI-NEXT: s_waitcnt vmcnt(0) 419; VI-NEXT: v_mov_b32_e32 v3, 0x3800 420; VI-NEXT: s_mov_b32 s8, s0 421; VI-NEXT: s_mov_b32 s9, s1 422; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 423; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 424; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 425; VI-NEXT: s_endpgm 426; 427; GFX11-LABEL: select_f16_imm_c: 428; GFX11: ; %bb.0: ; %entry 429; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 430; GFX11-NEXT: s_mov_b32 s10, -1 431; GFX11-NEXT: s_mov_b32 s11, 0x31016000 432; GFX11-NEXT: s_mov_b32 s14, s10 433; GFX11-NEXT: s_mov_b32 s15, s11 434; GFX11-NEXT: s_mov_b32 s18, s10 435; GFX11-NEXT: s_mov_b32 s19, s11 436; GFX11-NEXT: s_mov_b32 s22, s10 437; GFX11-NEXT: s_mov_b32 s23, s11 438; GFX11-NEXT: s_waitcnt lgkmcnt(0) 439; GFX11-NEXT: s_mov_b32 s12, s2 440; GFX11-NEXT: s_mov_b32 s13, s3 441; GFX11-NEXT: s_mov_b32 s16, s4 442; GFX11-NEXT: s_mov_b32 s17, s5 443; GFX11-NEXT: s_mov_b32 s20, s6 444; GFX11-NEXT: s_mov_b32 s21, s7 445; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 446; GFX11-NEXT: s_waitcnt vmcnt(0) 447; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc 448; GFX11-NEXT: s_waitcnt vmcnt(0) 449; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc 450; GFX11-NEXT: s_waitcnt vmcnt(0) 451; GFX11-NEXT: s_mov_b32 s8, s0 452; GFX11-NEXT: s_mov_b32 s9, s1 453; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 454; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo 455; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 456; GFX11-NEXT: s_endpgm 457 ptr addrspace(1) %r, 458 ptr addrspace(1) %a, 459 ptr addrspace(1) %b, 460 ptr addrspace(1) %d) { 461entry: 462 %a.val = load volatile half, ptr addrspace(1) %a 463 %b.val = load volatile half, ptr addrspace(1) %b 464 %d.val = load volatile half, ptr addrspace(1) %d 465 %fcmp = fcmp olt half %a.val, %b.val 466 %r.val = select i1 %fcmp, half 0xH3800, half %d.val 467 store half %r.val, ptr addrspace(1) %r 468 ret void 469} 470 471define amdgpu_kernel void @select_f16_imm_d( 472; SI-LABEL: select_f16_imm_d: 473; SI: ; %bb.0: ; %entry 474; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 475; SI-NEXT: s_mov_b32 s11, 0xf000 476; SI-NEXT: s_mov_b32 s10, -1 477; SI-NEXT: s_mov_b32 s14, s10 478; SI-NEXT: s_mov_b32 s15, s11 479; SI-NEXT: s_waitcnt lgkmcnt(0) 480; SI-NEXT: s_mov_b32 s12, s2 481; SI-NEXT: s_mov_b32 s13, s3 482; SI-NEXT: s_mov_b32 s16, s4 483; SI-NEXT: s_mov_b32 s17, s5 484; SI-NEXT: s_mov_b32 s18, s10 485; SI-NEXT: s_mov_b32 s19, s11 486; SI-NEXT: s_mov_b32 s4, s6 487; SI-NEXT: s_mov_b32 s5, s7 488; SI-NEXT: s_mov_b32 s6, s10 489; SI-NEXT: s_mov_b32 s7, s11 490; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 491; SI-NEXT: s_waitcnt vmcnt(0) 492; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 493; SI-NEXT: s_waitcnt vmcnt(0) 494; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 495; SI-NEXT: s_waitcnt vmcnt(0) 496; SI-NEXT: s_mov_b32 s8, s0 497; SI-NEXT: s_mov_b32 s9, s1 498; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 499; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 500; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 501; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 502; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc 503; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 504; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 505; SI-NEXT: s_endpgm 506; 507; VI-LABEL: select_f16_imm_d: 508; VI: ; %bb.0: ; %entry 509; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 510; VI-NEXT: s_mov_b32 s11, 0xf000 511; VI-NEXT: s_mov_b32 s10, -1 512; VI-NEXT: s_mov_b32 s14, s10 513; VI-NEXT: s_mov_b32 s15, s11 514; VI-NEXT: s_waitcnt lgkmcnt(0) 515; VI-NEXT: s_mov_b32 s12, s2 516; VI-NEXT: s_mov_b32 s13, s3 517; VI-NEXT: s_mov_b32 s16, s4 518; VI-NEXT: s_mov_b32 s17, s5 519; VI-NEXT: s_mov_b32 s18, s10 520; VI-NEXT: s_mov_b32 s19, s11 521; VI-NEXT: s_mov_b32 s4, s6 522; VI-NEXT: s_mov_b32 s5, s7 523; VI-NEXT: s_mov_b32 s6, s10 524; VI-NEXT: s_mov_b32 s7, s11 525; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 526; VI-NEXT: s_waitcnt vmcnt(0) 527; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc 528; VI-NEXT: s_waitcnt vmcnt(0) 529; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc 530; VI-NEXT: s_waitcnt vmcnt(0) 531; VI-NEXT: v_mov_b32_e32 v3, 0x3800 532; VI-NEXT: s_mov_b32 s8, s0 533; VI-NEXT: s_mov_b32 s9, s1 534; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 535; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 536; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 537; VI-NEXT: s_endpgm 538; 539; GFX11-LABEL: select_f16_imm_d: 540; GFX11: ; %bb.0: ; %entry 541; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 542; GFX11-NEXT: s_mov_b32 s10, -1 543; GFX11-NEXT: s_mov_b32 s11, 0x31016000 544; GFX11-NEXT: s_mov_b32 s14, s10 545; GFX11-NEXT: s_mov_b32 s15, s11 546; GFX11-NEXT: s_mov_b32 s18, s10 547; GFX11-NEXT: s_mov_b32 s19, s11 548; GFX11-NEXT: s_mov_b32 s22, s10 549; GFX11-NEXT: s_mov_b32 s23, s11 550; GFX11-NEXT: s_waitcnt lgkmcnt(0) 551; GFX11-NEXT: s_mov_b32 s12, s2 552; GFX11-NEXT: s_mov_b32 s13, s3 553; GFX11-NEXT: s_mov_b32 s16, s4 554; GFX11-NEXT: s_mov_b32 s17, s5 555; GFX11-NEXT: s_mov_b32 s20, s6 556; GFX11-NEXT: s_mov_b32 s21, s7 557; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 558; GFX11-NEXT: s_waitcnt vmcnt(0) 559; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc 560; GFX11-NEXT: s_waitcnt vmcnt(0) 561; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc 562; GFX11-NEXT: s_waitcnt vmcnt(0) 563; GFX11-NEXT: s_mov_b32 s8, s0 564; GFX11-NEXT: s_mov_b32 s9, s1 565; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 566; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo 567; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 568; GFX11-NEXT: s_endpgm 569 ptr addrspace(1) %r, 570 ptr addrspace(1) %a, 571 ptr addrspace(1) %b, 572 ptr addrspace(1) %c) { 573entry: 574 %a.val = load volatile half, ptr addrspace(1) %a 575 %b.val = load volatile half, ptr addrspace(1) %b 576 %c.val = load volatile half, ptr addrspace(1) %c 577 %fcmp = fcmp olt half %a.val, %b.val 578 %r.val = select i1 %fcmp, half %c.val, half 0xH3800 579 store half %r.val, ptr addrspace(1) %r 580 ret void 581} 582 583define amdgpu_kernel void @select_v2f16( 584; SI-LABEL: select_v2f16: 585; SI: ; %bb.0: ; %entry 586; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 587; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 588; SI-NEXT: s_mov_b32 s3, 0xf000 589; SI-NEXT: s_mov_b32 s2, -1 590; SI-NEXT: s_mov_b32 s18, s2 591; SI-NEXT: s_waitcnt lgkmcnt(0) 592; SI-NEXT: s_mov_b32 s16, s10 593; SI-NEXT: s_mov_b32 s17, s11 594; SI-NEXT: s_mov_b32 s19, s3 595; SI-NEXT: s_mov_b32 s20, s12 596; SI-NEXT: s_mov_b32 s21, s13 597; SI-NEXT: s_mov_b32 s22, s2 598; SI-NEXT: s_mov_b32 s23, s3 599; SI-NEXT: s_mov_b32 s6, s2 600; SI-NEXT: s_mov_b32 s7, s3 601; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 602; SI-NEXT: s_mov_b32 s12, s14 603; SI-NEXT: s_mov_b32 s13, s15 604; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 605; SI-NEXT: s_mov_b32 s14, s2 606; SI-NEXT: s_mov_b32 s15, s3 607; SI-NEXT: buffer_load_dword v2, off, s[20:23], 0 608; SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 609; SI-NEXT: s_mov_b32 s0, s8 610; SI-NEXT: s_mov_b32 s1, s9 611; SI-NEXT: s_waitcnt vmcnt(3) 612; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 613; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 614; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 615; SI-NEXT: s_waitcnt vmcnt(2) 616; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 617; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 618; SI-NEXT: s_waitcnt vmcnt(1) 619; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 620; SI-NEXT: s_waitcnt vmcnt(0) 621; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 622; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 623; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 624; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 625; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 626; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 627; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6 628; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc 629; SI-NEXT: v_cmp_lt_f32_e32 vcc, v1, v2 630; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 631; SI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 632; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 633; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 634; SI-NEXT: v_or_b32_e32 v0, v1, v0 635; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 636; SI-NEXT: s_endpgm 637; 638; VI-LABEL: select_v2f16: 639; VI: ; %bb.0: ; %entry 640; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 641; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x44 642; VI-NEXT: s_mov_b32 s3, 0xf000 643; VI-NEXT: s_mov_b32 s2, -1 644; VI-NEXT: s_mov_b32 s6, s2 645; VI-NEXT: s_mov_b32 s7, s3 646; VI-NEXT: s_waitcnt lgkmcnt(0) 647; VI-NEXT: s_mov_b32 s16, s10 648; VI-NEXT: s_mov_b32 s17, s11 649; VI-NEXT: s_mov_b32 s18, s2 650; VI-NEXT: s_mov_b32 s19, s3 651; VI-NEXT: s_mov_b32 s20, s12 652; VI-NEXT: s_mov_b32 s21, s13 653; VI-NEXT: s_mov_b32 s22, s2 654; VI-NEXT: s_mov_b32 s23, s3 655; VI-NEXT: s_mov_b32 s12, s14 656; VI-NEXT: s_mov_b32 s13, s15 657; VI-NEXT: s_mov_b32 s14, s2 658; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 659; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 660; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0 661; VI-NEXT: s_mov_b32 s15, s3 662; VI-NEXT: buffer_load_dword v3, off, s[12:15], 0 663; VI-NEXT: s_mov_b32 s0, s8 664; VI-NEXT: s_mov_b32 s1, s9 665; VI-NEXT: s_waitcnt vmcnt(3) 666; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 667; VI-NEXT: s_waitcnt vmcnt(2) 668; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 669; VI-NEXT: s_waitcnt vmcnt(1) 670; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 671; VI-NEXT: v_cmp_lt_f16_e32 vcc, v2, v1 672; VI-NEXT: s_waitcnt vmcnt(0) 673; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 674; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 675; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 676; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 677; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 678; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 679; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 680; VI-NEXT: s_endpgm 681; 682; GFX11-LABEL: select_v2f16: 683; GFX11: ; %bb.0: ; %entry 684; GFX11-NEXT: s_clause 0x1 685; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 686; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x44 687; GFX11-NEXT: s_mov_b32 s2, -1 688; GFX11-NEXT: s_mov_b32 s3, 0x31016000 689; GFX11-NEXT: s_mov_b32 s6, s2 690; GFX11-NEXT: s_mov_b32 s7, s3 691; GFX11-NEXT: s_mov_b32 s22, s2 692; GFX11-NEXT: s_mov_b32 s23, s3 693; GFX11-NEXT: s_mov_b32 s18, s2 694; GFX11-NEXT: s_mov_b32 s19, s3 695; GFX11-NEXT: s_mov_b32 s26, s2 696; GFX11-NEXT: s_mov_b32 s27, s3 697; GFX11-NEXT: s_waitcnt lgkmcnt(0) 698; GFX11-NEXT: s_mov_b32 s20, s12 699; GFX11-NEXT: s_mov_b32 s21, s13 700; GFX11-NEXT: s_mov_b32 s16, s10 701; GFX11-NEXT: s_mov_b32 s17, s11 702; GFX11-NEXT: s_mov_b32 s24, s14 703; GFX11-NEXT: s_mov_b32 s25, s15 704; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 705; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 706; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 707; GFX11-NEXT: buffer_load_b32 v3, off, s[24:27], 0 708; GFX11-NEXT: s_mov_b32 s0, s8 709; GFX11-NEXT: s_mov_b32 s1, s9 710; GFX11-NEXT: s_waitcnt vmcnt(3) 711; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 712; GFX11-NEXT: s_waitcnt vmcnt(2) 713; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 714; GFX11-NEXT: s_waitcnt vmcnt(1) 715; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 716; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v2, v1 717; GFX11-NEXT: s_waitcnt vmcnt(0) 718; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 719; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo 720; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v6, v5 721; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 722; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0 723; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 724; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 725; GFX11-NEXT: s_endpgm 726 ptr addrspace(1) %r, 727 ptr addrspace(1) %a, 728 ptr addrspace(1) %b, 729 ptr addrspace(1) %c, 730 ptr addrspace(1) %d) { 731entry: 732 %a.val = load <2 x half>, ptr addrspace(1) %a 733 %b.val = load <2 x half>, ptr addrspace(1) %b 734 %c.val = load <2 x half>, ptr addrspace(1) %c 735 %d.val = load <2 x half>, ptr addrspace(1) %d 736 %fcmp = fcmp olt <2 x half> %a.val, %b.val 737 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 738 store <2 x half> %r.val, ptr addrspace(1) %r 739 ret void 740} 741 742define amdgpu_kernel void @select_v2f16_imm_a( 743; SI-LABEL: select_v2f16_imm_a: 744; SI: ; %bb.0: ; %entry 745; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 746; SI-NEXT: s_mov_b32 s11, 0xf000 747; SI-NEXT: s_mov_b32 s10, -1 748; SI-NEXT: s_mov_b32 s14, s10 749; SI-NEXT: s_mov_b32 s15, s11 750; SI-NEXT: s_waitcnt lgkmcnt(0) 751; SI-NEXT: s_mov_b32 s12, s2 752; SI-NEXT: s_mov_b32 s13, s3 753; SI-NEXT: s_mov_b32 s16, s4 754; SI-NEXT: s_mov_b32 s17, s5 755; SI-NEXT: s_mov_b32 s18, s10 756; SI-NEXT: s_mov_b32 s19, s11 757; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 758; SI-NEXT: s_mov_b32 s4, s6 759; SI-NEXT: s_mov_b32 s5, s7 760; SI-NEXT: s_mov_b32 s6, s10 761; SI-NEXT: s_mov_b32 s7, s11 762; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 763; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 764; SI-NEXT: s_mov_b32 s2, 0x3f200000 765; SI-NEXT: s_mov_b32 s8, s0 766; SI-NEXT: s_mov_b32 s9, s1 767; SI-NEXT: s_waitcnt vmcnt(2) 768; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 769; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 770; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 771; SI-NEXT: s_waitcnt vmcnt(1) 772; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 773; SI-NEXT: s_waitcnt vmcnt(0) 774; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 775; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 776; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 777; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 778; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 779; SI-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 780; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc 781; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 782; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 783; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 784; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 785; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 786; SI-NEXT: v_or_b32_e32 v0, v0, v1 787; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 788; SI-NEXT: s_endpgm 789; 790; VI-LABEL: select_v2f16_imm_a: 791; VI: ; %bb.0: ; %entry 792; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 793; VI-NEXT: s_mov_b32 s11, 0xf000 794; VI-NEXT: s_mov_b32 s10, -1 795; VI-NEXT: s_mov_b32 s14, s10 796; VI-NEXT: s_mov_b32 s15, s11 797; VI-NEXT: s_waitcnt lgkmcnt(0) 798; VI-NEXT: s_mov_b32 s12, s2 799; VI-NEXT: s_mov_b32 s13, s3 800; VI-NEXT: s_mov_b32 s16, s4 801; VI-NEXT: s_mov_b32 s17, s5 802; VI-NEXT: s_mov_b32 s18, s10 803; VI-NEXT: s_mov_b32 s19, s11 804; VI-NEXT: s_mov_b32 s4, s6 805; VI-NEXT: s_mov_b32 s5, s7 806; VI-NEXT: s_mov_b32 s6, s10 807; VI-NEXT: s_mov_b32 s7, s11 808; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 809; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 810; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 811; VI-NEXT: s_movk_i32 s2, 0x3900 812; VI-NEXT: s_mov_b32 s8, s0 813; VI-NEXT: s_mov_b32 s9, s1 814; VI-NEXT: s_waitcnt vmcnt(2) 815; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 816; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 817; VI-NEXT: s_waitcnt vmcnt(0) 818; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 819; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 820; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 821; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3 822; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 823; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 824; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 825; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 826; VI-NEXT: s_endpgm 827; 828; GFX11-LABEL: select_v2f16_imm_a: 829; GFX11: ; %bb.0: ; %entry 830; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 831; GFX11-NEXT: s_mov_b32 s10, -1 832; GFX11-NEXT: s_mov_b32 s11, 0x31016000 833; GFX11-NEXT: s_mov_b32 s14, s10 834; GFX11-NEXT: s_mov_b32 s15, s11 835; GFX11-NEXT: s_mov_b32 s18, s10 836; GFX11-NEXT: s_mov_b32 s19, s11 837; GFX11-NEXT: s_mov_b32 s22, s10 838; GFX11-NEXT: s_mov_b32 s23, s11 839; GFX11-NEXT: s_waitcnt lgkmcnt(0) 840; GFX11-NEXT: s_mov_b32 s12, s2 841; GFX11-NEXT: s_mov_b32 s13, s3 842; GFX11-NEXT: s_mov_b32 s16, s4 843; GFX11-NEXT: s_mov_b32 s17, s5 844; GFX11-NEXT: s_mov_b32 s20, s6 845; GFX11-NEXT: s_mov_b32 s21, s7 846; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 847; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 848; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 849; GFX11-NEXT: s_mov_b32 s8, s0 850; GFX11-NEXT: s_mov_b32 s9, s1 851; GFX11-NEXT: s_waitcnt vmcnt(2) 852; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 853; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0 854; GFX11-NEXT: s_waitcnt vmcnt(1) 855; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 856; GFX11-NEXT: s_waitcnt vmcnt(0) 857; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 858; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo 859; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0x3900, v3 860; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 861; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo 862; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 863; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 864; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 865; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 866; GFX11-NEXT: s_endpgm 867 ptr addrspace(1) %r, 868 ptr addrspace(1) %b, 869 ptr addrspace(1) %c, 870 ptr addrspace(1) %d) { 871entry: 872 %b.val = load <2 x half>, ptr addrspace(1) %b 873 %c.val = load <2 x half>, ptr addrspace(1) %c 874 %d.val = load <2 x half>, ptr addrspace(1) %d 875 %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val 876 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 877 store <2 x half> %r.val, ptr addrspace(1) %r 878 ret void 879} 880 881define amdgpu_kernel void @select_v2f16_imm_b( 882; SI-LABEL: select_v2f16_imm_b: 883; SI: ; %bb.0: ; %entry 884; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 885; SI-NEXT: s_mov_b32 s11, 0xf000 886; SI-NEXT: s_mov_b32 s10, -1 887; SI-NEXT: s_mov_b32 s14, s10 888; SI-NEXT: s_mov_b32 s15, s11 889; SI-NEXT: s_waitcnt lgkmcnt(0) 890; SI-NEXT: s_mov_b32 s12, s2 891; SI-NEXT: s_mov_b32 s13, s3 892; SI-NEXT: s_mov_b32 s16, s4 893; SI-NEXT: s_mov_b32 s17, s5 894; SI-NEXT: s_mov_b32 s18, s10 895; SI-NEXT: s_mov_b32 s19, s11 896; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 897; SI-NEXT: s_mov_b32 s4, s6 898; SI-NEXT: s_mov_b32 s5, s7 899; SI-NEXT: s_mov_b32 s6, s10 900; SI-NEXT: s_mov_b32 s7, s11 901; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 902; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 903; SI-NEXT: s_mov_b32 s2, 0x3f200000 904; SI-NEXT: s_mov_b32 s8, s0 905; SI-NEXT: s_mov_b32 s9, s1 906; SI-NEXT: s_waitcnt vmcnt(2) 907; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 908; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 909; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 910; SI-NEXT: s_waitcnt vmcnt(1) 911; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 912; SI-NEXT: s_waitcnt vmcnt(0) 913; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 914; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 915; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 916; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 917; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 918; SI-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 919; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc 920; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 921; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 922; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 923; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 924; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 925; SI-NEXT: v_or_b32_e32 v0, v0, v1 926; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 927; SI-NEXT: s_endpgm 928; 929; VI-LABEL: select_v2f16_imm_b: 930; VI: ; %bb.0: ; %entry 931; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 932; VI-NEXT: s_mov_b32 s11, 0xf000 933; VI-NEXT: s_mov_b32 s10, -1 934; VI-NEXT: s_mov_b32 s14, s10 935; VI-NEXT: s_mov_b32 s15, s11 936; VI-NEXT: s_waitcnt lgkmcnt(0) 937; VI-NEXT: s_mov_b32 s12, s2 938; VI-NEXT: s_mov_b32 s13, s3 939; VI-NEXT: s_mov_b32 s16, s4 940; VI-NEXT: s_mov_b32 s17, s5 941; VI-NEXT: s_mov_b32 s18, s10 942; VI-NEXT: s_mov_b32 s19, s11 943; VI-NEXT: s_mov_b32 s4, s6 944; VI-NEXT: s_mov_b32 s5, s7 945; VI-NEXT: s_mov_b32 s6, s10 946; VI-NEXT: s_mov_b32 s7, s11 947; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 948; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 949; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 950; VI-NEXT: s_movk_i32 s2, 0x3900 951; VI-NEXT: s_mov_b32 s8, s0 952; VI-NEXT: s_mov_b32 s9, s1 953; VI-NEXT: s_waitcnt vmcnt(2) 954; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 955; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 956; VI-NEXT: s_waitcnt vmcnt(0) 957; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 958; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 959; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 960; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3 961; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 962; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 963; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 964; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 965; VI-NEXT: s_endpgm 966; 967; GFX11-LABEL: select_v2f16_imm_b: 968; GFX11: ; %bb.0: ; %entry 969; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 970; GFX11-NEXT: s_mov_b32 s10, -1 971; GFX11-NEXT: s_mov_b32 s11, 0x31016000 972; GFX11-NEXT: s_mov_b32 s14, s10 973; GFX11-NEXT: s_mov_b32 s15, s11 974; GFX11-NEXT: s_mov_b32 s18, s10 975; GFX11-NEXT: s_mov_b32 s19, s11 976; GFX11-NEXT: s_mov_b32 s22, s10 977; GFX11-NEXT: s_mov_b32 s23, s11 978; GFX11-NEXT: s_waitcnt lgkmcnt(0) 979; GFX11-NEXT: s_mov_b32 s12, s2 980; GFX11-NEXT: s_mov_b32 s13, s3 981; GFX11-NEXT: s_mov_b32 s16, s4 982; GFX11-NEXT: s_mov_b32 s17, s5 983; GFX11-NEXT: s_mov_b32 s20, s6 984; GFX11-NEXT: s_mov_b32 s21, s7 985; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 986; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 987; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 988; GFX11-NEXT: s_mov_b32 s8, s0 989; GFX11-NEXT: s_mov_b32 s9, s1 990; GFX11-NEXT: s_waitcnt vmcnt(2) 991; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 992; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0 993; GFX11-NEXT: s_waitcnt vmcnt(1) 994; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 995; GFX11-NEXT: s_waitcnt vmcnt(0) 996; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 997; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo 998; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0x3900, v3 999; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 1000; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo 1001; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 1002; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1003; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 1004; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 1005; GFX11-NEXT: s_endpgm 1006 ptr addrspace(1) %r, 1007 ptr addrspace(1) %a, 1008 ptr addrspace(1) %c, 1009 ptr addrspace(1) %d) { 1010entry: 1011 %a.val = load <2 x half>, ptr addrspace(1) %a 1012 %c.val = load <2 x half>, ptr addrspace(1) %c 1013 %d.val = load <2 x half>, ptr addrspace(1) %d 1014 %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900> 1015 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 1016 store <2 x half> %r.val, ptr addrspace(1) %r 1017 ret void 1018} 1019 1020define amdgpu_kernel void @select_v2f16_imm_c( 1021; SI-LABEL: select_v2f16_imm_c: 1022; SI: ; %bb.0: ; %entry 1023; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 1024; SI-NEXT: s_mov_b32 s11, 0xf000 1025; SI-NEXT: s_mov_b32 s10, -1 1026; SI-NEXT: s_mov_b32 s14, s10 1027; SI-NEXT: s_mov_b32 s15, s11 1028; SI-NEXT: s_waitcnt lgkmcnt(0) 1029; SI-NEXT: s_mov_b32 s12, s2 1030; SI-NEXT: s_mov_b32 s13, s3 1031; SI-NEXT: s_mov_b32 s16, s4 1032; SI-NEXT: s_mov_b32 s17, s5 1033; SI-NEXT: s_mov_b32 s18, s10 1034; SI-NEXT: s_mov_b32 s19, s11 1035; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 1036; SI-NEXT: s_mov_b32 s4, s6 1037; SI-NEXT: s_mov_b32 s5, s7 1038; SI-NEXT: s_mov_b32 s6, s10 1039; SI-NEXT: s_mov_b32 s7, s11 1040; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 1041; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 1042; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 1043; SI-NEXT: s_mov_b32 s8, s0 1044; SI-NEXT: s_mov_b32 s9, s1 1045; SI-NEXT: s_waitcnt vmcnt(2) 1046; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 1047; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1048; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1049; SI-NEXT: s_waitcnt vmcnt(1) 1050; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 1051; SI-NEXT: s_waitcnt vmcnt(0) 1052; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 1053; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 1054; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 1055; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1056; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1057; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5 1058; SI-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc 1059; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1 1060; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1061; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v2, vcc 1062; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1063; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1064; SI-NEXT: v_or_b32_e32 v0, v1, v0 1065; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 1066; SI-NEXT: s_endpgm 1067; 1068; VI-LABEL: select_v2f16_imm_c: 1069; VI: ; %bb.0: ; %entry 1070; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 1071; VI-NEXT: s_mov_b32 s11, 0xf000 1072; VI-NEXT: s_mov_b32 s10, -1 1073; VI-NEXT: s_mov_b32 s18, s10 1074; VI-NEXT: s_mov_b32 s19, s11 1075; VI-NEXT: s_waitcnt lgkmcnt(0) 1076; VI-NEXT: s_mov_b32 s16, s4 1077; VI-NEXT: s_mov_b32 s17, s5 1078; VI-NEXT: s_mov_b32 s14, s10 1079; VI-NEXT: s_mov_b32 s12, s2 1080; VI-NEXT: s_mov_b32 s13, s3 1081; VI-NEXT: s_mov_b32 s15, s11 1082; VI-NEXT: s_mov_b32 s4, s6 1083; VI-NEXT: s_mov_b32 s5, s7 1084; VI-NEXT: s_mov_b32 s6, s10 1085; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 1086; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 1087; VI-NEXT: s_mov_b32 s7, s11 1088; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 1089; VI-NEXT: v_mov_b32_e32 v3, 0x3800 1090; VI-NEXT: v_mov_b32_e32 v4, 0x3900 1091; VI-NEXT: s_mov_b32 s8, s0 1092; VI-NEXT: s_mov_b32 s9, s1 1093; VI-NEXT: s_waitcnt vmcnt(2) 1094; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 1095; VI-NEXT: s_waitcnt vmcnt(1) 1096; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 1097; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0 1098; VI-NEXT: s_waitcnt vmcnt(0) 1099; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 1100; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 1101; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5 1102; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 1103; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1104; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1105; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 1106; VI-NEXT: s_endpgm 1107; 1108; GFX11-LABEL: select_v2f16_imm_c: 1109; GFX11: ; %bb.0: ; %entry 1110; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 1111; GFX11-NEXT: s_mov_b32 s10, -1 1112; GFX11-NEXT: s_mov_b32 s11, 0x31016000 1113; GFX11-NEXT: s_mov_b32 s18, s10 1114; GFX11-NEXT: s_mov_b32 s19, s11 1115; GFX11-NEXT: s_mov_b32 s14, s10 1116; GFX11-NEXT: s_mov_b32 s15, s11 1117; GFX11-NEXT: s_mov_b32 s22, s10 1118; GFX11-NEXT: s_mov_b32 s23, s11 1119; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1120; GFX11-NEXT: s_mov_b32 s16, s4 1121; GFX11-NEXT: s_mov_b32 s17, s5 1122; GFX11-NEXT: s_mov_b32 s12, s2 1123; GFX11-NEXT: s_mov_b32 s13, s3 1124; GFX11-NEXT: s_mov_b32 s20, s6 1125; GFX11-NEXT: s_mov_b32 s21, s7 1126; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0 1127; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 1128; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 1129; GFX11-NEXT: s_mov_b32 s8, s0 1130; GFX11-NEXT: s_mov_b32 s9, s1 1131; GFX11-NEXT: s_waitcnt vmcnt(2) 1132; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 1133; GFX11-NEXT: s_waitcnt vmcnt(1) 1134; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 1135; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0 1136; GFX11-NEXT: s_waitcnt vmcnt(0) 1137; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 1138; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo 1139; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v4, v3 1140; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) 1141; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 1142; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo 1143; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1144; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 1145; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 1146; GFX11-NEXT: s_endpgm 1147 ptr addrspace(1) %r, 1148 ptr addrspace(1) %a, 1149 ptr addrspace(1) %b, 1150 ptr addrspace(1) %d) { 1151entry: 1152 %a.val = load <2 x half>, ptr addrspace(1) %a 1153 %b.val = load <2 x half>, ptr addrspace(1) %b 1154 %d.val = load <2 x half>, ptr addrspace(1) %d 1155 %fcmp = fcmp olt <2 x half> %a.val, %b.val 1156 %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val 1157 store <2 x half> %r.val, ptr addrspace(1) %r 1158 ret void 1159} 1160 1161define amdgpu_kernel void @select_v2f16_imm_d( 1162; SI-LABEL: select_v2f16_imm_d: 1163; SI: ; %bb.0: ; %entry 1164; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 1165; SI-NEXT: s_mov_b32 s11, 0xf000 1166; SI-NEXT: s_mov_b32 s10, -1 1167; SI-NEXT: s_mov_b32 s14, s10 1168; SI-NEXT: s_mov_b32 s15, s11 1169; SI-NEXT: s_waitcnt lgkmcnt(0) 1170; SI-NEXT: s_mov_b32 s12, s2 1171; SI-NEXT: s_mov_b32 s13, s3 1172; SI-NEXT: s_mov_b32 s16, s4 1173; SI-NEXT: s_mov_b32 s17, s5 1174; SI-NEXT: s_mov_b32 s18, s10 1175; SI-NEXT: s_mov_b32 s19, s11 1176; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 1177; SI-NEXT: s_mov_b32 s4, s6 1178; SI-NEXT: s_mov_b32 s5, s7 1179; SI-NEXT: s_mov_b32 s6, s10 1180; SI-NEXT: s_mov_b32 s7, s11 1181; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 1182; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 1183; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 1184; SI-NEXT: s_mov_b32 s8, s0 1185; SI-NEXT: s_mov_b32 s9, s1 1186; SI-NEXT: s_waitcnt vmcnt(2) 1187; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 1188; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 1189; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1190; SI-NEXT: s_waitcnt vmcnt(1) 1191; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 1192; SI-NEXT: s_waitcnt vmcnt(0) 1193; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 1194; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 1195; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 1196; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1197; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1198; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5 1199; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1200; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 1201; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1202; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc 1203; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1204; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 1205; SI-NEXT: v_or_b32_e32 v0, v0, v1 1206; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 1207; SI-NEXT: s_endpgm 1208; 1209; VI-LABEL: select_v2f16_imm_d: 1210; VI: ; %bb.0: ; %entry 1211; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 1212; VI-NEXT: s_mov_b32 s11, 0xf000 1213; VI-NEXT: s_mov_b32 s10, -1 1214; VI-NEXT: s_mov_b32 s18, s10 1215; VI-NEXT: s_mov_b32 s19, s11 1216; VI-NEXT: s_waitcnt lgkmcnt(0) 1217; VI-NEXT: s_mov_b32 s16, s4 1218; VI-NEXT: s_mov_b32 s17, s5 1219; VI-NEXT: s_mov_b32 s14, s10 1220; VI-NEXT: s_mov_b32 s12, s2 1221; VI-NEXT: s_mov_b32 s13, s3 1222; VI-NEXT: s_mov_b32 s15, s11 1223; VI-NEXT: s_mov_b32 s4, s6 1224; VI-NEXT: s_mov_b32 s5, s7 1225; VI-NEXT: s_mov_b32 s6, s10 1226; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 1227; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 1228; VI-NEXT: s_mov_b32 s7, s11 1229; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 1230; VI-NEXT: v_mov_b32_e32 v3, 0x3800 1231; VI-NEXT: v_mov_b32_e32 v4, 0x3900 1232; VI-NEXT: s_mov_b32 s8, s0 1233; VI-NEXT: s_mov_b32 s9, s1 1234; VI-NEXT: s_waitcnt vmcnt(2) 1235; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 1236; VI-NEXT: s_waitcnt vmcnt(1) 1237; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 1238; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0 1239; VI-NEXT: s_waitcnt vmcnt(0) 1240; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 1241; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 1242; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 1243; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 1244; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1245; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1246; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 1247; VI-NEXT: s_endpgm 1248; 1249; GFX11-LABEL: select_v2f16_imm_d: 1250; GFX11: ; %bb.0: ; %entry 1251; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 1252; GFX11-NEXT: s_mov_b32 s10, -1 1253; GFX11-NEXT: s_mov_b32 s11, 0x31016000 1254; GFX11-NEXT: s_mov_b32 s18, s10 1255; GFX11-NEXT: s_mov_b32 s19, s11 1256; GFX11-NEXT: s_mov_b32 s14, s10 1257; GFX11-NEXT: s_mov_b32 s15, s11 1258; GFX11-NEXT: s_mov_b32 s22, s10 1259; GFX11-NEXT: s_mov_b32 s23, s11 1260; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1261; GFX11-NEXT: s_mov_b32 s16, s4 1262; GFX11-NEXT: s_mov_b32 s17, s5 1263; GFX11-NEXT: s_mov_b32 s12, s2 1264; GFX11-NEXT: s_mov_b32 s13, s3 1265; GFX11-NEXT: s_mov_b32 s20, s6 1266; GFX11-NEXT: s_mov_b32 s21, s7 1267; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0 1268; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 1269; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 1270; GFX11-NEXT: s_mov_b32 s8, s0 1271; GFX11-NEXT: s_mov_b32 s9, s1 1272; GFX11-NEXT: s_waitcnt vmcnt(2) 1273; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 1274; GFX11-NEXT: s_waitcnt vmcnt(1) 1275; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 1276; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0 1277; GFX11-NEXT: s_waitcnt vmcnt(0) 1278; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 1279; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo 1280; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v4, v3 1281; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) 1282; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 1283; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo 1284; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1285; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 1286; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 1287; GFX11-NEXT: s_endpgm 1288 ptr addrspace(1) %r, 1289 ptr addrspace(1) %a, 1290 ptr addrspace(1) %b, 1291 ptr addrspace(1) %c) { 1292entry: 1293 %a.val = load <2 x half>, ptr addrspace(1) %a 1294 %b.val = load <2 x half>, ptr addrspace(1) %b 1295 %c.val = load <2 x half>, ptr addrspace(1) %c 1296 %fcmp = fcmp olt <2 x half> %a.val, %b.val 1297 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900> 1298 store <2 x half> %r.val, ptr addrspace(1) %r 1299 ret void 1300} 1301 1302define <4 x half> @v_select_v4f16(<4 x half> %a, <4 x half> %b, i32 %cond) { 1303; SI-LABEL: v_select_v4f16: 1304; SI: ; %bb.0: 1305; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1306; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1307; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1308; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1309; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 1310; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1311; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 1312; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 1313; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 1314; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1315; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1316; SI-NEXT: v_or_b32_e32 v2, v2, v3 1317; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 1318; SI-NEXT: v_or_b32_e32 v0, v0, v1 1319; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 1320; SI-NEXT: v_or_b32_e32 v3, v6, v3 1321; SI-NEXT: v_or_b32_e32 v1, v4, v1 1322; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 1323; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 1324; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc 1325; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 1326; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 1327; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1328; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1329; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1330; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1331; SI-NEXT: s_setpc_b64 s[30:31] 1332; 1333; VI-LABEL: v_select_v4f16: 1334; VI: ; %bb.0: 1335; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1336; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 1337; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 1338; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 1339; VI-NEXT: s_setpc_b64 s[30:31] 1340; 1341; GFX11-LABEL: v_select_v4f16: 1342; GFX11: ; %bb.0: 1343; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1344; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 1345; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 1346; GFX11-NEXT: s_setpc_b64 s[30:31] 1347 %cmp = icmp eq i32 %cond, 0 1348 %select = select i1 %cmp, <4 x half> %a, <4 x half> %b 1349 ret <4 x half> %select 1350} 1351 1352define <4 x half> @v_vselect_v4f16(<4 x half> %a, <4 x half> %b, <4 x i32> %cond) { 1353; SI-LABEL: v_vselect_v4f16: 1354; SI: ; %bb.0: 1355; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1356; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1357; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 1358; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1359; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 1360; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1361; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 1362; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1363; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 1364; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1365; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 1366; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1367; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 1368; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1369; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 1370; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1371; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 1372; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 1373; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 1374; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 1375; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 1376; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 1377; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 1378; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 1379; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 1380; SI-NEXT: s_setpc_b64 s[30:31] 1381; 1382; VI-LABEL: v_vselect_v4f16: 1383; VI: ; %bb.0: 1384; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1385; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 1386; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 1387; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 1388; VI-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc 1389; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 1390; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 1391; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 1392; VI-NEXT: v_cndmask_b32_e32 v5, v9, v8, vcc 1393; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 1394; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 1395; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 1396; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 1397; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 1398; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1399; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 1400; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1401; VI-NEXT: s_setpc_b64 s[30:31] 1402; 1403; GFX11-LABEL: v_vselect_v4f16: 1404; GFX11: ; %bb.0: 1405; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1406; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 1407; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v3 1408; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 1409; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v0 1410; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2 1411; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 1412; GFX11-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc_lo 1413; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 1414; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc_lo 1415; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 1416; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo 1417; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v6 1418; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1419; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 1420; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 1421; GFX11-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 1422; GFX11-NEXT: s_setpc_b64 s[30:31] 1423 %cmp = icmp eq <4 x i32> %cond, zeroinitializer 1424 %select = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b 1425 ret <4 x half> %select 1426} 1427 1428define <8 x half> @v_select_v8f16(<8 x half> %a, <8 x half> %b, i32 %cond) { 1429; SI-LABEL: v_select_v8f16: 1430; SI: ; %bb.0: 1431; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1432; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 1433; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 1434; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1435; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1436; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 1437; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 1438; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 1439; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 1440; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1441; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 1442; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1443; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 1444; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 1445; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 1446; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 1447; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 1448; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 1449; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 1450; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1451; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1452; SI-NEXT: v_or_b32_e32 v6, v6, v7 1453; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v15 1454; SI-NEXT: v_or_b32_e32 v4, v4, v5 1455; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 1456; SI-NEXT: v_or_b32_e32 v2, v2, v3 1457; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 1458; SI-NEXT: v_or_b32_e32 v0, v0, v1 1459; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v9 1460; SI-NEXT: v_or_b32_e32 v7, v14, v7 1461; SI-NEXT: v_or_b32_e32 v5, v12, v5 1462; SI-NEXT: v_or_b32_e32 v3, v10, v3 1463; SI-NEXT: v_or_b32_e32 v1, v8, v1 1464; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 1465; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 1466; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc 1467; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc 1468; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc 1469; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 1470; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 1471; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 1472; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 1473; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1474; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1475; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 1476; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1477; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1478; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1479; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 1480; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 1481; SI-NEXT: s_setpc_b64 s[30:31] 1482; 1483; VI-LABEL: v_select_v8f16: 1484; VI: ; %bb.0: 1485; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1486; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 1487; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 1488; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 1489; VI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 1490; VI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 1491; VI-NEXT: s_setpc_b64 s[30:31] 1492; 1493; GFX11-LABEL: v_select_v8f16: 1494; GFX11: ; %bb.0: 1495; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1496; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 1497; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 1498; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 1499; GFX11-NEXT: s_setpc_b64 s[30:31] 1500 %cmp = icmp eq i32 %cond, 0 1501 %select = select i1 %cmp, <8 x half> %a, <8 x half> %b 1502 ret <8 x half> %select 1503} 1504 1505define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond) { 1506; SI-LABEL: v_vselect_v8f16: 1507; SI: ; %bb.0: 1508; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1509; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1510; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 1511; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 1512; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1513; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1514; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 1515; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1516; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 1517; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1518; SI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc 1519; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 1520; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1521; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 1522; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 1523; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 1524; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 1525; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 1526; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 1527; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 1528; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 1529; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 1530; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1531; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 1532; SI-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc 1533; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 1534; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 1535; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1536; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 1537; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 1538; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 1539; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 1540; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 1541; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 1542; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 1543; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 1544; SI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc 1545; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 1546; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 1547; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 1548; SI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc 1549; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20 1550; SI-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc 1551; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 1552; SI-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc 1553; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 1554; SI-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc 1555; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 1556; SI-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc 1557; SI-NEXT: s_setpc_b64 s[30:31] 1558; 1559; VI-LABEL: v_vselect_v8f16: 1560; VI: ; %bb.0: 1561; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1562; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 1563; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 1564; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 1565; VI-NEXT: v_cndmask_b32_e32 v15, v17, v16, vcc 1566; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 1567; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 1568; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 1569; VI-NEXT: v_cndmask_b32_e32 v13, v17, v16, vcc 1570; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 1571; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 1572; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 1573; VI-NEXT: v_cndmask_b32_e32 v11, v17, v16, vcc 1574; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 1575; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 1576; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 1577; VI-NEXT: v_cndmask_b32_e32 v9, v17, v16, vcc 1578; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 1579; VI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 1580; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 1581; VI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 1582; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 1583; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 1584; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 1585; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 1586; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 1587; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1588; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 1589; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1590; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 1591; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1592; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v15 1593; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1594; VI-NEXT: s_setpc_b64 s[30:31] 1595; 1596; GFX11-LABEL: v_vselect_v8f16: 1597; GFX11: ; %bb.0: 1598; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1599; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v3 1600; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v7 1601; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v15 1602; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v1 1603; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v5 1604; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v0 1605; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v4 1606; GFX11-NEXT: v_cndmask_b32_e32 v15, v17, v16, vcc_lo 1607; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v2 1608; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v6 1609; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v13 1610; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1611; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v16, vcc_lo 1612; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11 1613; GFX11-NEXT: v_cndmask_b32_e32 v11, v19, v18, vcc_lo 1614; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 1615; GFX11-NEXT: v_cndmask_b32_e32 v9, v21, v20, vcc_lo 1616; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 1617; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo 1618; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 1619; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 1620; GFX11-NEXT: v_perm_b32 v2, v13, v2, 0x5040100 1621; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo 1622; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v10 1623; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x5040100 1624; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo 1625; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v14 1626; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1627; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 1628; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo 1629; GFX11-NEXT: v_perm_b32 v3, v15, v3, 0x5040100 1630; GFX11-NEXT: s_setpc_b64 s[30:31] 1631 %cmp = icmp eq <8 x i32> %cond, zeroinitializer 1632 %select = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b 1633 ret <8 x half> %select 1634} 1635 1636define <16 x half> @v_select_v16f16(<16 x half> %a, <16 x half> %b, i32 %cond) { 1637; SI-LABEL: v_select_v16f16: 1638; SI: ; %bb.0: 1639; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1640; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 1641; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 1642; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 1643; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 1644; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 1645; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 1646; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 1647; SI-NEXT: v_or_b32_e32 v12, v12, v13 1648; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 1649; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 1650; SI-NEXT: v_or_b32_e32 v14, v14, v15 1651; SI-NEXT: v_cvt_f16_f32_e32 v15, v28 1652; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 1653; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 1654; SI-NEXT: v_or_b32_e32 v10, v10, v11 1655; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 1656; SI-NEXT: v_or_b32_e32 v13, v15, v13 1657; SI-NEXT: v_cvt_f16_f32_e32 v15, v27 1658; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 1659; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1660; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 1661; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 1662; SI-NEXT: v_or_b32_e32 v15, v26, v15 1663; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 1664; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 1665; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 1666; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1667; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 1668; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 1669; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 1670; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1671; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 1672; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 1673; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 1674; SI-NEXT: v_or_b32_e32 v2, v2, v3 1675; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1676; SI-NEXT: v_or_b32_e32 v8, v8, v9 1677; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 1678; SI-NEXT: v_or_b32_e32 v6, v6, v7 1679; SI-NEXT: v_cvt_f16_f32_e32 v7, v23 1680; SI-NEXT: v_or_b32_e32 v4, v4, v5 1681; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 1682; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 1683; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1684; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 1685; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 1686; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 1687; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 1688; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 1689; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 1690; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1691; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 1692; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 1693; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 1694; SI-NEXT: v_or_b32_e32 v0, v0, v1 1695; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v17 1696; SI-NEXT: v_or_b32_e32 v9, v24, v9 1697; SI-NEXT: v_or_b32_e32 v7, v22, v7 1698; SI-NEXT: v_or_b32_e32 v5, v20, v5 1699; SI-NEXT: v_or_b32_e32 v1, v16, v1 1700; SI-NEXT: s_waitcnt vmcnt(1) 1701; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 1702; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 1703; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1704; SI-NEXT: v_or_b32_e32 v3, v11, v3 1705; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v19 1706; SI-NEXT: v_or_b32_e32 v11, v18, v11 1707; SI-NEXT: s_waitcnt vmcnt(0) 1708; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 1709; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 1710; SI-NEXT: v_cndmask_b32_e32 v11, v11, v2, vcc 1711; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc 1712; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc 1713; SI-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc 1714; SI-NEXT: v_cndmask_b32_e32 v15, v15, v10, vcc 1715; SI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc 1716; SI-NEXT: v_cndmask_b32_e32 v16, v3, v14, vcc 1717; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 1718; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 1719; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 1720; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 1721; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 1722; SI-NEXT: v_cvt_f32_f16_e32 v10, v15 1723; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 1724; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1725; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 1726; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 1727; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1728; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 1729; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 1730; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 1731; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v16 1732; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 1733; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1734; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1735; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 1736; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 1737; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 1738; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 1739; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 1740; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 1741; SI-NEXT: s_setpc_b64 s[30:31] 1742; 1743; VI-LABEL: v_select_v16f16: 1744; VI: ; %bb.0: 1745; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1746; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 1747; VI-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc 1748; VI-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc 1749; VI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc 1750; VI-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc 1751; VI-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc 1752; VI-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc 1753; VI-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc 1754; VI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc 1755; VI-NEXT: s_setpc_b64 s[30:31] 1756; 1757; GFX11-LABEL: v_select_v16f16: 1758; GFX11: ; %bb.0: 1759; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1760; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16 1761; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1 1762; GFX11-NEXT: v_dual_cndmask_b32 v2, v10, v2 :: v_dual_cndmask_b32 v3, v11, v3 1763; GFX11-NEXT: v_dual_cndmask_b32 v4, v12, v4 :: v_dual_cndmask_b32 v5, v13, v5 1764; GFX11-NEXT: v_dual_cndmask_b32 v6, v14, v6 :: v_dual_cndmask_b32 v7, v15, v7 1765; GFX11-NEXT: s_setpc_b64 s[30:31] 1766 %cmp = icmp eq i32 %cond, 0 1767 %select = select i1 %cmp, <16 x half> %a, <16 x half> %b 1768 ret <16 x half> %select 1769} 1770 1771define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> %cond) { 1772; SI-LABEL: v_vselect_v16f16: 1773; SI: ; %bb.0: 1774; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1775; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 1776; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 1777; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1778; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1779; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 1780; SI-NEXT: v_cvt_f32_f16_e32 v37, v16 1781; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 1782; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 1783; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 1784; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 1785; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 1786; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 1787; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32 1788; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1789; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1790; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 1791; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1792; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1793; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 1794; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 1795; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1796; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1797; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 1798; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 1799; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 1800; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 1801; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 1802; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 1803; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 1804; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 1805; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 1806; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 1807; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 1808; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 1809; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 1810; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 1811; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 1812; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 1813; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 1814; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 1815; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 1816; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 1817; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 1818; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 1819; SI-NEXT: s_waitcnt vmcnt(7) 1820; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36 1821; SI-NEXT: v_cndmask_b32_e32 v0, v37, v0, vcc 1822; SI-NEXT: s_waitcnt vmcnt(6) 1823; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38 1824; SI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc 1825; SI-NEXT: v_cvt_f16_f32_e32 v17, v18 1826; SI-NEXT: s_waitcnt vmcnt(5) 1827; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v35 1828; SI-NEXT: v_cvt_f16_f32_e32 v18, v20 1829; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 1830; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 1831; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 1832; SI-NEXT: v_cndmask_b32_e32 v2, v17, v2, vcc 1833; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 1834; SI-NEXT: s_waitcnt vmcnt(5) 1835; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 1836; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 1837; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 1838; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 1839; SI-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc 1840; SI-NEXT: s_waitcnt vmcnt(6) 1841; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 1842; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 1843; SI-NEXT: v_cvt_f16_f32_e32 v17, v21 1844; SI-NEXT: v_cndmask_b32_e32 v4, v18, v4, vcc 1845; SI-NEXT: v_cvt_f16_f32_e32 v18, v22 1846; SI-NEXT: s_waitcnt vmcnt(6) 1847; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 1848; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 1849; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 1850; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 1851; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 1852; SI-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc 1853; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:52 1854; SI-NEXT: s_waitcnt vmcnt(7) 1855; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 1856; SI-NEXT: v_cndmask_b32_e32 v6, v21, v6, vcc 1857; SI-NEXT: s_waitcnt vmcnt(6) 1858; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 1859; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 1860; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 1861; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 1862; SI-NEXT: v_cvt_f16_f32_e32 v23, v24 1863; SI-NEXT: v_cvt_f16_f32_e32 v24, v25 1864; SI-NEXT: v_cndmask_b32_e32 v7, v22, v7, vcc 1865; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 1866; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 1867; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 1868; SI-NEXT: s_waitcnt vmcnt(7) 1869; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 1870; SI-NEXT: v_cndmask_b32_e32 v8, v23, v8, vcc 1871; SI-NEXT: v_cvt_f16_f32_e32 v23, v26 1872; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 1873; SI-NEXT: s_waitcnt vmcnt(5) 1874; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 1875; SI-NEXT: v_cndmask_b32_e32 v9, v24, v9, vcc 1876; SI-NEXT: v_cvt_f16_f32_e32 v24, v27 1877; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 1878; SI-NEXT: v_cvt_f16_f32_e32 v19, v28 1879; SI-NEXT: v_cndmask_b32_e32 v10, v23, v10, vcc 1880; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 1881; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20 1882; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 1883; SI-NEXT: v_cvt_f16_f32_e32 v20, v29 1884; SI-NEXT: v_cndmask_b32_e32 v11, v24, v11, vcc 1885; SI-NEXT: s_waitcnt vmcnt(3) 1886; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 1887; SI-NEXT: v_cvt_f16_f32_e32 v17, v30 1888; SI-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc 1889; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 1890; SI-NEXT: s_waitcnt vmcnt(1) 1891; SI-NEXT: v_cvt_f16_f32_e32 v18, v21 1892; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 1893; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 1894; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 1895; SI-NEXT: v_cndmask_b32_e32 v13, v20, v13, vcc 1896; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 1897; SI-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc 1898; SI-NEXT: s_waitcnt vmcnt(0) 1899; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22 1900; SI-NEXT: v_cndmask_b32_e32 v15, v18, v15, vcc 1901; SI-NEXT: s_setpc_b64 s[30:31] 1902; 1903; VI-LABEL: v_vselect_v16f16: 1904; VI: ; %bb.0: 1905; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1906; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 1907; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill 1908; VI-NEXT: s_mov_b64 exec, s[4:5] 1909; VI-NEXT: v_writelane_b32 v31, s30, 0 1910; VI-NEXT: v_writelane_b32 v31, s31, 1 1911; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 1912; VI-NEXT: v_cmp_eq_u32_e64 s[18:19], 0, v17 1913; VI-NEXT: v_cmp_eq_u32_e64 s[30:31], 0, v29 1914; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 1915; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 1916; VI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 1917; VI-NEXT: v_cmp_eq_u32_e64 s[28:29], 0, v27 1918; VI-NEXT: v_cndmask_b32_e64 v16, v17, v16, s[30:31] 1919; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v5 1920; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v13 1921; VI-NEXT: v_cmp_eq_u32_e64 s[20:21], 0, v19 1922; VI-NEXT: v_cmp_eq_u32_e64 s[26:27], 0, v25 1923; VI-NEXT: v_cndmask_b32_e64 v17, v18, v17, s[28:29] 1924; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 1925; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 1926; VI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v20 1927; VI-NEXT: v_cmp_eq_u32_e64 s[24:25], 0, v23 1928; VI-NEXT: v_cndmask_b32_e64 v18, v19, v18, s[26:27] 1929; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 1930; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 1931; VI-NEXT: v_cmp_eq_u32_e64 s[22:23], 0, v21 1932; VI-NEXT: v_cndmask_b32_e64 v19, v20, v19, s[24:25] 1933; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 1934; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v10 1935; VI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v22 1936; VI-NEXT: v_cndmask_b32_e64 v20, v21, v20, s[22:23] 1937; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 1938; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v9 1939; VI-NEXT: v_cndmask_b32_e64 v21, v22, v21, s[20:21] 1940; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v0 1941; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 1942; VI-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] 1943; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 1944; VI-NEXT: v_cndmask_b32_e64 v22, v23, v22, s[18:19] 1945; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 1946; VI-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] 1947; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v22 1948; VI-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[8:9] 1949; VI-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1950; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 1951; VI-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc 1952; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 1953; VI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v24 1954; VI-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[10:11] 1955; VI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v26 1956; VI-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] 1957; VI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v28 1958; VI-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[14:15] 1959; VI-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[16:17] 1960; VI-NEXT: v_readlane_b32 s31, v31, 1 1961; VI-NEXT: v_readlane_b32 s30, v31, 0 1962; VI-NEXT: s_waitcnt vmcnt(0) 1963; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 1964; VI-NEXT: v_cndmask_b32_e32 v8, v10, v9, vcc 1965; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v21 1966; VI-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1967; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 1968; VI-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1969; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v19 1970; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1971; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v18 1972; VI-NEXT: v_or_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1973; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v17 1974; VI-NEXT: v_or_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1975; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v16 1976; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 1977; VI-NEXT: v_or_b32_sdwa v6, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1978; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1979; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 1980; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload 1981; VI-NEXT: s_mov_b64 exec, s[4:5] 1982; VI-NEXT: s_waitcnt vmcnt(0) 1983; VI-NEXT: s_setpc_b64 s[30:31] 1984; 1985; GFX11-LABEL: v_vselect_v16f16: 1986; GFX11: ; %bb.0: 1987; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1988; GFX11-NEXT: scratch_load_b32 v31, off, s32 1989; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v30 1990; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 1991; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6 1992; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5 1993; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v4 1994; GFX11-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc_lo 1995; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v28 1996; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 1997; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v2 1998; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v1 1999; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14 2000; GFX11-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo 2001; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26 2002; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v0 2003; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v13 2004; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 2005; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 2006; GFX11-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo 2007; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v24 2008; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v9 2009; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10 2010; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11 2011; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15 2012; GFX11-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo 2013; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v22 2014; GFX11-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo 2015; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v20 2016; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo 2017; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 2018; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo 2019; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16 2020; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo 2021; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v29 2022; GFX11-NEXT: v_cndmask_b32_e32 v8, v35, v34, vcc_lo 2023; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27 2024; GFX11-NEXT: v_cndmask_b32_e32 v9, v37, v36, vcc_lo 2025; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v25 2026; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 2027; GFX11-NEXT: v_perm_b32 v6, v8, v6, 0x5040100 2028; GFX11-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 2029; GFX11-NEXT: v_cndmask_b32_e32 v10, v39, v38, vcc_lo 2030; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v19 2031; GFX11-NEXT: v_cndmask_b32_e32 v11, v53, v52, vcc_lo 2032; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v17 2033; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 2034; GFX11-NEXT: v_perm_b32 v4, v10, v4, 0x5040100 2035; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 2036; GFX11-NEXT: v_cndmask_b32_e32 v12, v55, v54, vcc_lo 2037; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v21 2038; GFX11-NEXT: v_cndmask_b32_e32 v13, v51, v50, vcc_lo 2039; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v23 2040; GFX11-NEXT: v_cndmask_b32_e32 v14, v49, v48, vcc_lo 2041; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2042; GFX11-NEXT: v_perm_b32 v2, v13, v2, 0x5040100 2043; GFX11-NEXT: v_perm_b32 v3, v14, v3, 0x5040100 2044; GFX11-NEXT: s_waitcnt vmcnt(0) 2045; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 2046; GFX11-NEXT: v_cndmask_b32_e32 v11, v33, v32, vcc_lo 2047; GFX11-NEXT: v_perm_b32 v0, v12, v0, 0x5040100 2048; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 2049; GFX11-NEXT: v_perm_b32 v7, v11, v7, 0x5040100 2050; GFX11-NEXT: s_setpc_b64 s[30:31] 2051 %cmp = icmp eq <16 x i32> %cond, zeroinitializer 2052 %select = select <16 x i1> %cmp, <16 x half> %a, <16 x half> %b 2053 ret <16 x half> %select 2054} 2055 2056define <32 x half> @v_select_v32f16(<32 x half> %a, <32 x half> %b, i32 %cond) { 2057; SI-LABEL: v_select_v32f16: 2058; SI: ; %bb.0: 2059; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2060; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 2061; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 2062; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 2063; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 2064; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 2065; SI-NEXT: v_or_b32_e32 v20, v20, v21 2066; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 2067; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 2068; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 2069; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 2070; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 2071; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 2072; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 2073; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 2074; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 2075; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 2076; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 2077; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 2078; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 2079; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 2080; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 2081; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 2082; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 2083; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 2084; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 2085; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 2086; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2087; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2088; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 2089; SI-NEXT: v_or_b32_e32 v12, v12, v13 2090; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 2091; SI-NEXT: v_or_b32_e32 v10, v10, v11 2092; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 2093; SI-NEXT: v_or_b32_e32 v8, v8, v9 2094; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 2095; SI-NEXT: v_or_b32_e32 v6, v6, v7 2096; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 2097; SI-NEXT: v_or_b32_e32 v4, v4, v5 2098; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 2099; SI-NEXT: v_or_b32_e32 v2, v2, v3 2100; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 2101; SI-NEXT: v_or_b32_e32 v0, v0, v1 2102; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 2103; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 2104; SI-NEXT: v_or_b32_e32 v22, v22, v23 2105; SI-NEXT: v_cvt_f16_f32_e32 v23, v30 2106; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 2107; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 2108; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 2109; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 2110; SI-NEXT: v_or_b32_e32 v24, v24, v25 2111; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 2112; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 2113; SI-NEXT: v_or_b32_e32 v26, v26, v27 2114; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 2115; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 2116; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 2117; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 2118; SI-NEXT: v_or_b32_e32 v28, v28, v29 2119; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 2120; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 2121; SI-NEXT: v_or_b32_e32 v18, v18, v19 2122; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 2123; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 2124; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 2125; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 2126; SI-NEXT: v_or_b32_e32 v16, v16, v17 2127; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:124 2128; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 2129; SI-NEXT: v_or_b32_e32 v14, v14, v15 2130; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:116 2131; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 2132; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:100 2133; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 2134; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 2135; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 2136; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 2137; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 2138; SI-NEXT: s_waitcnt vmcnt(14) 2139; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 2140; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 2141; SI-NEXT: v_or_b32_e32 v21, v23, v21 2142; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 2143; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 2144; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 2145; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 2146; SI-NEXT: s_waitcnt vmcnt(14) 2147; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 2148; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 2149; SI-NEXT: s_waitcnt vmcnt(13) 2150; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 2151; SI-NEXT: s_waitcnt vmcnt(12) 2152; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 2153; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 2154; SI-NEXT: s_waitcnt vmcnt(11) 2155; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 2156; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 2157; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 2158; SI-NEXT: s_waitcnt vmcnt(10) 2159; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 2160; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2161; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2162; SI-NEXT: s_waitcnt vmcnt(9) 2163; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 2164; SI-NEXT: s_waitcnt vmcnt(8) 2165; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 2166; SI-NEXT: s_waitcnt vmcnt(7) 2167; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 2168; SI-NEXT: s_waitcnt vmcnt(6) 2169; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 2170; SI-NEXT: s_waitcnt vmcnt(5) 2171; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2172; SI-NEXT: s_waitcnt vmcnt(4) 2173; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 2174; SI-NEXT: s_waitcnt vmcnt(0) 2175; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 2176; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 2177; SI-NEXT: v_or_b32_e32 v23, v25, v23 2178; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:120 2179; SI-NEXT: s_waitcnt vmcnt(0) 2180; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 2181; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 2182; SI-NEXT: v_or_b32_e32 v25, v27, v25 2183; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 2184; SI-NEXT: s_waitcnt vmcnt(0) 2185; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 2186; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 2187; SI-NEXT: v_or_b32_e32 v27, v29, v27 2188; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:104 2189; SI-NEXT: s_waitcnt vmcnt(0) 2190; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 2191; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 2192; SI-NEXT: v_or_b32_e32 v29, v30, v29 2193; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 2194; SI-NEXT: s_waitcnt vmcnt(0) 2195; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 2196; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 2197; SI-NEXT: v_or_b32_e32 v30, v31, v30 2198; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 2199; SI-NEXT: s_waitcnt vmcnt(0) 2200; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2201; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 2202; SI-NEXT: v_or_b32_e32 v31, v32, v31 2203; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 2204; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 2205; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 2206; SI-NEXT: s_waitcnt vmcnt(0) 2207; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 2208; SI-NEXT: v_or_b32_e32 v19, v32, v19 2209; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 2210; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 2211; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 2212; SI-NEXT: s_waitcnt vmcnt(0) 2213; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 2214; SI-NEXT: v_or_b32_e32 v17, v32, v17 2215; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 2216; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 2217; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 2218; SI-NEXT: s_waitcnt vmcnt(0) 2219; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 2220; SI-NEXT: v_or_b32_e32 v15, v32, v15 2221; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 2222; SI-NEXT: s_waitcnt vmcnt(0) 2223; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 2224; SI-NEXT: v_or_b32_e32 v13, v32, v13 2225; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 2226; SI-NEXT: s_waitcnt vmcnt(0) 2227; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 2228; SI-NEXT: v_or_b32_e32 v11, v32, v11 2229; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 2230; SI-NEXT: s_waitcnt vmcnt(0) 2231; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 2232; SI-NEXT: v_or_b32_e32 v9, v32, v9 2233; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 2234; SI-NEXT: s_waitcnt vmcnt(0) 2235; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 2236; SI-NEXT: v_or_b32_e32 v7, v32, v7 2237; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 2238; SI-NEXT: s_waitcnt vmcnt(0) 2239; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 2240; SI-NEXT: v_or_b32_e32 v5, v32, v5 2241; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 2242; SI-NEXT: s_waitcnt vmcnt(0) 2243; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 2244; SI-NEXT: v_or_b32_e32 v3, v32, v3 2245; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 2246; SI-NEXT: s_waitcnt vmcnt(0) 2247; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 2248; SI-NEXT: v_or_b32_e32 v1, v32, v1 2249; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 2250; SI-NEXT: s_waitcnt vmcnt(0) 2251; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 2252; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 2253; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc 2254; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc 2255; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc 2256; SI-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc 2257; SI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc 2258; SI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc 2259; SI-NEXT: v_cndmask_b32_e32 v15, v15, v14, vcc 2260; SI-NEXT: v_cndmask_b32_e32 v17, v17, v16, vcc 2261; SI-NEXT: v_cndmask_b32_e32 v19, v19, v18, vcc 2262; SI-NEXT: v_cndmask_b32_e32 v31, v31, v20, vcc 2263; SI-NEXT: v_cndmask_b32_e32 v30, v30, v22, vcc 2264; SI-NEXT: v_cndmask_b32_e32 v29, v29, v24, vcc 2265; SI-NEXT: v_cndmask_b32_e32 v27, v27, v26, vcc 2266; SI-NEXT: v_cndmask_b32_e32 v32, v25, v28, vcc 2267; SI-NEXT: v_cndmask_b32_e32 v33, v23, v21, vcc 2268; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 2269; SI-NEXT: v_cvt_f32_f16_e32 v2, v3 2270; SI-NEXT: v_cvt_f32_f16_e32 v4, v5 2271; SI-NEXT: v_cvt_f32_f16_e32 v6, v7 2272; SI-NEXT: v_cvt_f32_f16_e32 v8, v9 2273; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2274; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2275; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 2276; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 2277; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 2278; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 2279; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 2280; SI-NEXT: v_cvt_f32_f16_e32 v12, v13 2281; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 2282; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 2283; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 2284; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 2285; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 2286; SI-NEXT: v_cvt_f32_f16_e32 v18, v19 2287; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 2288; SI-NEXT: v_cvt_f32_f16_e32 v20, v31 2289; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v31 2290; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v30 2291; SI-NEXT: v_cvt_f32_f16_e32 v24, v29 2292; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v29 2293; SI-NEXT: v_cvt_f32_f16_e32 v26, v27 2294; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 2295; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v32 2296; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 2297; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 2298; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 2299; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 2300; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 2301; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 2302; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 2303; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 2304; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 2305; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 2306; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 2307; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 2308; SI-NEXT: v_cvt_f32_f16_e32 v22, v30 2309; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 2310; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 2311; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 2312; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 2313; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 2314; SI-NEXT: v_cvt_f32_f16_e32 v30, v33 2315; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2316; SI-NEXT: s_setpc_b64 s[30:31] 2317; 2318; VI-LABEL: v_select_v32f16: 2319; VI: ; %bb.0: 2320; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2321; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 2322; VI-NEXT: s_waitcnt vmcnt(0) 2323; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2324; VI-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc 2325; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 2326; VI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc 2327; VI-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc 2328; VI-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc 2329; VI-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc 2330; VI-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc 2331; VI-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc 2332; VI-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc 2333; VI-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc 2334; VI-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc 2335; VI-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc 2336; VI-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc 2337; VI-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc 2338; VI-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc 2339; VI-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc 2340; VI-NEXT: s_waitcnt vmcnt(0) 2341; VI-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc 2342; VI-NEXT: s_setpc_b64 s[30:31] 2343; 2344; GFX11-LABEL: v_select_v32f16: 2345; GFX11: ; %bb.0: 2346; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2347; GFX11-NEXT: s_clause 0x1 2348; GFX11-NEXT: scratch_load_b32 v31, off, s32 offset:4 2349; GFX11-NEXT: scratch_load_b32 v32, off, s32 2350; GFX11-NEXT: s_waitcnt vmcnt(1) 2351; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 2352; GFX11-NEXT: v_dual_cndmask_b32 v0, v16, v0 :: v_dual_cndmask_b32 v1, v17, v1 2353; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v3 2354; GFX11-NEXT: v_dual_cndmask_b32 v4, v20, v4 :: v_dual_cndmask_b32 v5, v21, v5 2355; GFX11-NEXT: v_dual_cndmask_b32 v6, v22, v6 :: v_dual_cndmask_b32 v7, v23, v7 2356; GFX11-NEXT: v_dual_cndmask_b32 v8, v24, v8 :: v_dual_cndmask_b32 v9, v25, v9 2357; GFX11-NEXT: v_dual_cndmask_b32 v10, v26, v10 :: v_dual_cndmask_b32 v11, v27, v11 2358; GFX11-NEXT: v_dual_cndmask_b32 v12, v28, v12 :: v_dual_cndmask_b32 v13, v29, v13 2359; GFX11-NEXT: s_waitcnt vmcnt(0) 2360; GFX11-NEXT: v_dual_cndmask_b32 v14, v30, v14 :: v_dual_cndmask_b32 v15, v32, v15 2361; GFX11-NEXT: s_setpc_b64 s[30:31] 2362 %cmp = icmp eq i32 %cond, 0 2363 %select = select i1 %cmp, <32 x half> %a, <32 x half> %b 2364 ret <32 x half> %select 2365} 2366 2367define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> %cond) { 2368; SI-LABEL: v_vselect_v32f16: 2369; SI: ; %bb.0: 2370; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2371; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 2372; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 2373; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 2374; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 2375; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 2376; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 2377; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 2378; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 2379; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 2380; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 2381; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 2382; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 2383; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 2384; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 2385; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 2386; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 2387; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 2388; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 2389; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 2390; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 2391; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 2392; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 2393; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 2394; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 2395; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 2396; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 2397; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 2398; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 2399; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 2400; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 2401; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 2402; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 2403; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 2404; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 2405; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 2406; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 2407; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 2408; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 2409; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 2410; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 2411; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 2412; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 2413; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 2414; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 2415; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 2416; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 2417; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 2418; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 2419; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 2420; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 2421; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 2422; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 2423; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 2424; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 2425; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 2426; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 2427; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 2428; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 2429; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 2430; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 2431; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 2432; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 2433; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 2434; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 2435; SI-NEXT: s_waitcnt vmcnt(1) 2436; SI-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v31 2437; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 2438; SI-NEXT: s_waitcnt vmcnt(1) 2439; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 2440; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 2441; SI-NEXT: s_waitcnt vmcnt(0) 2442; SI-NEXT: v_cmp_eq_u32_e64 s[14:15], 0, v31 2443; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 2444; SI-NEXT: s_waitcnt vmcnt(0) 2445; SI-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v31 2446; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 2447; SI-NEXT: s_waitcnt vmcnt(0) 2448; SI-NEXT: v_cmp_eq_u32_e64 s[10:11], 0, v31 2449; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 2450; SI-NEXT: s_waitcnt vmcnt(0) 2451; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v31 2452; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 2453; SI-NEXT: s_waitcnt vmcnt(0) 2454; SI-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v31 2455; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 2456; SI-NEXT: s_waitcnt vmcnt(0) 2457; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v31 2458; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 2459; SI-NEXT: s_waitcnt vmcnt(0) 2460; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2461; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 2462; SI-NEXT: s_waitcnt vmcnt(0) 2463; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2464; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2465; SI-NEXT: v_cndmask_b32_e64 v0, v31, v0, s[16:17] 2466; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 2467; SI-NEXT: s_waitcnt vmcnt(0) 2468; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2469; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2470; SI-NEXT: v_cndmask_b32_e64 v1, v31, v1, s[14:15] 2471; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 2472; SI-NEXT: s_waitcnt vmcnt(0) 2473; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2474; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2475; SI-NEXT: v_cndmask_b32_e64 v2, v31, v2, s[12:13] 2476; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 2477; SI-NEXT: s_waitcnt vmcnt(0) 2478; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2479; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2480; SI-NEXT: v_cndmask_b32_e64 v3, v31, v3, s[10:11] 2481; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 2482; SI-NEXT: s_waitcnt vmcnt(0) 2483; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2484; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2485; SI-NEXT: v_cndmask_b32_e64 v4, v31, v4, s[8:9] 2486; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 2487; SI-NEXT: s_waitcnt vmcnt(0) 2488; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2489; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2490; SI-NEXT: v_cndmask_b32_e64 v5, v31, v5, s[6:7] 2491; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 2492; SI-NEXT: s_waitcnt vmcnt(0) 2493; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2494; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2495; SI-NEXT: v_cndmask_b32_e64 v6, v31, v6, s[4:5] 2496; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 2497; SI-NEXT: s_waitcnt vmcnt(0) 2498; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2499; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2500; SI-NEXT: v_cndmask_b32_e32 v7, v31, v7, vcc 2501; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164 2502; SI-NEXT: s_waitcnt vmcnt(0) 2503; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2504; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 2505; SI-NEXT: s_waitcnt vmcnt(0) 2506; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2507; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2508; SI-NEXT: v_cndmask_b32_e32 v8, v31, v8, vcc 2509; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168 2510; SI-NEXT: s_waitcnt vmcnt(0) 2511; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2512; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 2513; SI-NEXT: s_waitcnt vmcnt(0) 2514; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2515; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2516; SI-NEXT: v_cndmask_b32_e32 v9, v31, v9, vcc 2517; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 2518; SI-NEXT: s_waitcnt vmcnt(0) 2519; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2520; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 2521; SI-NEXT: s_waitcnt vmcnt(0) 2522; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2523; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2524; SI-NEXT: v_cndmask_b32_e32 v10, v31, v10, vcc 2525; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176 2526; SI-NEXT: s_waitcnt vmcnt(0) 2527; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2528; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 2529; SI-NEXT: s_waitcnt vmcnt(0) 2530; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2531; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2532; SI-NEXT: v_cndmask_b32_e32 v11, v31, v11, vcc 2533; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 2534; SI-NEXT: s_waitcnt vmcnt(0) 2535; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2536; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 2537; SI-NEXT: s_waitcnt vmcnt(0) 2538; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2539; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2540; SI-NEXT: v_cndmask_b32_e32 v12, v31, v12, vcc 2541; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 2542; SI-NEXT: s_waitcnt vmcnt(0) 2543; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2544; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 2545; SI-NEXT: s_waitcnt vmcnt(0) 2546; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2547; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2548; SI-NEXT: v_cndmask_b32_e32 v13, v31, v13, vcc 2549; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 2550; SI-NEXT: s_waitcnt vmcnt(0) 2551; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2552; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 2553; SI-NEXT: s_waitcnt vmcnt(0) 2554; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2555; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2556; SI-NEXT: v_cndmask_b32_e32 v14, v31, v14, vcc 2557; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 2558; SI-NEXT: s_waitcnt vmcnt(0) 2559; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2560; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 2561; SI-NEXT: s_waitcnt vmcnt(0) 2562; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2563; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2564; SI-NEXT: v_cndmask_b32_e32 v15, v31, v15, vcc 2565; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 2566; SI-NEXT: s_waitcnt vmcnt(0) 2567; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2568; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 2569; SI-NEXT: s_waitcnt vmcnt(0) 2570; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2571; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2572; SI-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc 2573; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 2574; SI-NEXT: s_waitcnt vmcnt(0) 2575; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2576; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 2577; SI-NEXT: s_waitcnt vmcnt(0) 2578; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2579; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2580; SI-NEXT: v_cndmask_b32_e32 v17, v31, v17, vcc 2581; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 2582; SI-NEXT: s_waitcnt vmcnt(0) 2583; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2584; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 2585; SI-NEXT: s_waitcnt vmcnt(0) 2586; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2587; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2588; SI-NEXT: v_cndmask_b32_e32 v18, v31, v18, vcc 2589; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 2590; SI-NEXT: s_waitcnt vmcnt(0) 2591; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2592; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 2593; SI-NEXT: s_waitcnt vmcnt(0) 2594; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2595; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2596; SI-NEXT: v_cndmask_b32_e32 v19, v31, v19, vcc 2597; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 2598; SI-NEXT: s_waitcnt vmcnt(0) 2599; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2600; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 2601; SI-NEXT: s_waitcnt vmcnt(0) 2602; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2603; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2604; SI-NEXT: v_cndmask_b32_e32 v20, v31, v20, vcc 2605; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 2606; SI-NEXT: s_waitcnt vmcnt(0) 2607; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2608; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 2609; SI-NEXT: s_waitcnt vmcnt(0) 2610; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2611; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2612; SI-NEXT: v_cndmask_b32_e32 v21, v31, v21, vcc 2613; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 2614; SI-NEXT: s_waitcnt vmcnt(0) 2615; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2616; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 2617; SI-NEXT: s_waitcnt vmcnt(0) 2618; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2619; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2620; SI-NEXT: v_cndmask_b32_e32 v22, v31, v22, vcc 2621; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 2622; SI-NEXT: s_waitcnt vmcnt(0) 2623; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2624; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 2625; SI-NEXT: s_waitcnt vmcnt(0) 2626; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2627; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2628; SI-NEXT: v_cndmask_b32_e32 v23, v31, v23, vcc 2629; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228 2630; SI-NEXT: s_waitcnt vmcnt(0) 2631; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2632; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 2633; SI-NEXT: s_waitcnt vmcnt(0) 2634; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2635; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2636; SI-NEXT: v_cndmask_b32_e32 v24, v31, v24, vcc 2637; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 2638; SI-NEXT: s_waitcnt vmcnt(0) 2639; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2640; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 2641; SI-NEXT: s_waitcnt vmcnt(0) 2642; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2643; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2644; SI-NEXT: v_cndmask_b32_e32 v25, v31, v25, vcc 2645; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236 2646; SI-NEXT: s_waitcnt vmcnt(0) 2647; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2648; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 2649; SI-NEXT: s_waitcnt vmcnt(0) 2650; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2651; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2652; SI-NEXT: v_cndmask_b32_e32 v26, v31, v26, vcc 2653; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 2654; SI-NEXT: s_waitcnt vmcnt(0) 2655; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2656; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 2657; SI-NEXT: s_waitcnt vmcnt(0) 2658; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2659; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2660; SI-NEXT: v_cndmask_b32_e32 v27, v31, v27, vcc 2661; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244 2662; SI-NEXT: s_waitcnt vmcnt(0) 2663; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2664; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 2665; SI-NEXT: s_waitcnt vmcnt(0) 2666; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2667; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2668; SI-NEXT: v_cndmask_b32_e32 v28, v31, v28, vcc 2669; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 2670; SI-NEXT: s_waitcnt vmcnt(0) 2671; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2672; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 2673; SI-NEXT: s_waitcnt vmcnt(0) 2674; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2675; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2676; SI-NEXT: v_cndmask_b32_e32 v29, v31, v29, vcc 2677; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 2678; SI-NEXT: s_waitcnt vmcnt(0) 2679; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2680; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 2681; SI-NEXT: s_waitcnt vmcnt(0) 2682; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2683; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2684; SI-NEXT: v_cndmask_b32_e32 v30, v31, v30, vcc 2685; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 2686; SI-NEXT: s_waitcnt vmcnt(0) 2687; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2688; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 2689; SI-NEXT: s_waitcnt vmcnt(0) 2690; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 2691; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 2692; SI-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc 2693; SI-NEXT: s_setpc_b64 s[30:31] 2694; 2695; VI-LABEL: v_vselect_v32f16: 2696; VI: ; %bb.0: 2697; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2698; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill 2699; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill 2700; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill 2701; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill 2702; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill 2703; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill 2704; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill 2705; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill 2706; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill 2707; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill 2708; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill 2709; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill 2710; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill 2711; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:120 2712; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 2713; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 2714; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 2715; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 2716; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 2717; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 2718; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 2719; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:128 2720; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 2721; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 2722; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 2723; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 2724; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32 2725; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 2726; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 2727; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 2728; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:124 2729; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 2730; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 2731; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v30 2732; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v13 2733; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v29 2734; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 2735; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v28 2736; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v11 2737; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v27 2738; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v10 2739; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v26 2740; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 2741; VI-NEXT: s_waitcnt vmcnt(14) 2742; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36 2743; VI-NEXT: v_cndmask_b32_e32 v36, v43, v38, vcc 2744; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v35 2745; VI-NEXT: v_cndmask_b32_e32 v35, v45, v44, vcc 2746; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 2747; VI-NEXT: v_cndmask_b32_e32 v34, v47, v46, vcc 2748; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v33 2749; VI-NEXT: v_cndmask_b32_e32 v33, v57, v56, vcc 2750; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 2751; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 2752; VI-NEXT: v_cndmask_b32_e32 v32, v59, v58, vcc 2753; VI-NEXT: s_waitcnt vmcnt(13) 2754; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 2755; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 2756; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v24 2757; VI-NEXT: v_cndmask_b32_e32 v38, v38, v60, vcc 2758; VI-NEXT: s_waitcnt vmcnt(12) 2759; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v39 2760; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 2761; VI-NEXT: v_cndmask_b32_e32 v39, v44, v43, vcc 2762; VI-NEXT: s_waitcnt vmcnt(11) 2763; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 2764; VI-NEXT: s_waitcnt vmcnt(10) 2765; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55 2766; VI-NEXT: v_cndmask_b32_e32 v31, v31, v45, vcc 2767; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v7 2768; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 2769; VI-NEXT: s_waitcnt vmcnt(9) 2770; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v50 2771; VI-NEXT: v_cndmask_b32_e32 v50, v43, v55, vcc 2772; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 2773; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v22 2774; VI-NEXT: s_waitcnt vmcnt(8) 2775; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v53 2776; VI-NEXT: v_cndmask_b32_e32 v53, v43, v55, vcc 2777; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v5 2778; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 2779; VI-NEXT: s_waitcnt vmcnt(7) 2780; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v54 2781; VI-NEXT: v_cndmask_b32_e32 v54, v43, v55, vcc 2782; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 2783; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 2784; VI-NEXT: s_waitcnt vmcnt(6) 2785; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v52 2786; VI-NEXT: v_cndmask_b32_e32 v52, v43, v55, vcc 2787; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v3 2788; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 2789; VI-NEXT: s_waitcnt vmcnt(5) 2790; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v51 2791; VI-NEXT: v_cndmask_b32_e32 v51, v43, v55, vcc 2792; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 2793; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 2794; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:108 2795; VI-NEXT: s_waitcnt vmcnt(5) 2796; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v49 2797; VI-NEXT: v_cndmask_b32_e32 v49, v43, v55, vcc 2798; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 2799; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:92 2800; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 2801; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 2802; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:84 2803; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:76 2804; VI-NEXT: s_waitcnt vmcnt(8) 2805; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v48 2806; VI-NEXT: v_cndmask_b32_e32 v48, v46, v43, vcc 2807; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 2808; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 2809; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 2810; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v0 2811; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 2812; VI-NEXT: s_waitcnt vmcnt(10) 2813; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v40 2814; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 2815; VI-NEXT: v_cndmask_b32_e32 v46, v58, v46, vcc 2816; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 2817; VI-NEXT: s_waitcnt vmcnt(11) 2818; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v41 2819; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 2820; VI-NEXT: v_cndmask_b32_e32 v15, v37, v15, vcc 2821; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 2822; VI-NEXT: s_waitcnt vmcnt(12) 2823; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v42 2824; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 2825; VI-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc 2826; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 2827; VI-NEXT: s_waitcnt vmcnt(13) 2828; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v44 2829; VI-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc 2830; VI-NEXT: s_waitcnt vmcnt(12) 2831; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v55 2832; VI-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc 2833; VI-NEXT: s_waitcnt vmcnt(11) 2834; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v45 2835; VI-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc 2836; VI-NEXT: s_waitcnt vmcnt(10) 2837; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v47 2838; VI-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc 2839; VI-NEXT: s_waitcnt vmcnt(9) 2840; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v56 2841; VI-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc 2842; VI-NEXT: s_waitcnt vmcnt(8) 2843; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v43 2844; VI-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc 2845; VI-NEXT: s_waitcnt vmcnt(7) 2846; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v57 2847; VI-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc 2848; VI-NEXT: s_waitcnt vmcnt(6) 2849; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v59 2850; VI-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc 2851; VI-NEXT: s_waitcnt vmcnt(5) 2852; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v40 2853; VI-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc 2854; VI-NEXT: s_waitcnt vmcnt(4) 2855; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v58 2856; VI-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc 2857; VI-NEXT: s_waitcnt vmcnt(3) 2858; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v41 2859; VI-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc 2860; VI-NEXT: s_waitcnt vmcnt(2) 2861; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v37 2862; VI-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc 2863; VI-NEXT: s_waitcnt vmcnt(1) 2864; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v42 2865; VI-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc 2866; VI-NEXT: s_waitcnt vmcnt(0) 2867; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 2868; VI-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc 2869; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v46 2870; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload 2871; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload 2872; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload 2873; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload 2874; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload 2875; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload 2876; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload 2877; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload 2878; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload 2879; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload 2880; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload 2881; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload 2882; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload 2883; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2884; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 2885; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2886; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v49 2887; VI-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2888; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 2889; VI-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2890; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v52 2891; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2892; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v54 2893; VI-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2894; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 2895; VI-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2896; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 2897; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2898; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v39 2899; VI-NEXT: v_or_b32_sdwa v8, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2900; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 2901; VI-NEXT: v_or_b32_sdwa v9, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2902; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v32 2903; VI-NEXT: v_or_b32_sdwa v10, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2904; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v33 2905; VI-NEXT: v_or_b32_sdwa v11, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2906; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v34 2907; VI-NEXT: v_or_b32_sdwa v12, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2908; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v35 2909; VI-NEXT: v_or_b32_sdwa v13, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2910; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v36 2911; VI-NEXT: v_or_b32_sdwa v14, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2912; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v31 2913; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2914; VI-NEXT: s_waitcnt vmcnt(0) 2915; VI-NEXT: s_setpc_b64 s[30:31] 2916; 2917; GFX11-LABEL: v_vselect_v32f16: 2918; GFX11: ; %bb.0: 2919; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2920; GFX11-NEXT: s_clause 0x1f 2921; GFX11-NEXT: scratch_load_b32 v31, off, s32 offset:120 2922; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:112 2923; GFX11-NEXT: scratch_load_b32 v33, off, s32 2924; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:104 2925; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:96 2926; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:88 2927; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:80 2928; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:72 2929; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:64 2930; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:56 2931; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:48 2932; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:40 2933; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32 2934; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:24 2935; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:16 2936; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:8 2937; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:124 2938; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:116 2939; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:108 2940; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:100 2941; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:92 2942; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:84 2943; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:76 2944; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:68 2945; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:60 2946; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:52 2947; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:44 2948; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:36 2949; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:28 2950; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:12 2951; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:4 2952; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:20 2953; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:128 2954; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v14 2955; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v30 2956; GFX11-NEXT: v_lshrrev_b32_e32 v99, 16, v13 2957; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v29 2958; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v12 2959; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v28 2960; GFX11-NEXT: v_lshrrev_b32_e32 v103, 16, v11 2961; GFX11-NEXT: v_lshrrev_b32_e32 v112, 16, v27 2962; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v10 2963; GFX11-NEXT: v_lshrrev_b32_e32 v114, 16, v26 2964; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v9 2965; GFX11-NEXT: v_lshrrev_b32_e32 v116, 16, v25 2966; GFX11-NEXT: v_lshrrev_b32_e32 v117, 16, v8 2967; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v24 2968; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v7 2969; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v23 2970; GFX11-NEXT: v_lshrrev_b32_e32 v129, 16, v6 2971; GFX11-NEXT: v_lshrrev_b32_e32 v130, 16, v22 2972; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v5 2973; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v21 2974; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v4 2975; GFX11-NEXT: v_lshrrev_b32_e32 v134, 16, v20 2976; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v3 2977; GFX11-NEXT: v_lshrrev_b32_e32 v144, 16, v19 2978; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v2 2979; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v18 2980; GFX11-NEXT: v_lshrrev_b32_e32 v147, 16, v1 2981; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v15 2982; GFX11-NEXT: s_waitcnt vmcnt(32) 2983; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 2984; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17 2985; GFX11-NEXT: v_cndmask_b32_e32 v97, v98, v97, vcc_lo 2986; GFX11-NEXT: s_waitcnt vmcnt(31) 2987; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v32 2988; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v0 2989; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16 2990; GFX11-NEXT: v_cndmask_b32_e32 v99, v100, v99, vcc_lo 2991; GFX11-NEXT: s_waitcnt vmcnt(29) 2992; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v34 2993; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v33 2994; GFX11-NEXT: v_cndmask_b32_e32 v34, v102, v101, vcc_lo 2995; GFX11-NEXT: s_waitcnt vmcnt(28) 2996; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v35 2997; GFX11-NEXT: v_cndmask_b32_e32 v35, v112, v103, vcc_lo 2998; GFX11-NEXT: s_waitcnt vmcnt(27) 2999; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v36 3000; GFX11-NEXT: v_cndmask_b32_e32 v36, v114, v113, vcc_lo 3001; GFX11-NEXT: s_waitcnt vmcnt(26) 3002; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v37 3003; GFX11-NEXT: v_cndmask_b32_e32 v37, v116, v115, vcc_lo 3004; GFX11-NEXT: s_waitcnt vmcnt(25) 3005; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v38 3006; GFX11-NEXT: v_cndmask_b32_e32 v38, v118, v117, vcc_lo 3007; GFX11-NEXT: s_waitcnt vmcnt(24) 3008; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v39 3009; GFX11-NEXT: v_cndmask_b32_e32 v39, v128, v119, vcc_lo 3010; GFX11-NEXT: s_waitcnt vmcnt(23) 3011; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v48 3012; GFX11-NEXT: v_cndmask_b32_e32 v48, v130, v129, vcc_lo 3013; GFX11-NEXT: s_waitcnt vmcnt(22) 3014; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v49 3015; GFX11-NEXT: v_cndmask_b32_e32 v49, v132, v131, vcc_lo 3016; GFX11-NEXT: s_waitcnt vmcnt(21) 3017; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v50 3018; GFX11-NEXT: v_cndmask_b32_e32 v50, v134, v133, vcc_lo 3019; GFX11-NEXT: s_waitcnt vmcnt(20) 3020; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v51 3021; GFX11-NEXT: v_cndmask_b32_e32 v51, v144, v135, vcc_lo 3022; GFX11-NEXT: s_waitcnt vmcnt(19) 3023; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v52 3024; GFX11-NEXT: v_cndmask_b32_e32 v52, v146, v145, vcc_lo 3025; GFX11-NEXT: s_waitcnt vmcnt(18) 3026; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v53 3027; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v147, vcc_lo 3028; GFX11-NEXT: s_waitcnt vmcnt(17) 3029; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v54 3030; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v98, vcc_lo 3031; GFX11-NEXT: s_waitcnt vmcnt(16) 3032; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v55 3033; GFX11-NEXT: v_cndmask_b32_e32 v15, v33, v15, vcc_lo 3034; GFX11-NEXT: s_waitcnt vmcnt(15) 3035; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v64 3036; GFX11-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo 3037; GFX11-NEXT: s_waitcnt vmcnt(14) 3038; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v65 3039; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 3040; GFX11-NEXT: v_perm_b32 v14, v97, v14, 0x5040100 3041; GFX11-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc_lo 3042; GFX11-NEXT: s_waitcnt vmcnt(13) 3043; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v66 3044; GFX11-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc_lo 3045; GFX11-NEXT: s_waitcnt vmcnt(12) 3046; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v67 3047; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 3048; GFX11-NEXT: v_perm_b32 v12, v34, v12, 0x5040100 3049; GFX11-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo 3050; GFX11-NEXT: s_waitcnt vmcnt(11) 3051; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v68 3052; GFX11-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo 3053; GFX11-NEXT: s_waitcnt vmcnt(10) 3054; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v69 3055; GFX11-NEXT: v_perm_b32 v13, v99, v13, 0x5040100 3056; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 3057; GFX11-NEXT: v_perm_b32 v10, v36, v10, 0x5040100 3058; GFX11-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo 3059; GFX11-NEXT: s_waitcnt vmcnt(9) 3060; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v70 3061; GFX11-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo 3062; GFX11-NEXT: s_waitcnt vmcnt(8) 3063; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v71 3064; GFX11-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 3065; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 3066; GFX11-NEXT: v_perm_b32 v8, v38, v8, 0x5040100 3067; GFX11-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc_lo 3068; GFX11-NEXT: s_waitcnt vmcnt(7) 3069; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v80 3070; GFX11-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc_lo 3071; GFX11-NEXT: s_waitcnt vmcnt(6) 3072; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v81 3073; GFX11-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 3074; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 3075; GFX11-NEXT: v_perm_b32 v6, v48, v6, 0x5040100 3076; GFX11-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo 3077; GFX11-NEXT: s_waitcnt vmcnt(5) 3078; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v82 3079; GFX11-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo 3080; GFX11-NEXT: s_waitcnt vmcnt(4) 3081; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v83 3082; GFX11-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 3083; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 3084; GFX11-NEXT: v_perm_b32 v4, v50, v4, 0x5040100 3085; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc_lo 3086; GFX11-NEXT: s_waitcnt vmcnt(3) 3087; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v84 3088; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo 3089; GFX11-NEXT: s_waitcnt vmcnt(2) 3090; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v85 3091; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo 3092; GFX11-NEXT: s_waitcnt vmcnt(1) 3093; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v86 3094; GFX11-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 3095; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) 3096; GFX11-NEXT: v_perm_b32 v0, v32, v0, 0x5040100 3097; GFX11-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo 3098; GFX11-NEXT: s_waitcnt vmcnt(0) 3099; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v87 3100; GFX11-NEXT: v_perm_b32 v3, v51, v3, 0x5040100 3101; GFX11-NEXT: v_perm_b32 v2, v52, v2, 0x5040100 3102; GFX11-NEXT: v_cndmask_b32_e32 v16, v100, v96, vcc_lo 3103; GFX11-NEXT: v_perm_b32 v1, v31, v1, 0x5040100 3104; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 3105; GFX11-NEXT: v_perm_b32 v15, v16, v15, 0x5040100 3106; GFX11-NEXT: s_setpc_b64 s[30:31] 3107 %cmp = icmp eq <32 x i32> %cond, zeroinitializer 3108 %select = select <32 x i1> %cmp, <32 x half> %a, <32 x half> %b 3109 ret <32 x half> %select 3110} 3111