1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2;RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck --check-prefixes=SI %s 3;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=VI %s 4;RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck --check-prefixes=EG %s 5 6define amdgpu_kernel void @test_select_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x i32> %val) { 7; SI-LABEL: test_select_v2i32: 8; SI: ; %bb.0: ; %entry 9; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 10; SI-NEXT: s_waitcnt lgkmcnt(0) 11; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 12; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 13; SI-NEXT: s_mov_b32 s3, 0xf000 14; SI-NEXT: s_waitcnt lgkmcnt(0) 15; SI-NEXT: s_cmp_gt_i32 s9, s5 16; SI-NEXT: s_cselect_b32 s5, s7, s9 17; SI-NEXT: s_cmp_gt_i32 s8, s4 18; SI-NEXT: s_cselect_b32 s4, s6, s8 19; SI-NEXT: s_mov_b32 s2, -1 20; SI-NEXT: v_mov_b32_e32 v1, s5 21; SI-NEXT: v_mov_b32_e32 v0, s4 22; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 23; SI-NEXT: s_endpgm 24; 25; VI-LABEL: test_select_v2i32: 26; VI: ; %bb.0: ; %entry 27; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 28; VI-NEXT: s_waitcnt lgkmcnt(0) 29; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 30; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 31; VI-NEXT: s_mov_b32 s3, 0xf000 32; VI-NEXT: s_mov_b32 s2, -1 33; VI-NEXT: s_waitcnt lgkmcnt(0) 34; VI-NEXT: s_cmp_gt_i32 s9, s5 35; VI-NEXT: s_cselect_b32 s5, s7, s9 36; VI-NEXT: s_cmp_gt_i32 s8, s4 37; VI-NEXT: s_cselect_b32 s4, s6, s8 38; VI-NEXT: v_mov_b32_e32 v0, s4 39; VI-NEXT: v_mov_b32_e32 v1, s5 40; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 41; VI-NEXT: s_endpgm 42; 43; EG-LABEL: test_select_v2i32: 44; EG: ; %bb.0: ; %entry 45; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 46; EG-NEXT: TEX 1 @6 47; EG-NEXT: ALU 5, @12, KC0[CB0:0-32], KC1[] 48; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 49; EG-NEXT: CF_END 50; EG-NEXT: PAD 51; EG-NEXT: Fetch clause starting at 6: 52; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 53; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 54; EG-NEXT: ALU clause starting at 10: 55; EG-NEXT: MOV T0.X, KC0[2].Z, 56; EG-NEXT: MOV * T1.X, KC0[2].W, 57; EG-NEXT: ALU clause starting at 12: 58; EG-NEXT: SETGT_INT * T0.W, T0.Y, T1.Y, 59; EG-NEXT: CNDE_INT T0.Y, PV.W, T0.Y, KC0[3].Z, 60; EG-NEXT: SETGT_INT * T0.W, T0.X, T1.X, 61; EG-NEXT: CNDE_INT T0.X, PV.W, T0.X, KC0[3].Y, 62; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 63; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 64entry: 65 %load0 = load <2 x i32>, ptr addrspace(1) %in0 66 %load1 = load <2 x i32>, ptr addrspace(1) %in1 67 %cmp = icmp sgt <2 x i32> %load0, %load1 68 %result = select <2 x i1> %cmp, <2 x i32> %val, <2 x i32> %load0 69 store <2 x i32> %result, ptr addrspace(1) %out 70 ret void 71} 72 73define amdgpu_kernel void @test_select_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { 74; SI-LABEL: test_select_v2f32: 75; SI: ; %bb.0: ; %entry 76; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 77; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 78; SI-NEXT: s_waitcnt lgkmcnt(0) 79; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 80; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 81; SI-NEXT: s_mov_b32 s3, 0xf000 82; SI-NEXT: s_mov_b32 s2, -1 83; SI-NEXT: s_waitcnt lgkmcnt(0) 84; SI-NEXT: v_mov_b32_e32 v0, s4 85; SI-NEXT: v_mov_b32_e32 v1, s5 86; SI-NEXT: v_mov_b32_e32 v2, s7 87; SI-NEXT: v_cmp_neq_f32_e32 vcc, s7, v1 88; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 89; SI-NEXT: v_mov_b32_e32 v2, s6 90; SI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0 91; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 92; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 93; SI-NEXT: s_endpgm 94; 95; VI-LABEL: test_select_v2f32: 96; VI: ; %bb.0: ; %entry 97; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 98; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 99; VI-NEXT: s_mov_b32 s7, 0xf000 100; VI-NEXT: s_mov_b32 s6, -1 101; VI-NEXT: s_waitcnt lgkmcnt(0) 102; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 103; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 104; VI-NEXT: s_mov_b32 s4, s0 105; VI-NEXT: s_mov_b32 s5, s1 106; VI-NEXT: s_waitcnt lgkmcnt(0) 107; VI-NEXT: v_mov_b32_e32 v1, s9 108; VI-NEXT: v_mov_b32_e32 v0, s8 109; VI-NEXT: v_mov_b32_e32 v2, s3 110; VI-NEXT: v_cmp_neq_f32_e32 vcc, s3, v1 111; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 112; VI-NEXT: v_mov_b32_e32 v2, s2 113; VI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v0 114; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 115; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 116; VI-NEXT: s_endpgm 117; 118; EG-LABEL: test_select_v2f32: 119; EG: ; %bb.0: ; %entry 120; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 121; EG-NEXT: TEX 1 @6 122; EG-NEXT: ALU 5, @12, KC0[CB0:0-32], KC1[] 123; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 124; EG-NEXT: CF_END 125; EG-NEXT: PAD 126; EG-NEXT: Fetch clause starting at 6: 127; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 128; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 129; EG-NEXT: ALU clause starting at 10: 130; EG-NEXT: MOV T0.X, KC0[2].Z, 131; EG-NEXT: MOV * T1.X, KC0[2].W, 132; EG-NEXT: ALU clause starting at 12: 133; EG-NEXT: SETNE_DX10 * T0.W, T0.Y, T1.Y, 134; EG-NEXT: CNDE_INT T0.Y, PV.W, T1.Y, T0.Y, 135; EG-NEXT: SETNE_DX10 * T0.W, T0.X, T1.X, 136; EG-NEXT: CNDE_INT T0.X, PV.W, T1.X, T0.X, 137; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 138; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 139entry: 140 %0 = load <2 x float>, ptr addrspace(1) %in0 141 %1 = load <2 x float>, ptr addrspace(1) %in1 142 %cmp = fcmp une <2 x float> %0, %1 143 %result = select <2 x i1> %cmp, <2 x float> %0, <2 x float> %1 144 store <2 x float> %result, ptr addrspace(1) %out 145 ret void 146} 147 148define amdgpu_kernel void @test_select_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <4 x i32> %val) { 149; SI-LABEL: test_select_v4i32: 150; SI: ; %bb.0: ; %entry 151; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 152; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 153; SI-NEXT: s_waitcnt lgkmcnt(0) 154; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 155; SI-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x0 156; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x11 157; SI-NEXT: s_mov_b32 s3, 0xf000 158; SI-NEXT: s_waitcnt lgkmcnt(0) 159; SI-NEXT: s_cmp_gt_i32 s10, s14 160; SI-NEXT: s_cselect_b32 s6, s6, s10 161; SI-NEXT: s_cmp_gt_i32 s9, s13 162; SI-NEXT: s_cselect_b32 s5, s5, s9 163; SI-NEXT: s_cmp_gt_i32 s11, s15 164; SI-NEXT: s_cselect_b32 s7, s7, s11 165; SI-NEXT: s_cmp_gt_i32 s8, s12 166; SI-NEXT: s_cselect_b32 s4, s4, s8 167; SI-NEXT: s_mov_b32 s2, -1 168; SI-NEXT: v_mov_b32_e32 v2, s6 169; SI-NEXT: v_mov_b32_e32 v1, s5 170; SI-NEXT: v_mov_b32_e32 v3, s7 171; SI-NEXT: v_mov_b32_e32 v0, s4 172; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 173; SI-NEXT: s_endpgm 174; 175; VI-LABEL: test_select_v4i32: 176; VI: ; %bb.0: ; %entry 177; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 178; VI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 179; VI-NEXT: s_mov_b32 s7, 0xf000 180; VI-NEXT: s_mov_b32 s6, -1 181; VI-NEXT: s_waitcnt lgkmcnt(0) 182; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 183; VI-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 184; VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x44 185; VI-NEXT: s_mov_b32 s4, s0 186; VI-NEXT: s_mov_b32 s5, s1 187; VI-NEXT: s_waitcnt lgkmcnt(0) 188; VI-NEXT: s_cmp_gt_i32 s10, s14 189; VI-NEXT: s_cselect_b32 s0, s18, s10 190; VI-NEXT: s_cmp_gt_i32 s9, s13 191; VI-NEXT: s_cselect_b32 s1, s17, s9 192; VI-NEXT: s_cmp_gt_i32 s11, s15 193; VI-NEXT: s_cselect_b32 s2, s19, s11 194; VI-NEXT: s_cmp_gt_i32 s8, s12 195; VI-NEXT: s_cselect_b32 s3, s16, s8 196; VI-NEXT: v_mov_b32_e32 v0, s3 197; VI-NEXT: v_mov_b32_e32 v1, s1 198; VI-NEXT: v_mov_b32_e32 v2, s0 199; VI-NEXT: v_mov_b32_e32 v3, s2 200; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 201; VI-NEXT: s_endpgm 202; 203; EG-LABEL: test_select_v4i32: 204; EG: ; %bb.0: ; %entry 205; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 206; EG-NEXT: TEX 1 @6 207; EG-NEXT: ALU 9, @12, KC0[CB0:0-32], KC1[] 208; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 209; EG-NEXT: CF_END 210; EG-NEXT: PAD 211; EG-NEXT: Fetch clause starting at 6: 212; EG-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1 213; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 214; EG-NEXT: ALU clause starting at 10: 215; EG-NEXT: MOV T0.X, KC0[2].Z, 216; EG-NEXT: MOV * T1.X, KC0[2].W, 217; EG-NEXT: ALU clause starting at 12: 218; EG-NEXT: SETGT_INT T1.W, T0.W, T1.W, 219; EG-NEXT: SETGT_INT * T2.W, T0.Z, T1.Z, 220; EG-NEXT: CNDE_INT * T0.W, PV.W, T0.W, KC0[4].X, 221; EG-NEXT: CNDE_INT T0.Z, T2.W, T0.Z, KC0[3].W, 222; EG-NEXT: SETGT_INT * T1.W, T0.Y, T1.Y, 223; EG-NEXT: CNDE_INT T0.Y, PV.W, T0.Y, KC0[3].Z, 224; EG-NEXT: SETGT_INT * T1.W, T0.X, T1.X, 225; EG-NEXT: CNDE_INT T0.X, PV.W, T0.X, KC0[3].Y, 226; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 227; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 228entry: 229 %load0 = load <4 x i32>, ptr addrspace(1) %in0 230 %load1 = load <4 x i32>, ptr addrspace(1) %in1 231 %cmp = icmp sgt <4 x i32> %load0, %load1 232 %result = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %load0 233 store <4 x i32> %result, ptr addrspace(1) %out 234 ret void 235} 236 237define amdgpu_kernel void @test_select_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { 238; SI-LABEL: test_select_v4f32: 239; SI: ; %bb.0: ; %entry 240; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 241; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 242; SI-NEXT: s_waitcnt lgkmcnt(0) 243; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 244; SI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 245; SI-NEXT: s_mov_b32 s3, 0xf000 246; SI-NEXT: s_mov_b32 s2, -1 247; SI-NEXT: s_waitcnt lgkmcnt(0) 248; SI-NEXT: v_mov_b32_e32 v0, s8 249; SI-NEXT: v_mov_b32_e32 v1, s9 250; SI-NEXT: v_mov_b32_e32 v2, s10 251; SI-NEXT: v_mov_b32_e32 v3, s11 252; SI-NEXT: v_mov_b32_e32 v4, s7 253; SI-NEXT: v_cmp_neq_f32_e32 vcc, s7, v3 254; SI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 255; SI-NEXT: v_mov_b32_e32 v4, s6 256; SI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v2 257; SI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 258; SI-NEXT: v_mov_b32_e32 v4, s5 259; SI-NEXT: v_cmp_neq_f32_e32 vcc, s5, v1 260; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 261; SI-NEXT: v_mov_b32_e32 v4, s4 262; SI-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0 263; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 264; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 265; SI-NEXT: s_endpgm 266; 267; VI-LABEL: test_select_v4f32: 268; VI: ; %bb.0: ; %entry 269; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 270; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 271; VI-NEXT: s_mov_b32 s7, 0xf000 272; VI-NEXT: s_mov_b32 s6, -1 273; VI-NEXT: s_waitcnt lgkmcnt(0) 274; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 275; VI-NEXT: s_mov_b32 s4, s0 276; VI-NEXT: s_mov_b32 s5, s1 277; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 278; VI-NEXT: s_waitcnt lgkmcnt(0) 279; VI-NEXT: v_mov_b32_e32 v3, s11 280; VI-NEXT: v_mov_b32_e32 v2, s10 281; VI-NEXT: v_mov_b32_e32 v1, s9 282; VI-NEXT: v_mov_b32_e32 v4, s3 283; VI-NEXT: v_cmp_neq_f32_e32 vcc, s3, v3 284; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 285; VI-NEXT: v_mov_b32_e32 v4, s2 286; VI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v2 287; VI-NEXT: v_mov_b32_e32 v0, s8 288; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 289; VI-NEXT: v_mov_b32_e32 v4, s1 290; VI-NEXT: v_cmp_neq_f32_e32 vcc, s1, v1 291; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 292; VI-NEXT: v_mov_b32_e32 v4, s0 293; VI-NEXT: v_cmp_neq_f32_e32 vcc, s0, v0 294; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 295; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 296; VI-NEXT: s_endpgm 297; 298; EG-LABEL: test_select_v4f32: 299; EG: ; %bb.0: ; %entry 300; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 301; EG-NEXT: TEX 1 @6 302; EG-NEXT: ALU 9, @12, KC0[CB0:0-32], KC1[] 303; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 304; EG-NEXT: CF_END 305; EG-NEXT: PAD 306; EG-NEXT: Fetch clause starting at 6: 307; EG-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1 308; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 309; EG-NEXT: ALU clause starting at 10: 310; EG-NEXT: MOV T0.X, KC0[2].Z, 311; EG-NEXT: MOV * T1.X, KC0[2].W, 312; EG-NEXT: ALU clause starting at 12: 313; EG-NEXT: SETNE_DX10 T2.W, T0.W, T1.W, 314; EG-NEXT: SETNE_DX10 * T3.W, T0.Z, T1.Z, 315; EG-NEXT: CNDE_INT * T0.W, PV.W, T1.W, T0.W, 316; EG-NEXT: CNDE_INT T0.Z, T3.W, T1.Z, T0.Z, 317; EG-NEXT: SETNE_DX10 * T1.W, T0.Y, T1.Y, 318; EG-NEXT: CNDE_INT T0.Y, PV.W, T1.Y, T0.Y, 319; EG-NEXT: SETNE_DX10 * T1.W, T0.X, T1.X, 320; EG-NEXT: CNDE_INT T0.X, PV.W, T1.X, T0.X, 321; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 322; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 323entry: 324 %0 = load <4 x float>, ptr addrspace(1) %in0 325 %1 = load <4 x float>, ptr addrspace(1) %in1 326 %cmp = fcmp une <4 x float> %0, %1 327 %result = select <4 x i1> %cmp, <4 x float> %0, <4 x float> %1 328 store <4 x float> %result, ptr addrspace(1) %out 329 ret void 330} 331