1; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck -check-prefixes=SI,GCN %s 2; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI,GCN %s 3 4; GCN-LABEL: {{^}}select0: 5; i64 select should be split into two i32 selects, and we shouldn't need 6; to use a shfit to extract the hi dword of the input. 7; GCN-NOT: s_lshr_b64 8; GCN: v_cndmask 9; GCN: v_cndmask 10define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) { 11<<<<<<< HEAD 12; SI-LABEL: select0: 13; SI: ; %bb.0: ; %entry 14; SI-NEXT: s_load_dword s6, s[0:1], 0xb 15; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd 16; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 17; SI-NEXT: s_mov_b32 s3, 0xf000 18; SI-NEXT: s_mov_b32 s2, -1 19; SI-NEXT: s_waitcnt lgkmcnt(0) 20; SI-NEXT: s_cmp_lt_u32 s6, 6 21; SI-NEXT: v_mov_b32_e32 v0, s5 22; SI-NEXT: s_cselect_b64 vcc, -1, 0 23; SI-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 24; SI-NEXT: v_mov_b32_e32 v0, s4 25; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 26; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 27; SI-NEXT: s_endpgm 28; 29; VI-LABEL: select0: 30; VI: ; %bb.0: ; %entry 31; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 32; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 33; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 34; VI-NEXT: s_waitcnt lgkmcnt(0) 35; VI-NEXT: s_cmp_lt_u32 s4, 6 36; VI-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 37; VI-NEXT: v_mov_b32_e32 v0, s0 38; VI-NEXT: v_mov_b32_e32 v2, s2 39; VI-NEXT: v_mov_b32_e32 v1, s1 40; VI-NEXT: v_mov_b32_e32 v3, s3 41; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 42; VI-NEXT: s_endpgm 43; 44; GFX90A-LABEL: select0: 45; GFX90A: ; %bb.0: ; %entry 46; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c 47; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 48; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 49; GFX90A-NEXT: v_mov_b32_e32 v2, 0 50; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 51; GFX90A-NEXT: s_cmp_lt_u32 s6, 6 52; GFX90A-NEXT: s_cselect_b64 s[0:1], s[2:3], 0 53; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 54; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 55; GFX90A-NEXT: s_endpgm 56======= 57>>>>>>> parent of 640beb38e771... [amdgpu] Enable selection of `s_cselect_b64`. 58entry: 59 %0 = icmp ugt i32 %cond, 5 60 %1 = select i1 %0, i64 0, i64 %in 61 store i64 %1, i64 addrspace(1)* %out 62 ret void 63} 64 65; GCN-LABEL: {{^}}select_trunc_i64: 66; VI: s_cselect_b32 67; VI-NOT: s_cselect_b32 68; SI: v_cndmask_b32 69; SI-NOT: v_cndmask_b32 70define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind { 71<<<<<<< HEAD 72; SI-LABEL: select_trunc_i64: 73; SI: ; %bb.0: 74; SI-NEXT: s_load_dword s4, s[0:1], 0xb 75; SI-NEXT: s_load_dword s5, s[0:1], 0xd 76; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 77; SI-NEXT: s_mov_b32 s3, 0xf000 78; SI-NEXT: s_mov_b32 s2, -1 79; SI-NEXT: s_waitcnt lgkmcnt(0) 80; SI-NEXT: s_cmp_lt_u32 s4, 6 81; SI-NEXT: v_mov_b32_e32 v0, s5 82; SI-NEXT: s_cselect_b64 vcc, -1, 0 83; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 84; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 85; SI-NEXT: s_endpgm 86; 87; VI-LABEL: select_trunc_i64: 88; VI: ; %bb.0: 89; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 90; VI-NEXT: s_load_dword s3, s[0:1], 0x34 91; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 92; VI-NEXT: s_waitcnt lgkmcnt(0) 93; VI-NEXT: s_cmp_lt_u32 s2, 6 94; VI-NEXT: s_cselect_b32 s2, s3, 0 95; VI-NEXT: v_mov_b32_e32 v0, s0 96; VI-NEXT: v_mov_b32_e32 v1, s1 97; VI-NEXT: v_mov_b32_e32 v2, s2 98; VI-NEXT: flat_store_dword v[0:1], v2 99; VI-NEXT: s_endpgm 100; 101; GFX90A-LABEL: select_trunc_i64: 102; GFX90A: ; %bb.0: 103; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 104; GFX90A-NEXT: s_load_dword s5, s[0:1], 0x34 105; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 106; GFX90A-NEXT: v_mov_b32_e32 v0, 0 107; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 108; GFX90A-NEXT: s_cmp_lt_u32 s4, 6 109; GFX90A-NEXT: s_cselect_b32 s0, s5, 0 110; GFX90A-NEXT: v_mov_b32_e32 v1, s0 111; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 112; GFX90A-NEXT: s_endpgm 113======= 114>>>>>>> parent of 640beb38e771... [amdgpu] Enable selection of `s_cselect_b64`. 115 %cmp = icmp ugt i32 %cond, 5 116 %sel = select i1 %cmp, i64 0, i64 %in 117 %trunc = trunc i64 %sel to i32 118 store i32 %trunc, i32 addrspace(1)* %out, align 4 119 ret void 120} 121 122; GCN-LABEL: {{^}}select_trunc_i64_2: 123; VI: s_cselect_b32 124; VI-NOT: s_cselect_b32 125; SI: v_cndmask_b32 126; SI-NOT: v_cndmask_b32 127define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind { 128<<<<<<< HEAD 129; SI-LABEL: select_trunc_i64_2: 130; SI: ; %bb.0: 131; SI-NEXT: s_load_dword s8, s[0:1], 0xb 132; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 133; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 134; SI-NEXT: s_mov_b32 s3, 0xf000 135; SI-NEXT: s_mov_b32 s2, -1 136; SI-NEXT: s_waitcnt lgkmcnt(0) 137; SI-NEXT: s_cmp_gt_u32 s8, 5 138; SI-NEXT: v_mov_b32_e32 v0, s6 139; SI-NEXT: v_mov_b32_e32 v1, s4 140; SI-NEXT: s_cselect_b64 vcc, -1, 0 141; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 142; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 143; SI-NEXT: s_endpgm 144; 145; VI-LABEL: select_trunc_i64_2: 146; VI: ; %bb.0: 147; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 148; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 149; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 150; VI-NEXT: s_waitcnt lgkmcnt(0) 151; VI-NEXT: s_cmp_gt_u32 s2, 5 152; VI-NEXT: s_cselect_b32 s2, s4, s6 153; VI-NEXT: v_mov_b32_e32 v0, s0 154; VI-NEXT: v_mov_b32_e32 v1, s1 155; VI-NEXT: v_mov_b32_e32 v2, s2 156; VI-NEXT: flat_store_dword v[0:1], v2 157; VI-NEXT: s_endpgm 158; 159; GFX90A-LABEL: select_trunc_i64_2: 160; GFX90A: ; %bb.0: 161; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x2c 162; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 163; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 164; GFX90A-NEXT: v_mov_b32_e32 v0, 0 165; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 166; GFX90A-NEXT: s_cmp_gt_u32 s8, 5 167; GFX90A-NEXT: s_cselect_b32 s0, s4, s6 168; GFX90A-NEXT: v_mov_b32_e32 v1, s0 169; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 170; GFX90A-NEXT: s_endpgm 171======= 172>>>>>>> parent of 640beb38e771... [amdgpu] Enable selection of `s_cselect_b64`. 173 %cmp = icmp ugt i32 %cond, 5 174 %sel = select i1 %cmp, i64 %a, i64 %b 175 %trunc = trunc i64 %sel to i32 176 store i32 %trunc, i32 addrspace(1)* %out, align 4 177 ret void 178} 179 180; GCN-LABEL: {{^}}v_select_trunc_i64_2: 181; VI: s_cselect_b32 182; VI-NOT: s_cselect_b32 183; SI: v_cndmask_b32 184; SI-NOT: v_cndmask_b32 185define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { 186<<<<<<< HEAD 187; SI-LABEL: v_select_trunc_i64_2: 188; SI: ; %bb.0: 189; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 190; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 191; SI-NEXT: s_load_dword s0, s[0:1], 0xb 192; SI-NEXT: s_mov_b32 s11, 0xf000 193; SI-NEXT: s_mov_b32 s10, -1 194; SI-NEXT: s_waitcnt lgkmcnt(0) 195; SI-NEXT: s_load_dword s1, s[6:7], 0x0 196; SI-NEXT: s_load_dword s2, s[4:5], 0x0 197; SI-NEXT: s_cmp_gt_u32 s0, 5 198; SI-NEXT: s_cselect_b64 vcc, -1, 0 199; SI-NEXT: s_waitcnt lgkmcnt(0) 200; SI-NEXT: v_mov_b32_e32 v0, s1 201; SI-NEXT: v_mov_b32_e32 v1, s2 202; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 203; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 204; SI-NEXT: s_endpgm 205; 206; VI-LABEL: v_select_trunc_i64_2: 207; VI: ; %bb.0: 208; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 209; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 210; VI-NEXT: s_waitcnt lgkmcnt(0) 211; VI-NEXT: s_load_dword s3, s[4:5], 0x0 212; VI-NEXT: s_load_dword s4, s[6:7], 0x0 213; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 214; VI-NEXT: s_cmp_gt_u32 s2, 5 215; VI-NEXT: s_waitcnt lgkmcnt(0) 216; VI-NEXT: s_cselect_b32 s2, s3, s4 217; VI-NEXT: v_mov_b32_e32 v0, s0 218; VI-NEXT: v_mov_b32_e32 v1, s1 219; VI-NEXT: v_mov_b32_e32 v2, s2 220; VI-NEXT: flat_store_dword v[0:1], v2 221; VI-NEXT: s_endpgm 222; 223; GFX90A-LABEL: v_select_trunc_i64_2: 224; GFX90A: ; %bb.0: 225; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 226; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x2c 227; GFX90A-NEXT: v_mov_b32_e32 v0, 0 228; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 229; GFX90A-NEXT: s_load_dword s9, s[4:5], 0x0 230; GFX90A-NEXT: s_load_dword s10, s[6:7], 0x0 231; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 232; GFX90A-NEXT: s_cmp_gt_u32 s8, 5 233; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 234; GFX90A-NEXT: s_cselect_b32 s0, s9, s10 235; GFX90A-NEXT: v_mov_b32_e32 v1, s0 236; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 237; GFX90A-NEXT: s_endpgm 238======= 239>>>>>>> parent of 640beb38e771... [amdgpu] Enable selection of `s_cselect_b64`. 240 %cmp = icmp ugt i32 %cond, 5 241 %a = load i64, i64 addrspace(1)* %aptr, align 8 242 %b = load i64, i64 addrspace(1)* %bptr, align 8 243 %sel = select i1 %cmp, i64 %a, i64 %b 244 %trunc = trunc i64 %sel to i32 245 store i32 %trunc, i32 addrspace(1)* %out, align 4 246 ret void 247} 248 249; GCN-LABEL: {{^}}v_select_i64_split_imm: 250; GCN-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}} 251; GCN-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}} 252; GCN: s_endpgm 253define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { 254<<<<<<< HEAD 255; SI-LABEL: v_select_i64_split_imm: 256; SI: ; %bb.0: 257; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 258; SI-NEXT: s_load_dword s6, s[0:1], 0xb 259; SI-NEXT: s_waitcnt lgkmcnt(0) 260; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 261; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 262; SI-NEXT: s_cmp_gt_u32 s6, 5 263; SI-NEXT: s_cselect_b64 vcc, -1, 0 264; SI-NEXT: s_mov_b32 s3, 0xf000 265; SI-NEXT: s_waitcnt lgkmcnt(0) 266; SI-NEXT: v_mov_b32_e32 v0, s5 267; SI-NEXT: v_mov_b32_e32 v2, s4 268; SI-NEXT: s_mov_b32 s2, -1 269; SI-NEXT: v_cndmask_b32_e32 v1, 63, v0, vcc 270; SI-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc 271; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 272; SI-NEXT: s_endpgm 273; 274; VI-LABEL: v_select_i64_split_imm: 275; VI: ; %bb.0: 276; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 277; VI-NEXT: s_load_dword s6, s[0:1], 0x2c 278; VI-NEXT: s_mov_b32 s4, 0 279; VI-NEXT: s_mov_b32 s5, 63 280; VI-NEXT: s_waitcnt lgkmcnt(0) 281; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 282; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 283; VI-NEXT: s_cmp_gt_u32 s6, 5 284; VI-NEXT: s_waitcnt lgkmcnt(0) 285; VI-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] 286; VI-NEXT: v_mov_b32_e32 v0, s0 287; VI-NEXT: v_mov_b32_e32 v2, s2 288; VI-NEXT: v_mov_b32_e32 v1, s1 289; VI-NEXT: v_mov_b32_e32 v3, s3 290; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 291; VI-NEXT: s_endpgm 292; 293; GFX90A-LABEL: v_select_i64_split_imm: 294; GFX90A: ; %bb.0: 295; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 296; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c 297; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 298; GFX90A-NEXT: v_mov_b32_e32 v2, 0 299; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 300; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 301; GFX90A-NEXT: s_mov_b32 s2, 0 302; GFX90A-NEXT: s_cmp_gt_u32 s6, 5 303; GFX90A-NEXT: s_mov_b32 s3, 63 304; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 305; GFX90A-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] 306; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 307; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 308; GFX90A-NEXT: s_endpgm 309======= 310>>>>>>> parent of 640beb38e771... [amdgpu] Enable selection of `s_cselect_b64`. 311 %cmp = icmp ugt i32 %cond, 5 312 %a = load i64, i64 addrspace(1)* %aptr, align 8 313 %b = load i64, i64 addrspace(1)* %bptr, align 8 314 %sel = select i1 %cmp, i64 %a, i64 270582939648 ; 63 << 32 315 store i64 %sel, i64 addrspace(1)* %out, align 8 316 ret void 317} 318