1; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s 4 5; Test expansion of scalar selects on vectors. 6; Evergreen not enabled since it seems to be having problems with doubles. 7 8; GCN-LABEL: {{^}}v_select_v2i8: 9; SI: v_cndmask_b32 10; SI-NOT: cndmask 11 12; GFX9: v_cndmask_b32 13; GFX9-NOT: cndmask 14 15; This is worse when i16 is legal and packed is not because 16; SelectionDAGBuilder for some reason changes the select type. 17; VI: s_cselect_b64 18; VI: v_cndmask_b32 19define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { 20 %a = load <2 x i8>, ptr addrspace(1) %a.ptr, align 2 21 %b = load <2 x i8>, ptr addrspace(1) %b.ptr, align 2 22 %cmp = icmp eq i32 %c, 0 23 %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b 24 store <2 x i8> %select, ptr addrspace(1) %out, align 2 25 ret void 26} 27 28; GCN-LABEL: {{^}}v_select_v4i8: 29; GCN: v_cndmask_b32_e32 30; GCN-NOT: cndmask 31define amdgpu_kernel void @v_select_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { 32 %a = load <4 x i8>, ptr addrspace(1) %a.ptr 33 %b = load <4 x i8>, ptr addrspace(1) %b.ptr 34 %cmp = icmp eq i32 %c, 0 35 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b 36 store <4 x i8> %select, ptr addrspace(1) %out, align 4 37 ret void 38} 39 40; GCN-LABEL: {{^}}v_select_v8i8: 41; GCN: v_cndmask_b32_e32 42; GCN: v_cndmask_b32_e32 43; GCN-NOT: cndmask 44define amdgpu_kernel void @v_select_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { 45 %a = load <8 x i8>, ptr addrspace(1) %a.ptr 46 %b = load <8 x i8>, ptr addrspace(1) %b.ptr 47 %cmp = icmp eq i32 %c, 0 48 %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b 49 store <8 x i8> %select, ptr addrspace(1) %out, align 4 50 ret void 51} 52 53; GCN-LABEL: {{^}}v_select_v16i8: 54; GCN: v_cndmask_b32_e32 55; GCN: v_cndmask_b32_e32 56; GCN: v_cndmask_b32_e32 57; GCN: v_cndmask_b32_e32 58; GCN-NOT: cndmask 59define amdgpu_kernel void @v_select_v16i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { 60 %a = load <16 x i8>, ptr addrspace(1) %a.ptr 61 %b = load <16 x i8>, ptr addrspace(1) %b.ptr 62 %cmp = icmp eq i32 %c, 0 63 %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b 64 store <16 x i8> %select, ptr addrspace(1) %out, align 4 65 ret void 66} 67 68; GCN-LABEL: {{^}}select_v4i8: 69; GFX89: s_cselect_b32 70; GFX89-NOT: s_cselect_b32 71 72; SI: s_cselect_b32 73; SI-NOT: cndmask 74define amdgpu_kernel void @select_v4i8(ptr addrspace(1) %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 { 75 %cmp = icmp eq i8 %c, 0 76 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b 77 store <4 x i8> %select, ptr addrspace(1) %out, align 4 78 ret void 79} 80 81; GCN-LABEL: {{^}}select_v2i16: 82; GFX89: s_load_dwordx4 83; GFX89: s_cselect_b32 84; GFX89-NOT: s_cselect_b32 85 86; SI: s_cselect_b32 87; SI-NOT: v_cndmask_b32e 88define amdgpu_kernel void @select_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 { 89 %cmp = icmp eq i32 %c, 0 90 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b 91 store <2 x i16> %select, ptr addrspace(1) %out, align 4 92 ret void 93} 94 95; GCN-LABEL: {{^}}v_select_v2i16: 96; GCN: buffer_load_dword v 97; GCN: buffer_load_dword v 98; GCN: v_cndmask_b32 99; GCN-NOT: cndmask 100define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { 101 %a = load <2 x i16>, ptr addrspace(1) %a.ptr 102 %b = load <2 x i16>, ptr addrspace(1) %b.ptr 103 %cmp = icmp eq i32 %c, 0 104 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b 105 store <2 x i16> %select, ptr addrspace(1) %out, align 4 106 ret void 107} 108 109; GCN-LABEL: {{^}}v_select_v3i16: 110; SI: v_cndmask_b32_e32 111; SI: cndmask 112; SI-NOT: cndmask 113 114; VI: s_cselect_b64 115; GFX9: cndmask 116; GFX9: cndmask 117define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { 118 %a = load <3 x i16>, ptr addrspace(1) %a.ptr 119 %b = load <3 x i16>, ptr addrspace(1) %b.ptr 120 %cmp = icmp eq i32 %c, 0 121 %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b 122 store <3 x i16> %select, ptr addrspace(1) %out, align 4 123 ret void 124} 125 126; GCN-LABEL: {{^}}v_select_v4i16: 127; GCN: v_cndmask_b32_e32 128; GCN: v_cndmask_b32_e32 129; GCN-NOT: cndmask 130define amdgpu_kernel void @v_select_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { 131 %a = load <4 x i16>, ptr addrspace(1) %a.ptr 132 %b = load <4 x i16>, ptr addrspace(1) %b.ptr 133 %cmp = icmp eq i32 %c, 0 134 %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b 135 store <4 x i16> %select, ptr addrspace(1) %out, align 4 136 ret void 137} 138 139; GCN-LABEL: {{^}}v_select_v8i16: 140; GCN: v_cndmask_b32_e32 141; GCN: v_cndmask_b32_e32 142; GCN: v_cndmask_b32_e32 143; GCN: v_cndmask_b32_e32 144; GCN-NOT: cndmask 145define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { 146 %a = load <8 x i16>, ptr addrspace(1) %a.ptr 147 %b = load <8 x i16>, ptr addrspace(1) %b.ptr 148 %cmp = icmp eq i32 %c, 0 149 %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b 150 store <8 x i16> %select, ptr addrspace(1) %out, align 4 151 ret void 152} 153 154; GCN-LABEL: {{^}}v_select_v16i16: 155; GCN: v_cndmask_b32_e32 156; GCN: v_cndmask_b32_e32 157; GCN: v_cndmask_b32_e32 158; GCN: v_cndmask_b32_e32 159; GCN: v_cndmask_b32_e32 160; GCN: v_cndmask_b32_e32 161; GCN: v_cndmask_b32_e32 162; GCN: v_cndmask_b32_e32 163; GCN-NOT: cndmask 164define amdgpu_kernel void @v_select_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { 165 %a = load <16 x i16>, ptr addrspace(1) %a.ptr 166 %b = load <16 x i16>, ptr addrspace(1) %b.ptr 167 %cmp = icmp eq i32 %c, 0 168 %select = select i1 %cmp, <16 x i16> %a, <16 x i16> %b 169 store <16 x i16> %select, ptr addrspace(1) %out, align 4 170 ret void 171} 172 173; GCN-LABEL: {{^}}v_select_v32i16: 174; GCN: v_cndmask_b32_e32 175; GCN: v_cndmask_b32_e32 176; GCN: v_cndmask_b32_e32 177; GCN: v_cndmask_b32_e32 178; GCN: v_cndmask_b32_e32 179; GCN: v_cndmask_b32_e32 180; GCN: v_cndmask_b32_e32 181; GCN: v_cndmask_b32_e32 182; GCN: v_cndmask_b32_e32 183; GCN: v_cndmask_b32_e32 184; GCN: v_cndmask_b32_e32 185; GCN: v_cndmask_b32_e32 186; GCN: v_cndmask_b32_e32 187; GCN: v_cndmask_b32_e32 188; GCN: v_cndmask_b32_e32 189; GCN: v_cndmask_b32_e32 190; GCN-NOT: cndmask 191define amdgpu_kernel void @v_select_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { 192 %a = load <32 x i16>, ptr addrspace(1) %a.ptr 193 %b = load <32 x i16>, ptr addrspace(1) %b.ptr 194 %cmp = icmp eq i32 %c, 0 195 %select = select i1 %cmp, <32 x i16> %a, <32 x i16> %b 196 store <32 x i16> %select, ptr addrspace(1) %out, align 4 197 ret void 198} 199 200; FIXME: Expansion with bitwise operations may be better if doing a 201; vector select with SGPR inputs. 202 203; GCN-LABEL: {{^}}s_select_v2i32: 204; GCN: s_cselect_b32 205; GCN: s_cselect_b32 206; GCN: buffer_store_dwordx2 207define amdgpu_kernel void @s_select_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 { 208 %cmp = icmp eq i32 %c, 0 209 %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b 210 store <2 x i32> %select, ptr addrspace(1) %out, align 8 211 ret void 212} 213 214; GCN-LABEL: {{^}}s_select_v4i32: 215; GCN: s_cselect_b32 216; GCN: s_cselect_b32 217; GCN: s_cselect_b32 218; GCN: s_cselect_b32 219; GCN: buffer_store_dwordx4 220define amdgpu_kernel void @s_select_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 { 221 %cmp = icmp eq i32 %c, 0 222 %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b 223 store <4 x i32> %select, ptr addrspace(1) %out, align 16 224 ret void 225} 226 227; GCN-LABEL: {{^}}v_select_v4i32: 228; GCN: buffer_load_dwordx4 229; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 32 230; GCN: s_cselect_b64 vcc, -1, 0 231; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 232; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 233; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 234; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 235; GCN: buffer_store_dwordx4 236define amdgpu_kernel void @v_select_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %cond) #0 { 237bb: 238 %tmp2 = icmp ult i32 %cond, 32 239 %val = load <4 x i32>, ptr addrspace(1) %in 240 %tmp3 = select i1 %tmp2, <4 x i32> %val, <4 x i32> zeroinitializer 241 store <4 x i32> %tmp3, ptr addrspace(1) %out, align 16 242 ret void 243} 244 245; GCN-LABEL: {{^}}select_v8i32: 246; GCN: s_cselect_b32 247; GCN: s_cselect_b32 248; GCN: s_cselect_b32 249; GCN: s_cselect_b32 250; GCN: s_cselect_b32 251; GCN: s_cselect_b32 252; GCN: s_cselect_b32 253; GCN: s_cselect_b32 254define amdgpu_kernel void @select_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 { 255 %cmp = icmp eq i32 %c, 0 256 %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b 257 store <8 x i32> %select, ptr addrspace(1) %out, align 16 258 ret void 259} 260 261; GCN-LABEL: {{^}}s_select_v2f32: 262; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} 263; GCN-DAG: s_cselect_b32 264; GCN-DAG: s_cselect_b32 265; GCN: buffer_store_dwordx2 266define amdgpu_kernel void @s_select_v2f32(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 { 267 %cmp = icmp eq i32 %c, 0 268 %select = select i1 %cmp, <2 x float> %a, <2 x float> %b 269 store <2 x float> %select, ptr addrspace(1) %out, align 16 270 ret void 271} 272 273; GCN-LABEL: {{^}}s_select_v3f32: 274; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} 275 276; GCN: s_cselect_b32 277; GCN: s_cselect_b32 278; GCN: s_cselect_b32 279 280; GCN: buffer_store_dwordx 281define amdgpu_kernel void @s_select_v3f32(ptr addrspace(1) %out, <3 x float> %a, <3 x float> %b, i32 %c) #0 { 282 %cmp = icmp eq i32 %c, 0 283 %select = select i1 %cmp, <3 x float> %a, <3 x float> %b 284 store <3 x float> %select, ptr addrspace(1) %out, align 16 285 ret void 286} 287 288; GCN-LABEL: {{^}}s_select_v4f32: 289; GCN: s_load_dwordx8 290; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} 291 292; GCN: s_cselect_b32 293; GCN: s_cselect_b32 294; GCN: s_cselect_b32 295; GCN: s_cselect_b32 296 297; GCN: buffer_store_dwordx4 298define amdgpu_kernel void @s_select_v4f32(ptr addrspace(1) %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 { 299 %cmp = icmp eq i32 %c, 0 300 %select = select i1 %cmp, <4 x float> %a, <4 x float> %b 301 store <4 x float> %select, ptr addrspace(1) %out, align 16 302 ret void 303} 304 305; GCN-LABEL: {{^}}v_select_v4f32: 306; GCN: buffer_load_dwordx4 307; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 32 308; GCN: s_cselect_b64 vcc, -1, 0 309; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 310; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 311; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 312; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc 313; GCN: buffer_store_dwordx4 314define amdgpu_kernel void @v_select_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %cond) #0 { 315bb: 316 %tmp2 = icmp ult i32 %cond, 32 317 %val = load <4 x float>, ptr addrspace(1) %in 318 %tmp3 = select i1 %tmp2, <4 x float> %val, <4 x float> zeroinitializer 319 store <4 x float> %tmp3, ptr addrspace(1) %out, align 16 320 ret void 321} 322 323; GCN-LABEL: {{^}}s_select_v5f32: 324; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} 325 326; GCN: s_cselect_b32 327; GCN: s_cselect_b32 328; GCN: s_cselect_b32 329; GCN: s_cselect_b32 330; GCN: s_cselect_b32 331 332; GCN: buffer_store_dwordx 333define amdgpu_kernel void @s_select_v5f32(ptr addrspace(1) %out, <5 x float> %a, <5 x float> %b, i32 %c) #0 { 334 %cmp = icmp eq i32 %c, 0 335 %select = select i1 %cmp, <5 x float> %a, <5 x float> %b 336 store <5 x float> %select, ptr addrspace(1) %out, align 16 337 ret void 338} 339 340; GCN-LABEL: {{^}}select_v8f32: 341; GCN: v_cndmask_b32_e32 342; GCN: v_cndmask_b32_e32 343; GCN: v_cndmask_b32_e32 344; GCN: v_cndmask_b32_e32 345; GCN: v_cndmask_b32_e32 346; GCN: v_cndmask_b32_e32 347; GCN: v_cndmask_b32_e32 348; GCN: v_cndmask_b32_e32 349define amdgpu_kernel void @select_v8f32(ptr addrspace(1) %out, <8 x float> %a, <8 x float> %b, i32 %c) #0 { 350 %cmp = icmp eq i32 %c, 0 351 %select = select i1 %cmp, <8 x float> %a, <8 x float> %b 352 store <8 x float> %select, ptr addrspace(1) %out, align 16 353 ret void 354} 355 356; GCN-LABEL: {{^}}select_v2f64: 357; GCN: s_cselect_b32 358; GCN: s_cselect_b32 359; GCN: s_cselect_b32 360; GCN: s_cselect_b32 361define amdgpu_kernel void @select_v2f64(ptr addrspace(1) %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 { 362 %cmp = icmp eq i32 %c, 0 363 %select = select i1 %cmp, <2 x double> %a, <2 x double> %b 364 store <2 x double> %select, ptr addrspace(1) %out, align 16 365 ret void 366} 367 368; GCN-LABEL: {{^}}select_v4f64: 369; GCN: s_cselect_b32 370; GCN: s_cselect_b32 371; GCN: s_cselect_b32 372; GCN: s_cselect_b32 373; GCN: s_cselect_b32 374; GCN: s_cselect_b32 375; GCN: s_cselect_b32 376; GCN: s_cselect_b32 377define amdgpu_kernel void @select_v4f64(ptr addrspace(1) %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 { 378 %cmp = icmp eq i32 %c, 0 379 %select = select i1 %cmp, <4 x double> %a, <4 x double> %b 380 store <4 x double> %select, ptr addrspace(1) %out, align 16 381 ret void 382} 383 384; GCN-LABEL: {{^}}select_v8f64: 385; GCN: s_cselect_b32 386; GCN: s_cselect_b32 387; GCN: s_cselect_b32 388; GCN: s_cselect_b32 389; GCN: s_cselect_b32 390; GCN: s_cselect_b32 391; GCN: s_cselect_b32 392; GCN: s_cselect_b32 393; GCN: s_cselect_b32 394; GCN: s_cselect_b32 395; GCN: s_cselect_b32 396; GCN: s_cselect_b32 397; GCN: s_cselect_b32 398; GCN: s_cselect_b32 399; GCN: s_cselect_b32 400; GCN: s_cselect_b32 401define amdgpu_kernel void @select_v8f64(ptr addrspace(1) %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 { 402 %cmp = icmp eq i32 %c, 0 403 %select = select i1 %cmp, <8 x double> %a, <8 x double> %b 404 store <8 x double> %select, ptr addrspace(1) %out, align 16 405 ret void 406} 407 408; GCN-LABEL: {{^}}v_select_v2f16: 409; GCN: v_cndmask_b32 410; GCN-NOT: cndmask 411define amdgpu_kernel void @v_select_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { 412 %a = load <2 x half>, ptr addrspace(1) %a.ptr 413 %b = load <2 x half>, ptr addrspace(1) %b.ptr 414 %cmp = icmp eq i32 %c, 0 415 %select = select i1 %cmp, <2 x half> %a, <2 x half> %b 416 store <2 x half> %select, ptr addrspace(1) %out, align 4 417 ret void 418} 419 420; GCN-LABEL: {{^}}v_select_v3f16: 421; GCN: v_cndmask_b32_e32 422; GCN: v_cndmask_b32_e32 423; GCN-NOT: cndmask 424define amdgpu_kernel void @v_select_v3f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { 425 %a = load <3 x half>, ptr addrspace(1) %a.ptr 426 %b = load <3 x half>, ptr addrspace(1) %b.ptr 427 %cmp = icmp eq i32 %c, 0 428 %select = select i1 %cmp, <3 x half> %a, <3 x half> %b 429 store <3 x half> %select, ptr addrspace(1) %out, align 4 430 ret void 431} 432 433; GCN-LABEL: {{^}}v_select_v4f16: 434; GCN: v_cndmask_b32_e32 435; GCN: v_cndmask_b32_e32 436; GCN-NOT: cndmask 437define amdgpu_kernel void @v_select_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { 438 %a = load <4 x half>, ptr addrspace(1) %a.ptr 439 %b = load <4 x half>, ptr addrspace(1) %b.ptr 440 %cmp = icmp eq i32 %c, 0 441 %select = select i1 %cmp, <4 x half> %a, <4 x half> %b 442 store <4 x half> %select, ptr addrspace(1) %out, align 4 443 ret void 444} 445 446; Function Attrs: nounwind readnone 447declare i32 @llvm.amdgcn.workitem.id.x() #1 448 449attributes #0 = { nounwind } 450attributes #1 = { nounwind readnone } 451