1; RUN: llc -amdgpu-scalar-ir-passes=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s 2 3; GCN-LABEL: {{^}}select_undef_lhs: 4; GCN: s_waitcnt 5; GCN-NOT: v_cmp 6; GCN-NOT: v_cndmask 7; GCN-NEXT: s_setpc_b64 8define float @select_undef_lhs(float %val, i1 %cond) { 9 %sel = select i1 %cond, float undef, float %val 10 ret float %sel 11} 12 13; GCN-LABEL: {{^}}select_undef_rhs: 14; GCN: s_waitcnt 15; GCN-NOT: v_cmp 16; GCN-NOT: v_cndmask 17; GCN-NEXT: s_setpc_b64 18define float @select_undef_rhs(float %val, i1 %cond) { 19 %sel = select i1 %cond, float %val, float undef 20 ret float %sel 21} 22 23; GCN-LABEL: {{^}}select_undef_n1: 24; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0 25; GCN: store_dword {{[^,]+}}, [[RES]] 26define void @select_undef_n1(ptr addrspace(1) %a, i32 %c) { 27 %cc = icmp eq i32 %c, 0 28 %sel = select i1 %cc, float 1.000000e+00, float undef 29 store float %sel, ptr addrspace(1) %a 30 ret void 31} 32 33; GCN-LABEL: {{^}}select_undef_n2: 34; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0 35; GCN: store_dword {{[^,]+}}, [[RES]] 36define void @select_undef_n2(ptr addrspace(1) %a, i32 %c) { 37 %cc = icmp eq i32 %c, 0 38 %sel = select i1 %cc, float undef, float 1.000000e+00 39 store float %sel, ptr addrspace(1) %a 40 ret void 41} 42 43declare float @llvm.amdgcn.rcp.f32(float) 44 45 46; Make sure the vector undef isn't lowered into 0s. 47; GCN-LABEL: {{^}}undef_v6f32: 48; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 49; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 50; GCN: s_cbranch_vccnz 51define amdgpu_kernel void @undef_v6f32(ptr addrspace(3) %ptr, i1 %cond) { 52entry: 53 br label %loop 54 55loop: 56 %phi = phi <6 x float> [ undef, %entry ], [ %add, %loop ] 57 %load = load volatile <6 x float>, ptr addrspace(3) undef 58 %add = fadd <6 x float> %load, %phi 59 br i1 %cond, label %loop, label %ret 60 61ret: 62 store volatile <6 x float> %add, ptr addrspace(3) undef 63 ret void 64} 65 66; GCN-LABEL: {{^}}undef_v6i32: 67; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 68; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 69; GCN: s_cbranch_vccnz 70define amdgpu_kernel void @undef_v6i32(ptr addrspace(3) %ptr, i1 %cond) { 71entry: 72 br label %loop 73 74loop: 75 %phi = phi <6 x i32> [ undef, %entry ], [ %add, %loop ] 76 %load = load volatile <6 x i32>, ptr addrspace(3) undef 77 %add = add <6 x i32> %load, %phi 78 br i1 %cond, label %loop, label %ret 79 80ret: 81 store volatile <6 x i32> %add, ptr addrspace(3) undef 82 ret void 83} 84 85; Make sure the vector undef isn't lowered into 0s. 86; GCN-LABEL: {{^}}undef_v5f32: 87; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 88; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 89; GCN: s_cbranch_vccnz 90define amdgpu_kernel void @undef_v5f32(ptr addrspace(3) %ptr, i1 %cond) { 91entry: 92 br label %loop 93 94loop: 95 %phi = phi <5 x float> [ undef, %entry ], [ %add, %loop ] 96 %load = load volatile <5 x float>, ptr addrspace(3) undef 97 %add = fadd <5 x float> %load, %phi 98 br i1 %cond, label %loop, label %ret 99 100ret: 101 store volatile <5 x float> %add, ptr addrspace(3) undef 102 ret void 103} 104 105; GCN-LABEL: {{^}}undef_v5i32: 106; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 107; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 108; GCN: s_cbranch_vccnz 109define amdgpu_kernel void @undef_v5i32(ptr addrspace(3) %ptr, i1 %cond) { 110entry: 111 br label %loop 112 113loop: 114 %phi = phi <5 x i32> [ undef, %entry ], [ %add, %loop ] 115 %load = load volatile <5 x i32>, ptr addrspace(3) undef 116 %add = add <5 x i32> %load, %phi 117 br i1 %cond, label %loop, label %ret 118 119ret: 120 store volatile <5 x i32> %add, ptr addrspace(3) undef 121 ret void 122} 123 124; Make sure the vector undef isn't lowered into 0s. 125; GCN-LABEL: {{^}}undef_v3f64: 126; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 127; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 128; GCN: s_cbranch_vccnz 129define amdgpu_kernel void @undef_v3f64(ptr addrspace(3) %ptr, i1 %cond) { 130entry: 131 br label %loop 132 133loop: 134 %phi = phi <3 x double> [ undef, %entry ], [ %add, %loop ] 135 %load = load volatile <3 x double>, ptr addrspace(3) %ptr 136 %add = fadd <3 x double> %load, %phi 137 br i1 %cond, label %loop, label %ret 138 139ret: 140 store volatile <3 x double> %add, ptr addrspace(3) %ptr 141 ret void 142} 143 144; GCN-LABEL: {{^}}undef_v3i64: 145; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 146; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 147; GCN: s_cbranch_vccnz 148define amdgpu_kernel void @undef_v3i64(ptr addrspace(3) %ptr, i1 %cond) { 149entry: 150 br label %loop 151 152loop: 153 %phi = phi <3 x i64> [ undef, %entry ], [ %add, %loop ] 154 %load = load volatile <3 x i64>, ptr addrspace(3) %ptr 155 %add = add <3 x i64> %load, %phi 156 br i1 %cond, label %loop, label %ret 157 158ret: 159 store volatile <3 x i64> %add, ptr addrspace(3) %ptr 160 ret void 161} 162 163; Make sure the vector undef isn't lowered into 0s. 164; GCN-LABEL: {{^}}undef_v4f16: 165; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 166; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 167; GCN: s_cbranch_vccnz 168define amdgpu_kernel void @undef_v4f16(ptr addrspace(3) %ptr, i1 %cond) { 169entry: 170 br label %loop 171 172loop: 173 %phi = phi <4 x half> [ undef, %entry ], [ %add, %loop ] 174 %load = load volatile <4 x half>, ptr addrspace(3) %ptr 175 %add = fadd <4 x half> %load, %phi 176 br i1 %cond, label %loop, label %ret 177 178ret: 179 store volatile <4 x half> %add, ptr addrspace(3) %ptr 180 ret void 181} 182 183; GCN-LABEL: {{^}}undef_v4i16: 184; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 185; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 186; GCN: s_cbranch_vccnz 187define amdgpu_kernel void @undef_v4i16(ptr addrspace(3) %ptr, i1 %cond) { 188entry: 189 br label %loop 190 191loop: 192 %phi = phi <4 x i16> [ undef, %entry ], [ %add, %loop ] 193 %load = load volatile <4 x i16>, ptr addrspace(3) %ptr 194 %add = add <4 x i16> %load, %phi 195 br i1 %cond, label %loop, label %ret 196 197ret: 198 store volatile <4 x i16> %add, ptr addrspace(3) %ptr 199 ret void 200} 201 202; Make sure the vector undef isn't lowered into 0s. 203; GCN-LABEL: {{^}}undef_v2f16: 204; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 205; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 206; GCN: s_cbranch_vccnz 207define amdgpu_kernel void @undef_v2f16(ptr addrspace(3) %ptr, i1 %cond) { 208entry: 209 br label %loop 210 211loop: 212 %phi = phi <2 x half> [ undef, %entry ], [ %add, %loop ] 213 %load = load volatile <2 x half>, ptr addrspace(3) %ptr 214 %add = fadd <2 x half> %load, %phi 215 br i1 %cond, label %loop, label %ret 216 217ret: 218 store volatile <2 x half> %add, ptr addrspace(3) %ptr 219 ret void 220} 221 222; GCN-LABEL: {{^}}undef_v2i16: 223; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 224; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 225; GCN: s_cbranch_vccnz 226define amdgpu_kernel void @undef_v2i16(ptr addrspace(3) %ptr, i1 %cond) { 227entry: 228 br label %loop 229 230loop: 231 %phi = phi <2 x i16> [ undef, %entry ], [ %add, %loop ] 232 %load = load volatile <2 x i16>, ptr addrspace(3) %ptr 233 %add = add <2 x i16> %load, %phi 234 br i1 %cond, label %loop, label %ret 235 236ret: 237 store volatile <2 x i16> %add, ptr addrspace(3) %ptr 238 ret void 239} 240 241; We were expanding undef vectors into zero vectors. Optimizations 242; would then see we used no elements of the vector, and reform the 243; undef vector resulting in a combiner loop. 244; GCN-LABEL: {{^}}inf_loop_undef_vector: 245; GCN: s_waitcnt 246; GCN-NEXT: v_mad_u64_u32 247; GCN-NEXT: v_mul_lo_u32 248; GCN-NEXT: v_mul_lo_u32 249; GCN-NEXT: v_add3_u32 250; GCN-NEXT: global_store_dwordx2 251define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) { 252 %i = insertelement <6 x float> %arg, float %arg1, i64 2 253 %i3 = bitcast <6 x float> %i to <3 x i64> 254 %i4 = extractelement <3 x i64> %i3, i64 0 255 %i5 = extractelement <3 x i64> %i3, i64 1 256 %i6 = mul i64 %i5, %arg2 257 %i7 = add i64 %i6, %i4 258 store volatile i64 %i7, ptr addrspace(1) undef, align 4 259 ret void 260} 261 262; GCN-LABEL: {{^}}undef_bf16: 263; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 264; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 265; GCN: s_cbranch_vccnz 266define amdgpu_kernel void @undef_bf16(ptr addrspace(3) %ptr, i1 %cond) { 267entry: 268 br label %loop 269 270loop: 271 %phi = phi bfloat [ undef, %entry ], [ %add, %loop ] 272 %load = load volatile bfloat, ptr addrspace(3) undef 273 %bc.0 = bitcast bfloat %load to i16 274 %bc.1 = bitcast bfloat %phi to i16 275 %add.i = add i16 %bc.0, %bc.1 276 %add = bitcast i16 %add.i to bfloat 277 br i1 %cond, label %loop, label %ret 278 279ret: 280 store volatile bfloat %add, ptr addrspace(3) undef 281 ret void 282} 283 284; GCN-LABEL: {{^}}undef_v2bf16: 285; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} 286; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}} 287; GCN: s_cbranch_vccnz 288define amdgpu_kernel void @undef_v2bf16(ptr addrspace(3) %ptr, i1 %cond) { 289entry: 290 br label %loop 291 292loop: 293 %phi = phi <2 x bfloat> [ undef, %entry ], [ %add, %loop ] 294 %load = load volatile <2 x bfloat>, ptr addrspace(3) undef 295 %bc.0 = bitcast <2 x bfloat> %load to <2 x i16> 296 %bc.1 = bitcast <2 x bfloat> %phi to <2 x i16> 297 %add.i = add <2 x i16> %bc.0, %bc.1 298 %add = bitcast <2 x i16> %add.i to <2 x bfloat> 299 br i1 %cond, label %loop, label %ret 300 301ret: 302 store volatile <2 x bfloat> %add, ptr addrspace(3) undef 303 ret void 304} 305 306; GCN-LABEL: {{^}}undef_v3bf16: 307; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} 308; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}} 309; GCN: s_cbranch_vccnz 310define amdgpu_kernel void @undef_v3bf16(ptr addrspace(3) %ptr, i1 %cond) { 311entry: 312 br label %loop 313 314loop: 315 %phi = phi <3 x bfloat> [ undef, %entry ], [ %add, %loop ] 316 %load = load volatile <3 x bfloat>, ptr addrspace(3) undef 317 %bc.0 = bitcast <3 x bfloat> %load to <3 x i16> 318 %bc.1 = bitcast <3 x bfloat> %phi to <3 x i16> 319 %add.i = add <3 x i16> %bc.0, %bc.1 320 %add = bitcast <3 x i16> %add.i to <3 x bfloat> 321 br i1 %cond, label %loop, label %ret 322 323ret: 324 store volatile <3 x bfloat> %add, ptr addrspace(3) undef 325 ret void 326} 327 328; GCN-LABEL: {{^}}undef_v4bf16: 329; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} 330; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}} 331; GCN: s_cbranch_vccnz 332define amdgpu_kernel void @undef_v4bf16(ptr addrspace(3) %ptr, i1 %cond) { 333entry: 334 br label %loop 335 336loop: 337 %phi = phi <4 x bfloat> [ undef, %entry ], [ %add, %loop ] 338 %load = load volatile <4 x bfloat>, ptr addrspace(3) undef 339 %bc.0 = bitcast <4 x bfloat> %load to <4 x i16> 340 %bc.1 = bitcast <4 x bfloat> %phi to <4 x i16> 341 %add.i = add <4 x i16> %bc.0, %bc.1 342 %add = bitcast <4 x i16> %add.i to <4 x bfloat> 343 br i1 %cond, label %loop, label %ret 344 345ret: 346 store volatile <4 x bfloat> %add, ptr addrspace(3) undef 347 ret void 348} 349 350; GCN-LABEL: {{^}}undef_v6bf16: 351; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} 352; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}} 353; GCN: s_cbranch_vccnz 354define amdgpu_kernel void @undef_v6bf16(ptr addrspace(3) %ptr, i1 %cond) { 355entry: 356 br label %loop 357 358loop: 359 %phi = phi <6 x bfloat> [ undef, %entry ], [ %add, %loop ] 360 %load = load volatile <6 x bfloat>, ptr addrspace(3) undef 361 %bc.0 = bitcast <6 x bfloat> %load to <6 x i16> 362 %bc.1 = bitcast <6 x bfloat> %phi to <6 x i16> 363 %add.i = add <6 x i16> %bc.0, %bc.1 364 %add = bitcast <6 x i16> %add.i to <6 x bfloat> 365 br i1 %cond, label %loop, label %ret 366 367ret: 368 store volatile <6 x bfloat> %add, ptr addrspace(3) undef 369 ret void 370} 371 372; GCN-LABEL: {{^}}undef_v8bf16: 373; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} 374; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}} 375; GCN: s_cbranch_vccnz 376define amdgpu_kernel void @undef_v8bf16(ptr addrspace(3) %ptr, i1 %cond) { 377entry: 378 br label %loop 379 380loop: 381 %phi = phi <8 x bfloat> [ undef, %entry ], [ %add, %loop ] 382 %load = load volatile <8 x bfloat>, ptr addrspace(3) undef 383 %bc.0 = bitcast <8 x bfloat> %load to <8 x i16> 384 %bc.1 = bitcast <8 x bfloat> %phi to <8 x i16> 385 %add.i = add <8 x i16> %bc.0, %bc.1 386 %add = bitcast <8 x i16> %add.i to <8 x bfloat> 387 br i1 %cond, label %loop, label %ret 388 389ret: 390 store volatile <8 x bfloat> %add, ptr addrspace(3) undef 391 ret void 392} 393 394; GCN-LABEL: {{^}}undef_v16bf16: 395; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} 396; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}} 397; GCN: s_cbranch_vccnz 398define amdgpu_kernel void @undef_v16bf16(ptr addrspace(3) %ptr, i1 %cond) { 399entry: 400 br label %loop 401 402loop: 403 %phi = phi <16 x bfloat> [ undef, %entry ], [ %add, %loop ] 404 %load = load volatile <16 x bfloat>, ptr addrspace(3) undef 405 %bc.0 = bitcast <16 x bfloat> %load to <16 x i16> 406 %bc.1 = bitcast <16 x bfloat> %phi to <16 x i16> 407 %add.i = add <16 x i16> %bc.0, %bc.1 408 %add = bitcast <16 x i16> %add.i to <16 x bfloat> 409 br i1 %cond, label %loop, label %ret 410 411ret: 412 store volatile <16 x bfloat> %add, ptr addrspace(3) undef 413 ret void 414} 415 416; GCN-LABEL: {{^}}undef_v32bf16: 417; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} 418; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}} 419; GCN: s_cbranch_vccnz 420define amdgpu_kernel void @undef_v32bf16(ptr addrspace(3) %ptr, i1 %cond) { 421entry: 422 br label %loop 423 424loop: 425 %phi = phi <32 x bfloat> [ undef, %entry ], [ %add, %loop ] 426 %load = load volatile <32 x bfloat>, ptr addrspace(3) undef 427 %bc.0 = bitcast <32 x bfloat> %load to <32 x i16> 428 %bc.1 = bitcast <32 x bfloat> %phi to <32 x i16> 429 %add.i = add <32 x i16> %bc.0, %bc.1 430 %add = bitcast <32 x i16> %add.i to <32 x bfloat> 431 br i1 %cond, label %loop, label %ret 432 433ret: 434 store volatile <32 x bfloat> %add, ptr addrspace(3) undef 435 ret void 436} 437 438