1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=OLD_RBS %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=NEW_RBS %s 4 5; if instruction is uniform and there is available instruction, select SALU instruction 6define amdgpu_ps void @uniform_in_vgpr(float inreg %a, i32 inreg %b, ptr addrspace(1) %ptr) { 7; OLD_RBS-LABEL: uniform_in_vgpr: 8; OLD_RBS: ; %bb.0: 9; OLD_RBS-NEXT: v_cvt_u32_f32_e32 v2, s0 10; OLD_RBS-NEXT: v_add_nc_u32_e32 v2, s1, v2 11; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off 12; OLD_RBS-NEXT: s_endpgm 13; 14; NEW_RBS-LABEL: uniform_in_vgpr: 15; NEW_RBS: ; %bb.0: 16; NEW_RBS-NEXT: v_cvt_u32_f32_e32 v2, s0 17; NEW_RBS-NEXT: v_readfirstlane_b32 s0, v2 18; NEW_RBS-NEXT: s_add_i32 s0, s0, s1 19; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 20; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off 21; NEW_RBS-NEXT: s_endpgm 22 %a.i32 = fptoui float %a to i32 23 %res = add i32 %a.i32, %b 24 store i32 %res, ptr addrspace(1) %ptr 25 ret void 26} 27 28; copy sgpr to vgpr + readfirstlane vgpr to sgpr combine from rb-legalize 29define amdgpu_ps void @back_to_back_uniform_in_vgpr(float inreg %a, float inreg %b, i32 inreg %c, ptr addrspace(1) %ptr) { 30; OLD_RBS-LABEL: back_to_back_uniform_in_vgpr: 31; OLD_RBS: ; %bb.0: 32; OLD_RBS-NEXT: v_add_f32_e64 v2, s0, s1 33; OLD_RBS-NEXT: v_cvt_u32_f32_e32 v2, v2 34; OLD_RBS-NEXT: v_add_nc_u32_e32 v2, s2, v2 35; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off 36; OLD_RBS-NEXT: s_endpgm 37; 38; NEW_RBS-LABEL: back_to_back_uniform_in_vgpr: 39; NEW_RBS: ; %bb.0: 40; NEW_RBS-NEXT: v_add_f32_e64 v2, s0, s1 41; NEW_RBS-NEXT: v_cvt_u32_f32_e32 v2, v2 42; NEW_RBS-NEXT: v_readfirstlane_b32 s0, v2 43; NEW_RBS-NEXT: s_add_i32 s0, s0, s2 44; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 45; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off 46; NEW_RBS-NEXT: s_endpgm 47 %add = fadd float %a, %b 48 %add.i32 = fptoui float %add to i32 49 %res = add i32 %add.i32, %c 50 store i32 %res, ptr addrspace(1) %ptr 51 ret void 52} 53 54; fast rules for vector instructions 55define amdgpu_cs void @buffer_load_uniform(<4 x i32> inreg %rsrc, i32 inreg %voffset, ptr addrspace(1) %ptr) { 56; OLD_RBS-LABEL: buffer_load_uniform: 57; OLD_RBS: ; %bb.0: ; %.entry 58; OLD_RBS-NEXT: v_mov_b32_e32 v2, s4 59; OLD_RBS-NEXT: buffer_load_dwordx4 v[2:5], v2, s[0:3], 0 offen 60; OLD_RBS-NEXT: s_waitcnt vmcnt(0) 61; OLD_RBS-NEXT: v_add_nc_u32_e32 v2, 1, v3 62; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off 63; OLD_RBS-NEXT: s_endpgm 64; 65; NEW_RBS-LABEL: buffer_load_uniform: 66; NEW_RBS: ; %bb.0: ; %.entry 67; NEW_RBS-NEXT: v_mov_b32_e32 v2, s4 68; NEW_RBS-NEXT: buffer_load_dwordx4 v[2:5], v2, s[0:3], 0 offen 69; NEW_RBS-NEXT: s_waitcnt vmcnt(0) 70; NEW_RBS-NEXT: v_readfirstlane_b32 s0, v3 71; NEW_RBS-NEXT: s_add_i32 s0, s0, 1 72; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 73; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off 74; NEW_RBS-NEXT: s_endpgm 75.entry: 76 %vec = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) 77 %el1 = extractelement <4 x i32> %vec, i64 1 78 %res = add i32 %el1, 1 79 store i32 %res, ptr addrspace(1) %ptr 80 ret void 81} 82 83define amdgpu_cs void @buffer_load_divergent(<4 x i32> inreg %rsrc, i32 %voffset, ptr addrspace(1) %ptr) { 84; OLD_RBS-LABEL: buffer_load_divergent: 85; OLD_RBS: ; %bb.0: ; %.entry 86; OLD_RBS-NEXT: buffer_load_dwordx4 v[3:6], v0, s[0:3], 0 offen 87; OLD_RBS-NEXT: s_waitcnt vmcnt(0) 88; OLD_RBS-NEXT: v_add_nc_u32_e32 v0, 1, v4 89; OLD_RBS-NEXT: global_store_dword v[1:2], v0, off 90; OLD_RBS-NEXT: s_endpgm 91; 92; NEW_RBS-LABEL: buffer_load_divergent: 93; NEW_RBS: ; %bb.0: ; %.entry 94; NEW_RBS-NEXT: buffer_load_dwordx4 v[3:6], v0, s[0:3], 0 offen 95; NEW_RBS-NEXT: s_waitcnt vmcnt(0) 96; NEW_RBS-NEXT: v_add_nc_u32_e32 v0, 1, v4 97; NEW_RBS-NEXT: global_store_dword v[1:2], v0, off 98; NEW_RBS-NEXT: s_endpgm 99.entry: 100 %vec = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) 101 %el1 = extractelement <4 x i32> %vec, i64 1 102 %res = add i32 %el1, 1 103 store i32 %res, ptr addrspace(1) %ptr 104 ret void 105} 106 107;lowering in rb-legalize (sgpr S64 is legal, vgpr has to be split to S32) 108define amdgpu_ps void @vgpr_and_i64(i64 %a, i64 %b, ptr addrspace(1) %ptr) { 109; OLD_RBS-LABEL: vgpr_and_i64: 110; OLD_RBS: ; %bb.0: 111; OLD_RBS-NEXT: v_and_b32_e32 v0, v0, v2 112; OLD_RBS-NEXT: v_and_b32_e32 v1, v1, v3 113; OLD_RBS-NEXT: global_store_dwordx2 v[4:5], v[0:1], off 114; OLD_RBS-NEXT: s_endpgm 115; 116; NEW_RBS-LABEL: vgpr_and_i64: 117; NEW_RBS: ; %bb.0: 118; NEW_RBS-NEXT: v_and_b32_e32 v0, v0, v2 119; NEW_RBS-NEXT: v_and_b32_e32 v1, v1, v3 120; NEW_RBS-NEXT: global_store_dwordx2 v[4:5], v[0:1], off 121; NEW_RBS-NEXT: s_endpgm 122 %res = and i64 %a, %b 123 store i64 %res, ptr addrspace(1) %ptr 124 ret void 125} 126 127; It is up to user instruction to deal with potential truncated bits in reg. 128; Here G_ABS needs to sign extend S16 in reg to S32 and then do S32 G_ABS. 129define amdgpu_ps void @abs_sgpr_i16(i16 inreg %arg, ptr addrspace(1) %ptr) { 130; OLD_RBS-LABEL: abs_sgpr_i16: 131; OLD_RBS: ; %bb.0: 132; OLD_RBS-NEXT: s_sext_i32_i16 s0, s0 133; OLD_RBS-NEXT: s_abs_i32 s0, s0 134; OLD_RBS-NEXT: v_mov_b32_e32 v2, s0 135; OLD_RBS-NEXT: global_store_short v[0:1], v2, off 136; OLD_RBS-NEXT: s_endpgm 137; 138; NEW_RBS-LABEL: abs_sgpr_i16: 139; NEW_RBS: ; %bb.0: 140; NEW_RBS-NEXT: s_sext_i32_i16 s0, s0 141; NEW_RBS-NEXT: s_abs_i32 s0, s0 142; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 143; NEW_RBS-NEXT: global_store_short v[0:1], v2, off 144; NEW_RBS-NEXT: s_endpgm 145 %res = call i16 @llvm.abs.i16(i16 %arg, i1 false) 146 store i16 %res, ptr addrspace(1) %ptr 147 ret void 148} 149 150define amdgpu_ps void @uniform_i1_phi(ptr addrspace(1) %out, i32 inreg %tid, i32 inreg %cond) { 151; OLD_RBS-LABEL: uniform_i1_phi: 152; OLD_RBS: ; %bb.0: ; %A 153; OLD_RBS-NEXT: s_cmp_ge_u32 s0, 6 154; OLD_RBS-NEXT: s_cselect_b32 s2, 1, 0 155; OLD_RBS-NEXT: s_cmp_lg_u32 s1, 0 156; OLD_RBS-NEXT: s_cbranch_scc1 .LBB6_2 157; OLD_RBS-NEXT: ; %bb.1: ; %B 158; OLD_RBS-NEXT: s_cmp_lt_u32 s0, 1 159; OLD_RBS-NEXT: s_cselect_b32 s2, 1, 0 160; OLD_RBS-NEXT: .LBB6_2: ; %exit 161; OLD_RBS-NEXT: s_bfe_i32 s0, s2, 0x10000 162; OLD_RBS-NEXT: s_add_i32 s0, s0, 2 163; OLD_RBS-NEXT: v_mov_b32_e32 v2, s0 164; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off 165; OLD_RBS-NEXT: s_endpgm 166; 167; NEW_RBS-LABEL: uniform_i1_phi: 168; NEW_RBS: ; %bb.0: ; %A 169; NEW_RBS-NEXT: s_cmp_ge_u32 s0, 6 170; NEW_RBS-NEXT: s_cselect_b32 s2, 1, 0 171; NEW_RBS-NEXT: s_cmp_lg_u32 s1, 0 172; NEW_RBS-NEXT: s_cbranch_scc1 .LBB6_2 173; NEW_RBS-NEXT: ; %bb.1: ; %B 174; NEW_RBS-NEXT: s_cmp_lt_u32 s0, 1 175; NEW_RBS-NEXT: s_cselect_b32 s2, 1, 0 176; NEW_RBS-NEXT: .LBB6_2: ; %exit 177; NEW_RBS-NEXT: s_cmp_lg_u32 s2, 0 178; NEW_RBS-NEXT: s_cselect_b32 s0, -1, 0 179; NEW_RBS-NEXT: s_add_i32 s0, s0, 2 180; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 181; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off 182; NEW_RBS-NEXT: s_endpgm 183A: 184 %val_A = icmp uge i32 %tid, 6 185 %cmp = icmp eq i32 %cond, 0 186 br i1 %cmp, label %B, label %exit 187 188B: 189 %val_B = icmp ult i32 %tid, 1 190 br label %exit 191 192exit: 193 %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] 194 %sel = select i1 %phi, i32 1, i32 2 195 store i32 %sel, ptr addrspace(1) %out 196 ret void 197} 198 199; this is kind of i1 readfirstlane 200; uniform i1 result on instruction that is only available on VALU 201define amdgpu_ps void @vcc_to_scc(float inreg %a, i32 inreg %b, i32 inreg %c, ptr addrspace(1) %ptr) { 202; OLD_RBS-LABEL: vcc_to_scc: 203; OLD_RBS: ; %bb.0: 204; OLD_RBS-NEXT: v_mov_b32_e32 v2, s2 205; OLD_RBS-NEXT: v_cmp_eq_f32_e64 s0, s0, 0 206; OLD_RBS-NEXT: v_cndmask_b32_e64 v2, v2, s1, s0 207; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off 208; OLD_RBS-NEXT: s_endpgm 209; 210; NEW_RBS-LABEL: vcc_to_scc: 211; NEW_RBS: ; %bb.0: 212; NEW_RBS-NEXT: v_cmp_eq_f32_e64 s0, s0, 0 213; NEW_RBS-NEXT: s_cmp_lg_u32 s0, 0 214; NEW_RBS-NEXT: s_cselect_b32 s0, 1, 0 215; NEW_RBS-NEXT: s_and_b32 s0, s0, 1 216; NEW_RBS-NEXT: s_cmp_lg_u32 s0, 0 217; NEW_RBS-NEXT: s_cselect_b32 s0, s1, s2 218; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 219; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off 220; NEW_RBS-NEXT: s_endpgm 221 %vcc_to_scc = fcmp oeq float %a, 0.0 222 %select = select i1 %vcc_to_scc, i32 %b, i32 %c 223 store i32 %select, ptr addrspace(1) %ptr 224 ret void 225} 226 227; combiner in rb-legalize recognizes sgpr S1 to vcc copy 228define amdgpu_ps void @scc_to_vcc(i32 inreg %a, i32 %b, i32 %c, ptr addrspace(1) %ptr) { 229; OLD_RBS-LABEL: scc_to_vcc: 230; OLD_RBS: ; %bb.0: 231; OLD_RBS-NEXT: s_cmp_eq_u32 s0, 0 232; OLD_RBS-NEXT: s_cselect_b32 s0, 1, 0 233; OLD_RBS-NEXT: s_and_b32 s0, 1, s0 234; OLD_RBS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 235; OLD_RBS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 236; OLD_RBS-NEXT: global_store_dword v[2:3], v0, off 237; OLD_RBS-NEXT: s_endpgm 238; 239; NEW_RBS-LABEL: scc_to_vcc: 240; NEW_RBS: ; %bb.0: 241; NEW_RBS-NEXT: s_cmp_eq_u32 s0, 0 242; NEW_RBS-NEXT: s_cselect_b32 vcc_lo, exec_lo, 0 243; NEW_RBS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 244; NEW_RBS-NEXT: global_store_dword v[2:3], v0, off 245; NEW_RBS-NEXT: s_endpgm 246 %scc_to_vcc = icmp eq i32 %a, 0 247 %select = select i1 %scc_to_vcc, i32 %b, i32 %c 248 store i32 %select, ptr addrspace(1) %ptr 249 ret void 250} 251 252; this is only G_TRUNC that is not no-op in global-isel for AMDGPU 253define amdgpu_ps void @vgpr_to_vcc_trunc(i32 %a, i32 %b, i32 %c, ptr addrspace(1) %ptr) { 254; OLD_RBS-LABEL: vgpr_to_vcc_trunc: 255; OLD_RBS: ; %bb.0: 256; OLD_RBS-NEXT: v_and_b32_e32 v0, 1, v0 257; OLD_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 258; OLD_RBS-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo 259; OLD_RBS-NEXT: global_store_dword v[3:4], v0, off 260; OLD_RBS-NEXT: s_endpgm 261; 262; NEW_RBS-LABEL: vgpr_to_vcc_trunc: 263; NEW_RBS: ; %bb.0: 264; NEW_RBS-NEXT: v_and_b32_e32 v0, 1, v0 265; NEW_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 266; NEW_RBS-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo 267; NEW_RBS-NEXT: global_store_dword v[3:4], v0, off 268; NEW_RBS-NEXT: s_endpgm 269 %vcc = trunc i32 %a to i1 270 %select = select i1 %vcc, i32 %b, i32 %c 271 store i32 %select, ptr addrspace(1) %ptr 272 ret void 273} 274 275; i1 input to zext and sext is something that survived legalizer (not trunc) 276; lower to select 277define amdgpu_ps void @zext(i32 inreg %a, ptr addrspace(1) %ptr) { 278; OLD_RBS-LABEL: zext: 279; OLD_RBS: ; %bb.0: 280; OLD_RBS-NEXT: s_cmp_eq_u32 s0, 10 281; OLD_RBS-NEXT: s_cselect_b32 s0, 1, 0 282; OLD_RBS-NEXT: v_mov_b32_e32 v2, s0 283; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off 284; OLD_RBS-NEXT: s_endpgm 285; 286; NEW_RBS-LABEL: zext: 287; NEW_RBS: ; %bb.0: 288; NEW_RBS-NEXT: s_cmp_eq_u32 s0, 10 289; NEW_RBS-NEXT: s_cselect_b32 s0, 1, 0 290; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 291; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off 292; NEW_RBS-NEXT: s_endpgm 293 %bool = icmp eq i32 %a, 10 294 %zext = zext i1 %bool to i32 295 store i32 %zext, ptr addrspace(1) %ptr 296 ret void 297} 298 299define amdgpu_ps void @sext(i32 inreg %a, ptr addrspace(1) %ptr) { 300; OLD_RBS-LABEL: sext: 301; OLD_RBS: ; %bb.0: 302; OLD_RBS-NEXT: s_cmp_eq_u32 s0, 10 303; OLD_RBS-NEXT: s_cselect_b32 s0, 1, 0 304; OLD_RBS-NEXT: s_bfe_i32 s0, s0, 0x10000 305; OLD_RBS-NEXT: v_mov_b32_e32 v2, s0 306; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off 307; OLD_RBS-NEXT: s_endpgm 308; 309; NEW_RBS-LABEL: sext: 310; NEW_RBS: ; %bb.0: 311; NEW_RBS-NEXT: s_cmp_eq_u32 s0, 10 312; NEW_RBS-NEXT: s_cselect_b32 s0, -1, 0 313; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 314; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off 315; NEW_RBS-NEXT: s_endpgm 316 %bool = icmp eq i32 %a, 10 317 %sext = sext i1 %bool to i32 318 store i32 %sext, ptr addrspace(1) %ptr 319 ret void 320} 321 322; divergent i1 bitwise, i1 vcc. 323; inst selected into s_and_b32 on wave32 or s_and_b64 on wave64. 324define amdgpu_ps void @and_i1_vcc(i32 %a, i32 %b, ptr addrspace(1) %ptr) { 325; OLD_RBS-LABEL: and_i1_vcc: 326; OLD_RBS: ; %bb.0: 327; OLD_RBS-NEXT: v_cmp_le_u32_e32 vcc_lo, 10, v0 328; OLD_RBS-NEXT: v_cmp_le_u32_e64 s0, 20, v1 329; OLD_RBS-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 330; OLD_RBS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 331; OLD_RBS-NEXT: global_store_dword v[2:3], v0, off 332; OLD_RBS-NEXT: s_endpgm 333; 334; NEW_RBS-LABEL: and_i1_vcc: 335; NEW_RBS: ; %bb.0: 336; NEW_RBS-NEXT: v_cmp_le_u32_e32 vcc_lo, 10, v0 337; NEW_RBS-NEXT: v_cmp_le_u32_e64 s0, 20, v1 338; NEW_RBS-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 339; NEW_RBS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 340; NEW_RBS-NEXT: global_store_dword v[2:3], v0, off 341; NEW_RBS-NEXT: s_endpgm 342 %cmp_a = icmp uge i32 %a, 10 343 %cmp_b = icmp uge i32 %b, 20 344 %cc = and i1 %cmp_a, %cmp_b 345 %res = select i1 %cc, i32 %a, i32 %b 346 store i32 %res, ptr addrspace(1) %ptr 347 ret void 348} 349 350; uniform i1 bitwise, i32 sgpr. inst selected into s_and_b32. 351define amdgpu_ps void @and_i1_scc(i32 inreg %a, i32 inreg %b, ptr addrspace(1) %ptr) { 352; OLD_RBS-LABEL: and_i1_scc: 353; OLD_RBS: ; %bb.0: 354; OLD_RBS-NEXT: s_cmp_ge_u32 s0, 10 355; OLD_RBS-NEXT: s_cselect_b32 s2, 1, 0 356; OLD_RBS-NEXT: s_cmp_ge_u32 s1, 20 357; OLD_RBS-NEXT: s_cselect_b32 s3, 1, 0 358; OLD_RBS-NEXT: s_and_b32 s2, s2, s3 359; OLD_RBS-NEXT: s_and_b32 s2, s2, 1 360; OLD_RBS-NEXT: s_cmp_lg_u32 s2, 0 361; OLD_RBS-NEXT: s_cselect_b32 s0, s0, s1 362; OLD_RBS-NEXT: v_mov_b32_e32 v2, s0 363; OLD_RBS-NEXT: global_store_dword v[0:1], v2, off 364; OLD_RBS-NEXT: s_endpgm 365; 366; NEW_RBS-LABEL: and_i1_scc: 367; NEW_RBS: ; %bb.0: 368; NEW_RBS-NEXT: s_cmp_ge_u32 s0, 10 369; NEW_RBS-NEXT: s_cselect_b32 s2, 1, 0 370; NEW_RBS-NEXT: s_cmp_ge_u32 s1, 20 371; NEW_RBS-NEXT: s_cselect_b32 s3, 1, 0 372; NEW_RBS-NEXT: s_and_b32 s2, s2, s3 373; NEW_RBS-NEXT: s_cmp_lg_u32 s2, 0 374; NEW_RBS-NEXT: s_cselect_b32 s0, s0, s1 375; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 376; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off 377; NEW_RBS-NEXT: s_endpgm 378 %cmp_a = icmp uge i32 %a, 10 379 %cmp_b = icmp uge i32 %b, 20 380 %cc = and i1 %cmp_a, %cmp_b 381 %res = select i1 %cc, i32 %a, i32 %b 382 store i32 %res, ptr addrspace(1) %ptr 383 ret void 384} 385 386; old RBS selects sgpr phi because it had sgpr inputs. 387define amdgpu_ps void @divergent_phi_with_uniform_inputs(i32 %a, ptr addrspace(1) %out) { 388; OLD_RBS-LABEL: divergent_phi_with_uniform_inputs: 389; OLD_RBS: ; %bb.0: ; %A 390; OLD_RBS-NEXT: s_mov_b32 s0, 0 391; OLD_RBS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 392; OLD_RBS-NEXT: s_and_saveexec_b32 s1, vcc_lo 393; OLD_RBS-NEXT: ; %bb.1: ; %B 394; OLD_RBS-NEXT: s_mov_b32 s0, 1 395; OLD_RBS-NEXT: ; %bb.2: ; %exit 396; OLD_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s1 397; OLD_RBS-NEXT: v_mov_b32_e32 v0, s0 398; OLD_RBS-NEXT: global_store_dword v[1:2], v0, off 399; OLD_RBS-NEXT: s_endpgm 400; 401; NEW_RBS-LABEL: divergent_phi_with_uniform_inputs: 402; NEW_RBS: ; %bb.0: ; %A 403; NEW_RBS-NEXT: s_mov_b32 s0, 0 404; NEW_RBS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 405; NEW_RBS-NEXT: v_mov_b32_e32 v0, s0 406; NEW_RBS-NEXT: s_and_saveexec_b32 s0, vcc_lo 407; NEW_RBS-NEXT: ; %bb.1: ; %B 408; NEW_RBS-NEXT: s_mov_b32 s1, 1 409; NEW_RBS-NEXT: v_mov_b32_e32 v0, s1 410; NEW_RBS-NEXT: ; %bb.2: ; %exit 411; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s0 412; NEW_RBS-NEXT: global_store_dword v[1:2], v0, off 413; NEW_RBS-NEXT: s_endpgm 414A: 415 %cmp = icmp eq i32 %a, 0 416 br i1 %cmp, label %B, label %exit 417 418B: 419 br label %exit 420 421exit: 422 %phi = phi i32 [ 0, %A ], [ 1, %B ] 423 store i32 %phi, ptr addrspace(1) %out 424 ret void 425} 426 427; old RBS assigned vgpr to uniform phi (because one input had undetermined bank) 428; and it propagated to mul, which was not wrong. 429; new RBS assigns vgpr to destination of mul even though both inputs are sgpr. 430; TODO: implement temporal divergence lowering 431define amdgpu_ps void @divergent_because_of_temporal_divergent_use(float %val, ptr addrspace(1) %addr) { 432; OLD_RBS-LABEL: divergent_because_of_temporal_divergent_use: 433; OLD_RBS: ; %bb.0: ; %entry 434; OLD_RBS-NEXT: s_mov_b32 s0, -1 435; OLD_RBS-NEXT: v_mov_b32_e32 v3, s0 436; OLD_RBS-NEXT: s_mov_b32 s0, 0 437; OLD_RBS-NEXT: .LBB15_1: ; %loop 438; OLD_RBS-NEXT: ; =>This Inner Loop Header: Depth=1 439; OLD_RBS-NEXT: v_add_nc_u32_e32 v3, 1, v3 440; OLD_RBS-NEXT: v_cvt_f32_u32_e32 v4, v3 441; OLD_RBS-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0 442; OLD_RBS-NEXT: s_or_b32 s0, vcc_lo, s0 443; OLD_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 444; OLD_RBS-NEXT: s_cbranch_execnz .LBB15_1 445; OLD_RBS-NEXT: ; %bb.2: ; %exit 446; OLD_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s0 447; OLD_RBS-NEXT: v_mul_lo_u32 v0, v3, 10 448; OLD_RBS-NEXT: global_store_dword v[1:2], v0, off 449; OLD_RBS-NEXT: s_endpgm 450; 451; NEW_RBS-LABEL: divergent_because_of_temporal_divergent_use: 452; NEW_RBS: ; %bb.0: ; %entry 453; NEW_RBS-NEXT: s_mov_b32 s0, -1 454; NEW_RBS-NEXT: s_mov_b32 s1, 0 455; NEW_RBS-NEXT: .LBB15_1: ; %loop 456; NEW_RBS-NEXT: ; =>This Inner Loop Header: Depth=1 457; NEW_RBS-NEXT: s_add_i32 s0, s0, 1 458; NEW_RBS-NEXT: v_cvt_f32_u32_e32 v3, s0 459; NEW_RBS-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0 460; NEW_RBS-NEXT: s_or_b32 s1, vcc_lo, s1 461; NEW_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 462; NEW_RBS-NEXT: s_cbranch_execnz .LBB15_1 463; NEW_RBS-NEXT: ; %bb.2: ; %exit 464; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s1 465; NEW_RBS-NEXT: v_mov_b32_e32 v0, s0 466; NEW_RBS-NEXT: v_mul_lo_u32 v0, v0, 10 467; NEW_RBS-NEXT: global_store_dword v[1:2], v0, off 468; NEW_RBS-NEXT: s_endpgm 469entry: 470 br label %loop 471 472loop: 473 %counter = phi i32 [ 0, %entry ], [ %counter.plus.1, %loop ] 474 %f.counter = uitofp i32 %counter to float 475 %cond = fcmp ogt float %f.counter, %val 476 %counter.plus.1 = add i32 %counter, 1 477 br i1 %cond, label %exit, label %loop 478 479exit: 480 %ceilx10 = mul i32 %counter, 10 481 store i32 %ceilx10, ptr addrspace(1) %addr 482 ret void 483} 484 485; Variables that hande counter can be allocated to sgprs. 486define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %b) { 487; OLD_RBS-LABEL: loop_with_2breaks: 488; OLD_RBS: ; %bb.0: ; %entry 489; OLD_RBS-NEXT: s_mov_b32 s0, 0 490; OLD_RBS-NEXT: ; implicit-def: $sgpr1 491; OLD_RBS-NEXT: v_mov_b32_e32 v6, s0 492; OLD_RBS-NEXT: s_branch .LBB16_3 493; OLD_RBS-NEXT: .LBB16_1: ; %Flow3 494; OLD_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 495; OLD_RBS-NEXT: s_waitcnt_depctr 0xffe3 496; OLD_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s3 497; OLD_RBS-NEXT: s_andn2_b32 s1, s1, exec_lo 498; OLD_RBS-NEXT: s_and_b32 s3, exec_lo, s4 499; OLD_RBS-NEXT: s_or_b32 s1, s1, s3 500; OLD_RBS-NEXT: .LBB16_2: ; %Flow 501; OLD_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 502; OLD_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s2 503; OLD_RBS-NEXT: s_and_b32 s2, exec_lo, s1 504; OLD_RBS-NEXT: s_or_b32 s0, s2, s0 505; OLD_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 506; OLD_RBS-NEXT: s_cbranch_execz .LBB16_6 507; OLD_RBS-NEXT: .LBB16_3: ; %A 508; OLD_RBS-NEXT: ; =>This Inner Loop Header: Depth=1 509; OLD_RBS-NEXT: v_ashrrev_i32_e32 v7, 31, v6 510; OLD_RBS-NEXT: s_andn2_b32 s1, s1, exec_lo 511; OLD_RBS-NEXT: s_and_b32 s2, exec_lo, -1 512; OLD_RBS-NEXT: s_or_b32 s1, s1, s2 513; OLD_RBS-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] 514; OLD_RBS-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 515; OLD_RBS-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo 516; OLD_RBS-NEXT: global_load_dword v9, v[9:10], off 517; OLD_RBS-NEXT: s_waitcnt vmcnt(0) 518; OLD_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 519; OLD_RBS-NEXT: s_and_saveexec_b32 s2, vcc_lo 520; OLD_RBS-NEXT: s_cbranch_execz .LBB16_2 521; OLD_RBS-NEXT: ; %bb.4: ; %B 522; OLD_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 523; OLD_RBS-NEXT: v_add_co_u32 v9, vcc_lo, v4, v7 524; OLD_RBS-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo 525; OLD_RBS-NEXT: s_mov_b32 s4, -1 526; OLD_RBS-NEXT: global_load_dword v9, v[9:10], off 527; OLD_RBS-NEXT: s_waitcnt vmcnt(0) 528; OLD_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 529; OLD_RBS-NEXT: s_and_saveexec_b32 s3, vcc_lo 530; OLD_RBS-NEXT: s_cbranch_execz .LBB16_1 531; OLD_RBS-NEXT: ; %bb.5: ; %loop.body 532; OLD_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 533; OLD_RBS-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7 534; OLD_RBS-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo 535; OLD_RBS-NEXT: v_add_nc_u32_e32 v10, 1, v6 536; OLD_RBS-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6 537; OLD_RBS-NEXT: s_andn2_b32 s4, -1, exec_lo 538; OLD_RBS-NEXT: global_load_dword v9, v[7:8], off 539; OLD_RBS-NEXT: v_mov_b32_e32 v6, v10 540; OLD_RBS-NEXT: s_and_b32 s5, exec_lo, vcc_lo 541; OLD_RBS-NEXT: s_or_b32 s4, s4, s5 542; OLD_RBS-NEXT: s_waitcnt vmcnt(0) 543; OLD_RBS-NEXT: v_add_nc_u32_e32 v9, 1, v9 544; OLD_RBS-NEXT: global_store_dword v[7:8], v9, off 545; OLD_RBS-NEXT: s_branch .LBB16_1 546; OLD_RBS-NEXT: .LBB16_6: ; %exit 547; OLD_RBS-NEXT: s_endpgm 548; 549; NEW_RBS-LABEL: loop_with_2breaks: 550; NEW_RBS: ; %bb.0: ; %entry 551; NEW_RBS-NEXT: s_mov_b32 s4, 0 552; NEW_RBS-NEXT: s_mov_b32 s0, 0 553; NEW_RBS-NEXT: ; implicit-def: $sgpr5 554; NEW_RBS-NEXT: s_branch .LBB16_3 555; NEW_RBS-NEXT: .LBB16_1: ; %Flow3 556; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 557; NEW_RBS-NEXT: s_waitcnt_depctr 0xffe3 558; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s7 559; NEW_RBS-NEXT: s_andn2_b32 s2, s5, exec_lo 560; NEW_RBS-NEXT: s_and_b32 s3, exec_lo, s6 561; NEW_RBS-NEXT: s_or_b32 s5, s2, s3 562; NEW_RBS-NEXT: .LBB16_2: ; %Flow 563; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 564; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s1 565; NEW_RBS-NEXT: s_and_b32 s1, exec_lo, s5 566; NEW_RBS-NEXT: s_or_b32 s4, s1, s4 567; NEW_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 568; NEW_RBS-NEXT: s_cbranch_execz .LBB16_6 569; NEW_RBS-NEXT: .LBB16_3: ; %A 570; NEW_RBS-NEXT: ; =>This Inner Loop Header: Depth=1 571; NEW_RBS-NEXT: s_ashr_i32 s1, s0, 31 572; NEW_RBS-NEXT: s_lshl_b64 s[2:3], s[0:1], 2 573; NEW_RBS-NEXT: s_andn2_b32 s1, s5, exec_lo 574; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 575; NEW_RBS-NEXT: v_mov_b32_e32 v6, s2 576; NEW_RBS-NEXT: s_and_b32 s5, exec_lo, exec_lo 577; NEW_RBS-NEXT: s_or_b32 s5, s1, s5 578; NEW_RBS-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6 579; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo 580; NEW_RBS-NEXT: global_load_dword v6, v[6:7], off 581; NEW_RBS-NEXT: s_waitcnt vmcnt(0) 582; NEW_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 583; NEW_RBS-NEXT: s_and_saveexec_b32 s1, vcc_lo 584; NEW_RBS-NEXT: s_cbranch_execz .LBB16_2 585; NEW_RBS-NEXT: ; %bb.4: ; %B 586; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 587; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 588; NEW_RBS-NEXT: v_mov_b32_e32 v6, s2 589; NEW_RBS-NEXT: s_mov_b32 s6, exec_lo 590; NEW_RBS-NEXT: v_add_co_u32 v6, vcc_lo, v4, v6 591; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v5, v7, vcc_lo 592; NEW_RBS-NEXT: global_load_dword v6, v[6:7], off 593; NEW_RBS-NEXT: s_waitcnt vmcnt(0) 594; NEW_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 595; NEW_RBS-NEXT: s_and_saveexec_b32 s7, vcc_lo 596; NEW_RBS-NEXT: s_cbranch_execz .LBB16_1 597; NEW_RBS-NEXT: ; %bb.5: ; %loop.body 598; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 599; NEW_RBS-NEXT: v_mov_b32_e32 v7, s3 600; NEW_RBS-NEXT: v_mov_b32_e32 v6, s2 601; NEW_RBS-NEXT: s_add_i32 s2, s0, 1 602; NEW_RBS-NEXT: s_cmpk_lt_u32 s0, 0x64 603; NEW_RBS-NEXT: s_cselect_b32 s0, exec_lo, 0 604; NEW_RBS-NEXT: v_add_co_u32 v6, vcc_lo, v0, v6 605; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo 606; NEW_RBS-NEXT: s_andn2_b32 s3, s6, exec_lo 607; NEW_RBS-NEXT: s_and_b32 s0, exec_lo, s0 608; NEW_RBS-NEXT: s_or_b32 s6, s3, s0 609; NEW_RBS-NEXT: global_load_dword v8, v[6:7], off 610; NEW_RBS-NEXT: s_mov_b32 s0, s2 611; NEW_RBS-NEXT: s_waitcnt vmcnt(0) 612; NEW_RBS-NEXT: v_add_nc_u32_e32 v8, 1, v8 613; NEW_RBS-NEXT: global_store_dword v[6:7], v8, off 614; NEW_RBS-NEXT: s_branch .LBB16_1 615; NEW_RBS-NEXT: .LBB16_6: ; %exit 616; NEW_RBS-NEXT: s_endpgm 617entry: 618 br label %A 619 620A: 621 %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ] 622 %a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter 623 %a.val = load i32, ptr addrspace(1) %a.plus.counter 624 %a.cond = icmp eq i32 %a.val, 0 625 br i1 %a.cond, label %exit, label %B 626 627B: 628 %b.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %b, i32 %counter 629 %b.val = load i32, ptr addrspace(1) %b.plus.counter 630 %b.cond = icmp eq i32 %b.val, 0 631 br i1 %b.cond, label %exit, label %loop.body 632 633loop.body: 634 %x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter 635 %x.val = load i32, ptr addrspace(1) %x.plus.counter 636 %x.val.plus.1 = add i32 %x.val, 1 637 store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter 638 %counter.plus.1 = add i32 %counter, 1 639 %x.cond = icmp ult i32 %counter, 100 640 br i1 %x.cond, label %exit, label %A 641 642exit: 643 ret void 644} 645 646declare i16 @llvm.abs.i16(i16, i1) 647declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg) 648