1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s 3 4define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { 5; CHECK-LABEL: InferNothing: 6; CHECK: ; %bb.0: ; %entry 7; CHECK-NEXT: s_load_dword s6, s[4:5], 0x24 8; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 9; CHECK-NEXT: s_waitcnt lgkmcnt(0) 10; CHECK-NEXT: s_ashr_i32 s7, s6, 31 11; CHECK-NEXT: v_mov_b32_e32 v0, s2 12; CHECK-NEXT: v_mov_b32_e32 v1, s3 13; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 14; CHECK-NEXT: s_add_u32 s0, s2, s0 15; CHECK-NEXT: s_addc_u32 s1, s3, s1 16; CHECK-NEXT: v_mov_b32_e32 v3, s1 17; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0 18; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc 19; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] 20; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 21; CHECK-NEXT: buffer_wbinvl1_vol 22; CHECK-NEXT: s_endpgm 23entry: 24 %i = add nsw i32 %a, -1 25 %i.2 = sext i32 %i to i64 26 %i.3 = getelementptr inbounds double, ptr %b, i64 %i.2 27 %i.4 = atomicrmw fadd ptr %i.3, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 28 ret void 29} 30 31define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, double %c) { 32; CHECK-LABEL: InferFadd: 33; CHECK: ; %bb.0: ; %entry 34; CHECK-NEXT: s_mov_b64 s[0:1], exec 35; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 36; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 37; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 38; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc 39; CHECK-NEXT: s_cbranch_execz .LBB1_2 40; CHECK-NEXT: ; %bb.1: 41; CHECK-NEXT: s_load_dword s2, s[4:5], 0x24 42; CHECK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x2c 43; CHECK-NEXT: v_mov_b32_e32 v2, 0 44; CHECK-NEXT: s_waitcnt lgkmcnt(0) 45; CHECK-NEXT: s_ashr_i32 s3, s2, 31 46; CHECK-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 47; CHECK-NEXT: s_add_u32 s2, s8, s2 48; CHECK-NEXT: s_addc_u32 s3, s9, s3 49; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] 50; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 51; CHECK-NEXT: v_mul_f64 v[0:1], s[10:11], v[0:1] 52; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[2:3] offset:-8 53; CHECK-NEXT: s_waitcnt vmcnt(0) 54; CHECK-NEXT: buffer_wbinvl1_vol 55; CHECK-NEXT: .LBB1_2: 56; CHECK-NEXT: s_endpgm 57entry: 58 %i = add nsw i32 %a, -1 59 %i.2 = sext i32 %i to i64 60 %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 61 %i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr 62 %0 = atomicrmw fadd ptr %i.4, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 63 ret void 64} 65 66define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, double %c, ptr %d) { 67; CHECK-LABEL: InferMixed: 68; CHECK: ; %bb.0: ; %entry 69; CHECK-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c 70; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 71; CHECK-NEXT: s_mov_b64 s[6:7], exec 72; CHECK-NEXT: s_waitcnt lgkmcnt(0) 73; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] 74; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] 75; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] 76; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 77; CHECK-NEXT: buffer_wbinvl1_vol 78; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 79; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 80; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 81; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc 82; CHECK-NEXT: s_cbranch_execz .LBB2_2 83; CHECK-NEXT: ; %bb.1: 84; CHECK-NEXT: s_load_dword s4, s[4:5], 0x24 85; CHECK-NEXT: v_mov_b32_e32 v2, 0 86; CHECK-NEXT: s_waitcnt lgkmcnt(0) 87; CHECK-NEXT: s_ashr_i32 s5, s4, 31 88; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 89; CHECK-NEXT: s_add_u32 s0, s0, s4 90; CHECK-NEXT: s_addc_u32 s1, s1, s5 91; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[6:7] 92; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s4 93; CHECK-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1] 94; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-7 95; CHECK-NEXT: s_waitcnt vmcnt(0) 96; CHECK-NEXT: buffer_wbinvl1_vol 97; CHECK-NEXT: .LBB2_2: 98; CHECK-NEXT: s_endpgm 99entry: 100 %i = add nsw i32 %a, -1 101 %i.2 = sext i32 %i to i64 102 %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 103 br label %bb1 104 105bb1: ; preds = %entry 106 %i.7 = ptrtoint ptr addrspace(1) %i.3 to i64 107 %i.8 = add nsw i64 %i.7, 1 108 %i.9 = inttoptr i64 %i.8 to ptr addrspace(1) 109 %0 = atomicrmw fadd ptr %d, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 110 %i.11 = addrspacecast ptr addrspace(1) %i.9 to ptr 111 %1 = atomicrmw fadd ptr %i.11, double %c syncscope("agent") seq_cst, align 8, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 112 ret void 113} 114 115define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, double %c) { 116; CHECK-LABEL: InferPHI: 117; CHECK: ; %bb.0: ; %entry 118; CHECK-NEXT: s_load_dword s6, s[4:5], 0x24 119; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 120; CHECK-NEXT: s_waitcnt lgkmcnt(0) 121; CHECK-NEXT: s_ashr_i32 s7, s6, 31 122; CHECK-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 123; CHECK-NEXT: s_add_u32 s0, s0, s4 124; CHECK-NEXT: s_addc_u32 s1, s1, s5 125; CHECK-NEXT: s_add_u32 s4, s0, -8 126; CHECK-NEXT: s_addc_u32 s5, s1, -1 127; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 9 128; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 129; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 130; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 131; CHECK-NEXT: .LBB3_1: ; %bb0 132; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 133; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] 134; CHECK-NEXT: s_cbranch_vccnz .LBB3_1 135; CHECK-NEXT: ; %bb.2: ; %bb1 136; CHECK-NEXT: s_mov_b64 s[0:1], exec 137; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 138; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 139; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 140; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc 141; CHECK-NEXT: s_cbranch_execz .LBB3_4 142; CHECK-NEXT: ; %bb.3: 143; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] 144; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 145; CHECK-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1] 146; CHECK-NEXT: v_mov_b32_e32 v2, 0 147; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] 148; CHECK-NEXT: s_waitcnt vmcnt(0) 149; CHECK-NEXT: buffer_wbinvl1_vol 150; CHECK-NEXT: .LBB3_4: 151; CHECK-NEXT: s_endpgm 152entry: 153 %i = add nsw i32 %a, -1 154 %i.2 = sext i32 %i to i64 155 %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2 156 %i.4 = ptrtoint ptr addrspace(1) %i.3 to i64 157 br label %bb0 158 159bb0: ; preds = %bb0, %entry 160 %phi = phi ptr addrspace(1) [ %i.3, %entry ], [ %i.9, %bb0 ] 161 %i.7 = ptrtoint ptr addrspace(1) %phi to i64 162 %i.8 = sub nsw i64 %i.7, 1 163 %cmp2 = icmp eq i64 %i.8, 0 164 %i.9 = inttoptr i64 %i.7 to ptr addrspace(1) 165 br i1 %cmp2, label %bb1, label %bb0 166 167bb1: ; preds = %bb0 168 %i.10 = addrspacecast ptr addrspace(1) %i.9 to ptr 169 %0 = atomicrmw fadd ptr %i.10, double %c syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 170 ret void 171} 172 173attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } 174attributes #1 = { mustprogress nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } 175 176!0 = !{} 177!1 = !{i32 5, i32 6} 178