1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6-NOHSA %s 3; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7-HSA %s 4; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-NOHSA %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s 6 7; FUNC-LABEL: {{^}}constant_load_f64: 8define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { 9; GFX6-NOHSA-LABEL: constant_load_f64: 10; GFX6-NOHSA: ; %bb.0: 11; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 12; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 13; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 14; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 15; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 16; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 17; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 18; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s5 19; GFX6-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 20; GFX6-NOHSA-NEXT: s_endpgm 21; 22; GFX7-HSA-LABEL: constant_load_f64: 23; GFX7-HSA: ; %bb.0: 24; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 25; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) 26; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 27; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 28; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 29; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) 30; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 31; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 32; GFX7-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 33; GFX7-HSA-NEXT: s_endpgm 34; 35; GFX8-NOHSA-LABEL: constant_load_f64: 36; GFX8-NOHSA: ; %bb.0: 37; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 38; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 39; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 40; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 41; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 42; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 43; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2 44; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3 45; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 46; GFX8-NOHSA-NEXT: s_endpgm 47; 48; GFX12-LABEL: constant_load_f64: 49; GFX12: ; %bb.0: 50; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 51; GFX12-NEXT: s_wait_kmcnt 0x0 52; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 53; GFX12-NEXT: v_mov_b32_e32 v2, 0 54; GFX12-NEXT: s_wait_kmcnt 0x0 55; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 56; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 57; GFX12-NEXT: s_endpgm 58 %ld = load double, ptr addrspace(4) %in 59 store double %ld, ptr addrspace(1) %out 60 ret void 61} 62 63attributes #0 = { nounwind } 64 65; Tests whether a load-chain of 8 constants of 64bit each gets vectorized into a wider load. 66define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) { 67; GFX6-NOHSA-LABEL: constant_load_2v4f64: 68; GFX6-NOHSA: ; %bb.0: ; %entry 69; GFX6-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 70; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 71; GFX6-NOHSA-NEXT: s_load_dwordx2 s[24:25], s[18:19], 0x0 72; GFX6-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 73; GFX6-NOHSA-NEXT: s_mov_b32 s23, 0xf000 74; GFX6-NOHSA-NEXT: s_mov_b32 s22, -1 75; GFX6-NOHSA-NEXT: s_mov_b32 s20, s18 76; GFX6-NOHSA-NEXT: s_mov_b32 s21, s19 77; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 78; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s24 79; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s25 80; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] 81; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] 82; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1] 83; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] 84; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[8:9], v[0:1] 85; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[10:11], v[0:1] 86; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[12:13], v[0:1] 87; GFX6-NOHSA-NEXT: v_add_f64 v[0:1], s[14:15], v[0:1] 88; GFX6-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[20:23], 0 89; GFX6-NOHSA-NEXT: s_endpgm 90; 91; GFX7-HSA-LABEL: constant_load_2v4f64: 92; GFX7-HSA: ; %bb.0: ; %entry 93; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 94; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) 95; GFX7-HSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 96; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 97; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 98; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 99; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) 100; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 101; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 102; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] 103; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] 104; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1] 105; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] 106; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[8:9], v[0:1] 107; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[10:11], v[0:1] 108; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[12:13], v[0:1] 109; GFX7-HSA-NEXT: v_add_f64 v[0:1], s[14:15], v[0:1] 110; GFX7-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 111; GFX7-HSA-NEXT: s_endpgm 112; 113; GFX8-NOHSA-LABEL: constant_load_2v4f64: 114; GFX8-NOHSA: ; %bb.0: ; %entry 115; GFX8-NOHSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 116; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 117; GFX8-NOHSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 118; GFX8-NOHSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 119; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 120; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s19 121; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 122; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 123; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s21 124; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] 125; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] 126; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1] 127; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] 128; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[8:9], v[0:1] 129; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[10:11], v[0:1] 130; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[12:13], v[0:1] 131; GFX8-NOHSA-NEXT: v_add_f64 v[0:1], s[14:15], v[0:1] 132; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 133; GFX8-NOHSA-NEXT: s_endpgm 134; 135; GFX12-LABEL: constant_load_2v4f64: 136; GFX12: ; %bb.0: ; %entry 137; GFX12-NEXT: s_load_b128 s[16:19], s[4:5], 0x24 138; GFX12-NEXT: v_mov_b32_e32 v2, 0 139; GFX12-NEXT: s_wait_kmcnt 0x0 140; GFX12-NEXT: s_load_b64 s[20:21], s[18:19], 0x0 141; GFX12-NEXT: s_load_b512 s[0:15], s[16:17], 0x0 142; GFX12-NEXT: s_wait_kmcnt 0x0 143; GFX12-NEXT: v_add_f64_e64 v[0:1], s[0:1], s[20:21] 144; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 145; GFX12-NEXT: v_add_f64_e32 v[0:1], s[2:3], v[0:1] 146; GFX12-NEXT: v_add_f64_e32 v[0:1], s[4:5], v[0:1] 147; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 148; GFX12-NEXT: v_add_f64_e32 v[0:1], s[6:7], v[0:1] 149; GFX12-NEXT: v_add_f64_e32 v[0:1], s[8:9], v[0:1] 150; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 151; GFX12-NEXT: v_add_f64_e32 v[0:1], s[10:11], v[0:1] 152; GFX12-NEXT: v_add_f64_e32 v[0:1], s[12:13], v[0:1] 153; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 154; GFX12-NEXT: v_add_f64_e32 v[0:1], s[14:15], v[0:1] 155; GFX12-NEXT: global_store_b64 v2, v[0:1], s[18:19] 156; GFX12-NEXT: s_endpgm 157entry: 158 %out_ptr.promoted = load double, ptr addrspace(1) %out_ptr, align 4 159 %tmp = load double, ptr addrspace(4) %weights, align 4 160 %add = fadd double %tmp, %out_ptr.promoted 161 %arrayidx.1 = getelementptr inbounds double, ptr addrspace(4) %weights, i64 1 162 %tmp1 = load double, ptr addrspace(4) %arrayidx.1, align 4 163 %add.1 = fadd double %tmp1, %add 164 %arrayidx.2 = getelementptr inbounds double, ptr addrspace(4) %weights, i64 2 165 %tmp2 = load double, ptr addrspace(4) %arrayidx.2, align 4 166 %add.2 = fadd double %tmp2, %add.1 167 %arrayidx.3 = getelementptr inbounds double, ptr addrspace(4) %weights, i64 3 168 %tmp3 = load double, ptr addrspace(4) %arrayidx.3, align 4 169 %add.3 = fadd double %tmp3, %add.2 170 %arrayidx.4 = getelementptr inbounds double, ptr addrspace(4) %weights, i64 4 171 %tmp4 = load double, ptr addrspace(4) %arrayidx.4, align 4 172 %add.4 = fadd double %tmp4, %add.3 173 %arrayidx.5 = getelementptr inbounds double, ptr addrspace(4) %weights, i64 5 174 %tmp5 = load double, ptr addrspace(4) %arrayidx.5, align 4 175 %add.5 = fadd double %tmp5, %add.4 176 %arrayidx.6 = getelementptr inbounds double, ptr addrspace(4) %weights, i64 6 177 %tmp6 = load double, ptr addrspace(4) %arrayidx.6, align 4 178 %add.6 = fadd double %tmp6, %add.5 179 %arrayidx.7 = getelementptr inbounds double, ptr addrspace(4) %weights, i64 7 180 %tmp7 = load double, ptr addrspace(4) %arrayidx.7, align 4 181 %add.7 = fadd double %tmp7, %add.6 182 store double %add.7, ptr addrspace(1) %out_ptr, align 4 183 ret void 184} 185