1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,dce < %s | FileCheck -check-prefixes=GCN,VI %s 4 5; FIXME: Should still like to vectorize the memory operations for VI 6 7; Simple 3-pair chain with loads and stores 8define amdgpu_kernel void @test1_as_3_3_3_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c) { 9; GCN-LABEL: @test1_as_3_3_3_v2f16( 10; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 11; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2 12; GCN-NEXT: [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]] 13; GCN-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2 14; GCN-NEXT: ret void 15; 16 %i0 = load half, ptr addrspace(3) %a, align 2 17 %i1 = load half, ptr addrspace(3) %b, align 2 18 %mul = fmul half %i0, %i1 19 %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1 20 %i3 = load half, ptr addrspace(3) %arrayidx3, align 2 21 %arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1 22 %i4 = load half, ptr addrspace(3) %arrayidx4, align 2 23 %mul5 = fmul half %i3, %i4 24 store half %mul, ptr addrspace(3) %c, align 2 25 %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1 26 store half %mul5, ptr addrspace(3) %arrayidx5, align 2 27 ret void 28} 29 30define amdgpu_kernel void @test1_as_3_0_0(ptr addrspace(3) %a, ptr %b, ptr %c) { 31; GCN-LABEL: @test1_as_3_0_0( 32; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 33; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2 34; GCN-NEXT: [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]] 35; GCN-NEXT: store <2 x half> [[TMP3]], ptr [[C:%.*]], align 2 36; GCN-NEXT: ret void 37; 38 %i0 = load half, ptr addrspace(3) %a, align 2 39 %i1 = load half, ptr %b, align 2 40 %mul = fmul half %i0, %i1 41 %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1 42 %i3 = load half, ptr addrspace(3) %arrayidx3, align 2 43 %arrayidx4 = getelementptr inbounds half, ptr %b, i64 1 44 %i4 = load half, ptr %arrayidx4, align 2 45 %mul5 = fmul half %i3, %i4 46 store half %mul, ptr %c, align 2 47 %arrayidx5 = getelementptr inbounds half, ptr %c, i64 1 48 store half %mul5, ptr %arrayidx5, align 2 49 ret void 50} 51 52define amdgpu_kernel void @test1_as_0_0_3_v2f16(ptr %a, ptr %b, ptr addrspace(3) %c) { 53; GCN-LABEL: @test1_as_0_0_3_v2f16( 54; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[A:%.*]], align 2 55; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2 56; GCN-NEXT: [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]] 57; GCN-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2 58; GCN-NEXT: ret void 59; 60 %i0 = load half, ptr %a, align 2 61 %i1 = load half, ptr %b, align 2 62 %mul = fmul half %i0, %i1 63 %arrayidx3 = getelementptr inbounds half, ptr %a, i64 1 64 %i3 = load half, ptr %arrayidx3, align 2 65 %arrayidx4 = getelementptr inbounds half, ptr %b, i64 1 66 %i4 = load half, ptr %arrayidx4, align 2 67 %mul5 = fmul half %i3, %i4 68 store half %mul, ptr addrspace(3) %c, align 2 69 %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1 70 store half %mul5, ptr addrspace(3) %arrayidx5, align 2 71 ret void 72} 73 74define amdgpu_kernel void @test1_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) { 75; GCN-LABEL: @test1_fma_v2f16( 76; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 77; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2 78; GCN-NEXT: [[TMP3:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2 79; GCN-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP2]], <2 x half> [[TMP3]]) 80; GCN-NEXT: store <2 x half> [[TMP4]], ptr addrspace(3) [[D:%.*]], align 2 81; GCN-NEXT: ret void 82; 83 %i0 = load half, ptr addrspace(3) %a, align 2 84 %i1 = load half, ptr addrspace(3) %b, align 2 85 %i2 = load half, ptr addrspace(3) %c, align 2 86 %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2) 87 %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1 88 %i3 = load half, ptr addrspace(3) %arrayidx3, align 2 89 %arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1 90 %i4 = load half, ptr addrspace(3) %arrayidx4, align 2 91 %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1 92 %i5 = load half, ptr addrspace(3) %arrayidx5, align 2 93 %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5) 94 store half %fma0, ptr addrspace(3) %d, align 2 95 %arrayidx6 = getelementptr inbounds half, ptr addrspace(3) %d, i64 1 96 store half %fma1, ptr addrspace(3) %arrayidx6, align 2 97 ret void 98} 99 100define amdgpu_kernel void @mul_scalar_v2f16(ptr addrspace(3) %a, half %scalar, ptr addrspace(3) %c) { 101; GCN-LABEL: @mul_scalar_v2f16( 102; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 103; GCN-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[SCALAR:%.*]], i32 0 104; GCN-NEXT: [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <2 x i32> zeroinitializer 105; GCN-NEXT: [[TMP4:%.*]] = fmul <2 x half> [[TMP1]], [[TMP3]] 106; GCN-NEXT: store <2 x half> [[TMP4]], ptr addrspace(3) [[C:%.*]], align 2 107; GCN-NEXT: ret void 108; 109 %i0 = load half, ptr addrspace(3) %a, align 2 110 %mul = fmul half %i0, %scalar 111 %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1 112 %i3 = load half, ptr addrspace(3) %arrayidx3, align 2 113 %mul5 = fmul half %i3, %scalar 114 store half %mul, ptr addrspace(3) %c, align 2 115 %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1 116 store half %mul5, ptr addrspace(3) %arrayidx5, align 2 117 ret void 118} 119 120define amdgpu_kernel void @fabs_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) { 121; GCN-LABEL: @fabs_v2f16( 122; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 123; GCN-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]]) 124; GCN-NEXT: store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2 125; GCN-NEXT: ret void 126; 127 %i0 = load half, ptr addrspace(3) %a, align 2 128 %fabs0 = call half @llvm.fabs.f16(half %i0) 129 %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1 130 %i3 = load half, ptr addrspace(3) %arrayidx3, align 2 131 %fabs1 = call half @llvm.fabs.f16(half %i3) 132 store half %fabs0, ptr addrspace(3) %c, align 2 133 %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1 134 store half %fabs1, ptr addrspace(3) %arrayidx5, align 2 135 ret void 136} 137 138define amdgpu_kernel void @test1_fabs_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) { 139; GCN-LABEL: @test1_fabs_fma_v2f16( 140; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 141; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2 142; GCN-NEXT: [[TMP3:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2 143; GCN-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]]) 144; GCN-NEXT: [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP4]], <2 x half> [[TMP2]], <2 x half> [[TMP3]]) 145; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2 146; GCN-NEXT: ret void 147; 148 %i0 = load half, ptr addrspace(3) %a, align 2 149 %i1 = load half, ptr addrspace(3) %b, align 2 150 %i2 = load half, ptr addrspace(3) %c, align 2 151 %i0.fabs = call half @llvm.fabs.f16(half %i0) 152 153 %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2) 154 %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1 155 %i3 = load half, ptr addrspace(3) %arrayidx3, align 2 156 %arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1 157 %i4 = load half, ptr addrspace(3) %arrayidx4, align 2 158 %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1 159 %i5 = load half, ptr addrspace(3) %arrayidx5, align 2 160 %i3.fabs = call half @llvm.fabs.f16(half %i3) 161 162 %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5) 163 store half %fma0, ptr addrspace(3) %d, align 2 164 %arrayidx6 = getelementptr inbounds half, ptr addrspace(3) %d, i64 1 165 store half %fma1, ptr addrspace(3) %arrayidx6, align 2 166 ret void 167} 168 169define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) { 170; GCN-LABEL: @test1_fabs_scalar_fma_v2f16( 171; GCN-NEXT: [[I1:%.*]] = load half, ptr addrspace(3) [[B:%.*]], align 2 172; GCN-NEXT: [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]]) 173; GCN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[B]], i64 1 174; GCN-NEXT: [[I4:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX4]], align 2 175; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 176; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2 177; GCN-NEXT: [[TMP3:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0 178; GCN-NEXT: [[TMP4:%.*]] = insertelement <2 x half> [[TMP3]], half [[I4]], i32 1 179; GCN-NEXT: [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP4]], <2 x half> [[TMP2]]) 180; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2 181; GCN-NEXT: ret void 182; 183 %i0 = load half, ptr addrspace(3) %a, align 2 184 %i1 = load half, ptr addrspace(3) %b, align 2 185 %i2 = load half, ptr addrspace(3) %c, align 2 186 %i1.fabs = call half @llvm.fabs.f16(half %i1) 187 188 %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2) 189 %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1 190 %i3 = load half, ptr addrspace(3) %arrayidx3, align 2 191 %arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1 192 %i4 = load half, ptr addrspace(3) %arrayidx4, align 2 193 %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1 194 %i5 = load half, ptr addrspace(3) %arrayidx5, align 2 195 %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5) 196 store half %fma0, ptr addrspace(3) %d, align 2 197 %arrayidx6 = getelementptr inbounds half, ptr addrspace(3) %d, i64 1 198 store half %fma1, ptr addrspace(3) %arrayidx6, align 2 199 ret void 200} 201 202define amdgpu_kernel void @canonicalize_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) { 203; GCN-LABEL: @canonicalize_v2f16( 204; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 205; GCN-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP1]]) 206; GCN-NEXT: store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2 207; GCN-NEXT: ret void 208; 209 %i0 = load half, ptr addrspace(3) %a, align 2 210 %canonicalize0 = call half @llvm.canonicalize.f16(half %i0) 211 %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1 212 %i3 = load half, ptr addrspace(3) %arrayidx3, align 2 213 %canonicalize1 = call half @llvm.canonicalize.f16(half %i3) 214 store half %canonicalize0, ptr addrspace(3) %c, align 2 215 %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1 216 store half %canonicalize1, ptr addrspace(3) %arrayidx5, align 2 217 ret void 218} 219 220declare half @llvm.fabs.f16(half) #1 221declare half @llvm.fma.f16(half, half, half) #1 222declare half @llvm.canonicalize.f16(half) #1 223 224attributes #0 = { nounwind } 225attributes #1 = { nounwind readnone } 226;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 227; GFX9: {{.*}} 228; VI: {{.*}} 229