1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -passes="default<O3>" -S < %s | FileCheck %s --check-prefix=SSE 3; RUN: opt -passes="default<O3>" -S -mattr=avx < %s | FileCheck %s --check-prefix=AVX 4 5target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" 6target triple = "x86_64--" 7 8%union.ElementWiseAccess = type { <4 x float> } 9 10$getAt = comdat any 11 12define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 { 13; SSE-LABEL: @ConvertVectors_ByRef( 14; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16 15; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 16; SSE-NEXT: ret <4 x float> [[TMP3]] 17; 18; AVX-LABEL: @ConvertVectors_ByRef( 19; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16 20; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 21; AVX-NEXT: ret <4 x float> [[TMP3]] 22; 23 %2 = alloca ptr, align 8 24 %3 = alloca <4 x float>, align 16 25 store ptr %0, ptr %2, align 8 26 %4 = load ptr, ptr %2, align 8 27 %5 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %4) 28 %6 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %5, i32 noundef 0) 29 %7 = insertelement <4 x float> undef, float %6, i32 0 30 %8 = load ptr, ptr %2, align 8 31 %9 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %8) 32 %10 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %9, i32 noundef 1) 33 %11 = insertelement <4 x float> %7, float %10, i32 1 34 %12 = load ptr, ptr %2, align 8 35 %13 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %12) 36 %14 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %13, i32 noundef 2) 37 %15 = insertelement <4 x float> %11, float %14, i32 2 38 %16 = load ptr, ptr %2, align 8 39 %17 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %16) 40 %18 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %17, i32 noundef 2) 41 %19 = insertelement <4 x float> %15, float %18, i32 3 42 store <4 x float> %19, ptr %3, align 16 43 %20 = load <4 x float>, ptr %3, align 16 44 ret <4 x float> %20 45} 46 47define noundef <4 x float> @ConvertVectors_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %V) #0 { 48; SSE-LABEL: @ConvertVectors_ByVal( 49; SSE-NEXT: entry: 50; SSE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16 51; SSE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 8 52; SSE-NEXT: [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8 53; SSE-NEXT: [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32 54; SSE-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float 55; SSE-NEXT: [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2 56; SSE-NEXT: [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3 57; SSE-NEXT: ret <4 x float> [[VECINIT16]] 58; 59; AVX-LABEL: @ConvertVectors_ByVal( 60; AVX-NEXT: entry: 61; AVX-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16 62; AVX-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 8 63; AVX-NEXT: [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8 64; AVX-NEXT: [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32 65; AVX-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float 66; AVX-NEXT: [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2 67; AVX-NEXT: [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3 68; AVX-NEXT: ret <4 x float> [[VECINIT16]] 69; 70entry: 71 %V.addr = alloca ptr, align 8 72 %.compoundliteral = alloca <4 x float>, align 16 73 %ref.tmp = alloca %union.ElementWiseAccess, align 16 74 %ref.tmp2 = alloca %union.ElementWiseAccess, align 16 75 %ref.tmp7 = alloca %union.ElementWiseAccess, align 16 76 %ref.tmp12 = alloca %union.ElementWiseAccess, align 16 77 store ptr %V, ptr %V.addr, align 8 78 call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp) #4 79 %0 = load ptr, ptr %V.addr, align 8 80 %call = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %0) 81 %coerce.dive = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp, i32 0, i32 0 82 %1 = getelementptr inbounds { double, double }, ptr %coerce.dive, i32 0, i32 0 83 %2 = extractvalue { double, double } %call, 0 84 store double %2, ptr %1, align 16 85 %3 = getelementptr inbounds { double, double }, ptr %coerce.dive, i32 0, i32 1 86 %4 = extractvalue { double, double } %call, 1 87 store double %4, ptr %3, align 8 88 %call1 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp, i32 noundef 0) 89 %vecinit = insertelement <4 x float> undef, float %call1, i32 0 90 call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp2) #4 91 %5 = load ptr, ptr %V.addr, align 8 92 %call3 = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %5) 93 %coerce.dive4 = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp2, i32 0, i32 0 94 %6 = getelementptr inbounds { double, double }, ptr %coerce.dive4, i32 0, i32 0 95 %7 = extractvalue { double, double } %call3, 0 96 store double %7, ptr %6, align 16 97 %8 = getelementptr inbounds { double, double }, ptr %coerce.dive4, i32 0, i32 1 98 %9 = extractvalue { double, double } %call3, 1 99 store double %9, ptr %8, align 8 100 %call5 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp2, i32 noundef 1) 101 %vecinit6 = insertelement <4 x float> %vecinit, float %call5, i32 1 102 call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp7) #4 103 %10 = load ptr, ptr %V.addr, align 8 104 %call8 = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %10) 105 %coerce.dive9 = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp7, i32 0, i32 0 106 %11 = getelementptr inbounds { double, double }, ptr %coerce.dive9, i32 0, i32 0 107 %12 = extractvalue { double, double } %call8, 0 108 store double %12, ptr %11, align 16 109 %13 = getelementptr inbounds { double, double }, ptr %coerce.dive9, i32 0, i32 1 110 %14 = extractvalue { double, double } %call8, 1 111 store double %14, ptr %13, align 8 112 %call10 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp7, i32 noundef 2) 113 %vecinit11 = insertelement <4 x float> %vecinit6, float %call10, i32 2 114 call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp12) #4 115 %15 = load ptr, ptr %V.addr, align 8 116 %call13 = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %15) 117 %coerce.dive14 = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp12, i32 0, i32 0 118 %16 = getelementptr inbounds { double, double }, ptr %coerce.dive14, i32 0, i32 0 119 %17 = extractvalue { double, double } %call13, 0 120 store double %17, ptr %16, align 16 121 %18 = getelementptr inbounds { double, double }, ptr %coerce.dive14, i32 0, i32 1 122 %19 = extractvalue { double, double } %call13, 1 123 store double %19, ptr %18, align 8 124 %call15 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp12, i32 noundef 2) 125 %vecinit16 = insertelement <4 x float> %vecinit11, float %call15, i32 3 126 store <4 x float> %vecinit16, ptr %.compoundliteral, align 16 127 %20 = load <4 x float>, ptr %.compoundliteral, align 16 128 call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp12) #4 129 call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp7) #4 130 call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp2) #4 131 call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp) #4 132 ret <4 x float> %20 133} 134 135define internal { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %t) #1 { 136entry: 137 %retval = alloca %union.ElementWiseAccess, align 16 138 %t.addr = alloca ptr, align 8 139 store ptr %t, ptr %t.addr, align 8 140 %0 = load ptr, ptr %t.addr, align 8 141 call void @llvm.memcpy.p0.p0.i64(ptr align 16 %retval, ptr align 16 %0, i64 16, i1 false) 142 %coerce.dive = getelementptr inbounds %union.ElementWiseAccess, ptr %retval, i32 0, i32 0 143 %1 = load { double, double }, ptr %coerce.dive, align 16 144 ret { double, double } %1 145} 146 147declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2 148declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2 149declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #3 150 151define internal noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #1 { 152 %2 = alloca ptr, align 8 153 store ptr %0, ptr %2, align 8 154 %3 = load ptr, ptr %2, align 8 155 ret ptr %3 156} 157 158define linkonce_odr dso_local noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %0, i32 noundef %1) #1 comdat align 2 { 159 %3 = alloca ptr, align 8 160 %4 = alloca i32, align 4 161 store ptr %0, ptr %3, align 8 162 store i32 %1, ptr %4, align 4 163 %5 = load ptr, ptr %3, align 8 164 %6 = load i32, ptr %4, align 4 165 %7 = sext i32 %6 to i64 166 %8 = getelementptr inbounds [4 x float], ptr %5, i64 0, i64 %7 167 %9 = load float, ptr %8, align 4 168 ret float %9 169} 170 171define linkonce_odr noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %this, i32 noundef %i) #1 align 2 { 172entry: 173 %this.addr = alloca ptr, align 8 174 %i.addr = alloca i32, align 4 175 store ptr %this, ptr %this.addr, align 8 176 store i32 %i, ptr %i.addr, align 4 177 %this1 = load ptr, ptr %this.addr, align 8 178 %0 = load i32, ptr %i.addr, align 4 179 %idxprom = sext i32 %0 to i64 180 %arrayidx = getelementptr inbounds [4 x float], ptr %this1, i64 0, i64 %idxprom 181 %1 = load float, ptr %arrayidx, align 4 182 ret float %1 183} 184 185; Vector combine + SLP should form a narrow load and a vector cast 186 187define void @PR51397(ptr noundef %dst, ptr noundef %srcp) { 188; SSE-LABEL: @PR51397( 189; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[SRCP:%.*]], align 16 190; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> 191; SSE-NEXT: store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 16 192; SSE-NEXT: ret void 193; 194; AVX-LABEL: @PR51397( 195; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[SRCP:%.*]], align 16 196; AVX-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> 197; AVX-NEXT: store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 16 198; AVX-NEXT: ret void 199; 200 %src = load <8 x i32>, ptr %srcp, align 16 201 %vecext = extractelement <8 x i32> %src, i32 0 202 %conv = sitofp i32 %vecext to float 203 %vecinit = insertelement <4 x float> undef, float %conv, i32 0 204 %vecext1 = extractelement <8 x i32> %src, i32 1 205 %conv2 = sitofp i32 %vecext1 to float 206 %vecinit3 = insertelement <4 x float> %vecinit, float %conv2, i32 1 207 %vecext4 = extractelement <8 x i32> %src, i32 2 208 %conv5 = sitofp i32 %vecext4 to float 209 %vecinit6 = insertelement <4 x float> %vecinit3, float %conv5, i32 2 210 %vecext7 = extractelement <8 x i32> %src, i32 3 211 %conv8 = sitofp i32 %vecext7 to float 212 %vecinit9 = insertelement <4 x float> %vecinit6, float %conv8, i32 3 213 store <4 x float> %vecinit9, ptr %dst, align 16 214 ret void 215} 216