1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE 3; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX 4 5declare void @use(<4 x i32>) 6declare void @usef(<4 x float>) 7 8; Eliminating an insert is profitable. 9 10define <16 x i8> @ins0_ins0_add(i8 %x, i8 %y) { 11; CHECK-LABEL: @ins0_ins0_add( 12; CHECK-NEXT: [[R_SCALAR:%.*]] = add i8 [[X:%.*]], [[Y:%.*]] 13; CHECK-NEXT: [[R:%.*]] = insertelement <16 x i8> undef, i8 [[R_SCALAR]], i64 0 14; CHECK-NEXT: ret <16 x i8> [[R]] 15; 16 %i0 = insertelement <16 x i8> undef, i8 %x, i32 0 17 %i1 = insertelement <16 x i8> undef, i8 %y, i32 0 18 %r = add <16 x i8> %i0, %i1 19 ret <16 x i8> %r 20} 21 22; Eliminating an insert is still profitable. Flags propagate. Mismatch types on index is ok. 23 24define <8 x i16> @ins0_ins0_sub_flags(i16 %x, i16 %y) { 25; CHECK-LABEL: @ins0_ins0_sub_flags( 26; CHECK-NEXT: [[R_SCALAR:%.*]] = sub nuw nsw i16 [[X:%.*]], [[Y:%.*]] 27; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[R_SCALAR]], i64 5 28; CHECK-NEXT: ret <8 x i16> [[R]] 29; 30 %i0 = insertelement <8 x i16> undef, i16 %x, i8 5 31 %i1 = insertelement <8 x i16> undef, i16 %y, i32 5 32 %r = sub nsw nuw <8 x i16> %i0, %i1 33 ret <8 x i16> %r 34} 35 36; The new vector constant is calculated by constant folding. 37; This is conservatively created as zero rather than undef for 'undef ^ undef'. 38 39define <2 x i64> @ins1_ins1_xor(i64 %x, i64 %y) { 40; CHECK-LABEL: @ins1_ins1_xor( 41; CHECK-NEXT: [[R_SCALAR:%.*]] = xor i64 [[X:%.*]], [[Y:%.*]] 42; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[R_SCALAR]], i64 1 43; CHECK-NEXT: ret <2 x i64> [[R]] 44; 45 %i0 = insertelement <2 x i64> undef, i64 %x, i64 1 46 %i1 = insertelement <2 x i64> undef, i64 %y, i32 1 47 %r = xor <2 x i64> %i0, %i1 48 ret <2 x i64> %r 49} 50 51define <2 x i64> @ins1_ins1_iterate(i64 %w, i64 %x, i64 %y, i64 %z) { 52; CHECK-LABEL: @ins1_ins1_iterate( 53; CHECK-NEXT: [[S0_SCALAR:%.*]] = sub i64 [[W:%.*]], [[X:%.*]] 54; CHECK-NEXT: [[S1_SCALAR:%.*]] = or i64 [[S0_SCALAR]], [[Y:%.*]] 55; CHECK-NEXT: [[S2_SCALAR:%.*]] = shl i64 [[Z:%.*]], [[S1_SCALAR]] 56; CHECK-NEXT: [[S2:%.*]] = insertelement <2 x i64> poison, i64 [[S2_SCALAR]], i64 1 57; CHECK-NEXT: ret <2 x i64> [[S2]] 58; 59 %i0 = insertelement <2 x i64> undef, i64 %w, i64 1 60 %i1 = insertelement <2 x i64> undef, i64 %x, i32 1 61 %s0 = sub <2 x i64> %i0, %i1 62 %i2 = insertelement <2 x i64> undef, i64 %y, i32 1 63 %s1 = or <2 x i64> %s0, %i2 64 %i3 = insertelement <2 x i64> undef, i64 %z, i32 1 65 %s2 = shl <2 x i64> %i3, %s1 66 ret <2 x i64> %s2 67} 68 69; The inserts are free, but it's still better to scalarize. 70 71define <2 x double> @ins0_ins0_fadd(double %x, double %y) { 72; CHECK-LABEL: @ins0_ins0_fadd( 73; CHECK-NEXT: [[R_SCALAR:%.*]] = fadd reassoc nsz double [[X:%.*]], [[Y:%.*]] 74; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> undef, double [[R_SCALAR]], i64 0 75; CHECK-NEXT: ret <2 x double> [[R]] 76; 77 %i0 = insertelement <2 x double> undef, double %x, i32 0 78 %i1 = insertelement <2 x double> undef, double %y, i32 0 79 %r = fadd reassoc nsz <2 x double> %i0, %i1 80 ret <2 x double> %r 81} 82 83; Negative test - mismatched indexes (but could fold this). 84 85define <16 x i8> @ins1_ins0_add(i8 %x, i8 %y) { 86; CHECK-LABEL: @ins1_ins0_add( 87; CHECK-NEXT: [[I0:%.*]] = insertelement <16 x i8> undef, i8 [[X:%.*]], i32 1 88; CHECK-NEXT: [[I1:%.*]] = insertelement <16 x i8> undef, i8 [[Y:%.*]], i32 0 89; CHECK-NEXT: [[R:%.*]] = add <16 x i8> [[I0]], [[I1]] 90; CHECK-NEXT: ret <16 x i8> [[R]] 91; 92 %i0 = insertelement <16 x i8> undef, i8 %x, i32 1 93 %i1 = insertelement <16 x i8> undef, i8 %y, i32 0 94 %r = add <16 x i8> %i0, %i1 95 ret <16 x i8> %r 96} 97 98; Base vector does not have to be undef. 99 100define <4 x i32> @ins0_ins0_mul(i32 %x, i32 %y) { 101; CHECK-LABEL: @ins0_ins0_mul( 102; CHECK-NEXT: [[R_SCALAR:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]] 103; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[R_SCALAR]], i64 0 104; CHECK-NEXT: ret <4 x i32> [[R]] 105; 106 %i0 = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0 107 %i1 = insertelement <4 x i32> undef, i32 %y, i32 0 108 %r = mul <4 x i32> %i0, %i1 109 ret <4 x i32> %r 110} 111 112; It is safe to scalarize any binop (no extra UB/poison danger). 113 114define <2 x i64> @ins1_ins1_sdiv(i64 %x, i64 %y) { 115; CHECK-LABEL: @ins1_ins1_sdiv( 116; CHECK-NEXT: [[R_SCALAR:%.*]] = sdiv i64 [[X:%.*]], [[Y:%.*]] 117; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> <i64 -6, i64 0>, i64 [[R_SCALAR]], i64 1 118; CHECK-NEXT: ret <2 x i64> [[R]] 119; 120 %i0 = insertelement <2 x i64> <i64 42, i64 -42>, i64 %x, i64 1 121 %i1 = insertelement <2 x i64> <i64 -7, i64 128>, i64 %y, i32 1 122 %r = sdiv <2 x i64> %i0, %i1 123 ret <2 x i64> %r 124} 125 126; Constant folding deals with undef per element - the entire value does not become undef. 127 128define <2 x i64> @ins1_ins1_udiv(i64 %x, i64 %y) { 129; CHECK-LABEL: @ins1_ins1_udiv( 130; CHECK-NEXT: [[R_SCALAR:%.*]] = udiv i64 [[X:%.*]], [[Y:%.*]] 131; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> <i64 6, i64 poison>, i64 [[R_SCALAR]], i64 1 132; CHECK-NEXT: ret <2 x i64> [[R]] 133; 134 %i0 = insertelement <2 x i64> <i64 42, i64 undef>, i64 %x, i32 1 135 %i1 = insertelement <2 x i64> <i64 7, i64 undef>, i64 %y, i32 1 136 %r = udiv <2 x i64> %i0, %i1 137 ret <2 x i64> %r 138} 139 140; This could be simplified -- creates immediate UB without the transform because 141; divisor has an undef element -- but that is hidden after the transform. 142 143define <2 x i64> @ins1_ins1_urem(i64 %x, i64 %y) { 144; CHECK-LABEL: @ins1_ins1_urem( 145; CHECK-NEXT: [[R_SCALAR:%.*]] = urem i64 [[X:%.*]], [[Y:%.*]] 146; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[R_SCALAR]], i64 1 147; CHECK-NEXT: ret <2 x i64> [[R]] 148; 149 %i0 = insertelement <2 x i64> <i64 42, i64 undef>, i64 %x, i64 1 150 %i1 = insertelement <2 x i64> <i64 undef, i64 128>, i64 %y, i32 1 151 %r = urem <2 x i64> %i0, %i1 152 ret <2 x i64> %r 153} 154 155; Extra use is accounted for in cost calculation. 156 157define <4 x i32> @ins0_ins0_xor(i32 %x, i32 %y) { 158; CHECK-LABEL: @ins0_ins0_xor( 159; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0 160; CHECK-NEXT: call void @use(<4 x i32> [[I0]]) 161; CHECK-NEXT: [[R_SCALAR:%.*]] = xor i32 [[X]], [[Y:%.*]] 162; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[R_SCALAR]], i64 0 163; CHECK-NEXT: ret <4 x i32> [[R]] 164; 165 %i0 = insertelement <4 x i32> undef, i32 %x, i32 0 166 call void @use(<4 x i32> %i0) 167 %i1 = insertelement <4 x i32> undef, i32 %y, i32 0 168 %r = xor <4 x i32> %i0, %i1 169 ret <4 x i32> %r 170} 171 172; Extra use is accounted for in cost calculation. 173 174define <4 x float> @ins1_ins1_fmul(float %x, float %y) { 175; CHECK-LABEL: @ins1_ins1_fmul( 176; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 1 177; CHECK-NEXT: call void @usef(<4 x float> [[I1]]) 178; CHECK-NEXT: [[R_SCALAR:%.*]] = fmul float [[X:%.*]], [[Y]] 179; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[R_SCALAR]], i64 1 180; CHECK-NEXT: ret <4 x float> [[R]] 181; 182 %i0 = insertelement <4 x float> undef, float %x, i32 1 183 %i1 = insertelement <4 x float> undef, float %y, i32 1 184 call void @usef(<4 x float> %i1) 185 %r = fmul <4 x float> %i0, %i1 186 ret <4 x float> %r 187} 188 189; If the scalar binop is not cheaper than the vector binop, extra uses can prevent the transform. 190 191define <4 x float> @ins2_ins2_fsub(float %x, float %y) { 192; CHECK-LABEL: @ins2_ins2_fsub( 193; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 2 194; CHECK-NEXT: call void @usef(<4 x float> [[I0]]) 195; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 2 196; CHECK-NEXT: call void @usef(<4 x float> [[I1]]) 197; CHECK-NEXT: [[R:%.*]] = fsub <4 x float> [[I0]], [[I1]] 198; CHECK-NEXT: ret <4 x float> [[R]] 199; 200 %i0 = insertelement <4 x float> undef, float %x, i32 2 201 call void @usef(<4 x float> %i0) 202 %i1 = insertelement <4 x float> undef, float %y, i32 2 203 call void @usef(<4 x float> %i1) 204 %r = fsub <4 x float> %i0, %i1 205 ret <4 x float> %r 206} 207 208; It may be worth scalarizing an expensive binop even if both inserts have extra uses. 209 210define <4 x float> @ins3_ins3_fdiv(float %x, float %y) { 211; SSE-LABEL: @ins3_ins3_fdiv( 212; SSE-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 3 213; SSE-NEXT: call void @usef(<4 x float> [[I0]]) 214; SSE-NEXT: [[I1:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 3 215; SSE-NEXT: call void @usef(<4 x float> [[I1]]) 216; SSE-NEXT: [[R_SCALAR:%.*]] = fdiv float [[X]], [[Y]] 217; SSE-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[R_SCALAR]], i64 3 218; SSE-NEXT: ret <4 x float> [[R]] 219; 220; AVX-LABEL: @ins3_ins3_fdiv( 221; AVX-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 3 222; AVX-NEXT: call void @usef(<4 x float> [[I0]]) 223; AVX-NEXT: [[I1:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 3 224; AVX-NEXT: call void @usef(<4 x float> [[I1]]) 225; AVX-NEXT: [[R:%.*]] = fdiv <4 x float> [[I0]], [[I1]] 226; AVX-NEXT: ret <4 x float> [[R]] 227; 228 %i0 = insertelement <4 x float> undef, float %x, i32 3 229 call void @usef(<4 x float> %i0) 230 %i1 = insertelement <4 x float> undef, float %y, i32 3 231 call void @usef(<4 x float> %i1) 232 %r = fdiv <4 x float> %i0, %i1 233 ret <4 x float> %r 234} 235