1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE 3; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX 4 5declare void @use(float) 6 7; TODO: The insert is costed as free, so creating a shuffle appears to be a loss. 8 9define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) { 10; CHECK-LABEL: @ext0_v4f32( 11; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]] 12; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 4, i32 1, i32 2, i32 3> 13; CHECK-NEXT: ret <4 x float> [[R]] 14; 15 %e = extractelement <4 x float> %x, i32 0 16 %n = fneg float %e 17 %r = insertelement <4 x float> %y, float %n, i32 0 18 ret <4 x float> %r 19} 20 21define <4 x float> @ext0_v2f32v4f32(<2 x float> %x, <4 x float> %y) { 22; CHECK-LABEL: @ext0_v2f32v4f32( 23; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 24; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] 25; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 0 26; CHECK-NEXT: ret <4 x float> [[R]] 27; 28 %e = extractelement <2 x float> %x, i32 0 29 %n = fneg float %e 30 %r = insertelement <4 x float> %y, float %n, i32 0 31 ret <4 x float> %r 32} 33 34; Eliminating extract/insert is profitable. 35 36define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) { 37; CHECK-LABEL: @ext2_v4f32( 38; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]] 39; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 6, i32 3> 40; CHECK-NEXT: ret <4 x float> [[R]] 41; 42 %e = extractelement <4 x float> %x, i32 2 43 %n = fneg float %e 44 %r = insertelement <4 x float> %y, float %n, i32 2 45 ret <4 x float> %r 46} 47 48define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) { 49; CHECK-LABEL: @ext2_v2f32v4f32( 50; CHECK-NEXT: [[TMP1:%.*]] = fneg <2 x float> [[X:%.*]] 51; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 poison> 52; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3> 53; CHECK-NEXT: ret <4 x float> [[R]] 54; 55 %e = extractelement <2 x float> %x, i32 2 56 %n = fneg float %e 57 %r = insertelement <4 x float> %y, float %n, i32 2 58 ret <4 x float> %r 59} 60 61; Eliminating extract/insert is still profitable. Flags propagate. 62 63define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) { 64; CHECK-LABEL: @ext1_v2f64( 65; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]] 66; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3> 67; CHECK-NEXT: ret <2 x double> [[R]] 68; 69 %e = extractelement <2 x double> %x, i32 1 70 %n = fneg nsz double %e 71 %r = insertelement <2 x double> %y, double %n, i32 1 72 ret <2 x double> %r 73} 74 75define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) { 76; SSE-LABEL: @ext1_v2f64v4f64( 77; SSE-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 78; SSE-NEXT: [[N:%.*]] = fneg nsz double [[E]] 79; SSE-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1 80; SSE-NEXT: ret <4 x double> [[R]] 81; 82; AVX-LABEL: @ext1_v2f64v4f64( 83; AVX-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]] 84; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison> 85; AVX-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3> 86; AVX-NEXT: ret <4 x double> [[R]] 87; 88 %e = extractelement <2 x double> %x, i32 1 89 %n = fneg nsz double %e 90 %r = insertelement <4 x double> %y, double %n, i32 1 91 ret <4 x double> %r 92} 93 94define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) { 95; CHECK-LABEL: @ext7_v8f32( 96; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]] 97; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15> 98; CHECK-NEXT: ret <8 x float> [[R]] 99; 100 %e = extractelement <8 x float> %x, i32 7 101 %n = fneg float %e 102 %r = insertelement <8 x float> %y, float %n, i32 7 103 ret <8 x float> %r 104} 105 106define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) { 107; CHECK-LABEL: @ext7_v4f32v8f32( 108; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 109; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] 110; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7 111; CHECK-NEXT: ret <8 x float> [[R]] 112; 113 %e = extractelement <4 x float> %x, i32 3 114 %n = fneg float %e 115 %r = insertelement <8 x float> %y, float %n, i32 7 116 ret <8 x float> %r 117} 118 119; Same as above with an extra use of the extracted element. 120 121define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) { 122; SSE-LABEL: @ext7_v8f32_use1( 123; SSE-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 5 124; SSE-NEXT: call void @use(float [[E]]) 125; SSE-NEXT: [[N:%.*]] = fneg float [[E]] 126; SSE-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 5 127; SSE-NEXT: ret <8 x float> [[R]] 128; 129; AVX-LABEL: @ext7_v8f32_use1( 130; AVX-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 5 131; AVX-NEXT: call void @use(float [[E]]) 132; AVX-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X]] 133; AVX-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 6, i32 7> 134; AVX-NEXT: ret <8 x float> [[R]] 135; 136 %e = extractelement <8 x float> %x, i32 5 137 call void @use(float %e) 138 %n = fneg float %e 139 %r = insertelement <8 x float> %y, float %n, i32 5 140 ret <8 x float> %r 141} 142 143define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) { 144; CHECK-LABEL: @ext7_v4f32v8f32_use1( 145; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 146; CHECK-NEXT: call void @use(float [[E]]) 147; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] 148; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3 149; CHECK-NEXT: ret <8 x float> [[R]] 150; 151 %e = extractelement <4 x float> %x, i32 3 152 call void @use(float %e) 153 %n = fneg float %e 154 %r = insertelement <8 x float> %y, float %n, i32 3 155 ret <8 x float> %r 156} 157 158; Negative test - the transform is likely not profitable if the fneg has another use. 159 160define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) { 161; CHECK-LABEL: @ext7_v8f32_use2( 162; CHECK-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 3 163; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] 164; CHECK-NEXT: call void @use(float [[N]]) 165; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3 166; CHECK-NEXT: ret <8 x float> [[R]] 167; 168 %e = extractelement <8 x float> %x, i32 3 169 %n = fneg float %e 170 call void @use(float %n) 171 %r = insertelement <8 x float> %y, float %n, i32 3 172 ret <8 x float> %r 173} 174 175define <8 x float> @ext7_v4f32v8f32_use2(<4 x float> %x, <8 x float> %y) { 176; CHECK-LABEL: @ext7_v4f32v8f32_use2( 177; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 178; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] 179; CHECK-NEXT: call void @use(float [[N]]) 180; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3 181; CHECK-NEXT: ret <8 x float> [[R]] 182; 183 %e = extractelement <4 x float> %x, i32 3 184 %n = fneg float %e 185 call void @use(float %n) 186 %r = insertelement <8 x float> %y, float %n, i32 3 187 ret <8 x float> %r 188} 189 190; Negative test - can't convert variable index to a shuffle. 191 192define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) { 193; CHECK-LABEL: @ext_index_var_v2f64( 194; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]] 195; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] 196; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 [[INDEX]] 197; CHECK-NEXT: ret <2 x double> [[R]] 198; 199 %e = extractelement <2 x double> %x, i32 %index 200 %n = fneg nsz double %e 201 %r = insertelement <2 x double> %y, double %n, i32 %index 202 ret <2 x double> %r 203} 204 205define <4 x double> @ext_index_var_v2f64v4f64(<2 x double> %x, <4 x double> %y, i32 %index) { 206; CHECK-LABEL: @ext_index_var_v2f64v4f64( 207; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]] 208; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] 209; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 [[INDEX]] 210; CHECK-NEXT: ret <4 x double> [[R]] 211; 212 %e = extractelement <2 x double> %x, i32 %index 213 %n = fneg nsz double %e 214 %r = insertelement <4 x double> %y, double %n, i32 %index 215 ret <4 x double> %r 216} 217 218; Negative test - require same extract/insert index for simple shuffle. 219; TODO: We could handle this by adjusting the cost calculation. 220 221define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) { 222; CHECK-LABEL: @ext1_v2f64_ins0( 223; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 224; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] 225; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 0 226; CHECK-NEXT: ret <2 x double> [[R]] 227; 228 %e = extractelement <2 x double> %x, i32 1 229 %n = fneg nsz double %e 230 %r = insertelement <2 x double> %y, double %n, i32 0 231 ret <2 x double> %r 232} 233 234; Negative test - extract from an index greater than the vector width of the destination 235define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) { 236; CHECK-LABEL: @ext3_v4f64v2f64( 237; CHECK-NEXT: [[E:%.*]] = extractelement <4 x double> [[X:%.*]], i32 3 238; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] 239; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1 240; CHECK-NEXT: ret <2 x double> [[R]] 241; 242 %e = extractelement <4 x double> %x, i32 3 243 %n = fneg nsz double %e 244 %r = insertelement <2 x double> %y, double %n, i32 1 245 ret <2 x double> %r 246} 247 248define <4 x double> @ext1_v2f64v4f64_ins0(<2 x double> %x, <4 x double> %y) { 249; CHECK-LABEL: @ext1_v2f64v4f64_ins0( 250; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 251; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] 252; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0 253; CHECK-NEXT: ret <4 x double> [[R]] 254; 255 %e = extractelement <2 x double> %x, i32 1 256 %n = fneg nsz double %e 257 %r = insertelement <4 x double> %y, double %n, i32 0 258 ret <4 x double> %r 259} 260 261; Negative test - avoid changing poison ops 262 263define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) { 264; CHECK-LABEL: @ext12_v4f32( 265; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 12 266; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] 267; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 12 268; CHECK-NEXT: ret <4 x float> [[R]] 269; 270 %e = extractelement <4 x float> %x, i32 12 271 %n = fneg float %e 272 %r = insertelement <4 x float> %y, float %n, i32 12 273 ret <4 x float> %r 274} 275 276define <4 x float> @ext12_v2f32v4f32(<2 x float> %x, <4 x float> %y) { 277; CHECK-LABEL: @ext12_v2f32v4f32( 278; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 6 279; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] 280; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 12 281; CHECK-NEXT: ret <4 x float> [[R]] 282; 283 %e = extractelement <2 x float> %x, i32 6 284 %n = fneg float %e 285 %r = insertelement <4 x float> %y, float %n, i32 12 286 ret <4 x float> %r 287} 288 289; This used to crash because we assumed matching a true, unary fneg instruction. 290 291define <2 x float> @ext1_v2f32_fsub(<2 x float> %x) { 292; CHECK-LABEL: @ext1_v2f32_fsub( 293; CHECK-NEXT: [[TMP1:%.*]] = fneg <2 x float> [[X:%.*]] 294; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[X]], <2 x float> [[TMP1]], <2 x i32> <i32 0, i32 3> 295; CHECK-NEXT: ret <2 x float> [[R]] 296; 297 %e = extractelement <2 x float> %x, i32 1 298 %s = fsub float -0.0, %e 299 %r = insertelement <2 x float> %x, float %s, i32 1 300 ret <2 x float> %r 301} 302 303; This used to crash because we assumed matching a true, unary fneg instruction. 304 305define <2 x float> @ext1_v2f32_fsub_fmf(<2 x float> %x, <2 x float> %y) { 306; CHECK-LABEL: @ext1_v2f32_fsub_fmf( 307; CHECK-NEXT: [[TMP1:%.*]] = fneg nnan nsz <2 x float> [[X:%.*]] 308; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[Y:%.*]], <2 x float> [[TMP1]], <2 x i32> <i32 0, i32 3> 309; CHECK-NEXT: ret <2 x float> [[R]] 310; 311 %e = extractelement <2 x float> %x, i32 1 312 %s = fsub nsz nnan float 0.0, %e 313 %r = insertelement <2 x float> %y, float %s, i32 1 314 ret <2 x float> %r 315} 316 317define <4 x float> @ext1_v2f32v4f32_fsub_fmf(<2 x float> %x, <4 x float> %y) { 318; CHECK-LABEL: @ext1_v2f32v4f32_fsub_fmf( 319; CHECK-NEXT: [[TMP1:%.*]] = fneg nnan nsz <2 x float> [[X:%.*]] 320; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison> 321; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3> 322; CHECK-NEXT: ret <4 x float> [[R]] 323; 324 %e = extractelement <2 x float> %x, i32 1 325 %s = fsub nsz nnan float 0.0, %e 326 %r = insertelement <4 x float> %y, float %s, i32 1 327 ret <4 x float> %r 328} 329