1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 5 6; Verify that we correctly generate 'addsub' instructions from 7; a sequence of vector extracts + float add/sub + vector inserts. 8 9define <4 x float> @test1(<4 x float> %A, <4 x float> %B) { 10; SSE-LABEL: test1: 11; SSE: # %bb.0: 12; SSE-NEXT: addsubps %xmm1, %xmm0 13; SSE-NEXT: retq 14; 15; AVX-LABEL: test1: 16; AVX: # %bb.0: 17; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 18; AVX-NEXT: retq 19 %1 = extractelement <4 x float> %A, i32 0 20 %2 = extractelement <4 x float> %B, i32 0 21 %sub = fsub float %1, %2 22 %3 = extractelement <4 x float> %A, i32 2 23 %4 = extractelement <4 x float> %B, i32 2 24 %sub2 = fsub float %3, %4 25 %5 = extractelement <4 x float> %A, i32 1 26 %6 = extractelement <4 x float> %B, i32 1 27 %add = fadd float %5, %6 28 %7 = extractelement <4 x float> %A, i32 3 29 %8 = extractelement <4 x float> %B, i32 3 30 %add2 = fadd float %7, %8 31 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1 32 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3 33 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0 34 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2 35 ret <4 x float> %vecinsert4 36} 37 38define <4 x float> @test2(<4 x float> %A, <4 x float> %B) { 39; SSE-LABEL: test2: 40; SSE: # %bb.0: 41; SSE-NEXT: addsubps %xmm1, %xmm0 42; SSE-NEXT: retq 43; 44; AVX-LABEL: test2: 45; AVX: # %bb.0: 46; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 47; AVX-NEXT: retq 48 %1 = extractelement <4 x float> %A, i32 2 49 %2 = extractelement <4 x float> %B, i32 2 50 %sub2 = fsub float %1, %2 51 %3 = extractelement <4 x float> %A, i32 3 52 %4 = extractelement <4 x float> %B, i32 3 53 %add2 = fadd float %3, %4 54 %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 2 55 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3 56 ret <4 x float> %vecinsert2 57} 58 59define <4 x float> @test3(<4 x float> %A, <4 x float> %B) { 60; SSE-LABEL: test3: 61; SSE: # %bb.0: 62; SSE-NEXT: addsubps %xmm1, %xmm0 63; SSE-NEXT: retq 64; 65; AVX-LABEL: test3: 66; AVX: # %bb.0: 67; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 68; AVX-NEXT: retq 69 %1 = extractelement <4 x float> %A, i32 0 70 %2 = extractelement <4 x float> %B, i32 0 71 %sub = fsub float %1, %2 72 %3 = extractelement <4 x float> %A, i32 3 73 %4 = extractelement <4 x float> %B, i32 3 74 %add = fadd float %4, %3 75 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0 76 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3 77 ret <4 x float> %vecinsert2 78} 79 80define <4 x float> @test4(<4 x float> %A, <4 x float> %B) { 81; SSE-LABEL: test4: 82; SSE: # %bb.0: 83; SSE-NEXT: addsubps %xmm1, %xmm0 84; SSE-NEXT: retq 85; 86; AVX-LABEL: test4: 87; AVX: # %bb.0: 88; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 89; AVX-NEXT: retq 90 %1 = extractelement <4 x float> %A, i32 2 91 %2 = extractelement <4 x float> %B, i32 2 92 %sub = fsub float %1, %2 93 %3 = extractelement <4 x float> %A, i32 1 94 %4 = extractelement <4 x float> %B, i32 1 95 %add = fadd float %3, %4 96 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2 97 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1 98 ret <4 x float> %vecinsert2 99} 100 101define <4 x float> @test5(<4 x float> %A, <4 x float> %B) { 102; SSE-LABEL: test5: 103; SSE: # %bb.0: 104; SSE-NEXT: addsubps %xmm1, %xmm0 105; SSE-NEXT: retq 106; 107; AVX-LABEL: test5: 108; AVX: # %bb.0: 109; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 110; AVX-NEXT: retq 111 %1 = extractelement <4 x float> %A, i32 0 112 %2 = extractelement <4 x float> %B, i32 0 113 %sub2 = fsub float %1, %2 114 %3 = extractelement <4 x float> %A, i32 1 115 %4 = extractelement <4 x float> %B, i32 1 116 %add2 = fadd float %3, %4 117 %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 0 118 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1 119 ret <4 x float> %vecinsert2 120} 121 122define <4 x float> @test6(<4 x float> %A, <4 x float> %B) { 123; SSE-LABEL: test6: 124; SSE: # %bb.0: 125; SSE-NEXT: addsubps %xmm1, %xmm0 126; SSE-NEXT: retq 127; 128; AVX-LABEL: test6: 129; AVX: # %bb.0: 130; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 131; AVX-NEXT: retq 132 %1 = extractelement <4 x float> %A, i32 0 133 %2 = extractelement <4 x float> %B, i32 0 134 %sub = fsub float %1, %2 135 %3 = extractelement <4 x float> %A, i32 2 136 %4 = extractelement <4 x float> %B, i32 2 137 %sub2 = fsub float %3, %4 138 %5 = extractelement <4 x float> %A, i32 1 139 %6 = extractelement <4 x float> %B, i32 1 140 %add = fadd float %5, %6 141 %7 = extractelement <4 x float> %A, i32 3 142 %8 = extractelement <4 x float> %B, i32 3 143 %add2 = fadd float %7, %8 144 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1 145 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3 146 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0 147 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2 148 ret <4 x float> %vecinsert4 149} 150 151define <4 x double> @test7(<4 x double> %A, <4 x double> %B) { 152; SSE-LABEL: test7: 153; SSE: # %bb.0: 154; SSE-NEXT: addsubpd %xmm2, %xmm0 155; SSE-NEXT: addsubpd %xmm3, %xmm1 156; SSE-NEXT: retq 157; 158; AVX-LABEL: test7: 159; AVX: # %bb.0: 160; AVX-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 161; AVX-NEXT: retq 162 %1 = extractelement <4 x double> %A, i32 0 163 %2 = extractelement <4 x double> %B, i32 0 164 %sub = fsub double %1, %2 165 %3 = extractelement <4 x double> %A, i32 2 166 %4 = extractelement <4 x double> %B, i32 2 167 %sub2 = fsub double %3, %4 168 %5 = extractelement <4 x double> %A, i32 1 169 %6 = extractelement <4 x double> %B, i32 1 170 %add = fadd double %5, %6 171 %7 = extractelement <4 x double> %A, i32 3 172 %8 = extractelement <4 x double> %B, i32 3 173 %add2 = fadd double %7, %8 174 %vecinsert1 = insertelement <4 x double> undef, double %add, i32 1 175 %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3 176 %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0 177 %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2 178 ret <4 x double> %vecinsert4 179} 180 181define <2 x double> @test8(<2 x double> %A, <2 x double> %B) { 182; SSE-LABEL: test8: 183; SSE: # %bb.0: 184; SSE-NEXT: addsubpd %xmm1, %xmm0 185; SSE-NEXT: retq 186; 187; AVX-LABEL: test8: 188; AVX: # %bb.0: 189; AVX-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 190; AVX-NEXT: retq 191 %1 = extractelement <2 x double> %A, i32 0 192 %2 = extractelement <2 x double> %B, i32 0 193 %sub = fsub double %1, %2 194 %3 = extractelement <2 x double> %A, i32 1 195 %4 = extractelement <2 x double> %B, i32 1 196 %add = fadd double %3, %4 197 %vecinsert1 = insertelement <2 x double> undef, double %sub, i32 0 198 %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1 199 ret <2 x double> %vecinsert2 200} 201 202define <8 x float> @test9(<8 x float> %A, <8 x float> %B) { 203; SSE-LABEL: test9: 204; SSE: # %bb.0: 205; SSE-NEXT: addsubps %xmm2, %xmm0 206; SSE-NEXT: addsubps %xmm3, %xmm1 207; SSE-NEXT: retq 208; 209; AVX-LABEL: test9: 210; AVX: # %bb.0: 211; AVX-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 212; AVX-NEXT: retq 213 %1 = extractelement <8 x float> %A, i32 0 214 %2 = extractelement <8 x float> %B, i32 0 215 %sub = fsub float %1, %2 216 %3 = extractelement <8 x float> %A, i32 2 217 %4 = extractelement <8 x float> %B, i32 2 218 %sub2 = fsub float %3, %4 219 %5 = extractelement <8 x float> %A, i32 1 220 %6 = extractelement <8 x float> %B, i32 1 221 %add = fadd float %5, %6 222 %7 = extractelement <8 x float> %A, i32 3 223 %8 = extractelement <8 x float> %B, i32 3 224 %add2 = fadd float %7, %8 225 %9 = extractelement <8 x float> %A, i32 4 226 %10 = extractelement <8 x float> %B, i32 4 227 %sub3 = fsub float %9, %10 228 %11 = extractelement <8 x float> %A, i32 6 229 %12 = extractelement <8 x float> %B, i32 6 230 %sub4 = fsub float %11, %12 231 %13 = extractelement <8 x float> %A, i32 5 232 %14 = extractelement <8 x float> %B, i32 5 233 %add3 = fadd float %13, %14 234 %15 = extractelement <8 x float> %A, i32 7 235 %16 = extractelement <8 x float> %B, i32 7 236 %add4 = fadd float %15, %16 237 %vecinsert1 = insertelement <8 x float> undef, float %add, i32 1 238 %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3 239 %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0 240 %vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2 241 %vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5 242 %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7 243 %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4 244 %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6 245 ret <8 x float> %vecinsert8 246} 247 248; Verify that we don't generate addsub instruction for the following 249; functions. 250 251define <4 x float> @test10(<4 x float> %A, <4 x float> %B) { 252; SSE-LABEL: test10: 253; SSE: # %bb.0: 254; SSE-NEXT: subss %xmm1, %xmm0 255; SSE-NEXT: retq 256; 257; AVX-LABEL: test10: 258; AVX: # %bb.0: 259; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 260; AVX-NEXT: retq 261 %1 = extractelement <4 x float> %A, i32 0 262 %2 = extractelement <4 x float> %B, i32 0 263 %sub = fsub float %1, %2 264 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0 265 ret <4 x float> %vecinsert1 266} 267 268define <4 x float> @test11(<4 x float> %A, <4 x float> %B) { 269; SSE-LABEL: test11: 270; SSE: # %bb.0: 271; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 272; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 273; SSE-NEXT: subss %xmm1, %xmm0 274; SSE-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 275; SSE-NEXT: retq 276; 277; AVX1-LABEL: test11: 278; AVX1: # %bb.0: 279; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 280; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 281; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 282; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 283; AVX1-NEXT: retq 284; 285; AVX512-LABEL: test11: 286; AVX512: # %bb.0: 287; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 288; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 289; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0 290; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 291; AVX512-NEXT: retq 292 %1 = extractelement <4 x float> %A, i32 2 293 %2 = extractelement <4 x float> %B, i32 2 294 %sub = fsub float %1, %2 295 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2 296 ret <4 x float> %vecinsert1 297} 298 299define <4 x float> @test12(<4 x float> %A, <4 x float> %B) { 300; SSE-LABEL: test12: 301; SSE: # %bb.0: 302; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 303; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 304; SSE-NEXT: addss %xmm0, %xmm1 305; SSE-NEXT: movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2] 306; SSE-NEXT: retq 307; 308; AVX1-LABEL: test12: 309; AVX1: # %bb.0: 310; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 311; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 312; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 313; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 314; AVX1-NEXT: retq 315; 316; AVX512-LABEL: test12: 317; AVX512: # %bb.0: 318; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 319; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 320; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 321; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 322; AVX512-NEXT: retq 323 %1 = extractelement <4 x float> %A, i32 1 324 %2 = extractelement <4 x float> %B, i32 1 325 %add = fadd float %1, %2 326 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1 327 ret <4 x float> %vecinsert1 328} 329 330define <4 x float> @test13(<4 x float> %A, <4 x float> %B) { 331; SSE-LABEL: test13: 332; SSE: # %bb.0: 333; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 334; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 335; SSE-NEXT: addss %xmm1, %xmm0 336; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 337; SSE-NEXT: retq 338; 339; AVX1-LABEL: test13: 340; AVX1: # %bb.0: 341; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 342; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 343; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 344; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 345; AVX1-NEXT: retq 346; 347; AVX512-LABEL: test13: 348; AVX512: # %bb.0: 349; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 350; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 351; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 352; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 353; AVX512-NEXT: retq 354 %1 = extractelement <4 x float> %A, i32 3 355 %2 = extractelement <4 x float> %B, i32 3 356 %add = fadd float %1, %2 357 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 3 358 ret <4 x float> %vecinsert1 359} 360 361define <4 x float> @test14(<4 x float> %A, <4 x float> %B) { 362; SSE-LABEL: test14: 363; SSE: # %bb.0: 364; SSE-NEXT: movaps %xmm0, %xmm2 365; SSE-NEXT: subss %xmm1, %xmm2 366; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 367; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 368; SSE-NEXT: subss %xmm1, %xmm0 369; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 370; SSE-NEXT: movaps %xmm2, %xmm0 371; SSE-NEXT: retq 372; 373; AVX-LABEL: test14: 374; AVX: # %bb.0: 375; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm2 376; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 377; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] 378; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 379; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] 380; AVX-NEXT: retq 381 %1 = extractelement <4 x float> %A, i32 0 382 %2 = extractelement <4 x float> %B, i32 0 383 %sub = fsub float %1, %2 384 %3 = extractelement <4 x float> %A, i32 2 385 %4 = extractelement <4 x float> %B, i32 2 386 %sub2 = fsub float %3, %4 387 %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0 388 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %sub2, i32 2 389 ret <4 x float> %vecinsert2 390} 391 392define <4 x float> @test15(<4 x float> %A, <4 x float> %B) { 393; SSE-LABEL: test15: 394; SSE: # %bb.0: 395; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 396; SSE-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 397; SSE-NEXT: addss %xmm3, %xmm2 398; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 399; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 400; SSE-NEXT: addss %xmm0, %xmm1 401; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] 402; SSE-NEXT: movaps %xmm2, %xmm0 403; SSE-NEXT: retq 404; 405; AVX1-LABEL: test15: 406; AVX1: # %bb.0: 407; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 408; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 409; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2 410; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 411; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 412; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 413; AVX1-NEXT: vmovsldup {{.*#+}} xmm1 = xmm2[0,0,2,2] 414; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 415; AVX1-NEXT: retq 416; 417; AVX512-LABEL: test15: 418; AVX512: # %bb.0: 419; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 420; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 421; AVX512-NEXT: vaddss %xmm3, %xmm2, %xmm2 422; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 423; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 424; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 425; AVX512-NEXT: vbroadcastss %xmm2, %xmm1 426; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 427; AVX512-NEXT: retq 428 %1 = extractelement <4 x float> %A, i32 1 429 %2 = extractelement <4 x float> %B, i32 1 430 %add = fadd float %1, %2 431 %3 = extractelement <4 x float> %A, i32 3 432 %4 = extractelement <4 x float> %B, i32 3 433 %add2 = fadd float %3, %4 434 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1 435 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3 436 ret <4 x float> %vecinsert2 437} 438 439define <4 x float> @test16(<4 x float> %A, <4 x float> %B) { 440; SSE-LABEL: test16: 441; SSE: # %bb.0: 442; SSE-NEXT: movss {{.*#+}} xmm3 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0] 443; SSE-NEXT: movaps %xmm0, %xmm2 444; SSE-NEXT: subss %xmm3, %xmm2 445; SSE-NEXT: movaps %xmm0, %xmm4 446; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] 447; SSE-NEXT: movaps %xmm1, %xmm5 448; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 449; SSE-NEXT: subss %xmm5, %xmm4 450; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] 451; SSE-NEXT: addss %xmm3, %xmm5 452; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 453; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 454; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 455; SSE-NEXT: addss %xmm0, %xmm1 456; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 457; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] 458; SSE-NEXT: movaps %xmm2, %xmm0 459; SSE-NEXT: retq 460; 461; AVX-LABEL: test16: 462; AVX: # %bb.0: 463; AVX-NEXT: vmovss {{.*#+}} xmm2 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0] 464; AVX-NEXT: vsubss %xmm2, %xmm0, %xmm3 465; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] 466; AVX-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] 467; AVX-NEXT: vsubss %xmm5, %xmm4, %xmm4 468; AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] 469; AVX-NEXT: vaddss %xmm2, %xmm5, %xmm2 470; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 471; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] 472; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 473; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 474; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 475; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] 476; AVX-NEXT: retq 477 %1 = extractelement <4 x float> %A, i32 0 478 %2 = extractelement <4 x float> %B, i32 0 479 %sub = fsub float %1, 42.0 480 %3 = extractelement <4 x float> %A, i32 2 481 %4 = extractelement <4 x float> %B, i32 2 482 %sub2 = fsub float %3, %4 483 %5 = extractelement <4 x float> %A, i32 1 484 %6 = extractelement <4 x float> %B, i32 1 485 %add = fadd float %5, 42.0 486 %7 = extractelement <4 x float> %A, i32 3 487 %8 = extractelement <4 x float> %B, i32 3 488 %add2 = fadd float %7, %8 489 %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1 490 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3 491 %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0 492 %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2 493 ret <4 x float> %vecinsert4 494} 495 496define <2 x float> @test_v2f32(<2 x float> %v0, <2 x float> %v1) { 497; SSE-LABEL: test_v2f32: 498; SSE: # %bb.0: 499; SSE-NEXT: addsubps %xmm1, %xmm0 500; SSE-NEXT: retq 501; 502; AVX-LABEL: test_v2f32: 503; AVX: # %bb.0: 504; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 505; AVX-NEXT: retq 506 %v2 = extractelement <2 x float> %v0, i32 0 507 %v3 = extractelement <2 x float> %v1, i32 0 508 %v4 = extractelement <2 x float> %v0, i32 1 509 %v5 = extractelement <2 x float> %v1, i32 1 510 %sub = fsub float %v2, %v3 511 %add = fadd float %v5, %v4 512 %res0 = insertelement <2 x float> undef, float %sub, i32 0 513 %res1 = insertelement <2 x float> %res0, float %add, i32 1 514 ret <2 x float> %res1 515} 516 517define <16 x float> @test17(<16 x float> %A, <16 x float> %B) { 518; SSE-LABEL: test17: 519; SSE: # %bb.0: 520; SSE-NEXT: addsubps %xmm4, %xmm0 521; SSE-NEXT: addsubps %xmm5, %xmm1 522; SSE-NEXT: addsubps %xmm6, %xmm2 523; SSE-NEXT: addsubps %xmm7, %xmm3 524; SSE-NEXT: retq 525; 526; AVX1-LABEL: test17: 527; AVX1: # %bb.0: 528; AVX1-NEXT: vaddsubps %ymm2, %ymm0, %ymm0 529; AVX1-NEXT: vaddsubps %ymm3, %ymm1, %ymm1 530; AVX1-NEXT: retq 531; 532; AVX512-LABEL: test17: 533; AVX512: # %bb.0: 534; AVX512-NEXT: vsubps %zmm1, %zmm0, %zmm2 535; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA 536; AVX512-NEXT: kmovw %eax, %k1 537; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm2 {%k1} 538; AVX512-NEXT: vmovaps %zmm2, %zmm0 539; AVX512-NEXT: retq 540 %1 = extractelement <16 x float> %A, i32 0 541 %2 = extractelement <16 x float> %B, i32 0 542 %sub = fsub float %1, %2 543 %3 = extractelement <16 x float> %A, i32 2 544 %4 = extractelement <16 x float> %B, i32 2 545 %sub2 = fsub float %3, %4 546 %5 = extractelement <16 x float> %A, i32 1 547 %6 = extractelement <16 x float> %B, i32 1 548 %add = fadd float %5, %6 549 %7 = extractelement <16 x float> %A, i32 3 550 %8 = extractelement <16 x float> %B, i32 3 551 %add2 = fadd float %7, %8 552 %9 = extractelement <16 x float> %A, i32 4 553 %10 = extractelement <16 x float> %B, i32 4 554 %sub3 = fsub float %9, %10 555 %11 = extractelement <16 x float> %A, i32 6 556 %12 = extractelement <16 x float> %B, i32 6 557 %sub4 = fsub float %11, %12 558 %13 = extractelement <16 x float> %A, i32 5 559 %14 = extractelement <16 x float> %B, i32 5 560 %add3 = fadd float %13, %14 561 %15 = extractelement <16 x float> %A, i32 7 562 %16 = extractelement <16 x float> %B, i32 7 563 %add4 = fadd float %15, %16 564 %17 = extractelement <16 x float> %A, i32 8 565 %18 = extractelement <16 x float> %B, i32 8 566 %sub5 = fsub float %17, %18 567 %19 = extractelement <16 x float> %A, i32 10 568 %20 = extractelement <16 x float> %B, i32 10 569 %sub6 = fsub float %19, %20 570 %21 = extractelement <16 x float> %A, i32 9 571 %22 = extractelement <16 x float> %B, i32 9 572 %add5 = fadd float %21, %22 573 %23 = extractelement <16 x float> %A, i32 11 574 %24 = extractelement <16 x float> %B, i32 11 575 %add6 = fadd float %23, %24 576 %25 = extractelement <16 x float> %A, i32 12 577 %26 = extractelement <16 x float> %B, i32 12 578 %sub7 = fsub float %25, %26 579 %27 = extractelement <16 x float> %A, i32 14 580 %28 = extractelement <16 x float> %B, i32 14 581 %sub8 = fsub float %27, %28 582 %29 = extractelement <16 x float> %A, i32 13 583 %30 = extractelement <16 x float> %B, i32 13 584 %add7 = fadd float %29, %30 585 %31 = extractelement <16 x float> %A, i32 15 586 %32 = extractelement <16 x float> %B, i32 15 587 %add8 = fadd float %31, %32 588 %vecinsert1 = insertelement <16 x float> undef, float %add, i32 1 589 %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add2, i32 3 590 %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub, i32 0 591 %vecinsert4 = insertelement <16 x float> %vecinsert3, float %sub2, i32 2 592 %vecinsert5 = insertelement <16 x float> %vecinsert4, float %add3, i32 5 593 %vecinsert6 = insertelement <16 x float> %vecinsert5, float %add4, i32 7 594 %vecinsert7 = insertelement <16 x float> %vecinsert6, float %sub3, i32 4 595 %vecinsert8 = insertelement <16 x float> %vecinsert7, float %sub4, i32 6 596 %vecinsert9 = insertelement <16 x float> %vecinsert8, float %add5, i32 9 597 %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add6, i32 11 598 %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub5, i32 8 599 %vecinsert12 = insertelement <16 x float> %vecinsert11, float %sub6, i32 10 600 %vecinsert13 = insertelement <16 x float> %vecinsert12, float %add7, i32 13 601 %vecinsert14 = insertelement <16 x float> %vecinsert13, float %add8, i32 15 602 %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub7, i32 12 603 %vecinsert16 = insertelement <16 x float> %vecinsert15, float %sub8, i32 14 604 ret <16 x float> %vecinsert16 605} 606 607define <8 x double> @test18(<8 x double> %A, <8 x double> %B) { 608; SSE-LABEL: test18: 609; SSE: # %bb.0: 610; SSE-NEXT: addsubpd %xmm4, %xmm0 611; SSE-NEXT: addsubpd %xmm5, %xmm1 612; SSE-NEXT: addsubpd %xmm6, %xmm2 613; SSE-NEXT: addsubpd %xmm7, %xmm3 614; SSE-NEXT: retq 615; 616; AVX1-LABEL: test18: 617; AVX1: # %bb.0: 618; AVX1-NEXT: vaddsubpd %ymm2, %ymm0, %ymm0 619; AVX1-NEXT: vaddsubpd %ymm3, %ymm1, %ymm1 620; AVX1-NEXT: retq 621; 622; AVX512-LABEL: test18: 623; AVX512: # %bb.0: 624; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm2 625; AVX512-NEXT: vsubpd %zmm1, %zmm0, %zmm0 626; AVX512-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm2[1],zmm0[2],zmm2[3],zmm0[4],zmm2[5],zmm0[6],zmm2[7] 627; AVX512-NEXT: retq 628 %1 = extractelement <8 x double> %A, i32 0 629 %2 = extractelement <8 x double> %B, i32 0 630 %sub = fsub double %1, %2 631 %3 = extractelement <8 x double> %A, i32 2 632 %4 = extractelement <8 x double> %B, i32 2 633 %sub2 = fsub double %3, %4 634 %5 = extractelement <8 x double> %A, i32 1 635 %6 = extractelement <8 x double> %B, i32 1 636 %add = fadd double %5, %6 637 %7 = extractelement <8 x double> %A, i32 3 638 %8 = extractelement <8 x double> %B, i32 3 639 %add2 = fadd double %7, %8 640 %9 = extractelement <8 x double> %A, i32 4 641 %10 = extractelement <8 x double> %B, i32 4 642 %sub3 = fsub double %9, %10 643 %11 = extractelement <8 x double> %A, i32 6 644 %12 = extractelement <8 x double> %B, i32 6 645 %sub4 = fsub double %11, %12 646 %13 = extractelement <8 x double> %A, i32 5 647 %14 = extractelement <8 x double> %B, i32 5 648 %add3 = fadd double %13, %14 649 %15 = extractelement <8 x double> %A, i32 7 650 %16 = extractelement <8 x double> %B, i32 7 651 %add4 = fadd double %15, %16 652 %vecinsert1 = insertelement <8 x double> undef, double %add, i32 1 653 %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add2, i32 3 654 %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub, i32 0 655 %vecinsert4 = insertelement <8 x double> %vecinsert3, double %sub2, i32 2 656 %vecinsert5 = insertelement <8 x double> %vecinsert4, double %add3, i32 5 657 %vecinsert6 = insertelement <8 x double> %vecinsert5, double %add4, i32 7 658 %vecinsert7 = insertelement <8 x double> %vecinsert6, double %sub3, i32 4 659 %vecinsert8 = insertelement <8 x double> %vecinsert7, double %sub4, i32 6 660 ret <8 x double> %vecinsert8 661} 662