1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX 4 5; fold (sub x, 0) -> x 6define <4 x i32> @combine_vec_sub_zero(<4 x i32> %a) { 7; CHECK-LABEL: combine_vec_sub_zero: 8; CHECK: # %bb.0: 9; CHECK-NEXT: retq 10 %1 = sub <4 x i32> %a, zeroinitializer 11 ret <4 x i32> %1 12} 13 14; fold (sub x, x) -> 0 15define <4 x i32> @combine_vec_sub_self(<4 x i32> %a) { 16; SSE-LABEL: combine_vec_sub_self: 17; SSE: # %bb.0: 18; SSE-NEXT: xorps %xmm0, %xmm0 19; SSE-NEXT: retq 20; 21; AVX-LABEL: combine_vec_sub_self: 22; AVX: # %bb.0: 23; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 24; AVX-NEXT: retq 25 %1 = sub <4 x i32> %a, %a 26 ret <4 x i32> %1 27} 28 29; fold (sub x, c) -> (add x, -c) 30define <4 x i32> @combine_vec_sub_constant(<4 x i32> %x) { 31; SSE-LABEL: combine_vec_sub_constant: 32; SSE: # %bb.0: 33; SSE-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 34; SSE-NEXT: retq 35; 36; AVX-LABEL: combine_vec_sub_constant: 37; AVX: # %bb.0: 38; AVX-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 39; AVX-NEXT: retq 40 %1 = sub <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3> 41 ret <4 x i32> %1 42} 43 44; Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) 45define <4 x i32> @combine_vec_sub_negone(<4 x i32> %x) { 46; SSE-LABEL: combine_vec_sub_negone: 47; SSE: # %bb.0: 48; SSE-NEXT: pcmpeqd %xmm1, %xmm1 49; SSE-NEXT: pxor %xmm1, %xmm0 50; SSE-NEXT: retq 51; 52; AVX-LABEL: combine_vec_sub_negone: 53; AVX: # %bb.0: 54; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 55; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 56; AVX-NEXT: retq 57 %1 = sub <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %x 58 ret <4 x i32> %1 59} 60 61; fold A-(A-B) -> B 62define <4 x i32> @combine_vec_sub_sub(<4 x i32> %a, <4 x i32> %b) { 63; SSE-LABEL: combine_vec_sub_sub: 64; SSE: # %bb.0: 65; SSE-NEXT: movaps %xmm1, %xmm0 66; SSE-NEXT: retq 67; 68; AVX-LABEL: combine_vec_sub_sub: 69; AVX: # %bb.0: 70; AVX-NEXT: vmovaps %xmm1, %xmm0 71; AVX-NEXT: retq 72 %1 = sub <4 x i32> %a, %b 73 %2 = sub <4 x i32> %a, %1 74 ret <4 x i32> %2 75} 76 77; fold (A+B)-A -> B 78define <4 x i32> @combine_vec_sub_add0(<4 x i32> %a, <4 x i32> %b) { 79; SSE-LABEL: combine_vec_sub_add0: 80; SSE: # %bb.0: 81; SSE-NEXT: movaps %xmm1, %xmm0 82; SSE-NEXT: retq 83; 84; AVX-LABEL: combine_vec_sub_add0: 85; AVX: # %bb.0: 86; AVX-NEXT: vmovaps %xmm1, %xmm0 87; AVX-NEXT: retq 88 %1 = add <4 x i32> %a, %b 89 %2 = sub <4 x i32> %1, %a 90 ret <4 x i32> %2 91} 92 93; fold (A+B)-B -> A 94define <4 x i32> @combine_vec_sub_add1(<4 x i32> %a, <4 x i32> %b) { 95; CHECK-LABEL: combine_vec_sub_add1: 96; CHECK: # %bb.0: 97; CHECK-NEXT: retq 98 %1 = add <4 x i32> %a, %b 99 %2 = sub <4 x i32> %1, %b 100 ret <4 x i32> %2 101} 102 103; fold C2-(A+C1) -> (C2-C1)-A 104define <4 x i32> @combine_vec_sub_constant_add(<4 x i32> %a) { 105; SSE-LABEL: combine_vec_sub_constant_add: 106; SSE: # %bb.0: 107; SSE-NEXT: pmovsxbd {{.*#+}} xmm1 = [3,1,4294967295,4294967293] 108; SSE-NEXT: psubd %xmm0, %xmm1 109; SSE-NEXT: movdqa %xmm1, %xmm0 110; SSE-NEXT: retq 111; 112; AVX-LABEL: combine_vec_sub_constant_add: 113; AVX: # %bb.0: 114; AVX-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,1,4294967295,4294967293] 115; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 116; AVX-NEXT: retq 117 %1 = add <4 x i32> %a, <i32 0, i32 1, i32 2, i32 3> 118 %2 = sub <4 x i32> <i32 3, i32 2, i32 1, i32 0>, %1 119 ret <4 x i32> %2 120} 121 122; fold ((A+(B+C))-B) -> A+C 123define <4 x i32> @combine_vec_sub_add_add(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 124; SSE-LABEL: combine_vec_sub_add_add: 125; SSE: # %bb.0: 126; SSE-NEXT: paddd %xmm2, %xmm0 127; SSE-NEXT: retq 128; 129; AVX-LABEL: combine_vec_sub_add_add: 130; AVX: # %bb.0: 131; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 132; AVX-NEXT: retq 133 %1 = add <4 x i32> %b, %c 134 %2 = add <4 x i32> %a, %1 135 %3 = sub <4 x i32> %2, %b 136 ret <4 x i32> %3 137} 138 139; fold ((A+(B-C))-B) -> A-C 140define <4 x i32> @combine_vec_sub_add_sub(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 141; SSE-LABEL: combine_vec_sub_add_sub: 142; SSE: # %bb.0: 143; SSE-NEXT: psubd %xmm2, %xmm0 144; SSE-NEXT: retq 145; 146; AVX-LABEL: combine_vec_sub_add_sub: 147; AVX: # %bb.0: 148; AVX-NEXT: vpsubd %xmm2, %xmm0, %xmm0 149; AVX-NEXT: retq 150 %1 = sub <4 x i32> %b, %c 151 %2 = add <4 x i32> %a, %1 152 %3 = sub <4 x i32> %2, %b 153 ret <4 x i32> %3 154} 155 156; fold ((A-(B-C))-C) -> A-B 157define <4 x i32> @combine_vec_sub_sub_sub(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 158; SSE-LABEL: combine_vec_sub_sub_sub: 159; SSE: # %bb.0: 160; SSE-NEXT: psubd %xmm1, %xmm0 161; SSE-NEXT: retq 162; 163; AVX-LABEL: combine_vec_sub_sub_sub: 164; AVX: # %bb.0: 165; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 166; AVX-NEXT: retq 167 %1 = sub <4 x i32> %b, %c 168 %2 = sub <4 x i32> %a, %1 169 %3 = sub <4 x i32> %2, %c 170 ret <4 x i32> %3 171} 172 173; fold undef-A -> undef 174define <4 x i32> @combine_vec_sub_undef0(<4 x i32> %a) { 175; CHECK-LABEL: combine_vec_sub_undef0: 176; CHECK: # %bb.0: 177; CHECK-NEXT: retq 178 %1 = sub <4 x i32> undef, %a 179 ret <4 x i32> %1 180} 181 182; fold A-undef -> undef 183define <4 x i32> @combine_vec_sub_undef1(<4 x i32> %a) { 184; CHECK-LABEL: combine_vec_sub_undef1: 185; CHECK: # %bb.0: 186; CHECK-NEXT: retq 187 %1 = sub <4 x i32> %a, undef 188 ret <4 x i32> %1 189} 190 191; sub X, (sext Y i1) -> add X, (and Y 1) 192define <4 x i32> @combine_vec_add_sext(<4 x i32> %x, <4 x i1> %y) { 193; SSE-LABEL: combine_vec_add_sext: 194; SSE: # %bb.0: 195; SSE-NEXT: pslld $31, %xmm1 196; SSE-NEXT: psrad $31, %xmm1 197; SSE-NEXT: psubd %xmm1, %xmm0 198; SSE-NEXT: retq 199; 200; AVX-LABEL: combine_vec_add_sext: 201; AVX: # %bb.0: 202; AVX-NEXT: vpslld $31, %xmm1, %xmm1 203; AVX-NEXT: vpsrad $31, %xmm1, %xmm1 204; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 205; AVX-NEXT: retq 206 %1 = sext <4 x i1> %y to <4 x i32> 207 %2 = sub <4 x i32> %x, %1 208 ret <4 x i32> %2 209} 210 211; sub X, (sextinreg Y i1) -> add X, (and Y 1) 212define <4 x i32> @combine_vec_sub_sextinreg(<4 x i32> %x, <4 x i32> %y) { 213; SSE-LABEL: combine_vec_sub_sextinreg: 214; SSE: # %bb.0: 215; SSE-NEXT: pslld $31, %xmm1 216; SSE-NEXT: psrad $31, %xmm1 217; SSE-NEXT: psubd %xmm1, %xmm0 218; SSE-NEXT: retq 219; 220; AVX-LABEL: combine_vec_sub_sextinreg: 221; AVX: # %bb.0: 222; AVX-NEXT: vpslld $31, %xmm1, %xmm1 223; AVX-NEXT: vpsrad $31, %xmm1, %xmm1 224; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 225; AVX-NEXT: retq 226 %1 = shl <4 x i32> %y, <i32 31, i32 31, i32 31, i32 31> 227 %2 = ashr <4 x i32> %1, <i32 31, i32 31, i32 31, i32 31> 228 %3 = sub <4 x i32> %x, %2 229 ret <4 x i32> %3 230} 231 232; sub C1, (xor X, C1) -> add (xor X, ~C2), C1+1 233define i32 @combine_sub_xor_consts(i32 %x) { 234; CHECK-LABEL: combine_sub_xor_consts: 235; CHECK: # %bb.0: 236; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 237; CHECK-NEXT: xorl $-32, %edi 238; CHECK-NEXT: leal 33(%rdi), %eax 239; CHECK-NEXT: retq 240 %xor = xor i32 %x, 31 241 %sub = sub i32 32, %xor 242 ret i32 %sub 243} 244 245define <4 x i32> @combine_vec_sub_xor_consts(<4 x i32> %x) { 246; SSE-LABEL: combine_vec_sub_xor_consts: 247; SSE: # %bb.0: 248; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 249; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 250; SSE-NEXT: retq 251; 252; AVX-LABEL: combine_vec_sub_xor_consts: 253; AVX: # %bb.0: 254; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 255; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 256; AVX-NEXT: retq 257 %xor = xor <4 x i32> %x, <i32 28, i32 29, i32 -1, i32 -31> 258 %sub = sub <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %xor 259 ret <4 x i32> %sub 260} 261 262define <4 x i32> @combine_vec_neg_xor_consts(<4 x i32> %x) { 263; SSE-LABEL: combine_vec_neg_xor_consts: 264; SSE: # %bb.0: 265; SSE-NEXT: pcmpeqd %xmm1, %xmm1 266; SSE-NEXT: psubd %xmm1, %xmm0 267; SSE-NEXT: retq 268; 269; AVX-LABEL: combine_vec_neg_xor_consts: 270; AVX: # %bb.0: 271; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 272; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 273; AVX-NEXT: retq 274 %xor = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> 275 %sub = sub <4 x i32> zeroinitializer, %xor 276 ret <4 x i32> %sub 277} 278 279; With AVX, this could use broadcast (an extra load) and 280; load-folded 'add', but currently we favor the virtually 281; free pcmpeq instruction. 282 283define void @PR52032_oneuse_constant(ptr %p) { 284; SSE-LABEL: PR52032_oneuse_constant: 285; SSE: # %bb.0: 286; SSE-NEXT: movdqu (%rdi), %xmm0 287; SSE-NEXT: movdqu 16(%rdi), %xmm1 288; SSE-NEXT: pcmpeqd %xmm2, %xmm2 289; SSE-NEXT: psubd %xmm2, %xmm1 290; SSE-NEXT: psubd %xmm2, %xmm0 291; SSE-NEXT: movdqu %xmm0, (%rdi) 292; SSE-NEXT: movdqu %xmm1, 16(%rdi) 293; SSE-NEXT: retq 294; 295; AVX-LABEL: PR52032_oneuse_constant: 296; AVX: # %bb.0: 297; AVX-NEXT: vmovdqu (%rdi), %ymm0 298; AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 299; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 300; AVX-NEXT: vmovdqu %ymm0, (%rdi) 301; AVX-NEXT: vzeroupper 302; AVX-NEXT: retq 303 %i3 = load <8 x i32>, ptr %p, align 4 304 %i4 = add nsw <8 x i32> %i3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 305 store <8 x i32> %i4, ptr %p, align 4 306 ret void 307} 308 309; With AVX, we don't transform 'add' to 'sub' because that prevents load folding. 310; With SSE, we do it because we can't load fold the other op without overwriting the constant op. 311 312define void @PR52032(ptr %p) { 313; SSE-LABEL: PR52032: 314; SSE: # %bb.0: 315; SSE-NEXT: pcmpeqd %xmm0, %xmm0 316; SSE-NEXT: movdqu (%rdi), %xmm1 317; SSE-NEXT: movdqu 16(%rdi), %xmm2 318; SSE-NEXT: movdqu 32(%rdi), %xmm3 319; SSE-NEXT: movdqu 48(%rdi), %xmm4 320; SSE-NEXT: psubd %xmm0, %xmm2 321; SSE-NEXT: psubd %xmm0, %xmm1 322; SSE-NEXT: movdqu %xmm1, (%rdi) 323; SSE-NEXT: movdqu %xmm2, 16(%rdi) 324; SSE-NEXT: psubd %xmm0, %xmm4 325; SSE-NEXT: psubd %xmm0, %xmm3 326; SSE-NEXT: movdqu %xmm3, 32(%rdi) 327; SSE-NEXT: movdqu %xmm4, 48(%rdi) 328; SSE-NEXT: retq 329; 330; AVX-LABEL: PR52032: 331; AVX: # %bb.0: 332; AVX-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] 333; AVX-NEXT: vpaddd (%rdi), %ymm0, %ymm1 334; AVX-NEXT: vmovdqu %ymm1, (%rdi) 335; AVX-NEXT: vpaddd 32(%rdi), %ymm0, %ymm0 336; AVX-NEXT: vmovdqu %ymm0, 32(%rdi) 337; AVX-NEXT: vzeroupper 338; AVX-NEXT: retq 339 %i3 = load <8 x i32>, ptr %p, align 4 340 %i4 = add nsw <8 x i32> %i3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 341 store <8 x i32> %i4, ptr %p, align 4 342 %p2 = getelementptr inbounds <8 x i32>, ptr %p, i64 1 343 %i8 = load <8 x i32>, ptr %p2, align 4 344 %i9 = add nsw <8 x i32> %i8, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 345 store <8 x i32> %i9, ptr %p2, align 4 346 ret void 347} 348 349; Same as above, but 128-bit ops: 350; With AVX, we don't transform 'add' to 'sub' because that prevents load folding. 351; With SSE, we do it because we can't load fold the other op without overwriting the constant op. 352 353define void @PR52032_2(ptr %p) { 354; SSE-LABEL: PR52032_2: 355; SSE: # %bb.0: 356; SSE-NEXT: pcmpeqd %xmm0, %xmm0 357; SSE-NEXT: movdqu (%rdi), %xmm1 358; SSE-NEXT: movdqu 16(%rdi), %xmm2 359; SSE-NEXT: psubd %xmm0, %xmm1 360; SSE-NEXT: movdqu %xmm1, (%rdi) 361; SSE-NEXT: psubd %xmm0, %xmm2 362; SSE-NEXT: movdqu %xmm2, 16(%rdi) 363; SSE-NEXT: retq 364; 365; AVX-LABEL: PR52032_2: 366; AVX: # %bb.0: 367; AVX-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] 368; AVX-NEXT: vpaddd (%rdi), %xmm0, %xmm1 369; AVX-NEXT: vmovdqu %xmm1, (%rdi) 370; AVX-NEXT: vpaddd 16(%rdi), %xmm0, %xmm0 371; AVX-NEXT: vmovdqu %xmm0, 16(%rdi) 372; AVX-NEXT: retq 373 %i3 = load <4 x i32>, ptr %p, align 4 374 %i4 = add nsw <4 x i32> %i3, <i32 1, i32 1, i32 1, i32 1> 375 store <4 x i32> %i4, ptr %p, align 4 376 %p2 = getelementptr inbounds <4 x i32>, ptr %p, i64 1 377 %i8 = load <4 x i32>, ptr %p2, align 4 378 %i9 = add nsw <4 x i32> %i8, <i32 1, i32 1, i32 1, i32 1> 379 store <4 x i32> %i9, ptr %p2, align 4 380 ret void 381} 382 383; If we are starting with a 'sub', it is always better to do the transform. 384 385define void @PR52032_3(ptr %p) { 386; SSE-LABEL: PR52032_3: 387; SSE: # %bb.0: 388; SSE-NEXT: pcmpeqd %xmm0, %xmm0 389; SSE-NEXT: movdqu (%rdi), %xmm1 390; SSE-NEXT: movdqu 16(%rdi), %xmm2 391; SSE-NEXT: paddd %xmm0, %xmm1 392; SSE-NEXT: movdqu %xmm1, (%rdi) 393; SSE-NEXT: paddd %xmm0, %xmm2 394; SSE-NEXT: movdqu %xmm2, 16(%rdi) 395; SSE-NEXT: retq 396; 397; AVX-LABEL: PR52032_3: 398; AVX: # %bb.0: 399; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 400; AVX-NEXT: vpaddd (%rdi), %xmm0, %xmm1 401; AVX-NEXT: vmovdqu %xmm1, (%rdi) 402; AVX-NEXT: vpaddd 16(%rdi), %xmm0, %xmm0 403; AVX-NEXT: vmovdqu %xmm0, 16(%rdi) 404; AVX-NEXT: retq 405 %i3 = load <4 x i32>, ptr %p, align 4 406 %i4 = sub nsw <4 x i32> %i3, <i32 1, i32 1, i32 1, i32 1> 407 store <4 x i32> %i4, ptr %p, align 4 408 %p2 = getelementptr inbounds <4 x i32>, ptr %p, i64 1 409 %i8 = load <4 x i32>, ptr %p2, align 4 410 %i9 = sub nsw <4 x i32> %i8, <i32 1, i32 1, i32 1, i32 1> 411 store <4 x i32> %i9, ptr %p2, align 4 412 ret void 413} 414 415; If there's no chance of profitable load folding (because of extra uses), we convert 'add' to 'sub'. 416 417define void @PR52032_4(ptr %p, ptr %q) { 418; SSE-LABEL: PR52032_4: 419; SSE: # %bb.0: 420; SSE-NEXT: movdqu (%rdi), %xmm0 421; SSE-NEXT: movdqa %xmm0, (%rsi) 422; SSE-NEXT: pcmpeqd %xmm1, %xmm1 423; SSE-NEXT: psubd %xmm1, %xmm0 424; SSE-NEXT: movdqu %xmm0, (%rdi) 425; SSE-NEXT: movdqu 16(%rdi), %xmm0 426; SSE-NEXT: movdqa %xmm0, 16(%rsi) 427; SSE-NEXT: psubd %xmm1, %xmm0 428; SSE-NEXT: movdqu %xmm0, 16(%rdi) 429; SSE-NEXT: retq 430; 431; AVX-LABEL: PR52032_4: 432; AVX: # %bb.0: 433; AVX-NEXT: vmovdqu (%rdi), %xmm0 434; AVX-NEXT: vmovdqa %xmm0, (%rsi) 435; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 436; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 437; AVX-NEXT: vmovdqu %xmm0, (%rdi) 438; AVX-NEXT: vmovdqu 16(%rdi), %xmm0 439; AVX-NEXT: vmovdqa %xmm0, 16(%rsi) 440; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 441; AVX-NEXT: vmovdqu %xmm0, 16(%rdi) 442; AVX-NEXT: retq 443 %i3 = load <4 x i32>, ptr %p, align 4 444 store <4 x i32> %i3, ptr %q 445 %i4 = add nsw <4 x i32> %i3, <i32 1, i32 1, i32 1, i32 1> 446 store <4 x i32> %i4, ptr %p, align 4 447 %p2 = getelementptr inbounds <4 x i32>, ptr %p, i64 1 448 %q2 = getelementptr inbounds <4 x i32>, ptr %q, i64 1 449 %i8 = load <4 x i32>, ptr %p2, align 4 450 store <4 x i32> %i8, ptr %q2 451 %i9 = add nsw <4 x i32> %i8, <i32 1, i32 1, i32 1, i32 1> 452 store <4 x i32> %i9, ptr %p2, align 4 453 ret void 454} 455 456; Fold sub(32,xor(bsr(x),31)) -> add(xor(bsr(x),-32),33) -> add(or(bsr(x),-32),33) -> add(bsr(x),1) 457define i32 @PR74101(i32 %a0) { 458; CHECK-LABEL: PR74101: 459; CHECK: # %bb.0: 460; CHECK-NEXT: bsrl %edi, %eax 461; CHECK-NEXT: incl %eax 462; CHECK-NEXT: retq 463 %lz = call i32 @llvm.ctlz.i32(i32 %a0, i1 true) 464 %add = sub nuw nsw i32 32, %lz 465 ret i32 %add 466} 467