1//===---------------------------------------------------------------------===// 2// Random ideas for the X86 backend. 3//===---------------------------------------------------------------------===// 4 5Improvements to the multiply -> shift/add algorithm: 6http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html 7 8//===---------------------------------------------------------------------===// 9 10Improve code like this (occurs fairly frequently, e.g. in LLVM): 11long long foo(int x) { return 1LL << x; } 12 13http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html 14http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html 15http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html 16 17Another useful one would be ~0ULL >> X and ~0ULL << X. 18 19One better solution for 1LL << x is: 20 xorl %eax, %eax 21 xorl %edx, %edx 22 testb $32, %cl 23 sete %al 24 setne %dl 25 sall %cl, %eax 26 sall %cl, %edx 27 28But that requires good 8-bit subreg support. 29 30Also, this might be better. It's an extra shift, but it's one instruction 31shorter, and doesn't stress 8-bit subreg support. 32(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html, 33but without the unnecessary and.) 34 movl %ecx, %eax 35 shrl $5, %eax 36 movl %eax, %edx 37 xorl $1, %edx 38 sall %cl, %eax 39 sall %cl. %edx 40 4164-bit shifts (in general) expand to really bad code. Instead of using 42cmovs, we should expand to a conditional branch like GCC produces. 43 44//===---------------------------------------------------------------------===// 45 46Some isel ideas: 47 481. Dynamic programming based approach when compile time is not an 49 issue. 502. Code duplication (addressing mode) during isel. 513. Other ideas from "Register-Sensitive Selection, Duplication, and 52 Sequencing of Instructions". 534. Scheduling for reduced register pressure. E.g. "Minimum Register 54 Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" 55 and other related papers. 56 http://citeseer.ist.psu.edu/govindarajan01minimum.html 57 58//===---------------------------------------------------------------------===// 59 60Should we promote i16 to i32 to avoid partial register update stalls? 61 62//===---------------------------------------------------------------------===// 63 64Leave any_extend as pseudo instruction and hint to register 65allocator. Delay codegen until post register allocation. 66Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach 67the coalescer how to deal with it though. 68 69//===---------------------------------------------------------------------===// 70 71It appears icc use push for parameter passing. Need to investigate. 72 73//===---------------------------------------------------------------------===// 74 75The instruction selector sometimes misses folding a load into a compare. The 76pattern is written as (cmp reg, (load p)). Because the compare isn't 77commutative, it is not matched with the load on both sides. The dag combiner 78should be made smart enough to canonicalize the load into the RHS of a compare 79when it can invert the result of the compare for free. 80 81//===---------------------------------------------------------------------===// 82 83In many cases, LLVM generates code like this: 84 85_test: 86 movl 8(%esp), %eax 87 cmpl %eax, 4(%esp) 88 setl %al 89 movzbl %al, %eax 90 ret 91 92on some processors (which ones?), it is more efficient to do this: 93 94_test: 95 movl 8(%esp), %ebx 96 xor %eax, %eax 97 cmpl %ebx, 4(%esp) 98 setl %al 99 ret 100 101Doing this correctly is tricky though, as the xor clobbers the flags. 102 103//===---------------------------------------------------------------------===// 104 105We should generate bts/btr/etc instructions on targets where they are cheap or 106when codesize is important. e.g., for: 107 108void setbit(int *target, int bit) { 109 *target |= (1 << bit); 110} 111void clearbit(int *target, int bit) { 112 *target &= ~(1 << bit); 113} 114 115//===---------------------------------------------------------------------===// 116 117Instead of the following for memset char*, 1, 10: 118 119 movl $16843009, 4(%edx) 120 movl $16843009, (%edx) 121 movw $257, 8(%edx) 122 123It might be better to generate 124 125 movl $16843009, %eax 126 movl %eax, 4(%edx) 127 movl %eax, (%edx) 128 movw al, 8(%edx) 129 130when we can spare a register. It reduces code size. 131 132//===---------------------------------------------------------------------===// 133 134Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently 135get this: 136 137define i32 @test1(i32 %X) { 138 %Y = sdiv i32 %X, 8 139 ret i32 %Y 140} 141 142_test1: 143 movl 4(%esp), %eax 144 movl %eax, %ecx 145 sarl $31, %ecx 146 shrl $29, %ecx 147 addl %ecx, %eax 148 sarl $3, %eax 149 ret 150 151GCC knows several different ways to codegen it, one of which is this: 152 153_test1: 154 movl 4(%esp), %eax 155 cmpl $-1, %eax 156 leal 7(%eax), %ecx 157 cmovle %ecx, %eax 158 sarl $3, %eax 159 ret 160 161which is probably slower, but it's interesting at least :) 162 163//===---------------------------------------------------------------------===// 164 165We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl 166We should leave these as libcalls for everything over a much lower threshold, 167since libc is hand tuned for medium and large mem ops (avoiding RFO for large 168stores, TLB preheating, etc) 169 170//===---------------------------------------------------------------------===// 171 172Optimize this into something reasonable: 173 x * copysign(1.0, y) * copysign(1.0, z) 174 175//===---------------------------------------------------------------------===// 176 177Optimize copysign(x, *y) to use an integer load from y. 178 179//===---------------------------------------------------------------------===// 180 181The following tests perform worse with LSR: 182 183lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. 184 185//===---------------------------------------------------------------------===// 186 187Adding to the list of cmp / test poor codegen issues: 188 189int test(__m128 *A, __m128 *B) { 190 if (_mm_comige_ss(*A, *B)) 191 return 3; 192 else 193 return 4; 194} 195 196_test: 197 movl 8(%esp), %eax 198 movaps (%eax), %xmm0 199 movl 4(%esp), %eax 200 movaps (%eax), %xmm1 201 comiss %xmm0, %xmm1 202 setae %al 203 movzbl %al, %ecx 204 movl $3, %eax 205 movl $4, %edx 206 cmpl $0, %ecx 207 cmove %edx, %eax 208 ret 209 210Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There 211are a number of issues. 1) We are introducing a setcc between the result of the 212intrisic call and select. 2) The intrinsic is expected to produce a i32 value 213so a any extend (which becomes a zero extend) is added. 214 215We probably need some kind of target DAG combine hook to fix this. 216 217//===---------------------------------------------------------------------===// 218 219We generate significantly worse code for this than GCC: 220http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150 221http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701 222 223There is also one case we do worse on PPC. 224 225//===---------------------------------------------------------------------===// 226 227For this: 228 229int test(int a) 230{ 231 return a * 3; 232} 233 234We currently emits 235 imull $3, 4(%esp), %eax 236 237Perhaps this is what we really should generate is? Is imull three or four 238cycles? Note: ICC generates this: 239 movl 4(%esp), %eax 240 leal (%eax,%eax,2), %eax 241 242The current instruction priority is based on pattern complexity. The former is 243more "complex" because it folds a load so the latter will not be emitted. 244 245Perhaps we should use AddedComplexity to give LEA32r a higher priority? We 246should always try to match LEA first since the LEA matching code does some 247estimate to determine whether the match is profitable. 248 249However, if we care more about code size, then imull is better. It's two bytes 250shorter than movl + leal. 251 252On a Pentium M, both variants have the same characteristics with regard 253to throughput; however, the multiplication has a latency of four cycles, as 254opposed to two cycles for the movl+lea variant. 255 256//===---------------------------------------------------------------------===// 257 258It appears gcc place string data with linkonce linkage in 259.section __TEXT,__const_coal,coalesced instead of 260.section __DATA,__const_coal,coalesced. 261Take a look at darwin.h, there are other Darwin assembler directives that we 262do not make use of. 263 264//===---------------------------------------------------------------------===// 265 266define i32 @foo(i32* %a, i32 %t) { 267entry: 268 br label %cond_true 269 270cond_true: ; preds = %cond_true, %entry 271 %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3] 272 %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1] 273 %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1] 274 %tmp3 = load i32* %tmp2 ; <i32> [#uses=1] 275 %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1] 276 %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2] 277 %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2] 278 %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1] 279 br i1 %tmp, label %bb12, label %cond_true 280 281bb12: ; preds = %cond_true 282 ret i32 %tmp7 283} 284is pessimized by -loop-reduce and -indvars 285 286//===---------------------------------------------------------------------===// 287 288u32 to float conversion improvement: 289 290float uint32_2_float( unsigned u ) { 291 float fl = (int) (u & 0xffff); 292 float fh = (int) (u >> 16); 293 fh *= 0x1.0p16f; 294 return fh + fl; 295} 296 29700000000 subl $0x04,%esp 29800000003 movl 0x08(%esp,1),%eax 29900000007 movl %eax,%ecx 30000000009 shrl $0x10,%ecx 3010000000c cvtsi2ss %ecx,%xmm0 30200000010 andl $0x0000ffff,%eax 30300000015 cvtsi2ss %eax,%xmm1 30400000019 mulss 0x00000078,%xmm0 30500000021 addss %xmm1,%xmm0 30600000025 movss %xmm0,(%esp,1) 3070000002a flds (%esp,1) 3080000002d addl $0x04,%esp 30900000030 ret 310 311//===---------------------------------------------------------------------===// 312 313When using fastcc abi, align stack slot of argument of type double on 8 byte 314boundary to improve performance. 315 316//===---------------------------------------------------------------------===// 317 318GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting 319simplifications for integer "x cmp y ? a : b". 320 321//===---------------------------------------------------------------------===// 322 323Consider the expansion of: 324 325define i32 @test3(i32 %X) { 326 %tmp1 = urem i32 %X, 255 327 ret i32 %tmp1 328} 329 330Currently it compiles to: 331 332... 333 movl $2155905153, %ecx 334 movl 8(%esp), %esi 335 movl %esi, %eax 336 mull %ecx 337... 338 339This could be "reassociated" into: 340 341 movl $2155905153, %eax 342 movl 8(%esp), %ecx 343 mull %ecx 344 345to avoid the copy. In fact, the existing two-address stuff would do this 346except that mul isn't a commutative 2-addr instruction. I guess this has 347to be done at isel time based on the #uses to mul? 348 349//===---------------------------------------------------------------------===// 350 351Make sure the instruction which starts a loop does not cross a cacheline 352boundary. This requires knowning the exact length of each machine instruction. 353That is somewhat complicated, but doable. Example 256.bzip2: 354 355In the new trace, the hot loop has an instruction which crosses a cacheline 356boundary. In addition to potential cache misses, this can't help decoding as I 357imagine there has to be some kind of complicated decoder reset and realignment 358to grab the bytes from the next cacheline. 359 360532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines 361942 942 0x3d03 movl %dh, (1809(%esp, %esi) 362937 937 0x3d0a incl %esi 3633 3 0x3d0b cmpb %bl, %dl 36427 27 0x3d0d jnz 0x000062db <main+11707> 365 366//===---------------------------------------------------------------------===// 367 368In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE. 369 370//===---------------------------------------------------------------------===// 371 372This could be a single 16-bit load. 373 374int f(char *p) { 375 if ((p[0] == 1) & (p[1] == 2)) return 1; 376 return 0; 377} 378 379//===---------------------------------------------------------------------===// 380 381We should inline lrintf and probably other libc functions. 382 383//===---------------------------------------------------------------------===// 384 385This code: 386 387void test(int X) { 388 if (X) abort(); 389} 390 391is currently compiled to: 392 393_test: 394 subl $12, %esp 395 cmpl $0, 16(%esp) 396 jne LBB1_1 397 addl $12, %esp 398 ret 399LBB1_1: 400 call L_abort$stub 401 402It would be better to produce: 403 404_test: 405 subl $12, %esp 406 cmpl $0, 16(%esp) 407 jne L_abort$stub 408 addl $12, %esp 409 ret 410 411This can be applied to any no-return function call that takes no arguments etc. 412Alternatively, the stack save/restore logic could be shrink-wrapped, producing 413something like this: 414 415_test: 416 cmpl $0, 4(%esp) 417 jne LBB1_1 418 ret 419LBB1_1: 420 subl $12, %esp 421 call L_abort$stub 422 423Both are useful in different situations. Finally, it could be shrink-wrapped 424and tail called, like this: 425 426_test: 427 cmpl $0, 4(%esp) 428 jne LBB1_1 429 ret 430LBB1_1: 431 pop %eax # realign stack. 432 call L_abort$stub 433 434Though this probably isn't worth it. 435 436//===---------------------------------------------------------------------===// 437 438Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with 439a neg instead of a sub instruction. Consider: 440 441int test(char X) { return 7-X; } 442 443we currently produce: 444_test: 445 movl $7, %eax 446 movsbl 4(%esp), %ecx 447 subl %ecx, %eax 448 ret 449 450We would use one fewer register if codegen'd as: 451 452 movsbl 4(%esp), %eax 453 neg %eax 454 add $7, %eax 455 ret 456 457Note that this isn't beneficial if the load can be folded into the sub. In 458this case, we want a sub: 459 460int test(int X) { return 7-X; } 461_test: 462 movl $7, %eax 463 subl 4(%esp), %eax 464 ret 465 466//===---------------------------------------------------------------------===// 467 468Leaf functions that require one 4-byte spill slot have a prolog like this: 469 470_foo: 471 pushl %esi 472 subl $4, %esp 473... 474and an epilog like this: 475 addl $4, %esp 476 popl %esi 477 ret 478 479It would be smaller, and potentially faster, to push eax on entry and to 480pop into a dummy register instead of using addl/subl of esp. Just don't pop 481into any return registers :) 482 483//===---------------------------------------------------------------------===// 484 485The X86 backend should fold (branch (or (setcc, setcc))) into multiple 486branches. We generate really poor code for: 487 488double testf(double a) { 489 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0); 490} 491 492For example, the entry BB is: 493 494_testf: 495 subl $20, %esp 496 pxor %xmm0, %xmm0 497 movsd 24(%esp), %xmm1 498 ucomisd %xmm0, %xmm1 499 setnp %al 500 sete %cl 501 testb %cl, %al 502 jne LBB1_5 # UnifiedReturnBlock 503LBB1_1: # cond_true 504 505 506it would be better to replace the last four instructions with: 507 508 jp LBB1_1 509 je LBB1_5 510LBB1_1: 511 512We also codegen the inner ?: into a diamond: 513 514 cvtss2sd LCPI1_0(%rip), %xmm2 515 cvtss2sd LCPI1_1(%rip), %xmm3 516 ucomisd %xmm1, %xmm0 517 ja LBB1_3 # cond_true 518LBB1_2: # cond_true 519 movapd %xmm3, %xmm2 520LBB1_3: # cond_true 521 movapd %xmm2, %xmm0 522 ret 523 524We should sink the load into xmm3 into the LBB1_2 block. This should 525be pretty easy, and will nuke all the copies. 526 527//===---------------------------------------------------------------------===// 528 529This: 530 #include <algorithm> 531 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b) 532 { return std::make_pair(a + b, a + b < a); } 533 bool no_overflow(unsigned a, unsigned b) 534 { return !full_add(a, b).second; } 535 536Should compile to: 537 addl %esi, %edi 538 setae %al 539 movzbl %al, %eax 540 ret 541 542on x86-64, instead of the rather stupid-looking: 543 addl %esi, %edi 544 setb %al 545 xorb $1, %al 546 movzbl %al, %eax 547 ret 548 549 550//===---------------------------------------------------------------------===// 551 552The following code: 553 554bb114.preheader: ; preds = %cond_next94 555 %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1] 556 %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1] 557 %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1] 558 %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1] 559 %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1] 560 %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2] 561 %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1] 562 %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1] 563 %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1] 564 %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1] 565 %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1] 566 br label %bb114 567 568produces: 569 570LBB3_5: # bb114.preheader 571 movswl -68(%ebp), %eax 572 movl $32, %ecx 573 movl %ecx, -80(%ebp) 574 subl %eax, -80(%ebp) 575 movswl -52(%ebp), %eax 576 movl %ecx, -84(%ebp) 577 subl %eax, -84(%ebp) 578 movswl -70(%ebp), %eax 579 movl %ecx, -88(%ebp) 580 subl %eax, -88(%ebp) 581 movswl -50(%ebp), %eax 582 subl %eax, %ecx 583 movl %ecx, -76(%ebp) 584 movswl -42(%ebp), %eax 585 movl %eax, -92(%ebp) 586 movswl -66(%ebp), %eax 587 movl %eax, -96(%ebp) 588 movw $0, -98(%ebp) 589 590This appears to be bad because the RA is not folding the store to the stack 591slot into the movl. The above instructions could be: 592 movl $32, -80(%ebp) 593... 594 movl $32, -84(%ebp) 595... 596This seems like a cross between remat and spill folding. 597 598This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't 599change, so we could simply subtract %eax from %ecx first and then use %ecx (or 600vice-versa). 601 602//===---------------------------------------------------------------------===// 603 604This code: 605 606 %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1] 607 br i1 %tmp659, label %cond_true662, label %cond_next715 608 609produces this: 610 611 testw %cx, %cx 612 movswl %cx, %esi 613 jns LBB4_109 # cond_next715 614 615Shark tells us that using %cx in the testw instruction is sub-optimal. It 616suggests using the 32-bit register (which is what ICC uses). 617 618//===---------------------------------------------------------------------===// 619 620We compile this: 621 622void compare (long long foo) { 623 if (foo < 4294967297LL) 624 abort(); 625} 626 627to: 628 629compare: 630 subl $4, %esp 631 cmpl $0, 8(%esp) 632 setne %al 633 movzbw %al, %ax 634 cmpl $1, 12(%esp) 635 setg %cl 636 movzbw %cl, %cx 637 cmove %ax, %cx 638 testb $1, %cl 639 jne .LBB1_2 # UnifiedReturnBlock 640.LBB1_1: # ifthen 641 call abort 642.LBB1_2: # UnifiedReturnBlock 643 addl $4, %esp 644 ret 645 646(also really horrible code on ppc). This is due to the expand code for 64-bit 647compares. GCC produces multiple branches, which is much nicer: 648 649compare: 650 subl $12, %esp 651 movl 20(%esp), %edx 652 movl 16(%esp), %eax 653 decl %edx 654 jle .L7 655.L5: 656 addl $12, %esp 657 ret 658 .p2align 4,,7 659.L7: 660 jl .L4 661 cmpl $0, %eax 662 .p2align 4,,8 663 ja .L5 664.L4: 665 .p2align 4,,9 666 call abort 667 668//===---------------------------------------------------------------------===// 669 670Tail call optimization improvements: Tail call optimization currently 671pushes all arguments on the top of the stack (their normal place for 672non-tail call optimized calls) that source from the callers arguments 673or that source from a virtual register (also possibly sourcing from 674callers arguments). 675This is done to prevent overwriting of parameters (see example 676below) that might be used later. 677 678example: 679 680int callee(int32, int64); 681int caller(int32 arg1, int32 arg2) { 682 int64 local = arg2 * 2; 683 return callee(arg2, (int64)local); 684} 685 686[arg1] [!arg2 no longer valid since we moved local onto it] 687[arg2] -> [(int64) 688[RETADDR] local ] 689 690Moving arg1 onto the stack slot of callee function would overwrite 691arg2 of the caller. 692 693Possible optimizations: 694 695 696 - Analyse the actual parameters of the callee to see which would 697 overwrite a caller parameter which is used by the callee and only 698 push them onto the top of the stack. 699 700 int callee (int32 arg1, int32 arg2); 701 int caller (int32 arg1, int32 arg2) { 702 return callee(arg1,arg2); 703 } 704 705 Here we don't need to write any variables to the top of the stack 706 since they don't overwrite each other. 707 708 int callee (int32 arg1, int32 arg2); 709 int caller (int32 arg1, int32 arg2) { 710 return callee(arg2,arg1); 711 } 712 713 Here we need to push the arguments because they overwrite each 714 other. 715 716//===---------------------------------------------------------------------===// 717 718main () 719{ 720 int i = 0; 721 unsigned long int z = 0; 722 723 do { 724 z -= 0x00004000; 725 i++; 726 if (i > 0x00040000) 727 abort (); 728 } while (z > 0); 729 exit (0); 730} 731 732gcc compiles this to: 733 734_main: 735 subl $28, %esp 736 xorl %eax, %eax 737 jmp L2 738L3: 739 cmpl $262144, %eax 740 je L10 741L2: 742 addl $1, %eax 743 cmpl $262145, %eax 744 jne L3 745 call L_abort$stub 746L10: 747 movl $0, (%esp) 748 call L_exit$stub 749 750llvm: 751 752_main: 753 subl $12, %esp 754 movl $1, %eax 755 movl $16384, %ecx 756LBB1_1: # bb 757 cmpl $262145, %eax 758 jge LBB1_4 # cond_true 759LBB1_2: # cond_next 760 incl %eax 761 addl $4294950912, %ecx 762 cmpl $16384, %ecx 763 jne LBB1_1 # bb 764LBB1_3: # bb11 765 xorl %eax, %eax 766 addl $12, %esp 767 ret 768LBB1_4: # cond_true 769 call L_abort$stub 770 7711. LSR should rewrite the first cmp with induction variable %ecx. 7722. DAG combiner should fold 773 leal 1(%eax), %edx 774 cmpl $262145, %edx 775 => 776 cmpl $262144, %eax 777 778//===---------------------------------------------------------------------===// 779 780define i64 @test(double %X) { 781 %Y = fptosi double %X to i64 782 ret i64 %Y 783} 784 785compiles to: 786 787_test: 788 subl $20, %esp 789 movsd 24(%esp), %xmm0 790 movsd %xmm0, 8(%esp) 791 fldl 8(%esp) 792 fisttpll (%esp) 793 movl 4(%esp), %edx 794 movl (%esp), %eax 795 addl $20, %esp 796 #FP_REG_KILL 797 ret 798 799This should just fldl directly from the input stack slot. 800 801//===---------------------------------------------------------------------===// 802 803This code: 804int foo (int x) { return (x & 65535) | 255; } 805 806Should compile into: 807 808_foo: 809 movzwl 4(%esp), %eax 810 orl $255, %eax 811 ret 812 813instead of: 814_foo: 815 movl $65280, %eax 816 andl 4(%esp), %eax 817 orl $255, %eax 818 ret 819 820//===---------------------------------------------------------------------===// 821 822We're codegen'ing multiply of long longs inefficiently: 823 824unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) { 825 return arg1 * arg2; 826} 827 828We compile to (fomit-frame-pointer): 829 830_LLM: 831 pushl %esi 832 movl 8(%esp), %ecx 833 movl 16(%esp), %esi 834 movl %esi, %eax 835 mull %ecx 836 imull 12(%esp), %esi 837 addl %edx, %esi 838 imull 20(%esp), %ecx 839 movl %esi, %edx 840 addl %ecx, %edx 841 popl %esi 842 ret 843 844This looks like a scheduling deficiency and lack of remat of the load from 845the argument area. ICC apparently produces: 846 847 movl 8(%esp), %ecx 848 imull 12(%esp), %ecx 849 movl 16(%esp), %eax 850 imull 4(%esp), %eax 851 addl %eax, %ecx 852 movl 4(%esp), %eax 853 mull 12(%esp) 854 addl %ecx, %edx 855 ret 856 857Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR: 858http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236 859 860//===---------------------------------------------------------------------===// 861 862We can fold a store into "zeroing a reg". Instead of: 863 864xorl %eax, %eax 865movl %eax, 124(%esp) 866 867we should get: 868 869movl $0, 124(%esp) 870 871if the flags of the xor are dead. 872 873Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should 874be folded into: shl [mem], 1 875 876//===---------------------------------------------------------------------===// 877 878In SSE mode, we turn abs and neg into a load from the constant pool plus a xor 879or and instruction, for example: 880 881 xorpd LCPI1_0, %xmm2 882 883However, if xmm2 gets spilled, we end up with really ugly code like this: 884 885 movsd (%esp), %xmm0 886 xorpd LCPI1_0, %xmm0 887 movsd %xmm0, (%esp) 888 889Since we 'know' that this is a 'neg', we can actually "fold" the spill into 890the neg/abs instruction, turning it into an *integer* operation, like this: 891 892 xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31) 893 894you could also use xorb, but xorl is less likely to lead to a partial register 895stall. Here is a contrived testcase: 896 897double a, b, c; 898void test(double *P) { 899 double X = *P; 900 a = X; 901 bar(); 902 X = -X; 903 b = X; 904 bar(); 905 c = X; 906} 907 908//===---------------------------------------------------------------------===// 909 910The generated code on x86 for checking for signed overflow on a multiply the 911obvious way is much longer than it needs to be. 912 913int x(int a, int b) { 914 long long prod = (long long)a*b; 915 return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1); 916} 917 918See PR2053 for more details. 919 920//===---------------------------------------------------------------------===// 921 922We should investigate using cdq/ctld (effect: edx = sar eax, 31) 923more aggressively; it should cost the same as a move+shift on any modern 924processor, but it's a lot shorter. Downside is that it puts more 925pressure on register allocation because it has fixed operands. 926 927Example: 928int abs(int x) {return x < 0 ? -x : x;} 929 930gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.: 931abs: 932 movl 4(%esp), %eax 933 cltd 934 xorl %edx, %eax 935 subl %edx, %eax 936 ret 937 938//===---------------------------------------------------------------------===// 939 940Take the following code (from 941http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541): 942 943extern unsigned char first_one[65536]; 944int FirstOnet(unsigned long long arg1) 945{ 946 if (arg1 >> 48) 947 return (first_one[arg1 >> 48]); 948 return 0; 949} 950 951 952The following code is currently generated: 953FirstOnet: 954 movl 8(%esp), %eax 955 cmpl $65536, %eax 956 movl 4(%esp), %ecx 957 jb .LBB1_2 # UnifiedReturnBlock 958.LBB1_1: # ifthen 959 shrl $16, %eax 960 movzbl first_one(%eax), %eax 961 ret 962.LBB1_2: # UnifiedReturnBlock 963 xorl %eax, %eax 964 ret 965 966We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this 967lets us change the cmpl into a testl, which is shorter, and eliminate the shift. 968 969//===---------------------------------------------------------------------===// 970 971We compile this function: 972 973define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind { 974entry: 975 %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1] 976 br i1 %tmp2, label %bb7, label %bb 977 978bb: ; preds = %entry 979 %tmp6 = add i32 %b, %a ; <i32> [#uses=1] 980 ret i32 %tmp6 981 982bb7: ; preds = %entry 983 %tmp10 = sub i32 %a, %c ; <i32> [#uses=1] 984 ret i32 %tmp10 985} 986 987to: 988 989foo: # @foo 990# %bb.0: # %entry 991 movl 4(%esp), %ecx 992 cmpb $0, 16(%esp) 993 je .LBB0_2 994# %bb.1: # %bb 995 movl 8(%esp), %eax 996 addl %ecx, %eax 997 ret 998.LBB0_2: # %bb7 999 movl 12(%esp), %edx 1000 movl %ecx, %eax 1001 subl %edx, %eax 1002 ret 1003 1004There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a 1005couple more movls by putting 4(%esp) into %eax instead of %ecx. 1006 1007//===---------------------------------------------------------------------===// 1008 1009Take the following: 1010 1011target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128" 1012target triple = "i386-apple-darwin8" 1013@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2] 1014define fastcc void @abort_gzip() noreturn nounwind { 1015entry: 1016 %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1] 1017 br i1 %tmp.b.i, label %bb.i, label %bb4.i 1018bb.i: ; preds = %entry 1019 tail call void @exit( i32 1 ) noreturn nounwind 1020 unreachable 1021bb4.i: ; preds = %entry 1022 store i1 true, i1* @in_exit.4870.b 1023 tail call void @exit( i32 1 ) noreturn nounwind 1024 unreachable 1025} 1026declare void @exit(i32) noreturn nounwind 1027 1028This compiles into: 1029_abort_gzip: ## @abort_gzip 1030## %bb.0: ## %entry 1031 subl $12, %esp 1032 movb _in_exit.4870.b, %al 1033 cmpb $1, %al 1034 jne LBB0_2 1035 1036We somehow miss folding the movb into the cmpb. 1037 1038//===---------------------------------------------------------------------===// 1039 1040We compile: 1041 1042int test(int x, int y) { 1043 return x-y-1; 1044} 1045 1046into (-m64): 1047 1048_test: 1049 decl %edi 1050 movl %edi, %eax 1051 subl %esi, %eax 1052 ret 1053 1054it would be better to codegen as: x+~y (notl+addl) 1055 1056//===---------------------------------------------------------------------===// 1057 1058This code: 1059 1060int foo(const char *str,...) 1061{ 1062 __builtin_va_list a; int x; 1063 __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a); 1064 return x; 1065} 1066 1067gets compiled into this on x86-64: 1068 subq $200, %rsp 1069 movaps %xmm7, 160(%rsp) 1070 movaps %xmm6, 144(%rsp) 1071 movaps %xmm5, 128(%rsp) 1072 movaps %xmm4, 112(%rsp) 1073 movaps %xmm3, 96(%rsp) 1074 movaps %xmm2, 80(%rsp) 1075 movaps %xmm1, 64(%rsp) 1076 movaps %xmm0, 48(%rsp) 1077 movq %r9, 40(%rsp) 1078 movq %r8, 32(%rsp) 1079 movq %rcx, 24(%rsp) 1080 movq %rdx, 16(%rsp) 1081 movq %rsi, 8(%rsp) 1082 leaq (%rsp), %rax 1083 movq %rax, 192(%rsp) 1084 leaq 208(%rsp), %rax 1085 movq %rax, 184(%rsp) 1086 movl $48, 180(%rsp) 1087 movl $8, 176(%rsp) 1088 movl 176(%rsp), %eax 1089 cmpl $47, %eax 1090 jbe .LBB1_3 # bb 1091.LBB1_1: # bb3 1092 movq 184(%rsp), %rcx 1093 leaq 8(%rcx), %rax 1094 movq %rax, 184(%rsp) 1095.LBB1_2: # bb4 1096 movl (%rcx), %eax 1097 addq $200, %rsp 1098 ret 1099.LBB1_3: # bb 1100 movl %eax, %ecx 1101 addl $8, %eax 1102 addq 192(%rsp), %rcx 1103 movl %eax, 176(%rsp) 1104 jmp .LBB1_2 # bb4 1105 1106gcc 4.3 generates: 1107 subq $96, %rsp 1108.LCFI0: 1109 leaq 104(%rsp), %rax 1110 movq %rsi, -80(%rsp) 1111 movl $8, -120(%rsp) 1112 movq %rax, -112(%rsp) 1113 leaq -88(%rsp), %rax 1114 movq %rax, -104(%rsp) 1115 movl $8, %eax 1116 cmpl $48, %eax 1117 jb .L6 1118 movq -112(%rsp), %rdx 1119 movl (%rdx), %eax 1120 addq $96, %rsp 1121 ret 1122 .p2align 4,,10 1123 .p2align 3 1124.L6: 1125 mov %eax, %edx 1126 addq -104(%rsp), %rdx 1127 addl $8, %eax 1128 movl %eax, -120(%rsp) 1129 movl (%rdx), %eax 1130 addq $96, %rsp 1131 ret 1132 1133and it gets compiled into this on x86: 1134 pushl %ebp 1135 movl %esp, %ebp 1136 subl $4, %esp 1137 leal 12(%ebp), %eax 1138 movl %eax, -4(%ebp) 1139 leal 16(%ebp), %eax 1140 movl %eax, -4(%ebp) 1141 movl 12(%ebp), %eax 1142 addl $4, %esp 1143 popl %ebp 1144 ret 1145 1146gcc 4.3 generates: 1147 pushl %ebp 1148 movl %esp, %ebp 1149 movl 12(%ebp), %eax 1150 popl %ebp 1151 ret 1152 1153//===---------------------------------------------------------------------===// 1154 1155Teach tblgen not to check bitconvert source type in some cases. This allows us 1156to consolidate the following patterns in X86InstrMMX.td: 1157 1158def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), 1159 (iPTR 0))))), 1160 (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>; 1161def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), 1162 (iPTR 0))))), 1163 (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>; 1164def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), 1165 (iPTR 0))))), 1166 (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>; 1167 1168There are other cases in various td files. 1169 1170//===---------------------------------------------------------------------===// 1171 1172Take something like the following on x86-32: 1173unsigned a(unsigned long long x, unsigned y) {return x % y;} 1174 1175We currently generate a libcall, but we really shouldn't: the expansion is 1176shorter and likely faster than the libcall. The expected code is something 1177like the following: 1178 1179 movl 12(%ebp), %eax 1180 movl 16(%ebp), %ecx 1181 xorl %edx, %edx 1182 divl %ecx 1183 movl 8(%ebp), %eax 1184 divl %ecx 1185 movl %edx, %eax 1186 ret 1187 1188A similar code sequence works for division. 1189 1190//===---------------------------------------------------------------------===// 1191 1192We currently compile this: 1193 1194define i32 @func1(i32 %v1, i32 %v2) nounwind { 1195entry: 1196 %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) 1197 %sum = extractvalue {i32, i1} %t, 0 1198 %obit = extractvalue {i32, i1} %t, 1 1199 br i1 %obit, label %overflow, label %normal 1200normal: 1201 ret i32 %sum 1202overflow: 1203 call void @llvm.trap() 1204 unreachable 1205} 1206declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) 1207declare void @llvm.trap() 1208 1209to: 1210 1211_func1: 1212 movl 4(%esp), %eax 1213 addl 8(%esp), %eax 1214 jo LBB1_2 ## overflow 1215LBB1_1: ## normal 1216 ret 1217LBB1_2: ## overflow 1218 ud2 1219 1220it would be nice to produce "into" someday. 1221 1222//===---------------------------------------------------------------------===// 1223 1224Test instructions can be eliminated by using EFLAGS values from arithmetic 1225instructions. This is currently not done for mul, and, or, xor, neg, shl, 1226sra, srl, shld, shrd, atomic ops, and others. It is also currently not done 1227for read-modify-write instructions. It is also current not done if the 1228OF or CF flags are needed. 1229 1230The shift operators have the complication that when the shift count is 1231zero, EFLAGS is not set, so they can only subsume a test instruction if 1232the shift count is known to be non-zero. Also, using the EFLAGS value 1233from a shift is apparently very slow on some x86 implementations. 1234 1235In read-modify-write instructions, the root node in the isel match is 1236the store, and isel has no way for the use of the EFLAGS result of the 1237arithmetic to be remapped to the new node. 1238 1239Add and subtract instructions set OF on signed overflow and CF on unsiged 1240overflow, while test instructions always clear OF and CF. In order to 1241replace a test with an add or subtract in a situation where OF or CF is 1242needed, codegen must be able to prove that the operation cannot see 1243signed or unsigned overflow, respectively. 1244 1245//===---------------------------------------------------------------------===// 1246 1247memcpy/memmove do not lower to SSE copies when possible. A silly example is: 1248define <16 x float> @foo(<16 x float> %A) nounwind { 1249 %tmp = alloca <16 x float>, align 16 1250 %tmp2 = alloca <16 x float>, align 16 1251 store <16 x float> %A, <16 x float>* %tmp 1252 %s = bitcast <16 x float>* %tmp to i8* 1253 %s2 = bitcast <16 x float>* %tmp2 to i8* 1254 call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16) 1255 %R = load <16 x float>* %tmp2 1256 ret <16 x float> %R 1257} 1258 1259declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind 1260 1261which compiles to: 1262 1263_foo: 1264 subl $140, %esp 1265 movaps %xmm3, 112(%esp) 1266 movaps %xmm2, 96(%esp) 1267 movaps %xmm1, 80(%esp) 1268 movaps %xmm0, 64(%esp) 1269 movl 60(%esp), %eax 1270 movl %eax, 124(%esp) 1271 movl 56(%esp), %eax 1272 movl %eax, 120(%esp) 1273 movl 52(%esp), %eax 1274 <many many more 32-bit copies> 1275 movaps (%esp), %xmm0 1276 movaps 16(%esp), %xmm1 1277 movaps 32(%esp), %xmm2 1278 movaps 48(%esp), %xmm3 1279 addl $140, %esp 1280 ret 1281 1282On Nehalem, it may even be cheaper to just use movups when unaligned than to 1283fall back to lower-granularity chunks. 1284 1285//===---------------------------------------------------------------------===// 1286 1287Implement processor-specific optimizations for parity with GCC on these 1288processors. GCC does two optimizations: 1289 12901. ix86_pad_returns inserts a noop before ret instructions if immediately 1291 preceded by a conditional branch or is the target of a jump. 12922. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of 1293 code contains more than 3 branches. 1294 1295The first one is done for all AMDs, Core2, and "Generic" 1296The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, 1297 Core 2, and "Generic" 1298 1299//===---------------------------------------------------------------------===// 1300Testcase: 1301int x(int a) { return (a&0xf0)>>4; } 1302 1303Current output: 1304 movl 4(%esp), %eax 1305 shrl $4, %eax 1306 andl $15, %eax 1307 ret 1308 1309Ideal output: 1310 movzbl 4(%esp), %eax 1311 shrl $4, %eax 1312 ret 1313 1314//===---------------------------------------------------------------------===// 1315 1316Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch 1317properly. 1318 1319When the return value is not used (i.e. only care about the value in the 1320memory), x86 does not have to use add to implement these. Instead, it can use 1321add, sub, inc, dec instructions with the "lock" prefix. 1322 1323This is currently implemented using a bit of instruction selection trick. The 1324issue is the target independent pattern produces one output and a chain and we 1325want to map it into one that just output a chain. The current trick is to select 1326it into a MERGE_VALUES with the first definition being an implicit_def. The 1327proper solution is to add new ISD opcodes for the no-output variant. DAG 1328combiner can then transform the node before it gets to target node selection. 1329 1330Problem #2 is we are adding a whole bunch of x86 atomic instructions when in 1331fact these instructions are identical to the non-lock versions. We need a way to 1332add target specific information to target nodes and have this information 1333carried over to machine instructions. Asm printer (or JIT) can use this 1334information to add the "lock" prefix. 1335 1336//===---------------------------------------------------------------------===// 1337 1338struct B { 1339 unsigned char y0 : 1; 1340}; 1341 1342int bar(struct B* a) { return a->y0; } 1343 1344define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize { 1345 %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0 1346 %2 = load i8* %1, align 1 1347 %3 = and i8 %2, 1 1348 %4 = zext i8 %3 to i32 1349 ret i32 %4 1350} 1351 1352bar: # @bar 1353# %bb.0: 1354 movb (%rdi), %al 1355 andb $1, %al 1356 movzbl %al, %eax 1357 ret 1358 1359Missed optimization: should be movl+andl. 1360 1361//===---------------------------------------------------------------------===// 1362 1363The x86_64 abi says: 1364 1365Booleans, when stored in a memory object, are stored as single byte objects the 1366value of which is always 0 (false) or 1 (true). 1367 1368We are not using this fact: 1369 1370int bar(_Bool *a) { return *a; } 1371 1372define i32 @bar(i8* nocapture %a) nounwind readonly optsize { 1373 %1 = load i8* %a, align 1, !tbaa !0 1374 %tmp = and i8 %1, 1 1375 %2 = zext i8 %tmp to i32 1376 ret i32 %2 1377} 1378 1379bar: 1380 movb (%rdi), %al 1381 andb $1, %al 1382 movzbl %al, %eax 1383 ret 1384 1385GCC produces 1386 1387bar: 1388 movzbl (%rdi), %eax 1389 ret 1390 1391//===---------------------------------------------------------------------===// 1392 1393Take the following C code: 1394int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } 1395 1396We generate the following IR with clang: 1397define i32 @f(i32 %a, i32 %b) nounwind readnone { 1398entry: 1399 %tmp = xor i32 %b, %a ; <i32> [#uses=1] 1400 %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1] 1401 %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1] 1402 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] 1403 ret i32 %conv5 1404} 1405 1406And the following x86 code: 1407 xorl %esi, %edi 1408 testb $-1, %dil 1409 sete %al 1410 movzbl %al, %eax 1411 ret 1412 1413A cmpb instead of the xorl+testb would be one instruction shorter. 1414 1415//===---------------------------------------------------------------------===// 1416 1417Given the following C code: 1418int f(int a, int b) { return (signed char)a == (signed char)b; } 1419 1420We generate the following IR with clang: 1421define i32 @f(i32 %a, i32 %b) nounwind readnone { 1422entry: 1423 %sext = shl i32 %a, 24 ; <i32> [#uses=1] 1424 %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1] 1425 %sext6 = shl i32 %b, 24 ; <i32> [#uses=1] 1426 %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1] 1427 %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1] 1428 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] 1429 ret i32 %conv5 1430} 1431 1432And the following x86 code: 1433 movsbl %sil, %eax 1434 movsbl %dil, %ecx 1435 cmpl %eax, %ecx 1436 sete %al 1437 movzbl %al, %eax 1438 ret 1439 1440 1441It should be possible to eliminate the sign extensions. 1442 1443//===---------------------------------------------------------------------===// 1444 1445LLVM misses a load+store narrowing opportunity in this code: 1446 1447%struct.bf = type { i64, i16, i16, i32 } 1448 1449@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2] 1450 1451define void @t1() nounwind ssp { 1452entry: 1453 %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] 1454 %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1] 1455 %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2] 1456 %3 = load i32* %2, align 1 ; <i32> [#uses=1] 1457 %4 = and i32 %3, -65537 ; <i32> [#uses=1] 1458 store i32 %4, i32* %2, align 1 1459 %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] 1460 %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1] 1461 %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2] 1462 %8 = load i32* %7, align 1 ; <i32> [#uses=1] 1463 %9 = and i32 %8, -131073 ; <i32> [#uses=1] 1464 store i32 %9, i32* %7, align 1 1465 ret void 1466} 1467 1468LLVM currently emits this: 1469 1470 movq bfi(%rip), %rax 1471 andl $-65537, 8(%rax) 1472 movq bfi(%rip), %rax 1473 andl $-131073, 8(%rax) 1474 ret 1475 1476It could narrow the loads and stores to emit this: 1477 1478 movq bfi(%rip), %rax 1479 andb $-2, 10(%rax) 1480 movq bfi(%rip), %rax 1481 andb $-3, 10(%rax) 1482 ret 1483 1484The trouble is that there is a TokenFactor between the store and the 1485load, making it non-trivial to determine if there's anything between 1486the load and the store which would prohibit narrowing. 1487 1488//===---------------------------------------------------------------------===// 1489 1490This code: 1491void foo(unsigned x) { 1492 if (x == 0) bar(); 1493 else if (x == 1) qux(); 1494} 1495 1496currently compiles into: 1497_foo: 1498 movl 4(%esp), %eax 1499 cmpl $1, %eax 1500 je LBB0_3 1501 testl %eax, %eax 1502 jne LBB0_4 1503 1504the testl could be removed: 1505_foo: 1506 movl 4(%esp), %eax 1507 cmpl $1, %eax 1508 je LBB0_3 1509 jb LBB0_4 1510 15110 is the only unsigned number < 1. 1512 1513//===---------------------------------------------------------------------===// 1514 1515This code: 1516 1517%0 = type { i32, i1 } 1518 1519define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp { 1520entry: 1521 %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x) 1522 %cmp = extractvalue %0 %uadd, 1 1523 %inc = zext i1 %cmp to i32 1524 %add = add i32 %x, %sum 1525 %z.0 = add i32 %add, %inc 1526 ret i32 %z.0 1527} 1528 1529declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone 1530 1531compiles to: 1532 1533_add32carry: ## @add32carry 1534 addl %esi, %edi 1535 sbbl %ecx, %ecx 1536 movl %edi, %eax 1537 subl %ecx, %eax 1538 ret 1539 1540But it could be: 1541 1542_add32carry: 1543 leal (%rsi,%rdi), %eax 1544 cmpl %esi, %eax 1545 adcl $0, %eax 1546 ret 1547 1548//===---------------------------------------------------------------------===// 1549 1550The hot loop of 256.bzip2 contains code that looks a bit like this: 1551 1552int foo(char *P, char *Q, int x, int y) { 1553 if (P[0] != Q[0]) 1554 return P[0] < Q[0]; 1555 if (P[1] != Q[1]) 1556 return P[1] < Q[1]; 1557 if (P[2] != Q[2]) 1558 return P[2] < Q[2]; 1559 return P[3] < Q[3]; 1560} 1561 1562In the real code, we get a lot more wrong than this. However, even in this 1563code we generate: 1564 1565_foo: ## @foo 1566## %bb.0: ## %entry 1567 movb (%rsi), %al 1568 movb (%rdi), %cl 1569 cmpb %al, %cl 1570 je LBB0_2 1571LBB0_1: ## %if.then 1572 cmpb %al, %cl 1573 jmp LBB0_5 1574LBB0_2: ## %if.end 1575 movb 1(%rsi), %al 1576 movb 1(%rdi), %cl 1577 cmpb %al, %cl 1578 jne LBB0_1 1579## %bb.3: ## %if.end38 1580 movb 2(%rsi), %al 1581 movb 2(%rdi), %cl 1582 cmpb %al, %cl 1583 jne LBB0_1 1584## %bb.4: ## %if.end60 1585 movb 3(%rdi), %al 1586 cmpb 3(%rsi), %al 1587LBB0_5: ## %if.end60 1588 setl %al 1589 movzbl %al, %eax 1590 ret 1591 1592Note that we generate jumps to LBB0_1 which does a redundant compare. The 1593redundant compare also forces the register values to be live, which prevents 1594folding one of the loads into the compare. In contrast, GCC 4.2 produces: 1595 1596_foo: 1597 movzbl (%rsi), %eax 1598 cmpb %al, (%rdi) 1599 jne L10 1600L12: 1601 movzbl 1(%rsi), %eax 1602 cmpb %al, 1(%rdi) 1603 jne L10 1604 movzbl 2(%rsi), %eax 1605 cmpb %al, 2(%rdi) 1606 jne L10 1607 movzbl 3(%rdi), %eax 1608 cmpb 3(%rsi), %al 1609L10: 1610 setl %al 1611 movzbl %al, %eax 1612 ret 1613 1614which is "perfect". 1615 1616//===---------------------------------------------------------------------===// 1617 1618For the branch in the following code: 1619int a(); 1620int b(int x, int y) { 1621 if (x & (1<<(y&7))) 1622 return a(); 1623 return y; 1624} 1625 1626We currently generate: 1627 movb %sil, %al 1628 andb $7, %al 1629 movzbl %al, %eax 1630 btl %eax, %edi 1631 jae .LBB0_2 1632 1633movl+andl would be shorter than the movb+andb+movzbl sequence. 1634 1635//===---------------------------------------------------------------------===// 1636 1637For the following: 1638struct u1 { 1639 float x, y; 1640}; 1641float foo(struct u1 u) { 1642 return u.x + u.y; 1643} 1644 1645We currently generate: 1646 movdqa %xmm0, %xmm1 1647 pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0] 1648 addss %xmm1, %xmm0 1649 ret 1650 1651We could save an instruction here by commuting the addss. 1652 1653//===---------------------------------------------------------------------===// 1654 1655This (from PR9661): 1656 1657float clamp_float(float a) { 1658 if (a > 1.0f) 1659 return 1.0f; 1660 else if (a < 0.0f) 1661 return 0.0f; 1662 else 1663 return a; 1664} 1665 1666Could compile to: 1667 1668clamp_float: # @clamp_float 1669 movss .LCPI0_0(%rip), %xmm1 1670 minss %xmm1, %xmm0 1671 pxor %xmm1, %xmm1 1672 maxss %xmm1, %xmm0 1673 ret 1674 1675with -ffast-math. 1676 1677//===---------------------------------------------------------------------===// 1678 1679This function (from PR9803): 1680 1681int clamp2(int a) { 1682 if (a > 5) 1683 a = 5; 1684 if (a < 0) 1685 return 0; 1686 return a; 1687} 1688 1689Compiles to: 1690 1691_clamp2: ## @clamp2 1692 pushq %rbp 1693 movq %rsp, %rbp 1694 cmpl $5, %edi 1695 movl $5, %ecx 1696 cmovlel %edi, %ecx 1697 testl %ecx, %ecx 1698 movl $0, %eax 1699 cmovnsl %ecx, %eax 1700 popq %rbp 1701 ret 1702 1703The move of 0 could be scheduled above the test to make it is xor reg,reg. 1704 1705//===---------------------------------------------------------------------===// 1706 1707GCC PR48986. We currently compile this: 1708 1709void bar(void); 1710void yyy(int* p) { 1711 if (__sync_fetch_and_add(p, -1) == 1) 1712 bar(); 1713} 1714 1715into: 1716 movl $-1, %eax 1717 lock 1718 xaddl %eax, (%rdi) 1719 cmpl $1, %eax 1720 je LBB0_2 1721 1722Instead we could generate: 1723 1724 lock 1725 dec %rdi 1726 je LBB0_2 1727 1728The trick is to match "fetch_and_add(X, -C) == C". 1729 1730//===---------------------------------------------------------------------===// 1731 1732unsigned t(unsigned a, unsigned b) { 1733 return a <= b ? 5 : -5; 1734} 1735 1736We generate: 1737 movl $5, %ecx 1738 cmpl %esi, %edi 1739 movl $-5, %eax 1740 cmovbel %ecx, %eax 1741 1742GCC: 1743 cmpl %edi, %esi 1744 sbbl %eax, %eax 1745 andl $-10, %eax 1746 addl $5, %eax 1747 1748//===---------------------------------------------------------------------===// 1749