Name Date Size #Lines LOC

..--

AsmParser/H--5,7784,709

Disassembler/H--3,2922,565

GISel/H--3,5962,787

MCA/H--12772

MCTargetDesc/H--12,7379,583

TargetInfo/H--6233

CMakeLists.txtH A D08-Jul-20243.2 KiB125117

ImmutableGraph.hH A D04-Jan-202315.1 KiB446356

README-FPStack.txtH A D17-Jul-20102.7 KiB8658

README-SSE.txtH A D27-Jul-202323.4 KiB799616

README-X86-64.txtH A D07-Dec-20176 KiB185150

README.txtH A D09-Aug-202345.7 KiB1,7491,363

X86.hH A D01-Nov-20249.1 KiB22198

X86.tdH A D16-Dec-2024102 KiB2,0031,806

X86ArgumentStackSlotRebase.cppH A D19-Jan-20256.2 KiB199140

X86AsmPrinter.cppH A D07-Nov-202436.1 KiB1,047780

X86AsmPrinter.hH A D14-Dec-20236.7 KiB171105

X86AvoidStoreForwardingBlocks.cppH A D09-Nov-202427.6 KiB729594

X86AvoidTrailingCall.cppH A D15-Mar-20225 KiB13767

X86CallFrameOptimization.cppH A D20-May-202322.7 KiB632375

X86CallingConv.cppH A D24-Jan-202514.8 KiB346204

X86CallingConv.hH A D19-Jan-20191.1 KiB3413

X86CallingConv.tdH A D24-Jan-202548.6 KiB1,244993

X86CmovConversion.cppH A D09-Jul-202435.5 KiB899461

X86CodeGenPassBuilder.cppH A D24-Jun-20242.2 KiB6539

X86CompressEVEX.cppH A D22-Nov-202410 KiB308205

X86DiscriminateMemOps.cppH A D14-Nov-20247.1 KiB188126

X86DomainReassignment.cppH A D09-Nov-202428.2 KiB839562

X86DynAllocaExpander.cppH A D09-Nov-20249.5 KiB299207

X86ExpandPseudo.cppH A D17-Jan-202539.1 KiB850629

X86FastISel.cppH A D09-Nov-2024139.6 KiB4,0733,040

X86FastPreTileConfig.cppH A D09-Nov-202425.1 KiB703456

X86FastTileConfig.cppH A D09-Nov-20247 KiB190113

X86FixupBWInsts.cppH A D28-Oct-202418.2 KiB465223

X86FixupInstTuning.cppH A D09-Nov-202419.3 KiB518417

X86FixupLEAs.cppH A D10-Nov-202332.5 KiB915670

X86FixupSetCC.cppH A D26-Jun-20245 KiB14879

X86FixupVectorConstants.cppH A D18-Jan-202530.5 KiB706586

X86FlagsCopyLowering.cppH A D13-Dec-202437 KiB919602

X86FloatingPoint.cppH A D12-Dec-202466 KiB1,7911,162

X86FrameLowering.cppH A D29-Jan-2025176.4 KiB4,2342,792

X86FrameLowering.hH A D18-Oct-202413.6 KiB275151

X86GenRegisterBankInfo.defH A D05-Mar-20243.8 KiB113103

X86ISelDAGToDAG.cppH A D17-Jan-2025249.3 KiB6,6224,769

X86ISelDAGToDAG.hH A D04-Jun-2024734 2611

X86ISelLowering.cppH A D27-Jan-20252.4 MiB59,28543,305

X86ISelLowering.hH A D24-Jan-202568.8 KiB1,898984

X86ISelLoweringCall.cppH A D24-Jan-2025117.8 KiB2,9372,008

X86IndirectBranchTracking.cppH A D20-Jul-20246.5 KiB192121

X86IndirectThunks.cppH A D09-Nov-20248.9 KiB249138

X86InsertPrefetch.cppH A D09-Nov-20249.8 KiB263193

X86InsertWait.cppH A D03-Nov-20214 KiB13186

X86InstCombineIntrinsic.cppH A D20-Jan-2025107.7 KiB3,3882,797

X86Instr3DNow.tdH A D16-Jul-20244.3 KiB10290

X86InstrAMX.tdH A D17-Jan-202538.9 KiB270250

X86InstrAVX10.tdH A D24-Jan-2025103.3 KiB

X86InstrAVX512.tdH A D13-Jan-2025708.2 KiB13,58312,269

X86InstrArithmetic.tdH A D09-Nov-202472.6 KiB1,4811,367

X86InstrAsmAlias.tdH A D13-Aug-202450.3 KiB904813

X86InstrBuilder.hH A D30-Mar-20208.5 KiB233146

X86InstrCMovSetCC.tdH A D27-Jun-20248.5 KiB176160

X86InstrCompiler.tdH A D23-Jan-2025103.4 KiB2,2061,963

X86InstrConditionalCompare.tdH A D11-Jul-20247.7 KiB157141

X86InstrControl.tdH A D11-Apr-202420.3 KiB433383

X86InstrExtension.tdH A D05-Apr-202311.5 KiB223209

X86InstrFMA.tdH A D05-Apr-202332.8 KiB633587

X86InstrFMA3Info.cppH A D24-Jan-20258.1 KiB179134

X86InstrFMA3Info.hH A D19-Apr-20203.2 KiB9841

X86InstrFPStack.tdH A D23-Dec-202336.3 KiB732671

X86InstrFoldTables.cppH A D31-Jan-202413.4 KiB335257

X86InstrFoldTables.hH A D31-Jan-20242.2 KiB6429

X86InstrFormats.tdH A D03-Aug-202412.6 KiB313288

X86InstrFragments.tdH A D23-Jan-202538 KiB868719

X86InstrFragmentsSIMD.tdH A D23-Jan-202576.1 KiB1,3641,167

X86InstrInfo.cppH A D29-Jan-2025355.8 KiB10,8308,942

X86InstrInfo.hH A D22-Jan-202533.2 KiB730375

X86InstrInfo.tdH A D03-Aug-20242.6 KiB9474

X86InstrKL.tdH A D19-Apr-20243.7 KiB7566

X86InstrMMX.tdH A D15-Sep-202429.7 KiB579519

X86InstrMisc.tdH A D23-Jan-202582.6 KiB1,7321,553

X86InstrOperands.tdH A D13-Jan-202519.2 KiB504441

X86InstrPredicates.tdH A D26-Nov-202413.8 KiB242234

X86InstrRAOINT.tdH A D26-Jan-20242 KiB4841

X86InstrSGX.tdH A D23-Dec-20231.1 KiB3025

X86InstrSNP.tdH A D22-Dec-20232.3 KiB5445

X86InstrSSE.tdH A D13-Jan-2025411.5 KiB8,4717,658

X86InstrSVM.tdH A D21-Feb-20232 KiB6353

X86InstrShiftRotate.tdH A D23-Jan-202432.3 KiB692609

X86InstrSystem.tdH A D09-Nov-202438.9 KiB849741

X86InstrTBM.tdH A D22-Dec-20238.1 KiB195165

X86InstrTDX.tdH A D22-Dec-20231.2 KiB3428

X86InstrTSX.tdH A D23-Dec-20232.2 KiB6249

X86InstrUtils.tdH A D24-Jan-202562.4 KiB1,3881,273

X86InstrVMX.tdH A D29-Feb-20243.8 KiB8674

X86InstrVecCompiler.tdH A D17-Sep-202422.3 KiB488417

X86InstrXOP.tdH A D22-Dec-202323.8 KiB474443

X86InterleavedAccess.cppH A D09-Nov-202432.1 KiB849473

X86IntrinsicsInfo.hH A D24-Jan-2025134.7 KiB1,3911,336

X86LoadValueInjectionLoadHardening.cppH A D09-Nov-202431.7 KiB811625

X86LoadValueInjectionRetHardening.cppH A D29-Dec-20244.1 KiB12078

X86LowerAMXIntrinsics.cppH A D09-Nov-202427.1 KiB679518

X86LowerAMXType.cppH A D24-Jan-202551.9 KiB1,299819

X86LowerTileCopy.cppH A D09-Nov-20245.7 KiB163111

X86MCInstLower.cppH A D28-Jan-202588 KiB2,4931,927

X86MachineFunctionInfo.cppH A D21-Nov-20241.8 KiB5134

X86MachineFunctionInfo.hH A D21-Nov-202413.3 KiB336187

X86MacroFusion.cppH A D22-Dec-20232.7 KiB7642

X86MacroFusion.hH A D26-Apr-2020992 329

X86OptimizeLEAs.cppH A D28-Oct-202428.9 KiB752455

X86PadShortFunction.cppH A D09-Nov-20247 KiB226140

X86PartialReduction.cppH A D09-Nov-202417.3 KiB546363

X86PassRegistry.defH A D04-Jun-2024751 2017

X86PfmCounters.tdH A D25-Nov-202416.6 KiB353325

X86PreTileConfig.cppH A D17-Jan-202516.2 KiB418310

X86RegisterBanks.tdH A D05-Mar-2024723 2016

X86RegisterInfo.cppH A D19-Jan-202543.5 KiB1,125871

X86RegisterInfo.hH A D26-Sep-20247.3 KiB18483

X86RegisterInfo.tdH A D19-Dec-202435.8 KiB849763

X86ReplaceableInstrs.defH A D20-Nov-202421.3 KiB427415

X86ReturnThunks.cppH A D20-Jul-20243.3 KiB10261

X86SchedAlderlakeP.tdH A D25-Nov-2024104.8 KiB2,4762,166

X86SchedBroadwell.tdH A D28-Nov-202468.3 KiB1,7011,475

X86SchedHaswell.tdH A D28-Nov-202472.9 KiB1,9701,692

X86SchedIceLake.tdH A D09-Dec-2024113.3 KiB2,5922,306

X86SchedPredicates.tdH A D20-Aug-20194.2 KiB144119

X86SchedSandyBridge.tdH A D27-Nov-202452.7 KiB1,2861,119

X86SchedSapphireRapids.tdH A D02-Jan-2025236.6 KiB5,2034,605

X86SchedSkylakeClient.tdH A D27-Nov-202470.6 KiB1,7821,551

X86SchedSkylakeServer.tdH A D27-Nov-2024111.5 KiB2,5752,288

X86Schedule.tdH A D23-Aug-202338.1 KiB752681

X86ScheduleAtom.tdH A D23-Aug-202340.5 KiB943814

X86ScheduleBdVer2.tdH A D15-Sep-202456.7 KiB1,4571,165

X86ScheduleBtVer2.tdH A D15-Sep-202447.3 KiB1,056905

X86ScheduleSLM.tdH A D23-Aug-202323.7 KiB517455

X86ScheduleZnver1.tdH A D15-Sep-202445.7 KiB1,3981,184

X86ScheduleZnver2.tdH A D15-Sep-202445.7 KiB1,4041,186

X86ScheduleZnver3.tdH A D15-Sep-202475 KiB1,6281,367

X86ScheduleZnver4.tdH A D02-Jan-202589.5 KiB1,9611,667

X86SelectionDAGInfo.cppH A D21-Dec-202413.9 KiB294205

X86SelectionDAGInfo.hH A D21-Dec-20241.9 KiB4623

X86ShuffleDecodeConstantPool.cppH A D30-Sep-20219.5 KiB297188

X86ShuffleDecodeConstantPool.hH A D03-Dec-20201.7 KiB4415

X86SpeculativeExecutionSideEffectSuppression.cppH A D14-Sep-20237 KiB183108

X86SpeculativeLoadHardening.cppH A D13-Jan-202592.7 KiB2,2721,282

X86Subtarget.cppH A D14-Aug-202414.4 KiB386248

X86Subtarget.hH A D28-Jan-202515.8 KiB442271

X86TargetMachine.cppH A D14-Nov-202424.3 KiB692489

X86TargetMachine.hH A D14-Nov-20243.1 KiB8449

X86TargetObjectFile.cppH A D18-Jan-20242.8 KiB6947

X86TargetObjectFile.hH A D18-Jan-20242.7 KiB6638

X86TargetTransformInfo.cppH A D29-Jan-2025345.5 KiB6,7595,280

X86TargetTransformInfo.hH A D29-Nov-202414 KiB312235

X86TileConfig.cppH A D06-Dec-20249.2 KiB207141

X86VZeroUpper.cppH A D25-Aug-202412.4 KiB355214

X86WinEHState.cppH A D24-Jan-202530.1 KiB786538

X86WinFixupBufferSecurityCheck.cppH A D09-Nov-20248.1 KiB248162

README-FPStack.txt

1//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend: FP stack related stuff
3//===---------------------------------------------------------------------===//
4
5//===---------------------------------------------------------------------===//
6
7Some targets (e.g. athlons) prefer freep to fstp ST(0):
8http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
9
10//===---------------------------------------------------------------------===//
11
12This should use fiadd on chips where it is profitable:
13double foo(double P, int *I) { return P+*I; }
14
15We have fiadd patterns now but the followings have the same cost and
16complexity. We need a way to specify the later is more profitable.
17
18def FpADD32m  : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
19                    [(set RFP:$dst, (fadd RFP:$src1,
20                                     (extloadf64f32 addr:$src2)))]>;
21                // ST(0) = ST(0) + [mem32]
22
23def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
24                    [(set RFP:$dst, (fadd RFP:$src1,
25                                     (X86fild addr:$src2, i32)))]>;
26                // ST(0) = ST(0) + [mem32int]
27
28//===---------------------------------------------------------------------===//
29
30The FP stackifier should handle simple permutates to reduce number of shuffle
31instructions, e.g. turning:
32
33fld P	->		fld Q
34fld Q			fld P
35fxch
36
37or:
38
39fxch	->		fucomi
40fucomi			jl X
41jg X
42
43Ideas:
44http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
45
46
47//===---------------------------------------------------------------------===//
48
49Add a target specific hook to DAG combiner to handle SINT_TO_FP and
50FP_TO_SINT when the source operand is already in memory.
51
52//===---------------------------------------------------------------------===//
53
54Open code rint,floor,ceil,trunc:
55http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
56http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
57
58Opencode the sincos[f] libcall.
59
60//===---------------------------------------------------------------------===//
61
62None of the FPStack instructions are handled in
63X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
64folding spill code into the instructions.
65
66//===---------------------------------------------------------------------===//
67
68Currently the x86 codegen isn't very good at mixing SSE and FPStack
69code:
70
71unsigned int foo(double x) { return x; }
72
73foo:
74	subl $20, %esp
75	movsd 24(%esp), %xmm0
76	movsd %xmm0, 8(%esp)
77	fldl 8(%esp)
78	fisttpll (%esp)
79	movl (%esp), %eax
80	addl $20, %esp
81	ret
82
83This just requires being smarter when custom expanding fptoui.
84
85//===---------------------------------------------------------------------===//
86

README-SSE.txt

1//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend: SSE-specific stuff.
3//===---------------------------------------------------------------------===//
4
5//===---------------------------------------------------------------------===//
6
7SSE Variable shift can be custom lowered to something like this, which uses a
8small table + unaligned load + shuffle instead of going through memory.
9
10__m128i_shift_right:
11	.byte	  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
12	.byte	 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
13
14...
15__m128i shift_right(__m128i value, unsigned long offset) {
16  return _mm_shuffle_epi8(value,
17               _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
18}
19
20//===---------------------------------------------------------------------===//
21
22SSE has instructions for doing operations on complex numbers, we should pattern
23match them.   For example, this should turn into a horizontal add:
24
25typedef float __attribute__((vector_size(16))) v4f32;
26float f32(v4f32 A) {
27  return A[0]+A[1]+A[2]+A[3];
28}
29
30Instead we get this:
31
32_f32:                                   ## @f32
33	pshufd	$1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
34	addss	%xmm0, %xmm1
35	pshufd	$3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
36	movhlps	%xmm0, %xmm0            ## xmm0 = xmm0[1,1]
37	movaps	%xmm0, %xmm3
38	addss	%xmm1, %xmm3
39	movdqa	%xmm2, %xmm0
40	addss	%xmm3, %xmm0
41	ret
42
43Also, there are cases where some simple local SLP would improve codegen a bit.
44compiling this:
45
46_Complex float f32(_Complex float A, _Complex float B) {
47  return A+B;
48}
49
50into:
51
52_f32:                                   ## @f32
53	movdqa	%xmm0, %xmm2
54	addss	%xmm1, %xmm2
55	pshufd	$1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
56	pshufd	$1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
57	addss	%xmm1, %xmm3
58	movaps	%xmm2, %xmm0
59	unpcklps	%xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
60	ret
61
62seems silly when it could just be one addps.
63
64
65//===---------------------------------------------------------------------===//
66
67Expand libm rounding functions inline:  Significant speedups possible.
68http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
69
70//===---------------------------------------------------------------------===//
71
72When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
73other fast SSE modes.
74
75//===---------------------------------------------------------------------===//
76
77Think about doing i64 math in SSE regs on x86-32.
78
79//===---------------------------------------------------------------------===//
80
81This testcase should have no SSE instructions in it, and only one load from
82a constant pool:
83
84double %test3(bool %B) {
85        %C = select bool %B, double 123.412, double 523.01123123
86        ret double %C
87}
88
89Currently, the select is being lowered, which prevents the dag combiner from
90turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
91
92The pattern isel got this one right.
93
94//===---------------------------------------------------------------------===//
95
96Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
97feasible.
98
99//===---------------------------------------------------------------------===//
100
101Codegen:
102  if (copysign(1.0, x) == copysign(1.0, y))
103into:
104  if (x^y & mask)
105when using SSE.
106
107//===---------------------------------------------------------------------===//
108
109Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
110of a v4sf value.
111
112//===---------------------------------------------------------------------===//
113
114Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
115Perhaps use pxor / xorp* to clear a XMM register first?
116
117//===---------------------------------------------------------------------===//
118
119External test Nurbs exposed some problems. Look for
120__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
121emits:
122
123        movaps    (%edx), %xmm2                                 #59.21
124        movaps    (%edx), %xmm5                                 #60.21
125        movaps    (%edx), %xmm4                                 #61.21
126        movaps    (%edx), %xmm3                                 #62.21
127        movl      40(%ecx), %ebp                                #69.49
128        shufps    $0, %xmm2, %xmm5                              #60.21
129        movl      100(%esp), %ebx                               #69.20
130        movl      (%ebx), %edi                                  #69.20
131        imull     %ebp, %edi                                    #69.49
132        addl      (%eax), %edi                                  #70.33
133        shufps    $85, %xmm2, %xmm4                             #61.21
134        shufps    $170, %xmm2, %xmm3                            #62.21
135        shufps    $255, %xmm2, %xmm2                            #63.21
136        lea       (%ebp,%ebp,2), %ebx                           #69.49
137        negl      %ebx                                          #69.49
138        lea       -3(%edi,%ebx), %ebx                           #70.33
139        shll      $4, %ebx                                      #68.37
140        addl      32(%ecx), %ebx                                #68.37
141        testb     $15, %bl                                      #91.13
142        jne       L_B1.24       # Prob 5%                       #91.13
143
144This is the llvm code after instruction scheduling:
145
146cond_next140 (0xa910740, LLVM BB @0xa90beb0):
147	%reg1078 = MOV32ri -3
148	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0
149	%reg1037 = MOV32rm %reg1024, 1, %noreg, 40
150	%reg1080 = IMUL32rr %reg1079, %reg1037
151	%reg1081 = MOV32rm %reg1058, 1, %noreg, 0
152	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
153	%reg1036 = MOV32rm %reg1024, 1, %noreg, 32
154	%reg1082 = SHL32ri %reg1038, 4
155	%reg1039 = ADD32rr %reg1036, %reg1082
156	%reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0
157	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
158	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
159	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
160	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
161	%reg1040 = MOV32rr %reg1039
162	%reg1084 = AND32ri8 %reg1039, 15
163	CMP32ri8 %reg1084, 0
164	JE mbb<cond_next204,0xa914d30>
165
166Still ok. After register allocation:
167
168cond_next140 (0xa910740, LLVM BB @0xa90beb0):
169	%eax = MOV32ri -3
170	%edx = MOV32rm %stack.3, 1, %noreg, 0
171	ADD32rm %eax<def&use>, %edx, 1, %noreg, 0
172	%edx = MOV32rm %stack.7, 1, %noreg, 0
173	%edx = MOV32rm %edx, 1, %noreg, 40
174	IMUL32rr %eax<def&use>, %edx
175	%esi = MOV32rm %stack.5, 1, %noreg, 0
176	%esi = MOV32rm %esi, 1, %noreg, 0
177	MOV32mr %stack.4, 1, %noreg, 0, %esi
178	%eax = LEA32r %esi, 1, %eax, -3
179	%esi = MOV32rm %stack.7, 1, %noreg, 0
180	%esi = MOV32rm %esi, 1, %noreg, 32
181	%edi = MOV32rr %eax
182	SHL32ri %edi<def&use>, 4
183	ADD32rr %edi<def&use>, %esi
184	%xmm0 = MOVAPSrm %ecx, 1, %noreg, 0
185	%xmm1 = MOVAPSrr %xmm0
186	SHUFPSrr %xmm1<def&use>, %xmm1, 170
187	%xmm2 = MOVAPSrr %xmm0
188	SHUFPSrr %xmm2<def&use>, %xmm2, 0
189	%xmm3 = MOVAPSrr %xmm0
190	SHUFPSrr %xmm3<def&use>, %xmm3, 255
191	SHUFPSrr %xmm0<def&use>, %xmm0, 85
192	%ebx = MOV32rr %edi
193	AND32ri8 %ebx<def&use>, 15
194	CMP32ri8 %ebx, 0
195	JE mbb<cond_next204,0xa914d30>
196
197This looks really bad. The problem is shufps is a destructive opcode. Since it
198appears as operand two in more than one shufps ops. It resulted in a number of
199copies. Note icc also suffers from the same problem. Either the instruction
200selector should select pshufd or The register allocator can made the two-address
201to three-address transformation.
202
203It also exposes some other problems. See MOV32ri -3 and the spills.
204
205//===---------------------------------------------------------------------===//
206
207Consider:
208
209__m128 test(float a) {
210  return _mm_set_ps(0.0, 0.0, 0.0, a*a);
211}
212
213This compiles into:
214
215movss 4(%esp), %xmm1
216mulss %xmm1, %xmm1
217xorps %xmm0, %xmm0
218movss %xmm1, %xmm0
219ret
220
221Because mulss doesn't modify the top 3 elements, the top elements of
222xmm1 are already zero'd.  We could compile this to:
223
224movss 4(%esp), %xmm0
225mulss %xmm0, %xmm0
226ret
227
228//===---------------------------------------------------------------------===//
229
230Here's a sick and twisted idea.  Consider code like this:
231
232__m128 test(__m128 a) {
233  float b = *(float*)&A;
234  ...
235  return _mm_set_ps(0.0, 0.0, 0.0, b);
236}
237
238This might compile to this code:
239
240movaps c(%esp), %xmm1
241xorps %xmm0, %xmm0
242movss %xmm1, %xmm0
243ret
244
245Now consider if the ... code caused xmm1 to get spilled.  This might produce
246this code:
247
248movaps c(%esp), %xmm1
249movaps %xmm1, c2(%esp)
250...
251
252xorps %xmm0, %xmm0
253movaps c2(%esp), %xmm1
254movss %xmm1, %xmm0
255ret
256
257However, since the reload is only used by these instructions, we could
258"fold" it into the uses, producing something like this:
259
260movaps c(%esp), %xmm1
261movaps %xmm1, c2(%esp)
262...
263
264movss c2(%esp), %xmm0
265ret
266
267... saving two instructions.
268
269The basic idea is that a reload from a spill slot, can, if only one 4-byte
270chunk is used, bring in 3 zeros the one element instead of 4 elements.
271This can be used to simplify a variety of shuffle operations, where the
272elements are fixed zeros.
273
274//===---------------------------------------------------------------------===//
275
276This code generates ugly code, probably due to costs being off or something:
277
278define void @test(float* %P, <4 x float>* %P2 ) {
279        %xFloat0.688 = load float* %P
280        %tmp = load <4 x float>* %P2
281        %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
282        store <4 x float> %inFloat3.713, <4 x float>* %P2
283        ret void
284}
285
286Generates:
287
288_test:
289	movl	8(%esp), %eax
290	movaps	(%eax), %xmm0
291	pxor	%xmm1, %xmm1
292	movaps	%xmm0, %xmm2
293	shufps	$50, %xmm1, %xmm2
294	shufps	$132, %xmm2, %xmm0
295	movaps	%xmm0, (%eax)
296	ret
297
298Would it be better to generate:
299
300_test:
301        movl 8(%esp), %ecx
302        movaps (%ecx), %xmm0
303	xor %eax, %eax
304        pinsrw $6, %eax, %xmm0
305        pinsrw $7, %eax, %xmm0
306        movaps %xmm0, (%ecx)
307        ret
308
309?
310
311//===---------------------------------------------------------------------===//
312
313Some useful information in the Apple Altivec / SSE Migration Guide:
314
315http://developer.apple.com/documentation/Performance/Conceptual/
316Accelerate_sse_migration/index.html
317
318e.g. SSE select using and, andnot, or. Various SSE compare translations.
319
320//===---------------------------------------------------------------------===//
321
322Add hooks to commute some CMPP operations.
323
324//===---------------------------------------------------------------------===//
325
326Apply the same transformation that merged four float into a single 128-bit load
327to loads from constant pool.
328
329//===---------------------------------------------------------------------===//
330
331Floating point max / min are commutable when -enable-unsafe-fp-path is
332specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
333nodes which are selected to max / min instructions that are marked commutable.
334
335//===---------------------------------------------------------------------===//
336
337We should materialize vector constants like "all ones" and "signbit" with
338code like:
339
340     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
341
342and:
343     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
344     psrlq   xmm1, 31     ; xmm1 = all 100000000000...
345
346instead of using a load from the constant pool.  The later is important for
347ABS/NEG/copysign etc.
348
349//===---------------------------------------------------------------------===//
350
351These functions:
352
353#include <xmmintrin.h>
354__m128i a;
355void x(unsigned short n) {
356  a = _mm_slli_epi32 (a, n);
357}
358void y(unsigned n) {
359  a = _mm_slli_epi32 (a, n);
360}
361
362compile to ( -O3 -static -fomit-frame-pointer):
363_x:
364        movzwl  4(%esp), %eax
365        movd    %eax, %xmm0
366        movaps  _a, %xmm1
367        pslld   %xmm0, %xmm1
368        movaps  %xmm1, _a
369        ret
370_y:
371        movd    4(%esp), %xmm0
372        movaps  _a, %xmm1
373        pslld   %xmm0, %xmm1
374        movaps  %xmm1, _a
375        ret
376
377"y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
378like movd would be sufficient in both cases as the value is already zero
379extended in the 32-bit stack slot IIRC.  For signed short, it should also be
380save, as a really-signed value would be undefined for pslld.
381
382
383//===---------------------------------------------------------------------===//
384
385#include <math.h>
386int t1(double d) { return signbit(d); }
387
388This currently compiles to:
389	subl	$12, %esp
390	movsd	16(%esp), %xmm0
391	movsd	%xmm0, (%esp)
392	movl	4(%esp), %eax
393	shrl	$31, %eax
394	addl	$12, %esp
395	ret
396
397We should use movmskp{s|d} instead.
398
399//===---------------------------------------------------------------------===//
400
401CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
402(aligned) vector load.  This functionality has a couple of problems.
403
4041. The code to infer alignment from loads of globals is in the X86 backend,
405   not the dag combiner.  This is because dagcombine2 needs to be able to see
406   through the X86ISD::Wrapper node, which DAGCombine can't really do.
4072. The code for turning 4 x load into a single vector load is target
408   independent and should be moved to the dag combiner.
4093. The code for turning 4 x load into a vector load can only handle a direct
410   load from a global or a direct load from the stack.  It should be generalized
411   to handle any load from P, P+4, P+8, P+12, where P can be anything.
4124. The alignment inference code cannot handle loads from globals in non-static
413   mode because it doesn't look through the extra dyld stub load.  If you try
414   vec_align.ll without -relocation-model=static, you'll see what I mean.
415
416//===---------------------------------------------------------------------===//
417
418We should lower store(fneg(load p), q) into an integer load+xor+store, which
419eliminates a constant pool load.  For example, consider:
420
421define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
422entry:
423 %tmp6 = fsub float -0.000000e+00, %z.1		; <float> [#uses=1]
424 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
425 ret i64 %tmp20
426}
427declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
428
429This currently compiles to:
430
431LCPI1_0:					#  <4 x float>
432	.long	2147483648	# float -0
433	.long	2147483648	# float -0
434	.long	2147483648	# float -0
435	.long	2147483648	# float -0
436_ccosf:
437	subl	$12, %esp
438	movss	16(%esp), %xmm0
439	movss	%xmm0, 4(%esp)
440	movss	20(%esp), %xmm0
441	xorps	LCPI1_0, %xmm0
442	movss	%xmm0, (%esp)
443	call	L_ccoshf$stub
444	addl	$12, %esp
445	ret
446
447Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
448this code computes the pic base and does two loads to do the constant pool
449load, so the improvement is much bigger.
450
451The tricky part about this xform is that the argument load/store isn't exposed
452until post-legalize, and at that point, the fneg has been custom expanded into
453an X86 fxor.  This means that we need to handle this case in the x86 backend
454instead of in target independent code.
455
456//===---------------------------------------------------------------------===//
457
458Non-SSE4 insert into 16 x i8 is atrociously bad.
459
460//===---------------------------------------------------------------------===//
461
462<2 x i64> extract is substantially worse than <2 x f64>, even if the destination
463is memory.
464
465//===---------------------------------------------------------------------===//
466
467INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
468any number of 0.0 simultaneously.  Currently we only use it for simple
469insertions.
470
471See comments in LowerINSERT_VECTOR_ELT_SSE4.
472
473//===---------------------------------------------------------------------===//
474
475On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
476Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
477legal, it'll just take a few extra patterns written in the .td file.
478
479Note: this is not a code quality issue; the custom lowered code happens to be
480right, but we shouldn't have to custom lower anything.  This is probably related
481to <2 x i64> ops being so bad.
482
483//===---------------------------------------------------------------------===//
484
485LLVM currently generates stack realignment code, when it is not necessary
486needed. The problem is that we need to know about stack alignment too early,
487before RA runs.
488
489At that point we don't know, whether there will be vector spill, or not.
490Stack realignment logic is overly conservative here, but otherwise we can
491produce unaligned loads/stores.
492
493Fixing this will require some huge RA changes.
494
495Testcase:
496#include <emmintrin.h>
497
498typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
499
500static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
501- 22725, - 12873};;
502
503vSInt16 madd(vSInt16 b)
504{
505    return _mm_madd_epi16(a, b);
506}
507
508Generated code (x86-32, linux):
509madd:
510        pushl   %ebp
511        movl    %esp, %ebp
512        andl    $-16, %esp
513        movaps  .LCPI1_0, %xmm1
514        pmaddwd %xmm1, %xmm0
515        movl    %ebp, %esp
516        popl    %ebp
517        ret
518
519//===---------------------------------------------------------------------===//
520
521Consider:
522#include <emmintrin.h>
523__m128 foo2 (float x) {
524 return _mm_set_ps (0, 0, x, 0);
525}
526
527In x86-32 mode, we generate this spiffy code:
528
529_foo2:
530	movss	4(%esp), %xmm0
531	pshufd	$81, %xmm0, %xmm0
532	ret
533
534in x86-64 mode, we generate this code, which could be better:
535
536_foo2:
537	xorps	%xmm1, %xmm1
538	movss	%xmm0, %xmm1
539	pshufd	$81, %xmm1, %xmm0
540	ret
541
542In sse4 mode, we could use insertps to make both better.
543
544Here's another testcase that could use insertps [mem]:
545
546#include <xmmintrin.h>
547extern float x2, x3;
548__m128 foo1 (float x1, float x4) {
549 return _mm_set_ps (x2, x1, x3, x4);
550}
551
552gcc mainline compiles it to:
553
554foo1:
555       insertps        $0x10, x2(%rip), %xmm0
556       insertps        $0x10, x3(%rip), %xmm1
557       movaps  %xmm1, %xmm2
558       movlhps %xmm0, %xmm2
559       movaps  %xmm2, %xmm0
560       ret
561
562//===---------------------------------------------------------------------===//
563
564We compile vector multiply-by-constant into poor code:
565
566define <4 x i32> @f(<4 x i32> %i) nounwind  {
567	%A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
568	ret <4 x i32> %A
569}
570
571On targets without SSE4.1, this compiles into:
572
573LCPI1_0:					##  <4 x i32>
574	.long	10
575	.long	10
576	.long	10
577	.long	10
578	.text
579	.align	4,0x90
580	.globl	_f
581_f:
582	pshufd	$3, %xmm0, %xmm1
583	movd	%xmm1, %eax
584	imull	LCPI1_0+12, %eax
585	movd	%eax, %xmm1
586	pshufd	$1, %xmm0, %xmm2
587	movd	%xmm2, %eax
588	imull	LCPI1_0+4, %eax
589	movd	%eax, %xmm2
590	punpckldq	%xmm1, %xmm2
591	movd	%xmm0, %eax
592	imull	LCPI1_0, %eax
593	movd	%eax, %xmm1
594	movhlps	%xmm0, %xmm0
595	movd	%xmm0, %eax
596	imull	LCPI1_0+8, %eax
597	movd	%eax, %xmm0
598	punpckldq	%xmm0, %xmm1
599	movaps	%xmm1, %xmm0
600	punpckldq	%xmm2, %xmm0
601	ret
602
603It would be better to synthesize integer vector multiplication by constants
604using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
605simple cases such as multiplication by powers of two would be better as
606vector shifts than as multiplications.
607
608//===---------------------------------------------------------------------===//
609
610We compile this:
611
612__m128i
613foo2 (char x)
614{
615  return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
616}
617
618into:
619	movl	$1, %eax
620	xorps	%xmm0, %xmm0
621	pinsrw	$2, %eax, %xmm0
622	movzbl	4(%esp), %eax
623	pinsrw	$3, %eax, %xmm0
624	movl	$256, %eax
625	pinsrw	$7, %eax, %xmm0
626	ret
627
628
629gcc-4.2:
630	subl	$12, %esp
631	movzbl	16(%esp), %eax
632	movdqa	LC0, %xmm0
633	pinsrw	$3, %eax, %xmm0
634	addl	$12, %esp
635	ret
636	.const
637	.align 4
638LC0:
639	.word	0
640	.word	0
641	.word	1
642	.word	0
643	.word	0
644	.word	0
645	.word	0
646	.word	256
647
648With SSE4, it should be
649      movdqa  .LC0(%rip), %xmm0
650      pinsrb  $6, %edi, %xmm0
651
652//===---------------------------------------------------------------------===//
653
654We should transform a shuffle of two vectors of constants into a single vector
655of constants. Also, insertelement of a constant into a vector of constants
656should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
657
658We compiled it to something horrible:
659
660	.align	4
661LCPI1_1:					##  float
662	.long	1065353216	## float 1
663	.const
664
665	.align	4
666LCPI1_0:					##  <4 x float>
667	.space	4
668	.long	1065353216	## float 1
669	.space	4
670	.long	1065353216	## float 1
671	.text
672	.align	4,0x90
673	.globl	_t
674_t:
675	xorps	%xmm0, %xmm0
676	movhps	LCPI1_0, %xmm0
677	movss	LCPI1_1, %xmm1
678	movaps	%xmm0, %xmm2
679	shufps	$2, %xmm1, %xmm2
680	shufps	$132, %xmm2, %xmm0
681	movaps	%xmm0, 0
682
683//===---------------------------------------------------------------------===//
684
685Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
686when code size is critical. movlps is slower than movsd on core2 but it's one
687byte shorter.
688
689//===---------------------------------------------------------------------===//
690
691We should use a dynamic programming based approach to tell when using FPStack
692operations is cheaper than SSE.  SciMark montecarlo contains code like this
693for example:
694
695double MonteCarlo_num_flops(int Num_samples) {
696    return ((double) Num_samples)* 4.0;
697}
698
699In fpstack mode, this compiles into:
700
701LCPI1_0:
702	.long	1082130432	## float 4.000000e+00
703_MonteCarlo_num_flops:
704	subl	$4, %esp
705	movl	8(%esp), %eax
706	movl	%eax, (%esp)
707	fildl	(%esp)
708	fmuls	LCPI1_0
709	addl	$4, %esp
710	ret
711
712in SSE mode, it compiles into significantly slower code:
713
714_MonteCarlo_num_flops:
715	subl	$12, %esp
716	cvtsi2sd	16(%esp), %xmm0
717	mulsd	LCPI1_0, %xmm0
718	movsd	%xmm0, (%esp)
719	fldl	(%esp)
720	addl	$12, %esp
721	ret
722
723There are also other cases in scimark where using fpstack is better, it is
724cheaper to do fld1 than load from a constant pool for example, so
725"load, add 1.0, store" is better done in the fp stack, etc.
726
727//===---------------------------------------------------------------------===//
728
729These should compile into the same code (PR6214): Perhaps instcombine should
730canonicalize the former into the later?
731
732define float @foo(float %x) nounwind {
733  %t = bitcast float %x to i32
734  %s = and i32 %t, 2147483647
735  %d = bitcast i32 %s to float
736  ret float %d
737}
738
739declare float @fabsf(float %n)
740define float @bar(float %x) nounwind {
741  %d = call float @fabsf(float %x)
742  ret float %d
743}
744
745//===---------------------------------------------------------------------===//
746
747This IR (from PR6194):
748
749target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
750target triple = "x86_64-apple-darwin10.0.0"
751
752%0 = type { double, double }
753%struct.float3 = type { float, float, float }
754
755define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
756entry:
757  %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
758  %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
759  %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
760  %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
761  %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
762  %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
763  %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
764  store float %tmp12, float* %tmp5
765  ret void
766}
767
768Compiles to:
769
770_test:                                  ## @test
771	movd	%xmm0, %rax
772	shrq	$32, %rax
773	movl	%eax, 4(%rdi)
774	ret
775
776This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
777doing a shuffle from v[1] to v[0] then a float store.
778
779//===---------------------------------------------------------------------===//
780
781[UNSAFE FP]
782
783void foo(double, double, double);
784void norm(double x, double y, double z) {
785  double scale = __builtin_sqrt(x*x + y*y + z*z);
786  foo(x/scale, y/scale, z/scale);
787}
788
789We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
790slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
791and emit 3 mulsd in place of the divs. This can be done as a target-independent
792transform.
793
794If we're dealing with floats instead of doubles we could even replace the sqrtss
795and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
796cost of reduced accuracy.
797
798//===---------------------------------------------------------------------===//
799

README-X86-64.txt

1//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
2
3AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
4multiplication by a constant. How much of it applies to Intel's X86-64
5implementation? There are definite trade-offs to consider: latency vs. register
6pressure vs. code size.
7
8//===---------------------------------------------------------------------===//
9
10Are we better off using branches instead of cmove to implement FP to
11unsigned i64?
12
13_conv:
14	ucomiss	LC0(%rip), %xmm0
15	cvttss2siq	%xmm0, %rdx
16	jb	L3
17	subss	LC0(%rip), %xmm0
18	movabsq	$-9223372036854775808, %rax
19	cvttss2siq	%xmm0, %rdx
20	xorq	%rax, %rdx
21L3:
22	movq	%rdx, %rax
23	ret
24
25instead of
26
27_conv:
28	movss LCPI1_0(%rip), %xmm1
29	cvttss2siq %xmm0, %rcx
30	movaps %xmm0, %xmm2
31	subss %xmm1, %xmm2
32	cvttss2siq %xmm2, %rax
33	movabsq $-9223372036854775808, %rdx
34	xorq %rdx, %rax
35	ucomiss %xmm1, %xmm0
36	cmovb %rcx, %rax
37	ret
38
39Seems like the jb branch has high likelihood of being taken. It would have
40saved a few instructions.
41
42//===---------------------------------------------------------------------===//
43
44It's not possible to reference AH, BH, CH, and DH registers in an instruction
45requiring REX prefix. However, divb and mulb both produce results in AH. If isel
46emits a CopyFromReg which gets turned into a movb and that can be allocated a
47r8b - r15b.
48
49To get around this, isel emits a CopyFromReg from AX and then right shift it
50down by 8 and truncate it. It's not pretty but it works. We need some register
51allocation magic to make the hack go away (e.g. putting additional constraints
52on the result of the movb).
53
54//===---------------------------------------------------------------------===//
55
56The x86-64 ABI for hidden-argument struct returns requires that the
57incoming value of %rdi be copied into %rax by the callee upon return.
58
59The idea is that it saves callers from having to remember this value,
60which would often require a callee-saved register. Callees usually
61need to keep this value live for most of their body anyway, so it
62doesn't add a significant burden on them.
63
64We currently implement this in codegen, however this is suboptimal
65because it means that it would be quite awkward to implement the
66optimization for callers.
67
68A better implementation would be to relax the LLVM IR rules for sret
69arguments to allow a function with an sret argument to have a non-void
70return type, and to have the front-end to set up the sret argument value
71as the return value of the function. The front-end could more easily
72emit uses of the returned struct value to be in terms of the function's
73lowered return value, and it would free non-C frontends from a
74complication only required by a C-based ABI.
75
76//===---------------------------------------------------------------------===//
77
78We get a redundant zero extension for code like this:
79
80int mask[1000];
81int foo(unsigned x) {
82 if (x < 10)
83   x = x * 45;
84 else
85   x = x * 78;
86 return mask[x];
87}
88
89_foo:
90LBB1_0:	## entry
91	cmpl	$9, %edi
92	jbe	LBB1_3	## bb
93LBB1_1:	## bb1
94	imull	$78, %edi, %eax
95LBB1_2:	## bb2
96	movl	%eax, %eax                    <----
97	movq	_mask@GOTPCREL(%rip), %rcx
98	movl	(%rcx,%rax,4), %eax
99	ret
100LBB1_3:	## bb
101	imull	$45, %edi, %eax
102	jmp	LBB1_2	## bb2
103
104Before regalloc, we have:
105
106        %reg1025 = IMUL32rri8 %reg1024, 45, implicit-def %eflags
107        JMP mbb<bb2,0x203afb0>
108    Successors according to CFG: 0x203afb0 (#3)
109
110bb1: 0x203af60, LLVM BB @0x1e02310, ID#2:
111    Predecessors according to CFG: 0x203aec0 (#0)
112        %reg1026 = IMUL32rri8 %reg1024, 78, implicit-def %eflags
113    Successors according to CFG: 0x203afb0 (#3)
114
115bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3:
116    Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2)
117        %reg1027 = PHI %reg1025, mbb<bb,0x203af10>,
118                            %reg1026, mbb<bb1,0x203af60>
119        %reg1029 = MOVZX64rr32 %reg1027
120
121so we'd have to know that IMUL32rri8 leaves the high word zero extended and to
122be able to recognize the zero extend.  This could also presumably be implemented
123if we have whole-function selectiondags.
124
125//===---------------------------------------------------------------------===//
126
127Take the following code
128(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653):
129extern unsigned long table[];
130unsigned long foo(unsigned char *p) {
131  unsigned long tag = *p;
132  return table[tag >> 4] + table[tag & 0xf];
133}
134
135Current code generated:
136	movzbl	(%rdi), %eax
137	movq	%rax, %rcx
138	andq	$240, %rcx
139	shrq	%rcx
140	andq	$15, %rax
141	movq	table(,%rax,8), %rax
142	addq	table(%rcx), %rax
143	ret
144
145Issues:
1461. First movq should be movl; saves a byte.
1472. Both andq's should be andl; saves another two bytes.  I think this was
148   implemented at one point, but subsequently regressed.
1493. shrq should be shrl; saves another byte.
1504. The first andq can be completely eliminated by using a slightly more
151   expensive addressing mode.
152
153//===---------------------------------------------------------------------===//
154
155Consider the following (contrived testcase, but contains common factors):
156
157#include <stdarg.h>
158int test(int x, ...) {
159  int sum, i;
160  va_list l;
161  va_start(l, x);
162  for (i = 0; i < x; i++)
163    sum += va_arg(l, int);
164  va_end(l);
165  return sum;
166}
167
168Testcase given in C because fixing it will likely involve changing the IR
169generated for it.  The primary issue with the result is that it doesn't do any
170of the optimizations which are possible if we know the address of a va_list
171in the current function is never taken:
1721. We shouldn't spill the XMM registers because we only call va_arg with "int".
1732. It would be nice if we could sroa the va_list.
1743. Probably overkill, but it'd be cool if we could peel off the first five
175iterations of the loop.
176
177Other optimizations involving functions which use va_arg on floats which don't
178have the address of a va_list taken:
1791. Conversely to the above, we shouldn't spill general registers if we only
180   call va_arg on "double".
1812. If we know nothing more than 64 bits wide is read from the XMM registers,
182   we can change the spilling code to reduce the amount of stack used by half.
183
184//===---------------------------------------------------------------------===//
185

README.txt

1//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend.
3//===---------------------------------------------------------------------===//
4
5Improvements to the multiply -> shift/add algorithm:
6http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
7
8//===---------------------------------------------------------------------===//
9
10Improve code like this (occurs fairly frequently, e.g. in LLVM):
11long long foo(int x) { return 1LL << x; }
12
13http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
14http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
15http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
16
17Another useful one would be  ~0ULL >> X and ~0ULL << X.
18
19One better solution for 1LL << x is:
20        xorl    %eax, %eax
21        xorl    %edx, %edx
22        testb   $32, %cl
23        sete    %al
24        setne   %dl
25        sall    %cl, %eax
26        sall    %cl, %edx
27
28But that requires good 8-bit subreg support.
29
30Also, this might be better.  It's an extra shift, but it's one instruction
31shorter, and doesn't stress 8-bit subreg support.
32(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
33but without the unnecessary and.)
34        movl %ecx, %eax
35        shrl $5, %eax
36        movl %eax, %edx
37        xorl $1, %edx
38        sall %cl, %eax
39        sall %cl. %edx
40
4164-bit shifts (in general) expand to really bad code.  Instead of using
42cmovs, we should expand to a conditional branch like GCC produces.
43
44//===---------------------------------------------------------------------===//
45
46Some isel ideas:
47
481. Dynamic programming based approach when compile time is not an
49   issue.
502. Code duplication (addressing mode) during isel.
513. Other ideas from "Register-Sensitive Selection, Duplication, and
52   Sequencing of Instructions".
534. Scheduling for reduced register pressure.  E.g. "Minimum Register
54   Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
55   and other related papers.
56   http://citeseer.ist.psu.edu/govindarajan01minimum.html
57
58//===---------------------------------------------------------------------===//
59
60Should we promote i16 to i32 to avoid partial register update stalls?
61
62//===---------------------------------------------------------------------===//
63
64Leave any_extend as pseudo instruction and hint to register
65allocator. Delay codegen until post register allocation.
66Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
67the coalescer how to deal with it though.
68
69//===---------------------------------------------------------------------===//
70
71It appears icc use push for parameter passing. Need to investigate.
72
73//===---------------------------------------------------------------------===//
74
75The instruction selector sometimes misses folding a load into a compare.  The
76pattern is written as (cmp reg, (load p)).  Because the compare isn't
77commutative, it is not matched with the load on both sides.  The dag combiner
78should be made smart enough to canonicalize the load into the RHS of a compare
79when it can invert the result of the compare for free.
80
81//===---------------------------------------------------------------------===//
82
83In many cases, LLVM generates code like this:
84
85_test:
86        movl 8(%esp), %eax
87        cmpl %eax, 4(%esp)
88        setl %al
89        movzbl %al, %eax
90        ret
91
92on some processors (which ones?), it is more efficient to do this:
93
94_test:
95        movl 8(%esp), %ebx
96        xor  %eax, %eax
97        cmpl %ebx, 4(%esp)
98        setl %al
99        ret
100
101Doing this correctly is tricky though, as the xor clobbers the flags.
102
103//===---------------------------------------------------------------------===//
104
105We should generate bts/btr/etc instructions on targets where they are cheap or
106when codesize is important.  e.g., for:
107
108void setbit(int *target, int bit) {
109    *target |= (1 << bit);
110}
111void clearbit(int *target, int bit) {
112    *target &= ~(1 << bit);
113}
114
115//===---------------------------------------------------------------------===//
116
117Instead of the following for memset char*, 1, 10:
118
119	movl $16843009, 4(%edx)
120	movl $16843009, (%edx)
121	movw $257, 8(%edx)
122
123It might be better to generate
124
125	movl $16843009, %eax
126	movl %eax, 4(%edx)
127	movl %eax, (%edx)
128	movw al, 8(%edx)
129
130when we can spare a register. It reduces code size.
131
132//===---------------------------------------------------------------------===//
133
134Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
135get this:
136
137define i32 @test1(i32 %X) {
138    %Y = sdiv i32 %X, 8
139    ret i32 %Y
140}
141
142_test1:
143        movl 4(%esp), %eax
144        movl %eax, %ecx
145        sarl $31, %ecx
146        shrl $29, %ecx
147        addl %ecx, %eax
148        sarl $3, %eax
149        ret
150
151GCC knows several different ways to codegen it, one of which is this:
152
153_test1:
154        movl    4(%esp), %eax
155        cmpl    $-1, %eax
156        leal    7(%eax), %ecx
157        cmovle  %ecx, %eax
158        sarl    $3, %eax
159        ret
160
161which is probably slower, but it's interesting at least :)
162
163//===---------------------------------------------------------------------===//
164
165We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
166We should leave these as libcalls for everything over a much lower threshold,
167since libc is hand tuned for medium and large mem ops (avoiding RFO for large
168stores, TLB preheating, etc)
169
170//===---------------------------------------------------------------------===//
171
172Optimize this into something reasonable:
173 x * copysign(1.0, y) * copysign(1.0, z)
174
175//===---------------------------------------------------------------------===//
176
177Optimize copysign(x, *y) to use an integer load from y.
178
179//===---------------------------------------------------------------------===//
180
181The following tests perform worse with LSR:
182
183lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
184
185//===---------------------------------------------------------------------===//
186
187Adding to the list of cmp / test poor codegen issues:
188
189int test(__m128 *A, __m128 *B) {
190  if (_mm_comige_ss(*A, *B))
191    return 3;
192  else
193    return 4;
194}
195
196_test:
197	movl 8(%esp), %eax
198	movaps (%eax), %xmm0
199	movl 4(%esp), %eax
200	movaps (%eax), %xmm1
201	comiss %xmm0, %xmm1
202	setae %al
203	movzbl %al, %ecx
204	movl $3, %eax
205	movl $4, %edx
206	cmpl $0, %ecx
207	cmove %edx, %eax
208	ret
209
210Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
211are a number of issues. 1) We are introducing a setcc between the result of the
212intrisic call and select. 2) The intrinsic is expected to produce a i32 value
213so a any extend (which becomes a zero extend) is added.
214
215We probably need some kind of target DAG combine hook to fix this.
216
217//===---------------------------------------------------------------------===//
218
219We generate significantly worse code for this than GCC:
220http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
221http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
222
223There is also one case we do worse on PPC.
224
225//===---------------------------------------------------------------------===//
226
227For this:
228
229int test(int a)
230{
231  return a * 3;
232}
233
234We currently emits
235	imull $3, 4(%esp), %eax
236
237Perhaps this is what we really should generate is? Is imull three or four
238cycles? Note: ICC generates this:
239	movl	4(%esp), %eax
240	leal	(%eax,%eax,2), %eax
241
242The current instruction priority is based on pattern complexity. The former is
243more "complex" because it folds a load so the latter will not be emitted.
244
245Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
246should always try to match LEA first since the LEA matching code does some
247estimate to determine whether the match is profitable.
248
249However, if we care more about code size, then imull is better. It's two bytes
250shorter than movl + leal.
251
252On a Pentium M, both variants have the same characteristics with regard
253to throughput; however, the multiplication has a latency of four cycles, as
254opposed to two cycles for the movl+lea variant.
255
256//===---------------------------------------------------------------------===//
257
258It appears gcc place string data with linkonce linkage in
259.section __TEXT,__const_coal,coalesced instead of
260.section __DATA,__const_coal,coalesced.
261Take a look at darwin.h, there are other Darwin assembler directives that we
262do not make use of.
263
264//===---------------------------------------------------------------------===//
265
266define i32 @foo(i32* %a, i32 %t) {
267entry:
268	br label %cond_true
269
270cond_true:		; preds = %cond_true, %entry
271	%x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]		; <i32> [#uses=3]
272	%t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]		; <i32> [#uses=1]
273	%tmp2 = getelementptr i32* %a, i32 %x.0.0		; <i32*> [#uses=1]
274	%tmp3 = load i32* %tmp2		; <i32> [#uses=1]
275	%tmp5 = add i32 %t_addr.0.0, %x.0.0		; <i32> [#uses=1]
276	%tmp7 = add i32 %tmp5, %tmp3		; <i32> [#uses=2]
277	%tmp9 = add i32 %x.0.0, 1		; <i32> [#uses=2]
278	%tmp = icmp sgt i32 %tmp9, 39		; <i1> [#uses=1]
279	br i1 %tmp, label %bb12, label %cond_true
280
281bb12:		; preds = %cond_true
282	ret i32 %tmp7
283}
284is pessimized by -loop-reduce and -indvars
285
286//===---------------------------------------------------------------------===//
287
288u32 to float conversion improvement:
289
290float uint32_2_float( unsigned u ) {
291  float fl = (int) (u & 0xffff);
292  float fh = (int) (u >> 16);
293  fh *= 0x1.0p16f;
294  return fh + fl;
295}
296
29700000000        subl    $0x04,%esp
29800000003        movl    0x08(%esp,1),%eax
29900000007        movl    %eax,%ecx
30000000009        shrl    $0x10,%ecx
3010000000c        cvtsi2ss        %ecx,%xmm0
30200000010        andl    $0x0000ffff,%eax
30300000015        cvtsi2ss        %eax,%xmm1
30400000019        mulss   0x00000078,%xmm0
30500000021        addss   %xmm1,%xmm0
30600000025        movss   %xmm0,(%esp,1)
3070000002a        flds    (%esp,1)
3080000002d        addl    $0x04,%esp
30900000030        ret
310
311//===---------------------------------------------------------------------===//
312
313When using fastcc abi, align stack slot of argument of type double on 8 byte
314boundary to improve performance.
315
316//===---------------------------------------------------------------------===//
317
318GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
319simplifications for integer "x cmp y ? a : b".
320
321//===---------------------------------------------------------------------===//
322
323Consider the expansion of:
324
325define i32 @test3(i32 %X) {
326        %tmp1 = urem i32 %X, 255
327        ret i32 %tmp1
328}
329
330Currently it compiles to:
331
332...
333        movl $2155905153, %ecx
334        movl 8(%esp), %esi
335        movl %esi, %eax
336        mull %ecx
337...
338
339This could be "reassociated" into:
340
341        movl $2155905153, %eax
342        movl 8(%esp), %ecx
343        mull %ecx
344
345to avoid the copy.  In fact, the existing two-address stuff would do this
346except that mul isn't a commutative 2-addr instruction.  I guess this has
347to be done at isel time based on the #uses to mul?
348
349//===---------------------------------------------------------------------===//
350
351Make sure the instruction which starts a loop does not cross a cacheline
352boundary. This requires knowning the exact length of each machine instruction.
353That is somewhat complicated, but doable. Example 256.bzip2:
354
355In the new trace, the hot loop has an instruction which crosses a cacheline
356boundary.  In addition to potential cache misses, this can't help decoding as I
357imagine there has to be some kind of complicated decoder reset and realignment
358to grab the bytes from the next cacheline.
359
360532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
361942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
362937  937 0x3d0a incl     %esi
3633    3   0x3d0b cmpb     %bl, %dl
36427   27  0x3d0d jnz      0x000062db <main+11707>
365
366//===---------------------------------------------------------------------===//
367
368In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
369
370//===---------------------------------------------------------------------===//
371
372This could be a single 16-bit load.
373
374int f(char *p) {
375    if ((p[0] == 1) & (p[1] == 2)) return 1;
376    return 0;
377}
378
379//===---------------------------------------------------------------------===//
380
381We should inline lrintf and probably other libc functions.
382
383//===---------------------------------------------------------------------===//
384
385This code:
386
387void test(int X) {
388  if (X) abort();
389}
390
391is currently compiled to:
392
393_test:
394        subl $12, %esp
395        cmpl $0, 16(%esp)
396        jne LBB1_1
397        addl $12, %esp
398        ret
399LBB1_1:
400        call L_abort$stub
401
402It would be better to produce:
403
404_test:
405        subl $12, %esp
406        cmpl $0, 16(%esp)
407        jne L_abort$stub
408        addl $12, %esp
409        ret
410
411This can be applied to any no-return function call that takes no arguments etc.
412Alternatively, the stack save/restore logic could be shrink-wrapped, producing
413something like this:
414
415_test:
416        cmpl $0, 4(%esp)
417        jne LBB1_1
418        ret
419LBB1_1:
420        subl $12, %esp
421        call L_abort$stub
422
423Both are useful in different situations.  Finally, it could be shrink-wrapped
424and tail called, like this:
425
426_test:
427        cmpl $0, 4(%esp)
428        jne LBB1_1
429        ret
430LBB1_1:
431        pop %eax   # realign stack.
432        call L_abort$stub
433
434Though this probably isn't worth it.
435
436//===---------------------------------------------------------------------===//
437
438Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
439a neg instead of a sub instruction.  Consider:
440
441int test(char X) { return 7-X; }
442
443we currently produce:
444_test:
445        movl $7, %eax
446        movsbl 4(%esp), %ecx
447        subl %ecx, %eax
448        ret
449
450We would use one fewer register if codegen'd as:
451
452        movsbl 4(%esp), %eax
453	neg %eax
454        add $7, %eax
455        ret
456
457Note that this isn't beneficial if the load can be folded into the sub.  In
458this case, we want a sub:
459
460int test(int X) { return 7-X; }
461_test:
462        movl $7, %eax
463        subl 4(%esp), %eax
464        ret
465
466//===---------------------------------------------------------------------===//
467
468Leaf functions that require one 4-byte spill slot have a prolog like this:
469
470_foo:
471        pushl   %esi
472        subl    $4, %esp
473...
474and an epilog like this:
475        addl    $4, %esp
476        popl    %esi
477        ret
478
479It would be smaller, and potentially faster, to push eax on entry and to
480pop into a dummy register instead of using addl/subl of esp.  Just don't pop
481into any return registers :)
482
483//===---------------------------------------------------------------------===//
484
485The X86 backend should fold (branch (or (setcc, setcc))) into multiple
486branches.  We generate really poor code for:
487
488double testf(double a) {
489       return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
490}
491
492For example, the entry BB is:
493
494_testf:
495        subl    $20, %esp
496        pxor    %xmm0, %xmm0
497        movsd   24(%esp), %xmm1
498        ucomisd %xmm0, %xmm1
499        setnp   %al
500        sete    %cl
501        testb   %cl, %al
502        jne     LBB1_5  # UnifiedReturnBlock
503LBB1_1: # cond_true
504
505
506it would be better to replace the last four instructions with:
507
508	jp LBB1_1
509	je LBB1_5
510LBB1_1:
511
512We also codegen the inner ?: into a diamond:
513
514       cvtss2sd        LCPI1_0(%rip), %xmm2
515        cvtss2sd        LCPI1_1(%rip), %xmm3
516        ucomisd %xmm1, %xmm0
517        ja      LBB1_3  # cond_true
518LBB1_2: # cond_true
519        movapd  %xmm3, %xmm2
520LBB1_3: # cond_true
521        movapd  %xmm2, %xmm0
522        ret
523
524We should sink the load into xmm3 into the LBB1_2 block.  This should
525be pretty easy, and will nuke all the copies.
526
527//===---------------------------------------------------------------------===//
528
529This:
530        #include <algorithm>
531        inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
532        { return std::make_pair(a + b, a + b < a); }
533        bool no_overflow(unsigned a, unsigned b)
534        { return !full_add(a, b).second; }
535
536Should compile to:
537	addl	%esi, %edi
538	setae	%al
539	movzbl	%al, %eax
540	ret
541
542on x86-64, instead of the rather stupid-looking:
543	addl	%esi, %edi
544	setb	%al
545	xorb	$1, %al
546	movzbl	%al, %eax
547	ret
548
549
550//===---------------------------------------------------------------------===//
551
552The following code:
553
554bb114.preheader:		; preds = %cond_next94
555	%tmp231232 = sext i16 %tmp62 to i32		; <i32> [#uses=1]
556	%tmp233 = sub i32 32, %tmp231232		; <i32> [#uses=1]
557	%tmp245246 = sext i16 %tmp65 to i32		; <i32> [#uses=1]
558	%tmp252253 = sext i16 %tmp68 to i32		; <i32> [#uses=1]
559	%tmp254 = sub i32 32, %tmp252253		; <i32> [#uses=1]
560	%tmp553554 = bitcast i16* %tmp37 to i8*		; <i8*> [#uses=2]
561	%tmp583584 = sext i16 %tmp98 to i32		; <i32> [#uses=1]
562	%tmp585 = sub i32 32, %tmp583584		; <i32> [#uses=1]
563	%tmp614615 = sext i16 %tmp101 to i32		; <i32> [#uses=1]
564	%tmp621622 = sext i16 %tmp104 to i32		; <i32> [#uses=1]
565	%tmp623 = sub i32 32, %tmp621622		; <i32> [#uses=1]
566	br label %bb114
567
568produces:
569
570LBB3_5:	# bb114.preheader
571	movswl	-68(%ebp), %eax
572	movl	$32, %ecx
573	movl	%ecx, -80(%ebp)
574	subl	%eax, -80(%ebp)
575	movswl	-52(%ebp), %eax
576	movl	%ecx, -84(%ebp)
577	subl	%eax, -84(%ebp)
578	movswl	-70(%ebp), %eax
579	movl	%ecx, -88(%ebp)
580	subl	%eax, -88(%ebp)
581	movswl	-50(%ebp), %eax
582	subl	%eax, %ecx
583	movl	%ecx, -76(%ebp)
584	movswl	-42(%ebp), %eax
585	movl	%eax, -92(%ebp)
586	movswl	-66(%ebp), %eax
587	movl	%eax, -96(%ebp)
588	movw	$0, -98(%ebp)
589
590This appears to be bad because the RA is not folding the store to the stack
591slot into the movl.  The above instructions could be:
592	movl    $32, -80(%ebp)
593...
594	movl    $32, -84(%ebp)
595...
596This seems like a cross between remat and spill folding.
597
598This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
599change, so we could simply subtract %eax from %ecx first and then use %ecx (or
600vice-versa).
601
602//===---------------------------------------------------------------------===//
603
604This code:
605
606	%tmp659 = icmp slt i16 %tmp654, 0		; <i1> [#uses=1]
607	br i1 %tmp659, label %cond_true662, label %cond_next715
608
609produces this:
610
611	testw	%cx, %cx
612	movswl	%cx, %esi
613	jns	LBB4_109	# cond_next715
614
615Shark tells us that using %cx in the testw instruction is sub-optimal. It
616suggests using the 32-bit register (which is what ICC uses).
617
618//===---------------------------------------------------------------------===//
619
620We compile this:
621
622void compare (long long foo) {
623  if (foo < 4294967297LL)
624    abort();
625}
626
627to:
628
629compare:
630        subl    $4, %esp
631        cmpl    $0, 8(%esp)
632        setne   %al
633        movzbw  %al, %ax
634        cmpl    $1, 12(%esp)
635        setg    %cl
636        movzbw  %cl, %cx
637        cmove   %ax, %cx
638        testb   $1, %cl
639        jne     .LBB1_2 # UnifiedReturnBlock
640.LBB1_1:        # ifthen
641        call    abort
642.LBB1_2:        # UnifiedReturnBlock
643        addl    $4, %esp
644        ret
645
646(also really horrible code on ppc).  This is due to the expand code for 64-bit
647compares.  GCC produces multiple branches, which is much nicer:
648
649compare:
650        subl    $12, %esp
651        movl    20(%esp), %edx
652        movl    16(%esp), %eax
653        decl    %edx
654        jle     .L7
655.L5:
656        addl    $12, %esp
657        ret
658        .p2align 4,,7
659.L7:
660        jl      .L4
661        cmpl    $0, %eax
662        .p2align 4,,8
663        ja      .L5
664.L4:
665        .p2align 4,,9
666        call    abort
667
668//===---------------------------------------------------------------------===//
669
670Tail call optimization improvements: Tail call optimization currently
671pushes all arguments on the top of the stack (their normal place for
672non-tail call optimized calls) that source from the callers arguments
673or  that source from a virtual register (also possibly sourcing from
674callers arguments).
675This is done to prevent overwriting of parameters (see example
676below) that might be used later.
677
678example:
679
680int callee(int32, int64);
681int caller(int32 arg1, int32 arg2) {
682  int64 local = arg2 * 2;
683  return callee(arg2, (int64)local);
684}
685
686[arg1]          [!arg2 no longer valid since we moved local onto it]
687[arg2]      ->  [(int64)
688[RETADDR]        local  ]
689
690Moving arg1 onto the stack slot of callee function would overwrite
691arg2 of the caller.
692
693Possible optimizations:
694
695
696 - Analyse the actual parameters of the callee to see which would
697   overwrite a caller parameter which is used by the callee and only
698   push them onto the top of the stack.
699
700   int callee (int32 arg1, int32 arg2);
701   int caller (int32 arg1, int32 arg2) {
702       return callee(arg1,arg2);
703   }
704
705   Here we don't need to write any variables to the top of the stack
706   since they don't overwrite each other.
707
708   int callee (int32 arg1, int32 arg2);
709   int caller (int32 arg1, int32 arg2) {
710       return callee(arg2,arg1);
711   }
712
713   Here we need to push the arguments because they overwrite each
714   other.
715
716//===---------------------------------------------------------------------===//
717
718main ()
719{
720  int i = 0;
721  unsigned long int z = 0;
722
723  do {
724    z -= 0x00004000;
725    i++;
726    if (i > 0x00040000)
727      abort ();
728  } while (z > 0);
729  exit (0);
730}
731
732gcc compiles this to:
733
734_main:
735	subl	$28, %esp
736	xorl	%eax, %eax
737	jmp	L2
738L3:
739	cmpl	$262144, %eax
740	je	L10
741L2:
742	addl	$1, %eax
743	cmpl	$262145, %eax
744	jne	L3
745	call	L_abort$stub
746L10:
747	movl	$0, (%esp)
748	call	L_exit$stub
749
750llvm:
751
752_main:
753	subl	$12, %esp
754	movl	$1, %eax
755	movl	$16384, %ecx
756LBB1_1:	# bb
757	cmpl	$262145, %eax
758	jge	LBB1_4	# cond_true
759LBB1_2:	# cond_next
760	incl	%eax
761	addl	$4294950912, %ecx
762	cmpl	$16384, %ecx
763	jne	LBB1_1	# bb
764LBB1_3:	# bb11
765	xorl	%eax, %eax
766	addl	$12, %esp
767	ret
768LBB1_4:	# cond_true
769	call	L_abort$stub
770
7711. LSR should rewrite the first cmp with induction variable %ecx.
7722. DAG combiner should fold
773        leal    1(%eax), %edx
774        cmpl    $262145, %edx
775   =>
776        cmpl    $262144, %eax
777
778//===---------------------------------------------------------------------===//
779
780define i64 @test(double %X) {
781	%Y = fptosi double %X to i64
782	ret i64 %Y
783}
784
785compiles to:
786
787_test:
788	subl	$20, %esp
789	movsd	24(%esp), %xmm0
790	movsd	%xmm0, 8(%esp)
791	fldl	8(%esp)
792	fisttpll	(%esp)
793	movl	4(%esp), %edx
794	movl	(%esp), %eax
795	addl	$20, %esp
796	#FP_REG_KILL
797	ret
798
799This should just fldl directly from the input stack slot.
800
801//===---------------------------------------------------------------------===//
802
803This code:
804int foo (int x) { return (x & 65535) | 255; }
805
806Should compile into:
807
808_foo:
809        movzwl  4(%esp), %eax
810        orl     $255, %eax
811        ret
812
813instead of:
814_foo:
815	movl	$65280, %eax
816	andl	4(%esp), %eax
817	orl	$255, %eax
818	ret
819
820//===---------------------------------------------------------------------===//
821
822We're codegen'ing multiply of long longs inefficiently:
823
824unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
825  return arg1 *  arg2;
826}
827
828We compile to (fomit-frame-pointer):
829
830_LLM:
831	pushl	%esi
832	movl	8(%esp), %ecx
833	movl	16(%esp), %esi
834	movl	%esi, %eax
835	mull	%ecx
836	imull	12(%esp), %esi
837	addl	%edx, %esi
838	imull	20(%esp), %ecx
839	movl	%esi, %edx
840	addl	%ecx, %edx
841	popl	%esi
842	ret
843
844This looks like a scheduling deficiency and lack of remat of the load from
845the argument area.  ICC apparently produces:
846
847        movl      8(%esp), %ecx
848        imull     12(%esp), %ecx
849        movl      16(%esp), %eax
850        imull     4(%esp), %eax
851        addl      %eax, %ecx
852        movl      4(%esp), %eax
853        mull      12(%esp)
854        addl      %ecx, %edx
855        ret
856
857Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
858http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
859
860//===---------------------------------------------------------------------===//
861
862We can fold a store into "zeroing a reg".  Instead of:
863
864xorl    %eax, %eax
865movl    %eax, 124(%esp)
866
867we should get:
868
869movl    $0, 124(%esp)
870
871if the flags of the xor are dead.
872
873Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
874be folded into: shl [mem], 1
875
876//===---------------------------------------------------------------------===//
877
878In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
879or and instruction, for example:
880
881	xorpd	LCPI1_0, %xmm2
882
883However, if xmm2 gets spilled, we end up with really ugly code like this:
884
885	movsd	(%esp), %xmm0
886	xorpd	LCPI1_0, %xmm0
887	movsd	%xmm0, (%esp)
888
889Since we 'know' that this is a 'neg', we can actually "fold" the spill into
890the neg/abs instruction, turning it into an *integer* operation, like this:
891
892	xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
893
894you could also use xorb, but xorl is less likely to lead to a partial register
895stall.  Here is a contrived testcase:
896
897double a, b, c;
898void test(double *P) {
899  double X = *P;
900  a = X;
901  bar();
902  X = -X;
903  b = X;
904  bar();
905  c = X;
906}
907
908//===---------------------------------------------------------------------===//
909
910The generated code on x86 for checking for signed overflow on a multiply the
911obvious way is much longer than it needs to be.
912
913int x(int a, int b) {
914  long long prod = (long long)a*b;
915  return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
916}
917
918See PR2053 for more details.
919
920//===---------------------------------------------------------------------===//
921
922We should investigate using cdq/ctld (effect: edx = sar eax, 31)
923more aggressively; it should cost the same as a move+shift on any modern
924processor, but it's a lot shorter. Downside is that it puts more
925pressure on register allocation because it has fixed operands.
926
927Example:
928int abs(int x) {return x < 0 ? -x : x;}
929
930gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
931abs:
932        movl    4(%esp), %eax
933        cltd
934        xorl    %edx, %eax
935        subl    %edx, %eax
936        ret
937
938//===---------------------------------------------------------------------===//
939
940Take the following code (from
941http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
942
943extern unsigned char first_one[65536];
944int FirstOnet(unsigned long long arg1)
945{
946  if (arg1 >> 48)
947    return (first_one[arg1 >> 48]);
948  return 0;
949}
950
951
952The following code is currently generated:
953FirstOnet:
954        movl    8(%esp), %eax
955        cmpl    $65536, %eax
956        movl    4(%esp), %ecx
957        jb      .LBB1_2 # UnifiedReturnBlock
958.LBB1_1:        # ifthen
959        shrl    $16, %eax
960        movzbl  first_one(%eax), %eax
961        ret
962.LBB1_2:        # UnifiedReturnBlock
963        xorl    %eax, %eax
964        ret
965
966We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
967lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
968
969//===---------------------------------------------------------------------===//
970
971We compile this function:
972
973define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
974entry:
975	%tmp2 = icmp eq i8 %d, 0		; <i1> [#uses=1]
976	br i1 %tmp2, label %bb7, label %bb
977
978bb:		; preds = %entry
979	%tmp6 = add i32 %b, %a		; <i32> [#uses=1]
980	ret i32 %tmp6
981
982bb7:		; preds = %entry
983	%tmp10 = sub i32 %a, %c		; <i32> [#uses=1]
984	ret i32 %tmp10
985}
986
987to:
988
989foo:                                    # @foo
990# %bb.0:                                # %entry
991	movl	4(%esp), %ecx
992	cmpb	$0, 16(%esp)
993	je	.LBB0_2
994# %bb.1:                                # %bb
995	movl	8(%esp), %eax
996	addl	%ecx, %eax
997	ret
998.LBB0_2:                                # %bb7
999	movl	12(%esp), %edx
1000	movl	%ecx, %eax
1001	subl	%edx, %eax
1002	ret
1003
1004There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
1005couple more movls by putting 4(%esp) into %eax instead of %ecx.
1006
1007//===---------------------------------------------------------------------===//
1008
1009Take the following:
1010
1011target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
1012target triple = "i386-apple-darwin8"
1013@in_exit.4870.b = internal global i1 false		; <i1*> [#uses=2]
1014define fastcc void @abort_gzip() noreturn nounwind  {
1015entry:
1016	%tmp.b.i = load i1* @in_exit.4870.b		; <i1> [#uses=1]
1017	br i1 %tmp.b.i, label %bb.i, label %bb4.i
1018bb.i:		; preds = %entry
1019	tail call void @exit( i32 1 ) noreturn nounwind
1020	unreachable
1021bb4.i:		; preds = %entry
1022	store i1 true, i1* @in_exit.4870.b
1023	tail call void @exit( i32 1 ) noreturn nounwind
1024	unreachable
1025}
1026declare void @exit(i32) noreturn nounwind
1027
1028This compiles into:
1029_abort_gzip:                            ## @abort_gzip
1030## %bb.0:                               ## %entry
1031	subl	$12, %esp
1032	movb	_in_exit.4870.b, %al
1033	cmpb	$1, %al
1034	jne	LBB0_2
1035
1036We somehow miss folding the movb into the cmpb.
1037
1038//===---------------------------------------------------------------------===//
1039
1040We compile:
1041
1042int test(int x, int y) {
1043  return x-y-1;
1044}
1045
1046into (-m64):
1047
1048_test:
1049	decl	%edi
1050	movl	%edi, %eax
1051	subl	%esi, %eax
1052	ret
1053
1054it would be better to codegen as: x+~y  (notl+addl)
1055
1056//===---------------------------------------------------------------------===//
1057
1058This code:
1059
1060int foo(const char *str,...)
1061{
1062 __builtin_va_list a; int x;
1063 __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1064 return x;
1065}
1066
1067gets compiled into this on x86-64:
1068	subq    $200, %rsp
1069        movaps  %xmm7, 160(%rsp)
1070        movaps  %xmm6, 144(%rsp)
1071        movaps  %xmm5, 128(%rsp)
1072        movaps  %xmm4, 112(%rsp)
1073        movaps  %xmm3, 96(%rsp)
1074        movaps  %xmm2, 80(%rsp)
1075        movaps  %xmm1, 64(%rsp)
1076        movaps  %xmm0, 48(%rsp)
1077        movq    %r9, 40(%rsp)
1078        movq    %r8, 32(%rsp)
1079        movq    %rcx, 24(%rsp)
1080        movq    %rdx, 16(%rsp)
1081        movq    %rsi, 8(%rsp)
1082        leaq    (%rsp), %rax
1083        movq    %rax, 192(%rsp)
1084        leaq    208(%rsp), %rax
1085        movq    %rax, 184(%rsp)
1086        movl    $48, 180(%rsp)
1087        movl    $8, 176(%rsp)
1088        movl    176(%rsp), %eax
1089        cmpl    $47, %eax
1090        jbe     .LBB1_3 # bb
1091.LBB1_1:        # bb3
1092        movq    184(%rsp), %rcx
1093        leaq    8(%rcx), %rax
1094        movq    %rax, 184(%rsp)
1095.LBB1_2:        # bb4
1096        movl    (%rcx), %eax
1097        addq    $200, %rsp
1098        ret
1099.LBB1_3:        # bb
1100        movl    %eax, %ecx
1101        addl    $8, %eax
1102        addq    192(%rsp), %rcx
1103        movl    %eax, 176(%rsp)
1104        jmp     .LBB1_2 # bb4
1105
1106gcc 4.3 generates:
1107	subq    $96, %rsp
1108.LCFI0:
1109        leaq    104(%rsp), %rax
1110        movq    %rsi, -80(%rsp)
1111        movl    $8, -120(%rsp)
1112        movq    %rax, -112(%rsp)
1113        leaq    -88(%rsp), %rax
1114        movq    %rax, -104(%rsp)
1115        movl    $8, %eax
1116        cmpl    $48, %eax
1117        jb      .L6
1118        movq    -112(%rsp), %rdx
1119        movl    (%rdx), %eax
1120        addq    $96, %rsp
1121        ret
1122        .p2align 4,,10
1123        .p2align 3
1124.L6:
1125        mov     %eax, %edx
1126        addq    -104(%rsp), %rdx
1127        addl    $8, %eax
1128        movl    %eax, -120(%rsp)
1129        movl    (%rdx), %eax
1130        addq    $96, %rsp
1131        ret
1132
1133and it gets compiled into this on x86:
1134	pushl   %ebp
1135        movl    %esp, %ebp
1136        subl    $4, %esp
1137        leal    12(%ebp), %eax
1138        movl    %eax, -4(%ebp)
1139        leal    16(%ebp), %eax
1140        movl    %eax, -4(%ebp)
1141        movl    12(%ebp), %eax
1142        addl    $4, %esp
1143        popl    %ebp
1144        ret
1145
1146gcc 4.3 generates:
1147	pushl   %ebp
1148        movl    %esp, %ebp
1149        movl    12(%ebp), %eax
1150        popl    %ebp
1151        ret
1152
1153//===---------------------------------------------------------------------===//
1154
1155Teach tblgen not to check bitconvert source type in some cases. This allows us
1156to consolidate the following patterns in X86InstrMMX.td:
1157
1158def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1159                                                  (iPTR 0))))),
1160          (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1161def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1162                                                  (iPTR 0))))),
1163          (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1164def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1165                                                  (iPTR 0))))),
1166          (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1167
1168There are other cases in various td files.
1169
1170//===---------------------------------------------------------------------===//
1171
1172Take something like the following on x86-32:
1173unsigned a(unsigned long long x, unsigned y) {return x % y;}
1174
1175We currently generate a libcall, but we really shouldn't: the expansion is
1176shorter and likely faster than the libcall.  The expected code is something
1177like the following:
1178
1179	movl	12(%ebp), %eax
1180	movl	16(%ebp), %ecx
1181	xorl	%edx, %edx
1182	divl	%ecx
1183	movl	8(%ebp), %eax
1184	divl	%ecx
1185	movl	%edx, %eax
1186	ret
1187
1188A similar code sequence works for division.
1189
1190//===---------------------------------------------------------------------===//
1191
1192We currently compile this:
1193
1194define i32 @func1(i32 %v1, i32 %v2) nounwind {
1195entry:
1196  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
1197  %sum = extractvalue {i32, i1} %t, 0
1198  %obit = extractvalue {i32, i1} %t, 1
1199  br i1 %obit, label %overflow, label %normal
1200normal:
1201  ret i32 %sum
1202overflow:
1203  call void @llvm.trap()
1204  unreachable
1205}
1206declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
1207declare void @llvm.trap()
1208
1209to:
1210
1211_func1:
1212	movl	4(%esp), %eax
1213	addl	8(%esp), %eax
1214	jo	LBB1_2	## overflow
1215LBB1_1:	## normal
1216	ret
1217LBB1_2:	## overflow
1218	ud2
1219
1220it would be nice to produce "into" someday.
1221
1222//===---------------------------------------------------------------------===//
1223
1224Test instructions can be eliminated by using EFLAGS values from arithmetic
1225instructions. This is currently not done for mul, and, or, xor, neg, shl,
1226sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
1227for read-modify-write instructions. It is also current not done if the
1228OF or CF flags are needed.
1229
1230The shift operators have the complication that when the shift count is
1231zero, EFLAGS is not set, so they can only subsume a test instruction if
1232the shift count is known to be non-zero. Also, using the EFLAGS value
1233from a shift is apparently very slow on some x86 implementations.
1234
1235In read-modify-write instructions, the root node in the isel match is
1236the store, and isel has no way for the use of the EFLAGS result of the
1237arithmetic to be remapped to the new node.
1238
1239Add and subtract instructions set OF on signed overflow and CF on unsiged
1240overflow, while test instructions always clear OF and CF. In order to
1241replace a test with an add or subtract in a situation where OF or CF is
1242needed, codegen must be able to prove that the operation cannot see
1243signed or unsigned overflow, respectively.
1244
1245//===---------------------------------------------------------------------===//
1246
1247memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
1248define <16 x float> @foo(<16 x float> %A) nounwind {
1249	%tmp = alloca <16 x float>, align 16
1250	%tmp2 = alloca <16 x float>, align 16
1251	store <16 x float> %A, <16 x float>* %tmp
1252	%s = bitcast <16 x float>* %tmp to i8*
1253	%s2 = bitcast <16 x float>* %tmp2 to i8*
1254	call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
1255	%R = load <16 x float>* %tmp2
1256	ret <16 x float> %R
1257}
1258
1259declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
1260
1261which compiles to:
1262
1263_foo:
1264	subl	$140, %esp
1265	movaps	%xmm3, 112(%esp)
1266	movaps	%xmm2, 96(%esp)
1267	movaps	%xmm1, 80(%esp)
1268	movaps	%xmm0, 64(%esp)
1269	movl	60(%esp), %eax
1270	movl	%eax, 124(%esp)
1271	movl	56(%esp), %eax
1272	movl	%eax, 120(%esp)
1273	movl	52(%esp), %eax
1274        <many many more 32-bit copies>
1275      	movaps	(%esp), %xmm0
1276	movaps	16(%esp), %xmm1
1277	movaps	32(%esp), %xmm2
1278	movaps	48(%esp), %xmm3
1279	addl	$140, %esp
1280	ret
1281
1282On Nehalem, it may even be cheaper to just use movups when unaligned than to
1283fall back to lower-granularity chunks.
1284
1285//===---------------------------------------------------------------------===//
1286
1287Implement processor-specific optimizations for parity with GCC on these
1288processors.  GCC does two optimizations:
1289
12901. ix86_pad_returns inserts a noop before ret instructions if immediately
1291   preceded by a conditional branch or is the target of a jump.
12922. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
1293   code contains more than 3 branches.
1294
1295The first one is done for all AMDs, Core2, and "Generic"
1296The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
1297  Core 2, and "Generic"
1298
1299//===---------------------------------------------------------------------===//
1300Testcase:
1301int x(int a) { return (a&0xf0)>>4; }
1302
1303Current output:
1304	movl	4(%esp), %eax
1305	shrl	$4, %eax
1306	andl	$15, %eax
1307	ret
1308
1309Ideal output:
1310	movzbl	4(%esp), %eax
1311	shrl	$4, %eax
1312	ret
1313
1314//===---------------------------------------------------------------------===//
1315
1316Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
1317properly.
1318
1319When the return value is not used (i.e. only care about the value in the
1320memory), x86 does not have to use add to implement these. Instead, it can use
1321add, sub, inc, dec instructions with the "lock" prefix.
1322
1323This is currently implemented using a bit of instruction selection trick. The
1324issue is the target independent pattern produces one output and a chain and we
1325want to map it into one that just output a chain. The current trick is to select
1326it into a MERGE_VALUES with the first definition being an implicit_def. The
1327proper solution is to add new ISD opcodes for the no-output variant. DAG
1328combiner can then transform the node before it gets to target node selection.
1329
1330Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
1331fact these instructions are identical to the non-lock versions. We need a way to
1332add target specific information to target nodes and have this information
1333carried over to machine instructions. Asm printer (or JIT) can use this
1334information to add the "lock" prefix.
1335
1336//===---------------------------------------------------------------------===//
1337
1338struct B {
1339  unsigned char y0 : 1;
1340};
1341
1342int bar(struct B* a) { return a->y0; }
1343
1344define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
1345  %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
1346  %2 = load i8* %1, align 1
1347  %3 = and i8 %2, 1
1348  %4 = zext i8 %3 to i32
1349  ret i32 %4
1350}
1351
1352bar:                                    # @bar
1353# %bb.0:
1354        movb    (%rdi), %al
1355        andb    $1, %al
1356        movzbl  %al, %eax
1357        ret
1358
1359Missed optimization: should be movl+andl.
1360
1361//===---------------------------------------------------------------------===//
1362
1363The x86_64 abi says:
1364
1365Booleans, when stored in a memory object, are stored as single byte objects the
1366value of which is always 0 (false) or 1 (true).
1367
1368We are not using this fact:
1369
1370int bar(_Bool *a) { return *a; }
1371
1372define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
1373  %1 = load i8* %a, align 1, !tbaa !0
1374  %tmp = and i8 %1, 1
1375  %2 = zext i8 %tmp to i32
1376  ret i32 %2
1377}
1378
1379bar:
1380        movb    (%rdi), %al
1381        andb    $1, %al
1382        movzbl  %al, %eax
1383        ret
1384
1385GCC produces
1386
1387bar:
1388        movzbl  (%rdi), %eax
1389        ret
1390
1391//===---------------------------------------------------------------------===//
1392
1393Take the following C code:
1394int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
1395
1396We generate the following IR with clang:
1397define i32 @f(i32 %a, i32 %b) nounwind readnone {
1398entry:
1399  %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
1400  %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
1401  %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
1402  %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1403  ret i32 %conv5
1404}
1405
1406And the following x86 code:
1407	xorl	%esi, %edi
1408	testb	$-1, %dil
1409	sete	%al
1410	movzbl	%al, %eax
1411	ret
1412
1413A cmpb instead of the xorl+testb would be one instruction shorter.
1414
1415//===---------------------------------------------------------------------===//
1416
1417Given the following C code:
1418int f(int a, int b) { return (signed char)a == (signed char)b; }
1419
1420We generate the following IR with clang:
1421define i32 @f(i32 %a, i32 %b) nounwind readnone {
1422entry:
1423  %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
1424  %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
1425  %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
1426  %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
1427  %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
1428  %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1429  ret i32 %conv5
1430}
1431
1432And the following x86 code:
1433	movsbl	%sil, %eax
1434	movsbl	%dil, %ecx
1435	cmpl	%eax, %ecx
1436	sete	%al
1437	movzbl	%al, %eax
1438	ret
1439
1440
1441It should be possible to eliminate the sign extensions.
1442
1443//===---------------------------------------------------------------------===//
1444
1445LLVM misses a load+store narrowing opportunity in this code:
1446
1447%struct.bf = type { i64, i16, i16, i32 }
1448
1449@bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
1450
1451define void @t1() nounwind ssp {
1452entry:
1453  %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1454  %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
1455  %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
1456  %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
1457  %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
1458  store i32 %4, i32* %2, align 1
1459  %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1460  %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
1461  %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
1462  %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
1463  %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
1464  store i32 %9, i32* %7, align 1
1465  ret void
1466}
1467
1468LLVM currently emits this:
1469
1470  movq  bfi(%rip), %rax
1471  andl  $-65537, 8(%rax)
1472  movq  bfi(%rip), %rax
1473  andl  $-131073, 8(%rax)
1474  ret
1475
1476It could narrow the loads and stores to emit this:
1477
1478  movq  bfi(%rip), %rax
1479  andb  $-2, 10(%rax)
1480  movq  bfi(%rip), %rax
1481  andb  $-3, 10(%rax)
1482  ret
1483
1484The trouble is that there is a TokenFactor between the store and the
1485load, making it non-trivial to determine if there's anything between
1486the load and the store which would prohibit narrowing.
1487
1488//===---------------------------------------------------------------------===//
1489
1490This code:
1491void foo(unsigned x) {
1492  if (x == 0) bar();
1493  else if (x == 1) qux();
1494}
1495
1496currently compiles into:
1497_foo:
1498	movl	4(%esp), %eax
1499	cmpl	$1, %eax
1500	je	LBB0_3
1501	testl	%eax, %eax
1502	jne	LBB0_4
1503
1504the testl could be removed:
1505_foo:
1506	movl	4(%esp), %eax
1507	cmpl	$1, %eax
1508	je	LBB0_3
1509	jb	LBB0_4
1510
15110 is the only unsigned number < 1.
1512
1513//===---------------------------------------------------------------------===//
1514
1515This code:
1516
1517%0 = type { i32, i1 }
1518
1519define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
1520entry:
1521  %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
1522  %cmp = extractvalue %0 %uadd, 1
1523  %inc = zext i1 %cmp to i32
1524  %add = add i32 %x, %sum
1525  %z.0 = add i32 %add, %inc
1526  ret i32 %z.0
1527}
1528
1529declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
1530
1531compiles to:
1532
1533_add32carry:                            ## @add32carry
1534	addl	%esi, %edi
1535	sbbl	%ecx, %ecx
1536	movl	%edi, %eax
1537	subl	%ecx, %eax
1538	ret
1539
1540But it could be:
1541
1542_add32carry:
1543	leal	(%rsi,%rdi), %eax
1544	cmpl	%esi, %eax
1545	adcl	$0, %eax
1546	ret
1547
1548//===---------------------------------------------------------------------===//
1549
1550The hot loop of 256.bzip2 contains code that looks a bit like this:
1551
1552int foo(char *P, char *Q, int x, int y) {
1553  if (P[0] != Q[0])
1554     return P[0] < Q[0];
1555  if (P[1] != Q[1])
1556     return P[1] < Q[1];
1557  if (P[2] != Q[2])
1558     return P[2] < Q[2];
1559   return P[3] < Q[3];
1560}
1561
1562In the real code, we get a lot more wrong than this.  However, even in this
1563code we generate:
1564
1565_foo:                                   ## @foo
1566## %bb.0:                               ## %entry
1567	movb	(%rsi), %al
1568	movb	(%rdi), %cl
1569	cmpb	%al, %cl
1570	je	LBB0_2
1571LBB0_1:                                 ## %if.then
1572	cmpb	%al, %cl
1573	jmp	LBB0_5
1574LBB0_2:                                 ## %if.end
1575	movb	1(%rsi), %al
1576	movb	1(%rdi), %cl
1577	cmpb	%al, %cl
1578	jne	LBB0_1
1579## %bb.3:                               ## %if.end38
1580	movb	2(%rsi), %al
1581	movb	2(%rdi), %cl
1582	cmpb	%al, %cl
1583	jne	LBB0_1
1584## %bb.4:                               ## %if.end60
1585	movb	3(%rdi), %al
1586	cmpb	3(%rsi), %al
1587LBB0_5:                                 ## %if.end60
1588	setl	%al
1589	movzbl	%al, %eax
1590	ret
1591
1592Note that we generate jumps to LBB0_1 which does a redundant compare.  The
1593redundant compare also forces the register values to be live, which prevents
1594folding one of the loads into the compare.  In contrast, GCC 4.2 produces:
1595
1596_foo:
1597	movzbl	(%rsi), %eax
1598	cmpb	%al, (%rdi)
1599	jne	L10
1600L12:
1601	movzbl	1(%rsi), %eax
1602	cmpb	%al, 1(%rdi)
1603	jne	L10
1604	movzbl	2(%rsi), %eax
1605	cmpb	%al, 2(%rdi)
1606	jne	L10
1607	movzbl	3(%rdi), %eax
1608	cmpb	3(%rsi), %al
1609L10:
1610	setl	%al
1611	movzbl	%al, %eax
1612	ret
1613
1614which is "perfect".
1615
1616//===---------------------------------------------------------------------===//
1617
1618For the branch in the following code:
1619int a();
1620int b(int x, int y) {
1621  if (x & (1<<(y&7)))
1622    return a();
1623  return y;
1624}
1625
1626We currently generate:
1627	movb	%sil, %al
1628	andb	$7, %al
1629	movzbl	%al, %eax
1630	btl	%eax, %edi
1631	jae	.LBB0_2
1632
1633movl+andl would be shorter than the movb+andb+movzbl sequence.
1634
1635//===---------------------------------------------------------------------===//
1636
1637For the following:
1638struct u1 {
1639    float x, y;
1640};
1641float foo(struct u1 u) {
1642    return u.x + u.y;
1643}
1644
1645We currently generate:
1646	movdqa	%xmm0, %xmm1
1647	pshufd	$1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
1648	addss	%xmm1, %xmm0
1649	ret
1650
1651We could save an instruction here by commuting the addss.
1652
1653//===---------------------------------------------------------------------===//
1654
1655This (from PR9661):
1656
1657float clamp_float(float a) {
1658        if (a > 1.0f)
1659                return 1.0f;
1660        else if (a < 0.0f)
1661                return 0.0f;
1662        else
1663                return a;
1664}
1665
1666Could compile to:
1667
1668clamp_float:                            # @clamp_float
1669        movss   .LCPI0_0(%rip), %xmm1
1670        minss   %xmm1, %xmm0
1671        pxor    %xmm1, %xmm1
1672        maxss   %xmm1, %xmm0
1673        ret
1674
1675with -ffast-math.
1676
1677//===---------------------------------------------------------------------===//
1678
1679This function (from PR9803):
1680
1681int clamp2(int a) {
1682        if (a > 5)
1683                a = 5;
1684        if (a < 0)
1685                return 0;
1686        return a;
1687}
1688
1689Compiles to:
1690
1691_clamp2:                                ## @clamp2
1692        pushq   %rbp
1693        movq    %rsp, %rbp
1694        cmpl    $5, %edi
1695        movl    $5, %ecx
1696        cmovlel %edi, %ecx
1697        testl   %ecx, %ecx
1698        movl    $0, %eax
1699        cmovnsl %ecx, %eax
1700        popq    %rbp
1701        ret
1702
1703The move of 0 could be scheduled above the test to make it is xor reg,reg.
1704
1705//===---------------------------------------------------------------------===//
1706
1707GCC PR48986.  We currently compile this:
1708
1709void bar(void);
1710void yyy(int* p) {
1711    if (__sync_fetch_and_add(p, -1) == 1)
1712      bar();
1713}
1714
1715into:
1716	movl	$-1, %eax
1717	lock
1718	xaddl	%eax, (%rdi)
1719	cmpl	$1, %eax
1720	je	LBB0_2
1721
1722Instead we could generate:
1723
1724	lock
1725	dec %rdi
1726	je LBB0_2
1727
1728The trick is to match "fetch_and_add(X, -C) == C".
1729
1730//===---------------------------------------------------------------------===//
1731
1732unsigned t(unsigned a, unsigned b) {
1733  return a <= b ? 5 : -5;
1734}
1735
1736We generate:
1737	movl	$5, %ecx
1738	cmpl	%esi, %edi
1739	movl	$-5, %eax
1740	cmovbel	%ecx, %eax
1741
1742GCC:
1743	cmpl	%edi, %esi
1744	sbbl	%eax, %eax
1745	andl	$-10, %eax
1746	addl	$5, %eax
1747
1748//===---------------------------------------------------------------------===//
1749