• Home
  • History
  • Annotate
  • only in this directory
NameDateSize

..24-Oct-201431

AsmParser/H24-Oct-20147

CMakeLists.txtH A D24-Oct-20142 KiB

Disassembler/H24-Oct-201410

InstPrinter/H24-Oct-201411

LLVMBuild.txtH A D24-Oct-20141 KiB

MakefileH A D24-Oct-2014861

MCTargetDesc/H24-Oct-201416

README-FPStack.txtH A D24-Oct-20142.7 KiB

README-MMX.txtH A D24-Oct-20141.5 KiB

README-SSE.txtH A D24-Oct-201427 KiB

README-UNIMPLEMENTED.txtH A D24-Oct-2014679

README-X86-64.txtH A D24-Oct-20146 KiB

README.txtH A D24-Oct-201453.6 KiB

TargetInfo/H24-Oct-20146

Utils/H24-Oct-20147

X86.hH A D24-Oct-20142.7 KiB

X86.tdH A D24-Oct-201415.1 KiB

X86AsmPrinter.cppH A D24-Oct-201428.3 KiB

X86AsmPrinter.hH A D24-Oct-20143 KiB

X86CallingConv.tdH A D24-Oct-201418.5 KiB

X86CodeEmitter.cppH A D24-Oct-201451.1 KiB

X86COFFMachineModuleInfo.cppH A D24-Oct-2014614

X86COFFMachineModuleInfo.hH A D24-Oct-20141.4 KiB

X86CompilationCallback_Win64.asmH A D24-Oct-20141.6 KiB

X86ELFWriterInfo.cppH A D24-Oct-20144.1 KiB

X86ELFWriterInfo.hH A D24-Oct-20142.2 KiB

X86FastISel.cppH A D24-Oct-201474.7 KiB

X86FloatingPoint.cppH A D24-Oct-201465.7 KiB

X86FrameLowering.cppH A D24-Oct-201457.2 KiB

X86FrameLowering.hH A D24-Oct-20142.5 KiB

X86Instr3DNow.tdH A D24-Oct-20144.3 KiB

X86InstrArithmetic.tdH A D24-Oct-201459 KiB

X86InstrBuilder.hH A D24-Oct-20146.6 KiB

X86InstrCMovSetCC.tdH A D24-Oct-20145.1 KiB

X86InstrCompiler.tdH A D24-Oct-201480.4 KiB

X86InstrControl.tdH A D24-Oct-201412.3 KiB

X86InstrExtension.tdH A D24-Oct-20148.7 KiB

X86InstrFMA.tdH A D24-Oct-201418 KiB

X86InstrFormats.tdH A D24-Oct-201428.1 KiB

X86InstrFPStack.tdH A D24-Oct-201433.9 KiB

X86InstrFragmentsSIMD.tdH A D24-Oct-201418.9 KiB

X86InstrInfo.cppH A D24-Oct-2014203.2 KiB

X86InstrInfo.hH A D24-Oct-201419.2 KiB

X86InstrInfo.tdH A D24-Oct-201495 KiB

X86InstrMMX.tdH A D24-Oct-201427.6 KiB

X86InstrShiftRotate.tdH A D24-Oct-201444.7 KiB

X86InstrSSE.tdH A D24-Oct-2014392.7 KiB

X86InstrSVM.tdH A D24-Oct-20142.1 KiB

X86InstrSystem.tdH A D24-Oct-201424.2 KiB

X86InstrVMX.tdH A D24-Oct-20143.2 KiB

X86InstrXOP.tdH A D24-Oct-201414.9 KiB

X86ISelDAGToDAG.cppH A D24-Oct-2014102.6 KiB

X86ISelLowering.cppH A D24-Oct-2014642.7 KiB

X86ISelLowering.hH A D24-Oct-201437.4 KiB

X86JITInfo.cppH A D24-Oct-201419.3 KiB

X86JITInfo.hH A D24-Oct-20143 KiB

X86MachineFunctionInfo.cppH A D24-Oct-2014444

X86MachineFunctionInfo.hH A D24-Oct-20145.6 KiB

X86MCInstLower.cppH A D24-Oct-201429.1 KiB

X86MCInstLower.hH A D24-Oct-20141.3 KiB

X86RegisterInfo.cppH A D24-Oct-201429.4 KiB

X86RegisterInfo.hH A D24-Oct-20145.2 KiB

X86RegisterInfo.tdH A D24-Oct-201417.8 KiB

X86Relocations.hH A D24-Oct-20142 KiB

X86Schedule.tdH A D24-Oct-201415.6 KiB

X86ScheduleAtom.tdH A D24-Oct-201427.7 KiB

X86SelectionDAGInfo.cppH A D24-Oct-20149.9 KiB

X86SelectionDAGInfo.hH A D24-Oct-20141.9 KiB

X86Subtarget.cppH A D24-Oct-201412.9 KiB

X86Subtarget.hH A D24-Oct-201411.2 KiB

X86TargetMachine.cppH A D24-Oct-20147.1 KiB

X86TargetMachine.hH A D24-Oct-20144.5 KiB

X86TargetObjectFile.cppH A D24-Oct-20141.9 KiB

X86TargetObjectFile.hH A D24-Oct-20141.5 KiB

X86VZeroUpper.cppH A D24-Oct-20149.3 KiB

README-FPStack.txt

1//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend: FP stack related stuff
3//===---------------------------------------------------------------------===//
4
5//===---------------------------------------------------------------------===//
6
7Some targets (e.g. athlons) prefer freep to fstp ST(0):
8http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
9
10//===---------------------------------------------------------------------===//
11
12This should use fiadd on chips where it is profitable:
13double foo(double P, int *I) { return P+*I; }
14
15We have fiadd patterns now but the followings have the same cost and
16complexity. We need a way to specify the later is more profitable.
17
18def FpADD32m  : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
19                    [(set RFP:$dst, (fadd RFP:$src1,
20                                     (extloadf64f32 addr:$src2)))]>;
21                // ST(0) = ST(0) + [mem32]
22
23def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
24                    [(set RFP:$dst, (fadd RFP:$src1,
25                                     (X86fild addr:$src2, i32)))]>;
26                // ST(0) = ST(0) + [mem32int]
27
28//===---------------------------------------------------------------------===//
29
30The FP stackifier should handle simple permutates to reduce number of shuffle
31instructions, e.g. turning:
32
33fld P	->		fld Q
34fld Q			fld P
35fxch
36
37or:
38
39fxch	->		fucomi
40fucomi			jl X
41jg X
42
43Ideas:
44http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
45
46
47//===---------------------------------------------------------------------===//
48
49Add a target specific hook to DAG combiner to handle SINT_TO_FP and
50FP_TO_SINT when the source operand is already in memory.
51
52//===---------------------------------------------------------------------===//
53
54Open code rint,floor,ceil,trunc:
55http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
56http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
57
58Opencode the sincos[f] libcall.
59
60//===---------------------------------------------------------------------===//
61
62None of the FPStack instructions are handled in
63X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
64folding spill code into the instructions.
65
66//===---------------------------------------------------------------------===//
67
68Currently the x86 codegen isn't very good at mixing SSE and FPStack
69code:
70
71unsigned int foo(double x) { return x; }
72
73foo:
74	subl $20, %esp
75	movsd 24(%esp), %xmm0
76	movsd %xmm0, 8(%esp)
77	fldl 8(%esp)
78	fisttpll (%esp)
79	movl (%esp), %eax
80	addl $20, %esp
81	ret
82
83This just requires being smarter when custom expanding fptoui.
84
85//===---------------------------------------------------------------------===//
86

README-MMX.txt

1//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend: MMX-specific stuff.
3//===---------------------------------------------------------------------===//
4
5//===---------------------------------------------------------------------===//
6
7This:
8
9#include <mmintrin.h>
10
11__v2si qux(int A) {
12  return (__v2si){ 0, A };
13}
14
15is compiled into:
16
17_qux:
18        subl $28, %esp
19        movl 32(%esp), %eax
20        movd %eax, %mm0
21        movq %mm0, (%esp)
22        movl (%esp), %eax
23        movl %eax, 20(%esp)
24        movq %mm0, 8(%esp)
25        movl 12(%esp), %eax
26        movl %eax, 16(%esp)
27        movq 16(%esp), %mm0
28        addl $28, %esp
29        ret
30
31Yuck!
32
33GCC gives us:
34
35_qux:
36        subl    $12, %esp
37        movl    16(%esp), %eax
38        movl    20(%esp), %edx
39        movl    $0, (%eax)
40        movl    %edx, 4(%eax)
41        addl    $12, %esp
42        ret     $4
43
44//===---------------------------------------------------------------------===//
45
46We generate crappy code for this:
47
48__m64 t() {
49  return _mm_cvtsi32_si64(1);
50}
51
52_t:
53	subl	$12, %esp
54	movl	$1, %eax
55	movd	%eax, %mm0
56	movq	%mm0, (%esp)
57	movl	(%esp), %eax
58	movl	4(%esp), %edx
59	addl	$12, %esp
60	ret
61
62The extra stack traffic is covered in the previous entry. But the other reason
63is we are not smart about materializing constants in MMX registers. With -m64
64
65	movl	$1, %eax
66	movd	%eax, %mm0
67	movd	%mm0, %rax
68	ret
69
70We should be using a constantpool load instead:
71	movq	LC0(%rip), %rax
72

README-SSE.txt

1//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend: SSE-specific stuff.
3//===---------------------------------------------------------------------===//
4
5//===---------------------------------------------------------------------===//
6
7SSE Variable shift can be custom lowered to something like this, which uses a
8small table + unaligned load + shuffle instead of going through memory.
9
10__m128i_shift_right:
11	.byte	  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
12	.byte	 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
13
14...
15__m128i shift_right(__m128i value, unsigned long offset) {
16  return _mm_shuffle_epi8(value,
17               _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
18}
19
20//===---------------------------------------------------------------------===//
21
22SSE has instructions for doing operations on complex numbers, we should pattern
23match them.   For example, this should turn into a horizontal add:
24
25typedef float __attribute__((vector_size(16))) v4f32;
26float f32(v4f32 A) {
27  return A[0]+A[1]+A[2]+A[3];
28}
29
30Instead we get this:
31
32_f32:                                   ## @f32
33	pshufd	$1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
34	addss	%xmm0, %xmm1
35	pshufd	$3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
36	movhlps	%xmm0, %xmm0            ## xmm0 = xmm0[1,1]
37	movaps	%xmm0, %xmm3
38	addss	%xmm1, %xmm3
39	movdqa	%xmm2, %xmm0
40	addss	%xmm3, %xmm0
41	ret
42
43Also, there are cases where some simple local SLP would improve codegen a bit.
44compiling this:
45
46_Complex float f32(_Complex float A, _Complex float B) {
47  return A+B;
48}
49
50into:
51
52_f32:                                   ## @f32
53	movdqa	%xmm0, %xmm2
54	addss	%xmm1, %xmm2
55	pshufd	$1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
56	pshufd	$1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
57	addss	%xmm1, %xmm3
58	movaps	%xmm2, %xmm0
59	unpcklps	%xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
60	ret
61
62seems silly when it could just be one addps.
63
64
65//===---------------------------------------------------------------------===//
66
67Expand libm rounding functions inline:  Significant speedups possible.
68http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
69
70//===---------------------------------------------------------------------===//
71
72When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
73other fast SSE modes.
74
75//===---------------------------------------------------------------------===//
76
77Think about doing i64 math in SSE regs on x86-32.
78
79//===---------------------------------------------------------------------===//
80
81This testcase should have no SSE instructions in it, and only one load from
82a constant pool:
83
84double %test3(bool %B) {
85        %C = select bool %B, double 123.412, double 523.01123123
86        ret double %C
87}
88
89Currently, the select is being lowered, which prevents the dag combiner from
90turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
91
92The pattern isel got this one right.
93
94//===---------------------------------------------------------------------===//
95
96SSE should implement 'select_cc' using 'emulated conditional moves' that use
97pcmp/pand/pandn/por to do a selection instead of a conditional branch:
98
99double %X(double %Y, double %Z, double %A, double %B) {
100        %C = setlt double %A, %B
101        %z = fadd double %Z, 0.0    ;; select operand is not a load
102        %D = select bool %C, double %Y, double %z
103        ret double %D
104}
105
106We currently emit:
107
108_X:
109        subl $12, %esp
110        xorpd %xmm0, %xmm0
111        addsd 24(%esp), %xmm0
112        movsd 32(%esp), %xmm1
113        movsd 16(%esp), %xmm2
114        ucomisd 40(%esp), %xmm1
115        jb LBB_X_2
116LBB_X_1:
117        movsd %xmm0, %xmm2
118LBB_X_2:
119        movsd %xmm2, (%esp)
120        fldl (%esp)
121        addl $12, %esp
122        ret
123
124//===---------------------------------------------------------------------===//
125
126Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
127feasible.
128
129//===---------------------------------------------------------------------===//
130
131Codegen:
132  if (copysign(1.0, x) == copysign(1.0, y))
133into:
134  if (x^y & mask)
135when using SSE.
136
137//===---------------------------------------------------------------------===//
138
139Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
140of a v4sf value.
141
142//===---------------------------------------------------------------------===//
143
144Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
145Perhaps use pxor / xorp* to clear a XMM register first?
146
147//===---------------------------------------------------------------------===//
148
149External test Nurbs exposed some problems. Look for
150__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
151emits:
152
153        movaps    (%edx), %xmm2                                 #59.21
154        movaps    (%edx), %xmm5                                 #60.21
155        movaps    (%edx), %xmm4                                 #61.21
156        movaps    (%edx), %xmm3                                 #62.21
157        movl      40(%ecx), %ebp                                #69.49
158        shufps    $0, %xmm2, %xmm5                              #60.21
159        movl      100(%esp), %ebx                               #69.20
160        movl      (%ebx), %edi                                  #69.20
161        imull     %ebp, %edi                                    #69.49
162        addl      (%eax), %edi                                  #70.33
163        shufps    $85, %xmm2, %xmm4                             #61.21
164        shufps    $170, %xmm2, %xmm3                            #62.21
165        shufps    $255, %xmm2, %xmm2                            #63.21
166        lea       (%ebp,%ebp,2), %ebx                           #69.49
167        negl      %ebx                                          #69.49
168        lea       -3(%edi,%ebx), %ebx                           #70.33
169        shll      $4, %ebx                                      #68.37
170        addl      32(%ecx), %ebx                                #68.37
171        testb     $15, %bl                                      #91.13
172        jne       L_B1.24       # Prob 5%                       #91.13
173
174This is the llvm code after instruction scheduling:
175
176cond_next140 (0xa910740, LLVM BB @0xa90beb0):
177	%reg1078 = MOV32ri -3
178	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
179	%reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
180	%reg1080 = IMUL32rr %reg1079, %reg1037
181	%reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
182	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
183	%reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
184	%reg1082 = SHL32ri %reg1038, 4
185	%reg1039 = ADD32rr %reg1036, %reg1082
186	%reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
187	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
188	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
189	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
190	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
191	%reg1040 = MOV32rr %reg1039
192	%reg1084 = AND32ri8 %reg1039, 15
193	CMP32ri8 %reg1084, 0
194	JE mbb<cond_next204,0xa914d30>
195
196Still ok. After register allocation:
197
198cond_next140 (0xa910740, LLVM BB @0xa90beb0):
199	%EAX = MOV32ri -3
200	%EDX = MOV32rm <fi#3>, 1, %NOREG, 0
201	ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
202	%EDX = MOV32rm <fi#7>, 1, %NOREG, 0
203	%EDX = MOV32rm %EDX, 1, %NOREG, 40
204	IMUL32rr %EAX<def&use>, %EDX
205	%ESI = MOV32rm <fi#5>, 1, %NOREG, 0
206	%ESI = MOV32rm %ESI, 1, %NOREG, 0
207	MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
208	%EAX = LEA32r %ESI, 1, %EAX, -3
209	%ESI = MOV32rm <fi#7>, 1, %NOREG, 0
210	%ESI = MOV32rm %ESI, 1, %NOREG, 32
211	%EDI = MOV32rr %EAX
212	SHL32ri %EDI<def&use>, 4
213	ADD32rr %EDI<def&use>, %ESI
214	%XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
215	%XMM1 = MOVAPSrr %XMM0
216	SHUFPSrr %XMM1<def&use>, %XMM1, 170
217	%XMM2 = MOVAPSrr %XMM0
218	SHUFPSrr %XMM2<def&use>, %XMM2, 0
219	%XMM3 = MOVAPSrr %XMM0
220	SHUFPSrr %XMM3<def&use>, %XMM3, 255
221	SHUFPSrr %XMM0<def&use>, %XMM0, 85
222	%EBX = MOV32rr %EDI
223	AND32ri8 %EBX<def&use>, 15
224	CMP32ri8 %EBX, 0
225	JE mbb<cond_next204,0xa914d30>
226
227This looks really bad. The problem is shufps is a destructive opcode. Since it
228appears as operand two in more than one shufps ops. It resulted in a number of
229copies. Note icc also suffers from the same problem. Either the instruction
230selector should select pshufd or The register allocator can made the two-address
231to three-address transformation.
232
233It also exposes some other problems. See MOV32ri -3 and the spills.
234
235//===---------------------------------------------------------------------===//
236
237Consider:
238
239__m128 test(float a) {
240  return _mm_set_ps(0.0, 0.0, 0.0, a*a);
241}
242
243This compiles into:
244
245movss 4(%esp), %xmm1
246mulss %xmm1, %xmm1
247xorps %xmm0, %xmm0
248movss %xmm1, %xmm0
249ret
250
251Because mulss doesn't modify the top 3 elements, the top elements of 
252xmm1 are already zero'd.  We could compile this to:
253
254movss 4(%esp), %xmm0
255mulss %xmm0, %xmm0
256ret
257
258//===---------------------------------------------------------------------===//
259
260Here's a sick and twisted idea.  Consider code like this:
261
262__m128 test(__m128 a) {
263  float b = *(float*)&A;
264  ...
265  return _mm_set_ps(0.0, 0.0, 0.0, b);
266}
267
268This might compile to this code:
269
270movaps c(%esp), %xmm1
271xorps %xmm0, %xmm0
272movss %xmm1, %xmm0
273ret
274
275Now consider if the ... code caused xmm1 to get spilled.  This might produce
276this code:
277
278movaps c(%esp), %xmm1
279movaps %xmm1, c2(%esp)
280...
281
282xorps %xmm0, %xmm0
283movaps c2(%esp), %xmm1
284movss %xmm1, %xmm0
285ret
286
287However, since the reload is only used by these instructions, we could 
288"fold" it into the uses, producing something like this:
289
290movaps c(%esp), %xmm1
291movaps %xmm1, c2(%esp)
292...
293
294movss c2(%esp), %xmm0
295ret
296
297... saving two instructions.
298
299The basic idea is that a reload from a spill slot, can, if only one 4-byte 
300chunk is used, bring in 3 zeros the one element instead of 4 elements.
301This can be used to simplify a variety of shuffle operations, where the
302elements are fixed zeros.
303
304//===---------------------------------------------------------------------===//
305
306This code generates ugly code, probably due to costs being off or something:
307
308define void @test(float* %P, <4 x float>* %P2 ) {
309        %xFloat0.688 = load float* %P
310        %tmp = load <4 x float>* %P2
311        %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
312        store <4 x float> %inFloat3.713, <4 x float>* %P2
313        ret void
314}
315
316Generates:
317
318_test:
319	movl	8(%esp), %eax
320	movaps	(%eax), %xmm0
321	pxor	%xmm1, %xmm1
322	movaps	%xmm0, %xmm2
323	shufps	$50, %xmm1, %xmm2
324	shufps	$132, %xmm2, %xmm0
325	movaps	%xmm0, (%eax)
326	ret
327
328Would it be better to generate:
329
330_test:
331        movl 8(%esp), %ecx
332        movaps (%ecx), %xmm0
333	xor %eax, %eax
334        pinsrw $6, %eax, %xmm0
335        pinsrw $7, %eax, %xmm0
336        movaps %xmm0, (%ecx)
337        ret
338
339?
340
341//===---------------------------------------------------------------------===//
342
343Some useful information in the Apple Altivec / SSE Migration Guide:
344
345http://developer.apple.com/documentation/Performance/Conceptual/
346Accelerate_sse_migration/index.html
347
348e.g. SSE select using and, andnot, or. Various SSE compare translations.
349
350//===---------------------------------------------------------------------===//
351
352Add hooks to commute some CMPP operations.
353
354//===---------------------------------------------------------------------===//
355
356Apply the same transformation that merged four float into a single 128-bit load
357to loads from constant pool.
358
359//===---------------------------------------------------------------------===//
360
361Floating point max / min are commutable when -enable-unsafe-fp-path is
362specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
363nodes which are selected to max / min instructions that are marked commutable.
364
365//===---------------------------------------------------------------------===//
366
367We should materialize vector constants like "all ones" and "signbit" with 
368code like:
369
370     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
371
372and:
373     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
374     psrlq   xmm1, 31     ; xmm1 = all 100000000000...
375
376instead of using a load from the constant pool.  The later is important for
377ABS/NEG/copysign etc.
378
379//===---------------------------------------------------------------------===//
380
381These functions:
382
383#include <xmmintrin.h>
384__m128i a;
385void x(unsigned short n) {
386  a = _mm_slli_epi32 (a, n);
387}
388void y(unsigned n) {
389  a = _mm_slli_epi32 (a, n);
390}
391
392compile to ( -O3 -static -fomit-frame-pointer):
393_x:
394        movzwl  4(%esp), %eax
395        movd    %eax, %xmm0
396        movaps  _a, %xmm1
397        pslld   %xmm0, %xmm1
398        movaps  %xmm1, _a
399        ret
400_y:
401        movd    4(%esp), %xmm0
402        movaps  _a, %xmm1
403        pslld   %xmm0, %xmm1
404        movaps  %xmm1, _a
405        ret
406
407"y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
408like movd would be sufficient in both cases as the value is already zero 
409extended in the 32-bit stack slot IIRC.  For signed short, it should also be
410save, as a really-signed value would be undefined for pslld.
411
412
413//===---------------------------------------------------------------------===//
414
415#include <math.h>
416int t1(double d) { return signbit(d); }
417
418This currently compiles to:
419	subl	$12, %esp
420	movsd	16(%esp), %xmm0
421	movsd	%xmm0, (%esp)
422	movl	4(%esp), %eax
423	shrl	$31, %eax
424	addl	$12, %esp
425	ret
426
427We should use movmskp{s|d} instead.
428
429//===---------------------------------------------------------------------===//
430
431CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
432(aligned) vector load.  This functionality has a couple of problems.
433
4341. The code to infer alignment from loads of globals is in the X86 backend,
435   not the dag combiner.  This is because dagcombine2 needs to be able to see
436   through the X86ISD::Wrapper node, which DAGCombine can't really do.
4372. The code for turning 4 x load into a single vector load is target 
438   independent and should be moved to the dag combiner.
4393. The code for turning 4 x load into a vector load can only handle a direct 
440   load from a global or a direct load from the stack.  It should be generalized
441   to handle any load from P, P+4, P+8, P+12, where P can be anything.
4424. The alignment inference code cannot handle loads from globals in non-static
443   mode because it doesn't look through the extra dyld stub load.  If you try
444   vec_align.ll without -relocation-model=static, you'll see what I mean.
445
446//===---------------------------------------------------------------------===//
447
448We should lower store(fneg(load p), q) into an integer load+xor+store, which
449eliminates a constant pool load.  For example, consider:
450
451define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
452entry:
453 %tmp6 = fsub float -0.000000e+00, %z.1		; <float> [#uses=1]
454 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
455 ret i64 %tmp20
456}
457declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
458
459This currently compiles to:
460
461LCPI1_0:					#  <4 x float>
462	.long	2147483648	# float -0
463	.long	2147483648	# float -0
464	.long	2147483648	# float -0
465	.long	2147483648	# float -0
466_ccosf:
467	subl	$12, %esp
468	movss	16(%esp), %xmm0
469	movss	%xmm0, 4(%esp)
470	movss	20(%esp), %xmm0
471	xorps	LCPI1_0, %xmm0
472	movss	%xmm0, (%esp)
473	call	L_ccoshf$stub
474	addl	$12, %esp
475	ret
476
477Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
478this code computes the pic base and does two loads to do the constant pool 
479load, so the improvement is much bigger.
480
481The tricky part about this xform is that the argument load/store isn't exposed
482until post-legalize, and at that point, the fneg has been custom expanded into 
483an X86 fxor.  This means that we need to handle this case in the x86 backend
484instead of in target independent code.
485
486//===---------------------------------------------------------------------===//
487
488Non-SSE4 insert into 16 x i8 is atrociously bad.
489
490//===---------------------------------------------------------------------===//
491
492<2 x i64> extract is substantially worse than <2 x f64>, even if the destination
493is memory.
494
495//===---------------------------------------------------------------------===//
496
497SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
498sitting between the truncate and the extract.
499
500//===---------------------------------------------------------------------===//
501
502INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
503any number of 0.0 simultaneously.  Currently we only use it for simple
504insertions.
505
506See comments in LowerINSERT_VECTOR_ELT_SSE4.
507
508//===---------------------------------------------------------------------===//
509
510On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
511Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
512legal, it'll just take a few extra patterns written in the .td file.
513
514Note: this is not a code quality issue; the custom lowered code happens to be
515right, but we shouldn't have to custom lower anything.  This is probably related
516to <2 x i64> ops being so bad.
517
518//===---------------------------------------------------------------------===//
519
520'select' on vectors and scalars could be a whole lot better.  We currently 
521lower them to conditional branches.  On x86-64 for example, we compile this:
522
523double test(double a, double b, double c, double d) { return a<b ? c : d; }
524
525to:
526
527_test:
528	ucomisd	%xmm0, %xmm1
529	ja	LBB1_2	# entry
530LBB1_1:	# entry
531	movapd	%xmm3, %xmm2
532LBB1_2:	# entry
533	movapd	%xmm2, %xmm0
534	ret
535
536instead of:
537
538_test:
539	cmpltsd	%xmm1, %xmm0
540	andpd	%xmm0, %xmm2
541	andnpd	%xmm3, %xmm0
542	orpd	%xmm2, %xmm0
543	ret
544
545For unpredictable branches, the later is much more efficient.  This should
546just be a matter of having scalar sse map to SELECT_CC and custom expanding
547or iseling it.
548
549//===---------------------------------------------------------------------===//
550
551LLVM currently generates stack realignment code, when it is not necessary
552needed. The problem is that we need to know about stack alignment too early,
553before RA runs.
554
555At that point we don't know, whether there will be vector spill, or not.
556Stack realignment logic is overly conservative here, but otherwise we can
557produce unaligned loads/stores.
558
559Fixing this will require some huge RA changes.
560
561Testcase:
562#include <emmintrin.h>
563
564typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
565
566static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
567- 22725, - 12873};;
568
569vSInt16 madd(vSInt16 b)
570{
571    return _mm_madd_epi16(a, b);
572}
573
574Generated code (x86-32, linux):
575madd:
576        pushl   %ebp
577        movl    %esp, %ebp
578        andl    $-16, %esp
579        movaps  .LCPI1_0, %xmm1
580        pmaddwd %xmm1, %xmm0
581        movl    %ebp, %esp
582        popl    %ebp
583        ret
584
585//===---------------------------------------------------------------------===//
586
587Consider:
588#include <emmintrin.h> 
589__m128 foo2 (float x) {
590 return _mm_set_ps (0, 0, x, 0);
591}
592
593In x86-32 mode, we generate this spiffy code:
594
595_foo2:
596	movss	4(%esp), %xmm0
597	pshufd	$81, %xmm0, %xmm0
598	ret
599
600in x86-64 mode, we generate this code, which could be better:
601
602_foo2:
603	xorps	%xmm1, %xmm1
604	movss	%xmm0, %xmm1
605	pshufd	$81, %xmm1, %xmm0
606	ret
607
608In sse4 mode, we could use insertps to make both better.
609
610Here's another testcase that could use insertps [mem]:
611
612#include <xmmintrin.h>
613extern float x2, x3;
614__m128 foo1 (float x1, float x4) {
615 return _mm_set_ps (x2, x1, x3, x4);
616}
617
618gcc mainline compiles it to:
619
620foo1:
621       insertps        $0x10, x2(%rip), %xmm0
622       insertps        $0x10, x3(%rip), %xmm1
623       movaps  %xmm1, %xmm2
624       movlhps %xmm0, %xmm2
625       movaps  %xmm2, %xmm0
626       ret
627
628//===---------------------------------------------------------------------===//
629
630We compile vector multiply-by-constant into poor code:
631
632define <4 x i32> @f(<4 x i32> %i) nounwind  {
633	%A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
634	ret <4 x i32> %A
635}
636
637On targets without SSE4.1, this compiles into:
638
639LCPI1_0:					##  <4 x i32>
640	.long	10
641	.long	10
642	.long	10
643	.long	10
644	.text
645	.align	4,0x90
646	.globl	_f
647_f:
648	pshufd	$3, %xmm0, %xmm1
649	movd	%xmm1, %eax
650	imull	LCPI1_0+12, %eax
651	movd	%eax, %xmm1
652	pshufd	$1, %xmm0, %xmm2
653	movd	%xmm2, %eax
654	imull	LCPI1_0+4, %eax
655	movd	%eax, %xmm2
656	punpckldq	%xmm1, %xmm2
657	movd	%xmm0, %eax
658	imull	LCPI1_0, %eax
659	movd	%eax, %xmm1
660	movhlps	%xmm0, %xmm0
661	movd	%xmm0, %eax
662	imull	LCPI1_0+8, %eax
663	movd	%eax, %xmm0
664	punpckldq	%xmm0, %xmm1
665	movaps	%xmm1, %xmm0
666	punpckldq	%xmm2, %xmm0
667	ret
668
669It would be better to synthesize integer vector multiplication by constants
670using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
671simple cases such as multiplication by powers of two would be better as
672vector shifts than as multiplications.
673
674//===---------------------------------------------------------------------===//
675
676We compile this:
677
678__m128i
679foo2 (char x)
680{
681  return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
682}
683
684into:
685	movl	$1, %eax
686	xorps	%xmm0, %xmm0
687	pinsrw	$2, %eax, %xmm0
688	movzbl	4(%esp), %eax
689	pinsrw	$3, %eax, %xmm0
690	movl	$256, %eax
691	pinsrw	$7, %eax, %xmm0
692	ret
693
694
695gcc-4.2:
696	subl	$12, %esp
697	movzbl	16(%esp), %eax
698	movdqa	LC0, %xmm0
699	pinsrw	$3, %eax, %xmm0
700	addl	$12, %esp
701	ret
702	.const
703	.align 4
704LC0:
705	.word	0
706	.word	0
707	.word	1
708	.word	0
709	.word	0
710	.word	0
711	.word	0
712	.word	256
713
714With SSE4, it should be
715      movdqa  .LC0(%rip), %xmm0
716      pinsrb  $6, %edi, %xmm0
717
718//===---------------------------------------------------------------------===//
719
720We should transform a shuffle of two vectors of constants into a single vector
721of constants. Also, insertelement of a constant into a vector of constants
722should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
723
724We compiled it to something horrible:
725
726	.align	4
727LCPI1_1:					##  float
728	.long	1065353216	## float 1
729	.const
730
731	.align	4
732LCPI1_0:					##  <4 x float>
733	.space	4
734	.long	1065353216	## float 1
735	.space	4
736	.long	1065353216	## float 1
737	.text
738	.align	4,0x90
739	.globl	_t
740_t:
741	xorps	%xmm0, %xmm0
742	movhps	LCPI1_0, %xmm0
743	movss	LCPI1_1, %xmm1
744	movaps	%xmm0, %xmm2
745	shufps	$2, %xmm1, %xmm2
746	shufps	$132, %xmm2, %xmm0
747	movaps	%xmm0, 0
748
749//===---------------------------------------------------------------------===//
750rdar://5907648
751
752This function:
753
754float foo(unsigned char x) {
755  return x;
756}
757
758compiles to (x86-32):
759
760define float @foo(i8 zeroext  %x) nounwind  {
761	%tmp12 = uitofp i8 %x to float		; <float> [#uses=1]
762	ret float %tmp12
763}
764
765compiles to:
766
767_foo:
768	subl	$4, %esp
769	movzbl	8(%esp), %eax
770	cvtsi2ss	%eax, %xmm0
771	movss	%xmm0, (%esp)
772	flds	(%esp)
773	addl	$4, %esp
774	ret
775
776We should be able to use:
777  cvtsi2ss 8($esp), %xmm0
778since we know the stack slot is already zext'd.
779
780//===---------------------------------------------------------------------===//
781
782Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
783when code size is critical. movlps is slower than movsd on core2 but it's one
784byte shorter.
785
786//===---------------------------------------------------------------------===//
787
788We should use a dynamic programming based approach to tell when using FPStack
789operations is cheaper than SSE.  SciMark montecarlo contains code like this
790for example:
791
792double MonteCarlo_num_flops(int Num_samples) {
793    return ((double) Num_samples)* 4.0;
794}
795
796In fpstack mode, this compiles into:
797
798LCPI1_0:					
799	.long	1082130432	## float 4.000000e+00
800_MonteCarlo_num_flops:
801	subl	$4, %esp
802	movl	8(%esp), %eax
803	movl	%eax, (%esp)
804	fildl	(%esp)
805	fmuls	LCPI1_0
806	addl	$4, %esp
807	ret
808        
809in SSE mode, it compiles into significantly slower code:
810
811_MonteCarlo_num_flops:
812	subl	$12, %esp
813	cvtsi2sd	16(%esp), %xmm0
814	mulsd	LCPI1_0, %xmm0
815	movsd	%xmm0, (%esp)
816	fldl	(%esp)
817	addl	$12, %esp
818	ret
819
820There are also other cases in scimark where using fpstack is better, it is
821cheaper to do fld1 than load from a constant pool for example, so
822"load, add 1.0, store" is better done in the fp stack, etc.
823
824//===---------------------------------------------------------------------===//
825
826The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
827"cmpsd".  For example, this code:
828
829double d1(double x) { return x == x ? x : x + x; }
830
831Compiles into:
832
833_d1:
834	ucomisd	%xmm0, %xmm0
835	jnp	LBB1_2
836	addsd	%xmm0, %xmm0
837	ret
838LBB1_2:
839	ret
840
841Also, the 'ret's should be shared.  This is PR6032.
842
843//===---------------------------------------------------------------------===//
844
845These should compile into the same code (PR6214): Perhaps instcombine should
846canonicalize the former into the later?
847
848define float @foo(float %x) nounwind {
849  %t = bitcast float %x to i32
850  %s = and i32 %t, 2147483647
851  %d = bitcast i32 %s to float
852  ret float %d
853}
854
855declare float @fabsf(float %n)
856define float @bar(float %x) nounwind {
857  %d = call float @fabsf(float %x)
858  ret float %d
859}
860
861//===---------------------------------------------------------------------===//
862
863This IR (from PR6194):
864
865target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
866target triple = "x86_64-apple-darwin10.0.0"
867
868%0 = type { double, double }
869%struct.float3 = type { float, float, float }
870
871define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
872entry:
873  %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
874  %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
875  %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
876  %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
877  %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
878  %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
879  %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
880  store float %tmp12, float* %tmp5
881  ret void
882}
883
884Compiles to:
885
886_test:                                  ## @test
887	movd	%xmm0, %rax
888	shrq	$32, %rax
889	movl	%eax, 4(%rdi)
890	ret
891
892This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
893doing a shuffle from v[1] to v[0] then a float store.
894
895//===---------------------------------------------------------------------===//
896
897On SSE4 machines, we compile this code:
898
899define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
900       <2 x float> *%P) nounwind {
901  %Z = fadd <2 x float> %Q, %R
902
903  store <2 x float> %Z, <2 x float> *%P
904  ret <2 x float> %Z
905}
906
907into:
908
909_test2:                                 ## @test2
910## BB#0:
911	insertps	$0, %xmm2, %xmm2
912	insertps	$16, %xmm3, %xmm2
913	insertps	$0, %xmm0, %xmm3
914	insertps	$16, %xmm1, %xmm3
915	addps	%xmm2, %xmm3
916	movq	%xmm3, (%rdi)
917	movaps	%xmm3, %xmm0
918	pshufd	$1, %xmm3, %xmm1
919                                        ## kill: XMM1<def> XMM1<kill>
920	ret
921
922The insertps's of $0 are pointless complex copies.
923
924//===---------------------------------------------------------------------===//
925
926[UNSAFE FP]
927
928void foo(double, double, double);
929void norm(double x, double y, double z) {
930  double scale = __builtin_sqrt(x*x + y*y + z*z);
931  foo(x/scale, y/scale, z/scale);
932}
933
934We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
935slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
936and emit 3 mulsd in place of the divs. This can be done as a target-independent
937transform.
938
939If we're dealing with floats instead of doubles we could even replace the sqrtss
940and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
941cost of reduced accuracy.
942
943//===---------------------------------------------------------------------===//
944
945This function should be matched to haddpd when the appropriate CPU is enabled:
946
947#include <x86intrin.h>
948double f (__m128d p) {
949  return p[0] + p[1];
950}
951
952similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
953turn into hsubpd also.
954
955//===---------------------------------------------------------------------===//
956

README-UNIMPLEMENTED.txt

1//===---------------------------------------------------------------------===//
2// Testcases that crash the X86 backend because they aren't implemented
3//===---------------------------------------------------------------------===//
4
5These are cases we know the X86 backend doesn't handle.  Patches are welcome
6and appreciated, because no one has signed up to implemented these yet.
7Implementing these would allow elimination of the corresponding intrinsics,
8which would be great.
9
101) vector shifts
112) vector comparisons
123) vector fp<->int conversions: PR2683, PR2684, PR2685, PR2686, PR2688
134) bitcasts from vectors to scalars: PR2804
145) llvm.atomic.cmp.swap.i128.p0i128: PR3462
15

README-X86-64.txt

1//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
2
3AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
4multiplication by a constant. How much of it applies to Intel's X86-64
5implementation? There are definite trade-offs to consider: latency vs. register
6pressure vs. code size.
7
8//===---------------------------------------------------------------------===//
9
10Are we better off using branches instead of cmove to implement FP to
11unsigned i64?
12
13_conv:
14	ucomiss	LC0(%rip), %xmm0
15	cvttss2siq	%xmm0, %rdx
16	jb	L3
17	subss	LC0(%rip), %xmm0
18	movabsq	$-9223372036854775808, %rax
19	cvttss2siq	%xmm0, %rdx
20	xorq	%rax, %rdx
21L3:
22	movq	%rdx, %rax
23	ret
24
25instead of
26
27_conv:
28	movss LCPI1_0(%rip), %xmm1
29	cvttss2siq %xmm0, %rcx
30	movaps %xmm0, %xmm2
31	subss %xmm1, %xmm2
32	cvttss2siq %xmm2, %rax
33	movabsq $-9223372036854775808, %rdx
34	xorq %rdx, %rax
35	ucomiss %xmm1, %xmm0
36	cmovb %rcx, %rax
37	ret
38
39Seems like the jb branch has high likelihood of being taken. It would have
40saved a few instructions.
41
42//===---------------------------------------------------------------------===//
43
44It's not possible to reference AH, BH, CH, and DH registers in an instruction
45requiring REX prefix. However, divb and mulb both produce results in AH. If isel
46emits a CopyFromReg which gets turned into a movb and that can be allocated a
47r8b - r15b.
48
49To get around this, isel emits a CopyFromReg from AX and then right shift it
50down by 8 and truncate it. It's not pretty but it works. We need some register
51allocation magic to make the hack go away (e.g. putting additional constraints
52on the result of the movb).
53
54//===---------------------------------------------------------------------===//
55
56The x86-64 ABI for hidden-argument struct returns requires that the
57incoming value of %rdi be copied into %rax by the callee upon return.
58
59The idea is that it saves callers from having to remember this value,
60which would often require a callee-saved register. Callees usually
61need to keep this value live for most of their body anyway, so it
62doesn't add a significant burden on them.
63
64We currently implement this in codegen, however this is suboptimal
65because it means that it would be quite awkward to implement the
66optimization for callers.
67
68A better implementation would be to relax the LLVM IR rules for sret
69arguments to allow a function with an sret argument to have a non-void
70return type, and to have the front-end to set up the sret argument value
71as the return value of the function. The front-end could more easily
72emit uses of the returned struct value to be in terms of the function's
73lowered return value, and it would free non-C frontends from a
74complication only required by a C-based ABI.
75
76//===---------------------------------------------------------------------===//
77
78We get a redundant zero extension for code like this:
79
80int mask[1000];
81int foo(unsigned x) {
82 if (x < 10)
83   x = x * 45;
84 else
85   x = x * 78;
86 return mask[x];
87}
88
89_foo:
90LBB1_0:	## entry
91	cmpl	$9, %edi
92	jbe	LBB1_3	## bb
93LBB1_1:	## bb1
94	imull	$78, %edi, %eax
95LBB1_2:	## bb2
96	movl	%eax, %eax                    <----
97	movq	_mask@GOTPCREL(%rip), %rcx
98	movl	(%rcx,%rax,4), %eax
99	ret
100LBB1_3:	## bb
101	imull	$45, %edi, %eax
102	jmp	LBB1_2	## bb2
103  
104Before regalloc, we have:
105
106        %reg1025<def> = IMUL32rri8 %reg1024, 45, %EFLAGS<imp-def>
107        JMP mbb<bb2,0x203afb0>
108    Successors according to CFG: 0x203afb0 (#3)
109
110bb1: 0x203af60, LLVM BB @0x1e02310, ID#2:
111    Predecessors according to CFG: 0x203aec0 (#0)
112        %reg1026<def> = IMUL32rri8 %reg1024, 78, %EFLAGS<imp-def>
113    Successors according to CFG: 0x203afb0 (#3)
114
115bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3:
116    Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2)
117        %reg1027<def> = PHI %reg1025, mbb<bb,0x203af10>,
118                            %reg1026, mbb<bb1,0x203af60>
119        %reg1029<def> = MOVZX64rr32 %reg1027
120
121so we'd have to know that IMUL32rri8 leaves the high word zero extended and to
122be able to recognize the zero extend.  This could also presumably be implemented
123if we have whole-function selectiondags.
124
125//===---------------------------------------------------------------------===//
126
127Take the following code
128(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653):
129extern unsigned long table[];
130unsigned long foo(unsigned char *p) {
131  unsigned long tag = *p;
132  return table[tag >> 4] + table[tag & 0xf];
133}
134
135Current code generated:
136	movzbl	(%rdi), %eax
137	movq	%rax, %rcx
138	andq	$240, %rcx
139	shrq	%rcx
140	andq	$15, %rax
141	movq	table(,%rax,8), %rax
142	addq	table(%rcx), %rax
143	ret
144
145Issues:
1461. First movq should be movl; saves a byte.
1472. Both andq's should be andl; saves another two bytes.  I think this was
148   implemented at one point, but subsequently regressed.
1493. shrq should be shrl; saves another byte.
1504. The first andq can be completely eliminated by using a slightly more
151   expensive addressing mode.
152
153//===---------------------------------------------------------------------===//
154
155Consider the following (contrived testcase, but contains common factors):
156
157#include <stdarg.h>
158int test(int x, ...) {
159  int sum, i;
160  va_list l;
161  va_start(l, x);
162  for (i = 0; i < x; i++)
163    sum += va_arg(l, int);
164  va_end(l);
165  return sum;
166}
167
168Testcase given in C because fixing it will likely involve changing the IR
169generated for it.  The primary issue with the result is that it doesn't do any
170of the optimizations which are possible if we know the address of a va_list
171in the current function is never taken:
1721. We shouldn't spill the XMM registers because we only call va_arg with "int".
1732. It would be nice if we could scalarrepl the va_list.
1743. Probably overkill, but it'd be cool if we could peel off the first five
175iterations of the loop.
176
177Other optimizations involving functions which use va_arg on floats which don't
178have the address of a va_list taken:
1791. Conversely to the above, we shouldn't spill general registers if we only
180   call va_arg on "double".
1812. If we know nothing more than 64 bits wide is read from the XMM registers,
182   we can change the spilling code to reduce the amount of stack used by half.
183
184//===---------------------------------------------------------------------===//
185

README.txt

1//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend.
3//===---------------------------------------------------------------------===//
4
5This should be one DIV/IDIV instruction, not a libcall:
6
7unsigned test(unsigned long long X, unsigned Y) {
8        return X/Y;
9}
10
11This can be done trivially with a custom legalizer.  What about overflow 
12though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
13
14//===---------------------------------------------------------------------===//
15
16Improvements to the multiply -> shift/add algorithm:
17http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
18
19//===---------------------------------------------------------------------===//
20
21Improve code like this (occurs fairly frequently, e.g. in LLVM):
22long long foo(int x) { return 1LL << x; }
23
24http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
25http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
26http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
27
28Another useful one would be  ~0ULL >> X and ~0ULL << X.
29
30One better solution for 1LL << x is:
31        xorl    %eax, %eax
32        xorl    %edx, %edx
33        testb   $32, %cl
34        sete    %al
35        setne   %dl
36        sall    %cl, %eax
37        sall    %cl, %edx
38
39But that requires good 8-bit subreg support.
40
41Also, this might be better.  It's an extra shift, but it's one instruction
42shorter, and doesn't stress 8-bit subreg support.
43(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
44but without the unnecessary and.)
45        movl %ecx, %eax
46        shrl $5, %eax
47        movl %eax, %edx
48        xorl $1, %edx
49        sall %cl, %eax
50        sall %cl. %edx
51
5264-bit shifts (in general) expand to really bad code.  Instead of using
53cmovs, we should expand to a conditional branch like GCC produces.
54
55//===---------------------------------------------------------------------===//
56
57Some isel ideas:
58
591. Dynamic programming based approach when compile time is not an
60   issue.
612. Code duplication (addressing mode) during isel.
623. Other ideas from "Register-Sensitive Selection, Duplication, and
63   Sequencing of Instructions".
644. Scheduling for reduced register pressure.  E.g. "Minimum Register 
65   Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" 
66   and other related papers.
67   http://citeseer.ist.psu.edu/govindarajan01minimum.html
68
69//===---------------------------------------------------------------------===//
70
71Should we promote i16 to i32 to avoid partial register update stalls?
72
73//===---------------------------------------------------------------------===//
74
75Leave any_extend as pseudo instruction and hint to register
76allocator. Delay codegen until post register allocation.
77Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
78the coalescer how to deal with it though.
79
80//===---------------------------------------------------------------------===//
81
82It appears icc use push for parameter passing. Need to investigate.
83
84//===---------------------------------------------------------------------===//
85
86This:
87
88void foo(void);
89void bar(int x, int *P) { 
90  x >>= 2;
91  if (x) 
92    foo();
93  *P = x;
94}
95
96compiles into:
97
98	movq	%rsi, %rbx
99	movl	%edi, %r14d
100	sarl	$2, %r14d
101	testl	%r14d, %r14d
102	je	LBB0_2
103
104Instead of doing an explicit test, we can use the flags off the sar.  This
105occurs in a bigger testcase like this, which is pretty common:
106
107#include <vector>
108int test1(std::vector<int> &X) {
109  int Sum = 0;
110  for (long i = 0, e = X.size(); i != e; ++i)
111    X[i] = 0;
112  return Sum;
113}
114
115//===---------------------------------------------------------------------===//
116
117Only use inc/neg/not instructions on processors where they are faster than
118add/sub/xor.  They are slower on the P4 due to only updating some processor
119flags.
120
121//===---------------------------------------------------------------------===//
122
123The instruction selector sometimes misses folding a load into a compare.  The
124pattern is written as (cmp reg, (load p)).  Because the compare isn't 
125commutative, it is not matched with the load on both sides.  The dag combiner
126should be made smart enough to cannonicalize the load into the RHS of a compare
127when it can invert the result of the compare for free.
128
129//===---------------------------------------------------------------------===//
130
131In many cases, LLVM generates code like this:
132
133_test:
134        movl 8(%esp), %eax
135        cmpl %eax, 4(%esp)
136        setl %al
137        movzbl %al, %eax
138        ret
139
140on some processors (which ones?), it is more efficient to do this:
141
142_test:
143        movl 8(%esp), %ebx
144        xor  %eax, %eax
145        cmpl %ebx, 4(%esp)
146        setl %al
147        ret
148
149Doing this correctly is tricky though, as the xor clobbers the flags.
150
151//===---------------------------------------------------------------------===//
152
153We should generate bts/btr/etc instructions on targets where they are cheap or
154when codesize is important.  e.g., for:
155
156void setbit(int *target, int bit) {
157    *target |= (1 << bit);
158}
159void clearbit(int *target, int bit) {
160    *target &= ~(1 << bit);
161}
162
163//===---------------------------------------------------------------------===//
164
165Instead of the following for memset char*, 1, 10:
166
167	movl $16843009, 4(%edx)
168	movl $16843009, (%edx)
169	movw $257, 8(%edx)
170
171It might be better to generate
172
173	movl $16843009, %eax
174	movl %eax, 4(%edx)
175	movl %eax, (%edx)
176	movw al, 8(%edx)
177	
178when we can spare a register. It reduces code size.
179
180//===---------------------------------------------------------------------===//
181
182Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
183get this:
184
185define i32 @test1(i32 %X) {
186    %Y = sdiv i32 %X, 8
187    ret i32 %Y
188}
189
190_test1:
191        movl 4(%esp), %eax
192        movl %eax, %ecx
193        sarl $31, %ecx
194        shrl $29, %ecx
195        addl %ecx, %eax
196        sarl $3, %eax
197        ret
198
199GCC knows several different ways to codegen it, one of which is this:
200
201_test1:
202        movl    4(%esp), %eax
203        cmpl    $-1, %eax
204        leal    7(%eax), %ecx
205        cmovle  %ecx, %eax
206        sarl    $3, %eax
207        ret
208
209which is probably slower, but it's interesting at least :)
210
211//===---------------------------------------------------------------------===//
212
213We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
214We should leave these as libcalls for everything over a much lower threshold,
215since libc is hand tuned for medium and large mem ops (avoiding RFO for large
216stores, TLB preheating, etc)
217
218//===---------------------------------------------------------------------===//
219
220Optimize this into something reasonable:
221 x * copysign(1.0, y) * copysign(1.0, z)
222
223//===---------------------------------------------------------------------===//
224
225Optimize copysign(x, *y) to use an integer load from y.
226
227//===---------------------------------------------------------------------===//
228
229The following tests perform worse with LSR:
230
231lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
232
233//===---------------------------------------------------------------------===//
234
235Adding to the list of cmp / test poor codegen issues:
236
237int test(__m128 *A, __m128 *B) {
238  if (_mm_comige_ss(*A, *B))
239    return 3;
240  else
241    return 4;
242}
243
244_test:
245	movl 8(%esp), %eax
246	movaps (%eax), %xmm0
247	movl 4(%esp), %eax
248	movaps (%eax), %xmm1
249	comiss %xmm0, %xmm1
250	setae %al
251	movzbl %al, %ecx
252	movl $3, %eax
253	movl $4, %edx
254	cmpl $0, %ecx
255	cmove %edx, %eax
256	ret
257
258Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
259are a number of issues. 1) We are introducing a setcc between the result of the
260intrisic call and select. 2) The intrinsic is expected to produce a i32 value
261so a any extend (which becomes a zero extend) is added.
262
263We probably need some kind of target DAG combine hook to fix this.
264
265//===---------------------------------------------------------------------===//
266
267We generate significantly worse code for this than GCC:
268http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
269http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
270
271There is also one case we do worse on PPC.
272
273//===---------------------------------------------------------------------===//
274
275For this:
276
277int test(int a)
278{
279  return a * 3;
280}
281
282We currently emits
283	imull $3, 4(%esp), %eax
284
285Perhaps this is what we really should generate is? Is imull three or four
286cycles? Note: ICC generates this:
287	movl	4(%esp), %eax
288	leal	(%eax,%eax,2), %eax
289
290The current instruction priority is based on pattern complexity. The former is
291more "complex" because it folds a load so the latter will not be emitted.
292
293Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
294should always try to match LEA first since the LEA matching code does some
295estimate to determine whether the match is profitable.
296
297However, if we care more about code size, then imull is better. It's two bytes
298shorter than movl + leal.
299
300On a Pentium M, both variants have the same characteristics with regard
301to throughput; however, the multiplication has a latency of four cycles, as
302opposed to two cycles for the movl+lea variant.
303
304//===---------------------------------------------------------------------===//
305
306__builtin_ffs codegen is messy.
307
308int ffs_(unsigned X) { return __builtin_ffs(X); }
309
310llvm produces:
311ffs_:
312        movl    4(%esp), %ecx
313        bsfl    %ecx, %eax
314        movl    $32, %edx
315        cmove   %edx, %eax
316        incl    %eax
317        xorl    %edx, %edx
318        testl   %ecx, %ecx
319        cmove   %edx, %eax
320        ret
321
322vs gcc:
323
324_ffs_:
325        movl    $-1, %edx
326        bsfl    4(%esp), %eax
327        cmove   %edx, %eax
328        addl    $1, %eax
329        ret
330
331Another example of __builtin_ffs (use predsimplify to eliminate a select):
332
333int foo (unsigned long j) {
334  if (j)
335    return __builtin_ffs (j) - 1;
336  else
337    return 0;
338}
339
340//===---------------------------------------------------------------------===//
341
342It appears gcc place string data with linkonce linkage in
343.section __TEXT,__const_coal,coalesced instead of
344.section __DATA,__const_coal,coalesced.
345Take a look at darwin.h, there are other Darwin assembler directives that we
346do not make use of.
347
348//===---------------------------------------------------------------------===//
349
350define i32 @foo(i32* %a, i32 %t) {
351entry:
352	br label %cond_true
353
354cond_true:		; preds = %cond_true, %entry
355	%x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]		; <i32> [#uses=3]
356	%t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]		; <i32> [#uses=1]
357	%tmp2 = getelementptr i32* %a, i32 %x.0.0		; <i32*> [#uses=1]
358	%tmp3 = load i32* %tmp2		; <i32> [#uses=1]
359	%tmp5 = add i32 %t_addr.0.0, %x.0.0		; <i32> [#uses=1]
360	%tmp7 = add i32 %tmp5, %tmp3		; <i32> [#uses=2]
361	%tmp9 = add i32 %x.0.0, 1		; <i32> [#uses=2]
362	%tmp = icmp sgt i32 %tmp9, 39		; <i1> [#uses=1]
363	br i1 %tmp, label %bb12, label %cond_true
364
365bb12:		; preds = %cond_true
366	ret i32 %tmp7
367}
368is pessimized by -loop-reduce and -indvars
369
370//===---------------------------------------------------------------------===//
371
372u32 to float conversion improvement:
373
374float uint32_2_float( unsigned u ) {
375  float fl = (int) (u & 0xffff);
376  float fh = (int) (u >> 16);
377  fh *= 0x1.0p16f;
378  return fh + fl;
379}
380
38100000000        subl    $0x04,%esp
38200000003        movl    0x08(%esp,1),%eax
38300000007        movl    %eax,%ecx
38400000009        shrl    $0x10,%ecx
3850000000c        cvtsi2ss        %ecx,%xmm0
38600000010        andl    $0x0000ffff,%eax
38700000015        cvtsi2ss        %eax,%xmm1
38800000019        mulss   0x00000078,%xmm0
38900000021        addss   %xmm1,%xmm0
39000000025        movss   %xmm0,(%esp,1)
3910000002a        flds    (%esp,1)
3920000002d        addl    $0x04,%esp
39300000030        ret
394
395//===---------------------------------------------------------------------===//
396
397When using fastcc abi, align stack slot of argument of type double on 8 byte
398boundary to improve performance.
399
400//===---------------------------------------------------------------------===//
401
402GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
403simplifications for integer "x cmp y ? a : b".
404
405//===---------------------------------------------------------------------===//
406
407Consider the expansion of:
408
409define i32 @test3(i32 %X) {
410        %tmp1 = urem i32 %X, 255
411        ret i32 %tmp1
412}
413
414Currently it compiles to:
415
416...
417        movl $2155905153, %ecx
418        movl 8(%esp), %esi
419        movl %esi, %eax
420        mull %ecx
421...
422
423This could be "reassociated" into:
424
425        movl $2155905153, %eax
426        movl 8(%esp), %ecx
427        mull %ecx
428
429to avoid the copy.  In fact, the existing two-address stuff would do this
430except that mul isn't a commutative 2-addr instruction.  I guess this has
431to be done at isel time based on the #uses to mul?
432
433//===---------------------------------------------------------------------===//
434
435Make sure the instruction which starts a loop does not cross a cacheline
436boundary. This requires knowning the exact length of each machine instruction.
437That is somewhat complicated, but doable. Example 256.bzip2:
438
439In the new trace, the hot loop has an instruction which crosses a cacheline
440boundary.  In addition to potential cache misses, this can't help decoding as I
441imagine there has to be some kind of complicated decoder reset and realignment
442to grab the bytes from the next cacheline.
443
444532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
445942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
446937  937 0x3d0a incl     %esi
4473    3   0x3d0b cmpb     %bl, %dl
44827   27  0x3d0d jnz      0x000062db <main+11707>
449
450//===---------------------------------------------------------------------===//
451
452In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
453
454//===---------------------------------------------------------------------===//
455
456This could be a single 16-bit load.
457
458int f(char *p) {
459    if ((p[0] == 1) & (p[1] == 2)) return 1;
460    return 0;
461}
462
463//===---------------------------------------------------------------------===//
464
465We should inline lrintf and probably other libc functions.
466
467//===---------------------------------------------------------------------===//
468
469Use the FLAGS values from arithmetic instructions more.  For example, compile:
470
471int add_zf(int *x, int y, int a, int b) {
472     if ((*x += y) == 0)
473          return a;
474     else
475          return b;
476}
477
478to:
479       addl    %esi, (%rdi)
480       movl    %edx, %eax
481       cmovne  %ecx, %eax
482       ret
483instead of:
484
485_add_zf:
486        addl (%rdi), %esi
487        movl %esi, (%rdi)
488        testl %esi, %esi
489        cmove %edx, %ecx
490        movl %ecx, %eax
491        ret
492
493As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
494without a test instruction.
495
496//===---------------------------------------------------------------------===//
497
498These two functions have identical effects:
499
500unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
501unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
502
503We currently compile them to:
504
505_f:
506        movl 4(%esp), %eax
507        movl %eax, %ecx
508        incl %ecx
509        movl 8(%esp), %edx
510        cmpl %edx, %ecx
511        jne LBB1_2      #UnifiedReturnBlock
512LBB1_1: #cond_true
513        addl $2, %eax
514        ret
515LBB1_2: #UnifiedReturnBlock
516        movl %ecx, %eax
517        ret
518_f2:
519        movl 4(%esp), %eax
520        movl %eax, %ecx
521        incl %ecx
522        cmpl 8(%esp), %ecx
523        sete %cl
524        movzbl %cl, %ecx
525        leal 1(%ecx,%eax), %eax
526        ret
527
528both of which are inferior to GCC's:
529
530_f:
531        movl    4(%esp), %edx
532        leal    1(%edx), %eax
533        addl    $2, %edx
534        cmpl    8(%esp), %eax
535        cmove   %edx, %eax
536        ret
537_f2:
538        movl    4(%esp), %eax
539        addl    $1, %eax
540        xorl    %edx, %edx
541        cmpl    8(%esp), %eax
542        sete    %dl
543        addl    %edx, %eax
544        ret
545
546//===---------------------------------------------------------------------===//
547
548This code:
549
550void test(int X) {
551  if (X) abort();
552}
553
554is currently compiled to:
555
556_test:
557        subl $12, %esp
558        cmpl $0, 16(%esp)
559        jne LBB1_1
560        addl $12, %esp
561        ret
562LBB1_1:
563        call L_abort$stub
564
565It would be better to produce:
566
567_test:
568        subl $12, %esp
569        cmpl $0, 16(%esp)
570        jne L_abort$stub
571        addl $12, %esp
572        ret
573
574This can be applied to any no-return function call that takes no arguments etc.
575Alternatively, the stack save/restore logic could be shrink-wrapped, producing
576something like this:
577
578_test:
579        cmpl $0, 4(%esp)
580        jne LBB1_1
581        ret
582LBB1_1:
583        subl $12, %esp
584        call L_abort$stub
585
586Both are useful in different situations.  Finally, it could be shrink-wrapped
587and tail called, like this:
588
589_test:
590        cmpl $0, 4(%esp)
591        jne LBB1_1
592        ret
593LBB1_1:
594        pop %eax   # realign stack.
595        call L_abort$stub
596
597Though this probably isn't worth it.
598
599//===---------------------------------------------------------------------===//
600
601Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
602a neg instead of a sub instruction.  Consider:
603
604int test(char X) { return 7-X; }
605
606we currently produce:
607_test:
608        movl $7, %eax
609        movsbl 4(%esp), %ecx
610        subl %ecx, %eax
611        ret
612
613We would use one fewer register if codegen'd as:
614
615        movsbl 4(%esp), %eax
616	neg %eax
617        add $7, %eax
618        ret
619
620Note that this isn't beneficial if the load can be folded into the sub.  In
621this case, we want a sub:
622
623int test(int X) { return 7-X; }
624_test:
625        movl $7, %eax
626        subl 4(%esp), %eax
627        ret
628
629//===---------------------------------------------------------------------===//
630
631Leaf functions that require one 4-byte spill slot have a prolog like this:
632
633_foo:
634        pushl   %esi
635        subl    $4, %esp
636...
637and an epilog like this:
638        addl    $4, %esp
639        popl    %esi
640        ret
641
642It would be smaller, and potentially faster, to push eax on entry and to
643pop into a dummy register instead of using addl/subl of esp.  Just don't pop 
644into any return registers :)
645
646//===---------------------------------------------------------------------===//
647
648The X86 backend should fold (branch (or (setcc, setcc))) into multiple 
649branches.  We generate really poor code for:
650
651double testf(double a) {
652       return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
653}
654
655For example, the entry BB is:
656
657_testf:
658        subl    $20, %esp
659        pxor    %xmm0, %xmm0
660        movsd   24(%esp), %xmm1
661        ucomisd %xmm0, %xmm1
662        setnp   %al
663        sete    %cl
664        testb   %cl, %al
665        jne     LBB1_5  # UnifiedReturnBlock
666LBB1_1: # cond_true
667
668
669it would be better to replace the last four instructions with:
670
671	jp LBB1_1
672	je LBB1_5
673LBB1_1:
674
675We also codegen the inner ?: into a diamond:
676
677       cvtss2sd        LCPI1_0(%rip), %xmm2
678        cvtss2sd        LCPI1_1(%rip), %xmm3
679        ucomisd %xmm1, %xmm0
680        ja      LBB1_3  # cond_true
681LBB1_2: # cond_true
682        movapd  %xmm3, %xmm2
683LBB1_3: # cond_true
684        movapd  %xmm2, %xmm0
685        ret
686
687We should sink the load into xmm3 into the LBB1_2 block.  This should
688be pretty easy, and will nuke all the copies.
689
690//===---------------------------------------------------------------------===//
691
692This:
693        #include <algorithm>
694        inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
695        { return std::make_pair(a + b, a + b < a); }
696        bool no_overflow(unsigned a, unsigned b)
697        { return !full_add(a, b).second; }
698
699Should compile to:
700	addl	%esi, %edi
701	setae	%al
702	movzbl	%al, %eax
703	ret
704
705on x86-64, instead of the rather stupid-looking:
706	addl	%esi, %edi
707	setb	%al
708	xorb	$1, %al
709	movzbl	%al, %eax
710	ret
711
712
713//===---------------------------------------------------------------------===//
714
715The following code:
716
717bb114.preheader:		; preds = %cond_next94
718	%tmp231232 = sext i16 %tmp62 to i32		; <i32> [#uses=1]
719	%tmp233 = sub i32 32, %tmp231232		; <i32> [#uses=1]
720	%tmp245246 = sext i16 %tmp65 to i32		; <i32> [#uses=1]
721	%tmp252253 = sext i16 %tmp68 to i32		; <i32> [#uses=1]
722	%tmp254 = sub i32 32, %tmp252253		; <i32> [#uses=1]
723	%tmp553554 = bitcast i16* %tmp37 to i8*		; <i8*> [#uses=2]
724	%tmp583584 = sext i16 %tmp98 to i32		; <i32> [#uses=1]
725	%tmp585 = sub i32 32, %tmp583584		; <i32> [#uses=1]
726	%tmp614615 = sext i16 %tmp101 to i32		; <i32> [#uses=1]
727	%tmp621622 = sext i16 %tmp104 to i32		; <i32> [#uses=1]
728	%tmp623 = sub i32 32, %tmp621622		; <i32> [#uses=1]
729	br label %bb114
730
731produces:
732
733LBB3_5:	# bb114.preheader
734	movswl	-68(%ebp), %eax
735	movl	$32, %ecx
736	movl	%ecx, -80(%ebp)
737	subl	%eax, -80(%ebp)
738	movswl	-52(%ebp), %eax
739	movl	%ecx, -84(%ebp)
740	subl	%eax, -84(%ebp)
741	movswl	-70(%ebp), %eax
742	movl	%ecx, -88(%ebp)
743	subl	%eax, -88(%ebp)
744	movswl	-50(%ebp), %eax
745	subl	%eax, %ecx
746	movl	%ecx, -76(%ebp)
747	movswl	-42(%ebp), %eax
748	movl	%eax, -92(%ebp)
749	movswl	-66(%ebp), %eax
750	movl	%eax, -96(%ebp)
751	movw	$0, -98(%ebp)
752
753This appears to be bad because the RA is not folding the store to the stack 
754slot into the movl.  The above instructions could be:
755	movl    $32, -80(%ebp)
756...
757	movl    $32, -84(%ebp)
758...
759This seems like a cross between remat and spill folding.
760
761This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
762change, so we could simply subtract %eax from %ecx first and then use %ecx (or
763vice-versa).
764
765//===---------------------------------------------------------------------===//
766
767This code:
768
769	%tmp659 = icmp slt i16 %tmp654, 0		; <i1> [#uses=1]
770	br i1 %tmp659, label %cond_true662, label %cond_next715
771
772produces this:
773
774	testw	%cx, %cx
775	movswl	%cx, %esi
776	jns	LBB4_109	# cond_next715
777
778Shark tells us that using %cx in the testw instruction is sub-optimal. It
779suggests using the 32-bit register (which is what ICC uses).
780
781//===---------------------------------------------------------------------===//
782
783We compile this:
784
785void compare (long long foo) {
786  if (foo < 4294967297LL)
787    abort();
788}
789
790to:
791
792compare:
793        subl    $4, %esp
794        cmpl    $0, 8(%esp)
795        setne   %al
796        movzbw  %al, %ax
797        cmpl    $1, 12(%esp)
798        setg    %cl
799        movzbw  %cl, %cx
800        cmove   %ax, %cx
801        testb   $1, %cl
802        jne     .LBB1_2 # UnifiedReturnBlock
803.LBB1_1:        # ifthen
804        call    abort
805.LBB1_2:        # UnifiedReturnBlock
806        addl    $4, %esp
807        ret
808
809(also really horrible code on ppc).  This is due to the expand code for 64-bit
810compares.  GCC produces multiple branches, which is much nicer:
811
812compare:
813        subl    $12, %esp
814        movl    20(%esp), %edx
815        movl    16(%esp), %eax
816        decl    %edx
817        jle     .L7
818.L5:
819        addl    $12, %esp
820        ret
821        .p2align 4,,7
822.L7:
823        jl      .L4
824        cmpl    $0, %eax
825        .p2align 4,,8
826        ja      .L5
827.L4:
828        .p2align 4,,9
829        call    abort
830
831//===---------------------------------------------------------------------===//
832
833Tail call optimization improvements: Tail call optimization currently
834pushes all arguments on the top of the stack (their normal place for
835non-tail call optimized calls) that source from the callers arguments
836or  that source from a virtual register (also possibly sourcing from
837callers arguments).
838This is done to prevent overwriting of parameters (see example
839below) that might be used later.
840
841example:  
842
843int callee(int32, int64); 
844int caller(int32 arg1, int32 arg2) { 
845  int64 local = arg2 * 2; 
846  return callee(arg2, (int64)local); 
847}
848
849[arg1]          [!arg2 no longer valid since we moved local onto it]
850[arg2]      ->  [(int64)
851[RETADDR]        local  ]
852
853Moving arg1 onto the stack slot of callee function would overwrite
854arg2 of the caller.
855
856Possible optimizations:
857
858
859 - Analyse the actual parameters of the callee to see which would
860   overwrite a caller parameter which is used by the callee and only
861   push them onto the top of the stack.
862
863   int callee (int32 arg1, int32 arg2);
864   int caller (int32 arg1, int32 arg2) {
865       return callee(arg1,arg2);
866   }
867
868   Here we don't need to write any variables to the top of the stack
869   since they don't overwrite each other.
870
871   int callee (int32 arg1, int32 arg2);
872   int caller (int32 arg1, int32 arg2) {
873       return callee(arg2,arg1);
874   }
875
876   Here we need to push the arguments because they overwrite each
877   other.
878
879//===---------------------------------------------------------------------===//
880
881main ()
882{
883  int i = 0;
884  unsigned long int z = 0;
885
886  do {
887    z -= 0x00004000;
888    i++;
889    if (i > 0x00040000)
890      abort ();
891  } while (z > 0);
892  exit (0);
893}
894
895gcc compiles this to:
896
897_main:
898	subl	$28, %esp
899	xorl	%eax, %eax
900	jmp	L2
901L3:
902	cmpl	$262144, %eax
903	je	L10
904L2:
905	addl	$1, %eax
906	cmpl	$262145, %eax
907	jne	L3
908	call	L_abort$stub
909L10:
910	movl	$0, (%esp)
911	call	L_exit$stub
912
913llvm:
914
915_main:
916	subl	$12, %esp
917	movl	$1, %eax
918	movl	$16384, %ecx
919LBB1_1:	# bb
920	cmpl	$262145, %eax
921	jge	LBB1_4	# cond_true
922LBB1_2:	# cond_next
923	incl	%eax
924	addl	$4294950912, %ecx
925	cmpl	$16384, %ecx
926	jne	LBB1_1	# bb
927LBB1_3:	# bb11
928	xorl	%eax, %eax
929	addl	$12, %esp
930	ret
931LBB1_4:	# cond_true
932	call	L_abort$stub
933
9341. LSR should rewrite the first cmp with induction variable %ecx.
9352. DAG combiner should fold
936        leal    1(%eax), %edx
937        cmpl    $262145, %edx
938   =>
939        cmpl    $262144, %eax
940
941//===---------------------------------------------------------------------===//
942
943define i64 @test(double %X) {
944	%Y = fptosi double %X to i64
945	ret i64 %Y
946}
947
948compiles to:
949
950_test:
951	subl	$20, %esp
952	movsd	24(%esp), %xmm0
953	movsd	%xmm0, 8(%esp)
954	fldl	8(%esp)
955	fisttpll	(%esp)
956	movl	4(%esp), %edx
957	movl	(%esp), %eax
958	addl	$20, %esp
959	#FP_REG_KILL
960	ret
961
962This should just fldl directly from the input stack slot.
963
964//===---------------------------------------------------------------------===//
965
966This code:
967int foo (int x) { return (x & 65535) | 255; }
968
969Should compile into:
970
971_foo:
972        movzwl  4(%esp), %eax
973        orl     $255, %eax
974        ret
975
976instead of:
977_foo:
978	movl	$65280, %eax
979	andl	4(%esp), %eax
980	orl	$255, %eax
981	ret
982
983//===---------------------------------------------------------------------===//
984
985We're codegen'ing multiply of long longs inefficiently:
986
987unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
988  return arg1 *  arg2;
989}
990
991We compile to (fomit-frame-pointer):
992
993_LLM:
994	pushl	%esi
995	movl	8(%esp), %ecx
996	movl	16(%esp), %esi
997	movl	%esi, %eax
998	mull	%ecx
999	imull	12(%esp), %esi
1000	addl	%edx, %esi
1001	imull	20(%esp), %ecx
1002	movl	%esi, %edx
1003	addl	%ecx, %edx
1004	popl	%esi
1005	ret
1006
1007This looks like a scheduling deficiency and lack of remat of the load from
1008the argument area.  ICC apparently produces:
1009
1010        movl      8(%esp), %ecx
1011        imull     12(%esp), %ecx
1012        movl      16(%esp), %eax
1013        imull     4(%esp), %eax 
1014        addl      %eax, %ecx  
1015        movl      4(%esp), %eax
1016        mull      12(%esp) 
1017        addl      %ecx, %edx
1018        ret
1019
1020Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
1021http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
1022
1023//===---------------------------------------------------------------------===//
1024
1025We can fold a store into "zeroing a reg".  Instead of:
1026
1027xorl    %eax, %eax
1028movl    %eax, 124(%esp)
1029
1030we should get:
1031
1032movl    $0, 124(%esp)
1033
1034if the flags of the xor are dead.
1035
1036Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
1037be folded into: shl [mem], 1
1038
1039//===---------------------------------------------------------------------===//
1040
1041In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1042or and instruction, for example:
1043
1044	xorpd	LCPI1_0, %xmm2
1045
1046However, if xmm2 gets spilled, we end up with really ugly code like this:
1047
1048	movsd	(%esp), %xmm0
1049	xorpd	LCPI1_0, %xmm0
1050	movsd	%xmm0, (%esp)
1051
1052Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1053the neg/abs instruction, turning it into an *integer* operation, like this:
1054
1055	xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
1056
1057you could also use xorb, but xorl is less likely to lead to a partial register
1058stall.  Here is a contrived testcase:
1059
1060double a, b, c;
1061void test(double *P) {
1062  double X = *P;
1063  a = X;
1064  bar();
1065  X = -X;
1066  b = X;
1067  bar();
1068  c = X;
1069}
1070
1071//===---------------------------------------------------------------------===//
1072
1073The generated code on x86 for checking for signed overflow on a multiply the
1074obvious way is much longer than it needs to be.
1075
1076int x(int a, int b) {
1077  long long prod = (long long)a*b;
1078  return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1079}
1080
1081See PR2053 for more details.
1082
1083//===---------------------------------------------------------------------===//
1084
1085We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1086more aggressively; it should cost the same as a move+shift on any modern
1087processor, but it's a lot shorter. Downside is that it puts more
1088pressure on register allocation because it has fixed operands.
1089
1090Example:
1091int abs(int x) {return x < 0 ? -x : x;}
1092
1093gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1094abs:
1095        movl    4(%esp), %eax
1096        cltd
1097        xorl    %edx, %eax
1098        subl    %edx, %eax
1099        ret
1100
1101//===---------------------------------------------------------------------===//
1102
1103Take the following code (from 
1104http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1105
1106extern unsigned char first_one[65536];
1107int FirstOnet(unsigned long long arg1)
1108{
1109  if (arg1 >> 48)
1110    return (first_one[arg1 >> 48]);
1111  return 0;
1112}
1113
1114
1115The following code is currently generated:
1116FirstOnet:
1117        movl    8(%esp), %eax
1118        cmpl    $65536, %eax
1119        movl    4(%esp), %ecx
1120        jb      .LBB1_2 # UnifiedReturnBlock
1121.LBB1_1:        # ifthen
1122        shrl    $16, %eax
1123        movzbl  first_one(%eax), %eax
1124        ret
1125.LBB1_2:        # UnifiedReturnBlock
1126        xorl    %eax, %eax
1127        ret
1128
1129We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
1130lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
1131
1132//===---------------------------------------------------------------------===//
1133
1134We compile this function:
1135
1136define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
1137entry:
1138	%tmp2 = icmp eq i8 %d, 0		; <i1> [#uses=1]
1139	br i1 %tmp2, label %bb7, label %bb
1140
1141bb:		; preds = %entry
1142	%tmp6 = add i32 %b, %a		; <i32> [#uses=1]
1143	ret i32 %tmp6
1144
1145bb7:		; preds = %entry
1146	%tmp10 = sub i32 %a, %c		; <i32> [#uses=1]
1147	ret i32 %tmp10
1148}
1149
1150to:
1151
1152foo:                                    # @foo
1153# BB#0:                                 # %entry
1154	movl	4(%esp), %ecx
1155	cmpb	$0, 16(%esp)
1156	je	.LBB0_2
1157# BB#1:                                 # %bb
1158	movl	8(%esp), %eax
1159	addl	%ecx, %eax
1160	ret
1161.LBB0_2:                                # %bb7
1162	movl	12(%esp), %edx
1163	movl	%ecx, %eax
1164	subl	%edx, %eax
1165	ret
1166
1167There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
1168couple more movls by putting 4(%esp) into %eax instead of %ecx.
1169
1170//===---------------------------------------------------------------------===//
1171
1172See rdar://4653682.
1173
1174From flops:
1175
1176LBB1_15:        # bb310
1177        cvtss2sd        LCPI1_0, %xmm1
1178        addsd   %xmm1, %xmm0
1179        movsd   176(%esp), %xmm2
1180        mulsd   %xmm0, %xmm2
1181        movapd  %xmm2, %xmm3
1182        mulsd   %xmm3, %xmm3
1183        movapd  %xmm3, %xmm4
1184        mulsd   LCPI1_23, %xmm4
1185        addsd   LCPI1_24, %xmm4
1186        mulsd   %xmm3, %xmm4
1187        addsd   LCPI1_25, %xmm4
1188        mulsd   %xmm3, %xmm4
1189        addsd   LCPI1_26, %xmm4
1190        mulsd   %xmm3, %xmm4
1191        addsd   LCPI1_27, %xmm4
1192        mulsd   %xmm3, %xmm4
1193        addsd   LCPI1_28, %xmm4
1194        mulsd   %xmm3, %xmm4
1195        addsd   %xmm1, %xmm4
1196        mulsd   %xmm2, %xmm4
1197        movsd   152(%esp), %xmm1
1198        addsd   %xmm4, %xmm1
1199        movsd   %xmm1, 152(%esp)
1200        incl    %eax
1201        cmpl    %eax, %esi
1202        jge     LBB1_15 # bb310
1203LBB1_16:        # bb358.loopexit
1204        movsd   152(%esp), %xmm0
1205        addsd   %xmm0, %xmm0
1206        addsd   LCPI1_22, %xmm0
1207        movsd   %xmm0, 152(%esp)
1208
1209Rather than spilling the result of the last addsd in the loop, we should have
1210insert a copy to split the interval (one for the duration of the loop, one
1211extending to the fall through). The register pressure in the loop isn't high
1212enough to warrant the spill.
1213
1214Also check why xmm7 is not used at all in the function.
1215
1216//===---------------------------------------------------------------------===//
1217
1218Take the following:
1219
1220target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
1221target triple = "i386-apple-darwin8"
1222@in_exit.4870.b = internal global i1 false		; <i1*> [#uses=2]
1223define fastcc void @abort_gzip() noreturn nounwind  {
1224entry:
1225	%tmp.b.i = load i1* @in_exit.4870.b		; <i1> [#uses=1]
1226	br i1 %tmp.b.i, label %bb.i, label %bb4.i
1227bb.i:		; preds = %entry
1228	tail call void @exit( i32 1 ) noreturn nounwind 
1229	unreachable
1230bb4.i:		; preds = %entry
1231	store i1 true, i1* @in_exit.4870.b
1232	tail call void @exit( i32 1 ) noreturn nounwind 
1233	unreachable
1234}
1235declare void @exit(i32) noreturn nounwind 
1236
1237This compiles into:
1238_abort_gzip:                            ## @abort_gzip
1239## BB#0:                                ## %entry
1240	subl	$12, %esp
1241	movb	_in_exit.4870.b, %al
1242	cmpb	$1, %al
1243	jne	LBB0_2
1244
1245We somehow miss folding the movb into the cmpb.
1246
1247//===---------------------------------------------------------------------===//
1248
1249We compile:
1250
1251int test(int x, int y) {
1252  return x-y-1;
1253}
1254
1255into (-m64):
1256
1257_test:
1258	decl	%edi
1259	movl	%edi, %eax
1260	subl	%esi, %eax
1261	ret
1262
1263it would be better to codegen as: x+~y  (notl+addl)
1264
1265//===---------------------------------------------------------------------===//
1266
1267This code:
1268
1269int foo(const char *str,...)
1270{
1271 __builtin_va_list a; int x;
1272 __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1273 return x;
1274}
1275
1276gets compiled into this on x86-64:
1277	subq    $200, %rsp
1278        movaps  %xmm7, 160(%rsp)
1279        movaps  %xmm6, 144(%rsp)
1280        movaps  %xmm5, 128(%rsp)
1281        movaps  %xmm4, 112(%rsp)
1282        movaps  %xmm3, 96(%rsp)
1283        movaps  %xmm2, 80(%rsp)
1284        movaps  %xmm1, 64(%rsp)
1285        movaps  %xmm0, 48(%rsp)
1286        movq    %r9, 40(%rsp)
1287        movq    %r8, 32(%rsp)
1288        movq    %rcx, 24(%rsp)
1289        movq    %rdx, 16(%rsp)
1290        movq    %rsi, 8(%rsp)
1291        leaq    (%rsp), %rax
1292        movq    %rax, 192(%rsp)
1293        leaq    208(%rsp), %rax
1294        movq    %rax, 184(%rsp)
1295        movl    $48, 180(%rsp)
1296        movl    $8, 176(%rsp)
1297        movl    176(%rsp), %eax
1298        cmpl    $47, %eax
1299        jbe     .LBB1_3 # bb
1300.LBB1_1:        # bb3
1301        movq    184(%rsp), %rcx
1302        leaq    8(%rcx), %rax
1303        movq    %rax, 184(%rsp)
1304.LBB1_2:        # bb4
1305        movl    (%rcx), %eax
1306        addq    $200, %rsp
1307        ret
1308.LBB1_3:        # bb
1309        movl    %eax, %ecx
1310        addl    $8, %eax
1311        addq    192(%rsp), %rcx
1312        movl    %eax, 176(%rsp)
1313        jmp     .LBB1_2 # bb4
1314
1315gcc 4.3 generates:
1316	subq    $96, %rsp
1317.LCFI0:
1318        leaq    104(%rsp), %rax
1319        movq    %rsi, -80(%rsp)
1320        movl    $8, -120(%rsp)
1321        movq    %rax, -112(%rsp)
1322        leaq    -88(%rsp), %rax
1323        movq    %rax, -104(%rsp)
1324        movl    $8, %eax
1325        cmpl    $48, %eax
1326        jb      .L6
1327        movq    -112(%rsp), %rdx
1328        movl    (%rdx), %eax
1329        addq    $96, %rsp
1330        ret
1331        .p2align 4,,10
1332        .p2align 3
1333.L6:
1334        mov     %eax, %edx
1335        addq    -104(%rsp), %rdx
1336        addl    $8, %eax
1337        movl    %eax, -120(%rsp)
1338        movl    (%rdx), %eax
1339        addq    $96, %rsp
1340        ret
1341
1342and it gets compiled into this on x86:
1343	pushl   %ebp
1344        movl    %esp, %ebp
1345        subl    $4, %esp
1346        leal    12(%ebp), %eax
1347        movl    %eax, -4(%ebp)
1348        leal    16(%ebp), %eax
1349        movl    %eax, -4(%ebp)
1350        movl    12(%ebp), %eax
1351        addl    $4, %esp
1352        popl    %ebp
1353        ret
1354
1355gcc 4.3 generates:
1356	pushl   %ebp
1357        movl    %esp, %ebp
1358        movl    12(%ebp), %eax
1359        popl    %ebp
1360        ret
1361
1362//===---------------------------------------------------------------------===//
1363
1364Teach tblgen not to check bitconvert source type in some cases. This allows us
1365to consolidate the following patterns in X86InstrMMX.td:
1366
1367def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1368                                                  (iPTR 0))))),
1369          (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1370def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1371                                                  (iPTR 0))))),
1372          (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1373def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1374                                                  (iPTR 0))))),
1375          (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1376
1377There are other cases in various td files.
1378
1379//===---------------------------------------------------------------------===//
1380
1381Take something like the following on x86-32:
1382unsigned a(unsigned long long x, unsigned y) {return x % y;}
1383
1384We currently generate a libcall, but we really shouldn't: the expansion is
1385shorter and likely faster than the libcall.  The expected code is something
1386like the following:
1387
1388	movl	12(%ebp), %eax
1389	movl	16(%ebp), %ecx
1390	xorl	%edx, %edx
1391	divl	%ecx
1392	movl	8(%ebp), %eax
1393	divl	%ecx
1394	movl	%edx, %eax
1395	ret
1396
1397A similar code sequence works for division.
1398
1399//===---------------------------------------------------------------------===//
1400
1401These should compile to the same code, but the later codegen's to useless
1402instructions on X86. This may be a trivial dag combine (GCC PR7061):
1403
1404struct s1 { unsigned char a, b; };
1405unsigned long f1(struct s1 x) {
1406    return x.a + x.b;
1407}
1408struct s2 { unsigned a: 8, b: 8; };
1409unsigned long f2(struct s2 x) {
1410    return x.a + x.b;
1411}
1412
1413//===---------------------------------------------------------------------===//
1414
1415We currently compile this:
1416
1417define i32 @func1(i32 %v1, i32 %v2) nounwind {
1418entry:
1419  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
1420  %sum = extractvalue {i32, i1} %t, 0
1421  %obit = extractvalue {i32, i1} %t, 1
1422  br i1 %obit, label %overflow, label %normal
1423normal:
1424  ret i32 %sum
1425overflow:
1426  call void @llvm.trap()
1427  unreachable
1428}
1429declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
1430declare void @llvm.trap()
1431
1432to:
1433
1434_func1:
1435	movl	4(%esp), %eax
1436	addl	8(%esp), %eax
1437	jo	LBB1_2	## overflow
1438LBB1_1:	## normal
1439	ret
1440LBB1_2:	## overflow
1441	ud2
1442
1443it would be nice to produce "into" someday.
1444
1445//===---------------------------------------------------------------------===//
1446
1447This code:
1448
1449void vec_mpys1(int y[], const int x[], int scaler) {
1450int i;
1451for (i = 0; i < 150; i++)
1452 y[i] += (((long long)scaler * (long long)x[i]) >> 31);
1453}
1454
1455Compiles to this loop with GCC 3.x:
1456
1457.L5:
1458	movl	%ebx, %eax
1459	imull	(%edi,%ecx,4)
1460	shrdl	$31, %edx, %eax
1461	addl	%eax, (%esi,%ecx,4)
1462	incl	%ecx
1463	cmpl	$149, %ecx
1464	jle	.L5
1465
1466llvm-gcc compiles it to the much uglier:
1467
1468LBB1_1:	## bb1
1469	movl	24(%esp), %eax
1470	movl	(%eax,%edi,4), %ebx
1471	movl	%ebx, %ebp
1472	imull	%esi, %ebp
1473	movl	%ebx, %eax
1474	mull	%ecx
1475	addl	%ebp, %edx
1476	sarl	$31, %ebx
1477	imull	%ecx, %ebx
1478	addl	%edx, %ebx
1479	shldl	$1, %eax, %ebx
1480	movl	20(%esp), %eax
1481	addl	%ebx, (%eax,%edi,4)
1482	incl	%edi
1483	cmpl	$150, %edi
1484	jne	LBB1_1	## bb1
1485
1486The issue is that we hoist the cast of "scaler" to long long outside of the
1487loop, the value comes into the loop as two values, and
1488RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
1489constructed BUILD_PAIR which represents the cast value.
1490
1491This can be handled by making CodeGenPrepare sink the cast.
1492
1493//===---------------------------------------------------------------------===//
1494
1495Test instructions can be eliminated by using EFLAGS values from arithmetic
1496instructions. This is currently not done for mul, and, or, xor, neg, shl,
1497sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
1498for read-modify-write instructions. It is also current not done if the
1499OF or CF flags are needed.
1500
1501The shift operators have the complication that when the shift count is
1502zero, EFLAGS is not set, so they can only subsume a test instruction if
1503the shift count is known to be non-zero. Also, using the EFLAGS value
1504from a shift is apparently very slow on some x86 implementations.
1505
1506In read-modify-write instructions, the root node in the isel match is
1507the store, and isel has no way for the use of the EFLAGS result of the
1508arithmetic to be remapped to the new node.
1509
1510Add and subtract instructions set OF on signed overflow and CF on unsiged
1511overflow, while test instructions always clear OF and CF. In order to
1512replace a test with an add or subtract in a situation where OF or CF is
1513needed, codegen must be able to prove that the operation cannot see
1514signed or unsigned overflow, respectively.
1515
1516//===---------------------------------------------------------------------===//
1517
1518memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
1519define <16 x float> @foo(<16 x float> %A) nounwind {
1520	%tmp = alloca <16 x float>, align 16
1521	%tmp2 = alloca <16 x float>, align 16
1522	store <16 x float> %A, <16 x float>* %tmp
1523	%s = bitcast <16 x float>* %tmp to i8*
1524	%s2 = bitcast <16 x float>* %tmp2 to i8*
1525	call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
1526	%R = load <16 x float>* %tmp2
1527	ret <16 x float> %R
1528}
1529
1530declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
1531
1532which compiles to:
1533
1534_foo:
1535	subl	$140, %esp
1536	movaps	%xmm3, 112(%esp)
1537	movaps	%xmm2, 96(%esp)
1538	movaps	%xmm1, 80(%esp)
1539	movaps	%xmm0, 64(%esp)
1540	movl	60(%esp), %eax
1541	movl	%eax, 124(%esp)
1542	movl	56(%esp), %eax
1543	movl	%eax, 120(%esp)
1544	movl	52(%esp), %eax
1545        <many many more 32-bit copies>
1546      	movaps	(%esp), %xmm0
1547	movaps	16(%esp), %xmm1
1548	movaps	32(%esp), %xmm2
1549	movaps	48(%esp), %xmm3
1550	addl	$140, %esp
1551	ret
1552
1553On Nehalem, it may even be cheaper to just use movups when unaligned than to
1554fall back to lower-granularity chunks.
1555
1556//===---------------------------------------------------------------------===//
1557
1558Implement processor-specific optimizations for parity with GCC on these
1559processors.  GCC does two optimizations:
1560
15611. ix86_pad_returns inserts a noop before ret instructions if immediately
1562   preceded by a conditional branch or is the target of a jump.
15632. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
1564   code contains more than 3 branches.
1565   
1566The first one is done for all AMDs, Core2, and "Generic"
1567The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
1568  Core 2, and "Generic"
1569
1570//===---------------------------------------------------------------------===//
1571
1572Testcase:
1573int a(int x) { return (x & 127) > 31; }
1574
1575Current output:
1576	movl	4(%esp), %eax
1577	andl	$127, %eax
1578	cmpl	$31, %eax
1579	seta	%al
1580	movzbl	%al, %eax
1581	ret
1582
1583Ideal output:
1584	xorl	%eax, %eax
1585	testl	$96, 4(%esp)
1586	setne	%al
1587	ret
1588
1589This should definitely be done in instcombine, canonicalizing the range
1590condition into a != condition.  We get this IR:
1591
1592define i32 @a(i32 %x) nounwind readnone {
1593entry:
1594	%0 = and i32 %x, 127		; <i32> [#uses=1]
1595	%1 = icmp ugt i32 %0, 31		; <i1> [#uses=1]
1596	%2 = zext i1 %1 to i32		; <i32> [#uses=1]
1597	ret i32 %2
1598}
1599
1600Instcombine prefers to strength reduce relational comparisons to equality
1601comparisons when possible, this should be another case of that.  This could
1602be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
1603looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
1604be redesigned to use ComputeMaskedBits and friends.
1605
1606
1607//===---------------------------------------------------------------------===//
1608Testcase:
1609int x(int a) { return (a&0xf0)>>4; }
1610
1611Current output:
1612	movl	4(%esp), %eax
1613	shrl	$4, %eax
1614	andl	$15, %eax
1615	ret
1616
1617Ideal output:
1618	movzbl	4(%esp), %eax
1619	shrl	$4, %eax
1620	ret
1621
1622//===---------------------------------------------------------------------===//
1623
1624Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
1625properly.
1626
1627When the return value is not used (i.e. only care about the value in the
1628memory), x86 does not have to use add to implement these. Instead, it can use
1629add, sub, inc, dec instructions with the "lock" prefix.
1630
1631This is currently implemented using a bit of instruction selection trick. The
1632issue is the target independent pattern produces one output and a chain and we
1633want to map it into one that just output a chain. The current trick is to select
1634it into a MERGE_VALUES with the first definition being an implicit_def. The
1635proper solution is to add new ISD opcodes for the no-output variant. DAG
1636combiner can then transform the node before it gets to target node selection.
1637
1638Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
1639fact these instructions are identical to the non-lock versions. We need a way to
1640add target specific information to target nodes and have this information
1641carried over to machine instructions. Asm printer (or JIT) can use this
1642information to add the "lock" prefix.
1643
1644//===---------------------------------------------------------------------===//
1645
1646struct B {
1647  unsigned char y0 : 1;
1648};
1649
1650int bar(struct B* a) { return a->y0; }
1651
1652define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
1653  %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
1654  %2 = load i8* %1, align 1
1655  %3 = and i8 %2, 1
1656  %4 = zext i8 %3 to i32
1657  ret i32 %4
1658}
1659
1660bar:                                    # @bar
1661# BB#0:
1662        movb    (%rdi), %al
1663        andb    $1, %al
1664        movzbl  %al, %eax
1665        ret
1666
1667Missed optimization: should be movl+andl.
1668
1669//===---------------------------------------------------------------------===//
1670
1671The x86_64 abi says:
1672
1673Booleans, when stored in a memory object, are stored as single byte objects the
1674value of which is always 0 (false) or 1 (true).
1675
1676We are not using this fact:
1677
1678int bar(_Bool *a) { return *a; }
1679
1680define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
1681  %1 = load i8* %a, align 1, !tbaa !0
1682  %tmp = and i8 %1, 1
1683  %2 = zext i8 %tmp to i32
1684  ret i32 %2
1685}
1686
1687bar:
1688        movb    (%rdi), %al
1689        andb    $1, %al
1690        movzbl  %al, %eax
1691        ret
1692
1693GCC produces
1694
1695bar:
1696        movzbl  (%rdi), %eax
1697        ret
1698
1699//===---------------------------------------------------------------------===//
1700
1701Consider the following two functions compiled with clang:
1702_Bool foo(int *x) { return !(*x & 4); }
1703unsigned bar(int *x) { return !(*x & 4); }
1704
1705foo:
1706	movl	4(%esp), %eax
1707	testb	$4, (%eax)
1708	sete	%al
1709	movzbl	%al, %eax
1710	ret
1711
1712bar:
1713	movl	4(%esp), %eax
1714	movl	(%eax), %eax
1715	shrl	$2, %eax
1716	andl	$1, %eax
1717	xorl	$1, %eax
1718	ret
1719
1720The second function generates more code even though the two functions are
1721are functionally identical.
1722
1723//===---------------------------------------------------------------------===//
1724
1725Take the following C code:
1726int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
1727
1728We generate the following IR with clang:
1729define i32 @f(i32 %a, i32 %b) nounwind readnone {
1730entry:
1731  %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
1732  %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
1733  %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
1734  %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1735  ret i32 %conv5
1736}
1737
1738And the following x86 code:
1739	xorl	%esi, %edi
1740	testb	$-1, %dil
1741	sete	%al
1742	movzbl	%al, %eax
1743	ret
1744
1745A cmpb instead of the xorl+testb would be one instruction shorter.
1746
1747//===---------------------------------------------------------------------===//
1748
1749Given the following C code:
1750int f(int a, int b) { return (signed char)a == (signed char)b; }
1751
1752We generate the following IR with clang:
1753define i32 @f(i32 %a, i32 %b) nounwind readnone {
1754entry:
1755  %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
1756  %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
1757  %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
1758  %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
1759  %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
1760  %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1761  ret i32 %conv5
1762}
1763
1764And the following x86 code:
1765	movsbl	%sil, %eax
1766	movsbl	%dil, %ecx
1767	cmpl	%eax, %ecx
1768	sete	%al
1769	movzbl	%al, %eax
1770	ret
1771
1772
1773It should be possible to eliminate the sign extensions.
1774
1775//===---------------------------------------------------------------------===//
1776
1777LLVM misses a load+store narrowing opportunity in this code:
1778
1779%struct.bf = type { i64, i16, i16, i32 }
1780
1781@bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
1782
1783define void @t1() nounwind ssp {
1784entry:
1785  %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1786  %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
1787  %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
1788  %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
1789  %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
1790  store i32 %4, i32* %2, align 1
1791  %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1792  %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
1793  %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
1794  %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
1795  %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
1796  store i32 %9, i32* %7, align 1
1797  ret void
1798}
1799
1800LLVM currently emits this:
1801
1802  movq  bfi(%rip), %rax
1803  andl  $-65537, 8(%rax)
1804  movq  bfi(%rip), %rax
1805  andl  $-131073, 8(%rax)
1806  ret
1807
1808It could narrow the loads and stores to emit this:
1809
1810  movq  bfi(%rip), %rax
1811  andb  $-2, 10(%rax)
1812  movq  bfi(%rip), %rax
1813  andb  $-3, 10(%rax)
1814  ret
1815
1816The trouble is that there is a TokenFactor between the store and the
1817load, making it non-trivial to determine if there's anything between
1818the load and the store which would prohibit narrowing.
1819
1820//===---------------------------------------------------------------------===//
1821
1822This code:
1823void foo(unsigned x) {
1824  if (x == 0) bar();
1825  else if (x == 1) qux();
1826}
1827
1828currently compiles into:
1829_foo:
1830	movl	4(%esp), %eax
1831	cmpl	$1, %eax
1832	je	LBB0_3
1833	testl	%eax, %eax
1834	jne	LBB0_4
1835
1836the testl could be removed:
1837_foo:
1838	movl	4(%esp), %eax
1839	cmpl	$1, %eax
1840	je	LBB0_3
1841	jb	LBB0_4
1842
18430 is the only unsigned number < 1.
1844
1845//===---------------------------------------------------------------------===//
1846
1847This code:
1848
1849%0 = type { i32, i1 }
1850
1851define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
1852entry:
1853  %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
1854  %cmp = extractvalue %0 %uadd, 1
1855  %inc = zext i1 %cmp to i32
1856  %add = add i32 %x, %sum
1857  %z.0 = add i32 %add, %inc
1858  ret i32 %z.0
1859}
1860
1861declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
1862
1863compiles to:
1864
1865_add32carry:                            ## @add32carry
1866	addl	%esi, %edi
1867	sbbl	%ecx, %ecx
1868	movl	%edi, %eax
1869	subl	%ecx, %eax
1870	ret
1871
1872But it could be:
1873
1874_add32carry:
1875	leal	(%rsi,%rdi), %eax
1876	cmpl	%esi, %eax
1877	adcl	$0, %eax
1878	ret
1879
1880//===---------------------------------------------------------------------===//
1881
1882The hot loop of 256.bzip2 contains code that looks a bit like this:
1883
1884int foo(char *P, char *Q, int x, int y) {
1885  if (P[0] != Q[0])
1886     return P[0] < Q[0];
1887  if (P[1] != Q[1])
1888     return P[1] < Q[1];
1889  if (P[2] != Q[2])
1890     return P[2] < Q[2];
1891   return P[3] < Q[3];
1892}
1893
1894In the real code, we get a lot more wrong than this.  However, even in this
1895code we generate:
1896
1897_foo:                                   ## @foo
1898## BB#0:                                ## %entry
1899	movb	(%rsi), %al
1900	movb	(%rdi), %cl
1901	cmpb	%al, %cl
1902	je	LBB0_2
1903LBB0_1:                                 ## %if.then
1904	cmpb	%al, %cl
1905	jmp	LBB0_5
1906LBB0_2:                                 ## %if.end
1907	movb	1(%rsi), %al
1908	movb	1(%rdi), %cl
1909	cmpb	%al, %cl
1910	jne	LBB0_1
1911## BB#3:                                ## %if.end38
1912	movb	2(%rsi), %al
1913	movb	2(%rdi), %cl
1914	cmpb	%al, %cl
1915	jne	LBB0_1
1916## BB#4:                                ## %if.end60
1917	movb	3(%rdi), %al
1918	cmpb	3(%rsi), %al
1919LBB0_5:                                 ## %if.end60
1920	setl	%al
1921	movzbl	%al, %eax
1922	ret
1923
1924Note that we generate jumps to LBB0_1 which does a redundant compare.  The
1925redundant compare also forces the register values to be live, which prevents
1926folding one of the loads into the compare.  In contrast, GCC 4.2 produces:
1927
1928_foo:
1929	movzbl	(%rsi), %eax
1930	cmpb	%al, (%rdi)
1931	jne	L10
1932L12:
1933	movzbl	1(%rsi), %eax
1934	cmpb	%al, 1(%rdi)
1935	jne	L10
1936	movzbl	2(%rsi), %eax
1937	cmpb	%al, 2(%rdi)
1938	jne	L10
1939	movzbl	3(%rdi), %eax
1940	cmpb	3(%rsi), %al
1941L10:
1942	setl	%al
1943	movzbl	%al, %eax
1944	ret
1945
1946which is "perfect".
1947
1948//===---------------------------------------------------------------------===//
1949
1950For the branch in the following code:
1951int a();
1952int b(int x, int y) {
1953  if (x & (1<<(y&7)))
1954    return a();
1955  return y;
1956}
1957
1958We currently generate:
1959	movb	%sil, %al
1960	andb	$7, %al
1961	movzbl	%al, %eax
1962	btl	%eax, %edi
1963	jae	.LBB0_2
1964
1965movl+andl would be shorter than the movb+andb+movzbl sequence.
1966
1967//===---------------------------------------------------------------------===//
1968
1969For the following:
1970struct u1 {
1971    float x, y;
1972};
1973float foo(struct u1 u) {
1974    return u.x + u.y;
1975}
1976
1977We currently generate:
1978	movdqa	%xmm0, %xmm1
1979	pshufd	$1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
1980	addss	%xmm1, %xmm0
1981	ret
1982
1983We could save an instruction here by commuting the addss.
1984
1985//===---------------------------------------------------------------------===//
1986
1987This (from PR9661):
1988
1989float clamp_float(float a) {
1990        if (a > 1.0f)
1991                return 1.0f;
1992        else if (a < 0.0f)
1993                return 0.0f;
1994        else
1995                return a;
1996}
1997
1998Could compile to:
1999
2000clamp_float:                            # @clamp_float
2001        movss   .LCPI0_0(%rip), %xmm1
2002        minss   %xmm1, %xmm0
2003        pxor    %xmm1, %xmm1
2004        maxss   %xmm1, %xmm0
2005        ret
2006
2007with -ffast-math.
2008
2009//===---------------------------------------------------------------------===//
2010
2011This function (from PR9803):
2012
2013int clamp2(int a) {
2014        if (a > 5)
2015                a = 5;
2016        if (a < 0) 
2017                return 0;
2018        return a;
2019}
2020
2021Compiles to:
2022
2023_clamp2:                                ## @clamp2
2024        pushq   %rbp
2025        movq    %rsp, %rbp
2026        cmpl    $5, %edi
2027        movl    $5, %ecx
2028        cmovlel %edi, %ecx
2029        testl   %ecx, %ecx
2030        movl    $0, %eax
2031        cmovnsl %ecx, %eax
2032        popq    %rbp
2033        ret
2034
2035The move of 0 could be scheduled above the test to make it is xor reg,reg.
2036
2037//===---------------------------------------------------------------------===//
2038
2039GCC PR48986.  We currently compile this:
2040
2041void bar(void);
2042void yyy(int* p) {
2043    if (__sync_fetch_and_add(p, -1) == 1)
2044      bar();
2045}
2046
2047into:
2048	movl	$-1, %eax
2049	lock
2050	xaddl	%eax, (%rdi)
2051	cmpl	$1, %eax
2052	je	LBB0_2
2053
2054Instead we could generate:
2055
2056	lock
2057	dec %rdi
2058	je LBB0_2
2059
2060The trick is to match "fetch_and_add(X, -C) == C".
2061
2062//===---------------------------------------------------------------------===//
2063
2064unsigned t(unsigned a, unsigned b) {
2065  return a <= b ? 5 : -5;
2066}
2067
2068We generate:
2069	movl	$5, %ecx
2070	cmpl	%esi, %edi
2071	movl	$-5, %eax
2072	cmovbel	%ecx, %eax
2073
2074GCC:
2075	cmpl	%edi, %esi
2076	sbbl	%eax, %eax
2077	andl	$-10, %eax
2078	addl	$5, %eax
2079
2080//===---------------------------------------------------------------------===//
2081