1//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend: SSE-specific stuff.
3//===---------------------------------------------------------------------===//
4
5//===---------------------------------------------------------------------===//
6
7SSE Variable shift can be custom lowered to something like this, which uses a
8small table + unaligned load + shuffle instead of going through memory.
9
10__m128i_shift_right:
11	.byte	  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
12	.byte	 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
13
14...
15__m128i shift_right(__m128i value, unsigned long offset) {
16  return _mm_shuffle_epi8(value,
17               _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
18}
19
20//===---------------------------------------------------------------------===//
21
22SSE has instructions for doing operations on complex numbers, we should pattern
23match them.   For example, this should turn into a horizontal add:
24
25typedef float __attribute__((vector_size(16))) v4f32;
26float f32(v4f32 A) {
27  return A[0]+A[1]+A[2]+A[3];
28}
29
30Instead we get this:
31
32_f32:                                   ## @f32
33	pshufd	$1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
34	addss	%xmm0, %xmm1
35	pshufd	$3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
36	movhlps	%xmm0, %xmm0            ## xmm0 = xmm0[1,1]
37	movaps	%xmm0, %xmm3
38	addss	%xmm1, %xmm3
39	movdqa	%xmm2, %xmm0
40	addss	%xmm3, %xmm0
41	ret
42
43Also, there are cases where some simple local SLP would improve codegen a bit.
44compiling this:
45
46_Complex float f32(_Complex float A, _Complex float B) {
47  return A+B;
48}
49
50into:
51
52_f32:                                   ## @f32
53	movdqa	%xmm0, %xmm2
54	addss	%xmm1, %xmm2
55	pshufd	$1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
56	pshufd	$1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
57	addss	%xmm1, %xmm3
58	movaps	%xmm2, %xmm0
59	unpcklps	%xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
60	ret
61
62seems silly when it could just be one addps.
63
64
65//===---------------------------------------------------------------------===//
66
67Expand libm rounding functions inline:  Significant speedups possible.
68http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
69
70//===---------------------------------------------------------------------===//
71
72When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
73other fast SSE modes.
74
75//===---------------------------------------------------------------------===//
76
77Think about doing i64 math in SSE regs on x86-32.
78
79//===---------------------------------------------------------------------===//
80
81This testcase should have no SSE instructions in it, and only one load from
82a constant pool:
83
84double %test3(bool %B) {
85        %C = select bool %B, double 123.412, double 523.01123123
86        ret double %C
87}
88
89Currently, the select is being lowered, which prevents the dag combiner from
90turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
91
92The pattern isel got this one right.
93
94//===---------------------------------------------------------------------===//
95
96SSE should implement 'select_cc' using 'emulated conditional moves' that use
97pcmp/pand/pandn/por to do a selection instead of a conditional branch:
98
99double %X(double %Y, double %Z, double %A, double %B) {
100        %C = setlt double %A, %B
101        %z = fadd double %Z, 0.0    ;; select operand is not a load
102        %D = select bool %C, double %Y, double %z
103        ret double %D
104}
105
106We currently emit:
107
108_X:
109        subl $12, %esp
110        xorpd %xmm0, %xmm0
111        addsd 24(%esp), %xmm0
112        movsd 32(%esp), %xmm1
113        movsd 16(%esp), %xmm2
114        ucomisd 40(%esp), %xmm1
115        jb LBB_X_2
116LBB_X_1:
117        movsd %xmm0, %xmm2
118LBB_X_2:
119        movsd %xmm2, (%esp)
120        fldl (%esp)
121        addl $12, %esp
122        ret
123
124//===---------------------------------------------------------------------===//
125
126Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
127feasible.
128
129//===---------------------------------------------------------------------===//
130
131Codegen:
132  if (copysign(1.0, x) == copysign(1.0, y))
133into:
134  if (x^y & mask)
135when using SSE.
136
137//===---------------------------------------------------------------------===//
138
139Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
140of a v4sf value.
141
142//===---------------------------------------------------------------------===//
143
144Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
145Perhaps use pxor / xorp* to clear a XMM register first?
146
147//===---------------------------------------------------------------------===//
148
149External test Nurbs exposed some problems. Look for
150__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
151emits:
152
153        movaps    (%edx), %xmm2                                 #59.21
154        movaps    (%edx), %xmm5                                 #60.21
155        movaps    (%edx), %xmm4                                 #61.21
156        movaps    (%edx), %xmm3                                 #62.21
157        movl      40(%ecx), %ebp                                #69.49
158        shufps    $0, %xmm2, %xmm5                              #60.21
159        movl      100(%esp), %ebx                               #69.20
160        movl      (%ebx), %edi                                  #69.20
161        imull     %ebp, %edi                                    #69.49
162        addl      (%eax), %edi                                  #70.33
163        shufps    $85, %xmm2, %xmm4                             #61.21
164        shufps    $170, %xmm2, %xmm3                            #62.21
165        shufps    $255, %xmm2, %xmm2                            #63.21
166        lea       (%ebp,%ebp,2), %ebx                           #69.49
167        negl      %ebx                                          #69.49
168        lea       -3(%edi,%ebx), %ebx                           #70.33
169        shll      $4, %ebx                                      #68.37
170        addl      32(%ecx), %ebx                                #68.37
171        testb     $15, %bl                                      #91.13
172        jne       L_B1.24       # Prob 5%                       #91.13
173
174This is the llvm code after instruction scheduling:
175
176cond_next140 (0xa910740, LLVM BB @0xa90beb0):
177	%reg1078 = MOV32ri -3
178	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
179	%reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
180	%reg1080 = IMUL32rr %reg1079, %reg1037
181	%reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
182	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
183	%reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
184	%reg1082 = SHL32ri %reg1038, 4
185	%reg1039 = ADD32rr %reg1036, %reg1082
186	%reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
187	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
188	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
189	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
190	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
191	%reg1040 = MOV32rr %reg1039
192	%reg1084 = AND32ri8 %reg1039, 15
193	CMP32ri8 %reg1084, 0
194	JE mbb<cond_next204,0xa914d30>
195
196Still ok. After register allocation:
197
198cond_next140 (0xa910740, LLVM BB @0xa90beb0):
199	%EAX = MOV32ri -3
200	%EDX = MOV32rm <fi#3>, 1, %NOREG, 0
201	ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
202	%EDX = MOV32rm <fi#7>, 1, %NOREG, 0
203	%EDX = MOV32rm %EDX, 1, %NOREG, 40
204	IMUL32rr %EAX<def&use>, %EDX
205	%ESI = MOV32rm <fi#5>, 1, %NOREG, 0
206	%ESI = MOV32rm %ESI, 1, %NOREG, 0
207	MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
208	%EAX = LEA32r %ESI, 1, %EAX, -3
209	%ESI = MOV32rm <fi#7>, 1, %NOREG, 0
210	%ESI = MOV32rm %ESI, 1, %NOREG, 32
211	%EDI = MOV32rr %EAX
212	SHL32ri %EDI<def&use>, 4
213	ADD32rr %EDI<def&use>, %ESI
214	%XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
215	%XMM1 = MOVAPSrr %XMM0
216	SHUFPSrr %XMM1<def&use>, %XMM1, 170
217	%XMM2 = MOVAPSrr %XMM0
218	SHUFPSrr %XMM2<def&use>, %XMM2, 0
219	%XMM3 = MOVAPSrr %XMM0
220	SHUFPSrr %XMM3<def&use>, %XMM3, 255
221	SHUFPSrr %XMM0<def&use>, %XMM0, 85
222	%EBX = MOV32rr %EDI
223	AND32ri8 %EBX<def&use>, 15
224	CMP32ri8 %EBX, 0
225	JE mbb<cond_next204,0xa914d30>
226
227This looks really bad. The problem is shufps is a destructive opcode. Since it
228appears as operand two in more than one shufps ops. It resulted in a number of
229copies. Note icc also suffers from the same problem. Either the instruction
230selector should select pshufd or The register allocator can made the two-address
231to three-address transformation.
232
233It also exposes some other problems. See MOV32ri -3 and the spills.
234
235//===---------------------------------------------------------------------===//
236
237Consider:
238
239__m128 test(float a) {
240  return _mm_set_ps(0.0, 0.0, 0.0, a*a);
241}
242
243This compiles into:
244
245movss 4(%esp), %xmm1
246mulss %xmm1, %xmm1
247xorps %xmm0, %xmm0
248movss %xmm1, %xmm0
249ret
250
251Because mulss doesn't modify the top 3 elements, the top elements of 
252xmm1 are already zero'd.  We could compile this to:
253
254movss 4(%esp), %xmm0
255mulss %xmm0, %xmm0
256ret
257
258//===---------------------------------------------------------------------===//
259
260Here's a sick and twisted idea.  Consider code like this:
261
262__m128 test(__m128 a) {
263  float b = *(float*)&A;
264  ...
265  return _mm_set_ps(0.0, 0.0, 0.0, b);
266}
267
268This might compile to this code:
269
270movaps c(%esp), %xmm1
271xorps %xmm0, %xmm0
272movss %xmm1, %xmm0
273ret
274
275Now consider if the ... code caused xmm1 to get spilled.  This might produce
276this code:
277
278movaps c(%esp), %xmm1
279movaps %xmm1, c2(%esp)
280...
281
282xorps %xmm0, %xmm0
283movaps c2(%esp), %xmm1
284movss %xmm1, %xmm0
285ret
286
287However, since the reload is only used by these instructions, we could 
288"fold" it into the uses, producing something like this:
289
290movaps c(%esp), %xmm1
291movaps %xmm1, c2(%esp)
292...
293
294movss c2(%esp), %xmm0
295ret
296
297... saving two instructions.
298
299The basic idea is that a reload from a spill slot, can, if only one 4-byte 
300chunk is used, bring in 3 zeros the one element instead of 4 elements.
301This can be used to simplify a variety of shuffle operations, where the
302elements are fixed zeros.
303
304//===---------------------------------------------------------------------===//
305
306This code generates ugly code, probably due to costs being off or something:
307
308define void @test(float* %P, <4 x float>* %P2 ) {
309        %xFloat0.688 = load float* %P
310        %tmp = load <4 x float>* %P2
311        %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
312        store <4 x float> %inFloat3.713, <4 x float>* %P2
313        ret void
314}
315
316Generates:
317
318_test:
319	movl	8(%esp), %eax
320	movaps	(%eax), %xmm0
321	pxor	%xmm1, %xmm1
322	movaps	%xmm0, %xmm2
323	shufps	$50, %xmm1, %xmm2
324	shufps	$132, %xmm2, %xmm0
325	movaps	%xmm0, (%eax)
326	ret
327
328Would it be better to generate:
329
330_test:
331        movl 8(%esp), %ecx
332        movaps (%ecx), %xmm0
333	xor %eax, %eax
334        pinsrw $6, %eax, %xmm0
335        pinsrw $7, %eax, %xmm0
336        movaps %xmm0, (%ecx)
337        ret
338
339?
340
341//===---------------------------------------------------------------------===//
342
343Some useful information in the Apple Altivec / SSE Migration Guide:
344
345http://developer.apple.com/documentation/Performance/Conceptual/
346Accelerate_sse_migration/index.html
347
348e.g. SSE select using and, andnot, or. Various SSE compare translations.
349
350//===---------------------------------------------------------------------===//
351
352Add hooks to commute some CMPP operations.
353
354//===---------------------------------------------------------------------===//
355
356Apply the same transformation that merged four float into a single 128-bit load
357to loads from constant pool.
358
359//===---------------------------------------------------------------------===//
360
361Floating point max / min are commutable when -enable-unsafe-fp-path is
362specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
363nodes which are selected to max / min instructions that are marked commutable.
364
365//===---------------------------------------------------------------------===//
366
367We should materialize vector constants like "all ones" and "signbit" with 
368code like:
369
370     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
371
372and:
373     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
374     psrlq   xmm1, 31     ; xmm1 = all 100000000000...
375
376instead of using a load from the constant pool.  The later is important for
377ABS/NEG/copysign etc.
378
379//===---------------------------------------------------------------------===//
380
381These functions:
382
383#include <xmmintrin.h>
384__m128i a;
385void x(unsigned short n) {
386  a = _mm_slli_epi32 (a, n);
387}
388void y(unsigned n) {
389  a = _mm_slli_epi32 (a, n);
390}
391
392compile to ( -O3 -static -fomit-frame-pointer):
393_x:
394        movzwl  4(%esp), %eax
395        movd    %eax, %xmm0
396        movaps  _a, %xmm1
397        pslld   %xmm0, %xmm1
398        movaps  %xmm1, _a
399        ret
400_y:
401        movd    4(%esp), %xmm0
402        movaps  _a, %xmm1
403        pslld   %xmm0, %xmm1
404        movaps  %xmm1, _a
405        ret
406
407"y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
408like movd would be sufficient in both cases as the value is already zero 
409extended in the 32-bit stack slot IIRC.  For signed short, it should also be
410save, as a really-signed value would be undefined for pslld.
411
412
413//===---------------------------------------------------------------------===//
414
415#include <math.h>
416int t1(double d) { return signbit(d); }
417
418This currently compiles to:
419	subl	$12, %esp
420	movsd	16(%esp), %xmm0
421	movsd	%xmm0, (%esp)
422	movl	4(%esp), %eax
423	shrl	$31, %eax
424	addl	$12, %esp
425	ret
426
427We should use movmskp{s|d} instead.
428
429//===---------------------------------------------------------------------===//
430
431CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
432(aligned) vector load.  This functionality has a couple of problems.
433
4341. The code to infer alignment from loads of globals is in the X86 backend,
435   not the dag combiner.  This is because dagcombine2 needs to be able to see
436   through the X86ISD::Wrapper node, which DAGCombine can't really do.
4372. The code for turning 4 x load into a single vector load is target 
438   independent and should be moved to the dag combiner.
4393. The code for turning 4 x load into a vector load can only handle a direct 
440   load from a global or a direct load from the stack.  It should be generalized
441   to handle any load from P, P+4, P+8, P+12, where P can be anything.
4424. The alignment inference code cannot handle loads from globals in non-static
443   mode because it doesn't look through the extra dyld stub load.  If you try
444   vec_align.ll without -relocation-model=static, you'll see what I mean.
445
446//===---------------------------------------------------------------------===//
447
448We should lower store(fneg(load p), q) into an integer load+xor+store, which
449eliminates a constant pool load.  For example, consider:
450
451define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
452entry:
453 %tmp6 = fsub float -0.000000e+00, %z.1		; <float> [#uses=1]
454 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
455 ret i64 %tmp20
456}
457declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
458
459This currently compiles to:
460
461LCPI1_0:					#  <4 x float>
462	.long	2147483648	# float -0
463	.long	2147483648	# float -0
464	.long	2147483648	# float -0
465	.long	2147483648	# float -0
466_ccosf:
467	subl	$12, %esp
468	movss	16(%esp), %xmm0
469	movss	%xmm0, 4(%esp)
470	movss	20(%esp), %xmm0
471	xorps	LCPI1_0, %xmm0
472	movss	%xmm0, (%esp)
473	call	L_ccoshf$stub
474	addl	$12, %esp
475	ret
476
477Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
478this code computes the pic base and does two loads to do the constant pool 
479load, so the improvement is much bigger.
480
481The tricky part about this xform is that the argument load/store isn't exposed
482until post-legalize, and at that point, the fneg has been custom expanded into 
483an X86 fxor.  This means that we need to handle this case in the x86 backend
484instead of in target independent code.
485
486//===---------------------------------------------------------------------===//
487
488Non-SSE4 insert into 16 x i8 is atrociously bad.
489
490//===---------------------------------------------------------------------===//
491
492<2 x i64> extract is substantially worse than <2 x f64>, even if the destination
493is memory.
494
495//===---------------------------------------------------------------------===//
496
497SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
498sitting between the truncate and the extract.
499
500//===---------------------------------------------------------------------===//
501
502INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
503any number of 0.0 simultaneously.  Currently we only use it for simple
504insertions.
505
506See comments in LowerINSERT_VECTOR_ELT_SSE4.
507
508//===---------------------------------------------------------------------===//
509
510On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
511Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
512legal, it'll just take a few extra patterns written in the .td file.
513
514Note: this is not a code quality issue; the custom lowered code happens to be
515right, but we shouldn't have to custom lower anything.  This is probably related
516to <2 x i64> ops being so bad.
517
518//===---------------------------------------------------------------------===//
519
520'select' on vectors and scalars could be a whole lot better.  We currently 
521lower them to conditional branches.  On x86-64 for example, we compile this:
522
523double test(double a, double b, double c, double d) { return a<b ? c : d; }
524
525to:
526
527_test:
528	ucomisd	%xmm0, %xmm1
529	ja	LBB1_2	# entry
530LBB1_1:	# entry
531	movapd	%xmm3, %xmm2
532LBB1_2:	# entry
533	movapd	%xmm2, %xmm0
534	ret
535
536instead of:
537
538_test:
539	cmpltsd	%xmm1, %xmm0
540	andpd	%xmm0, %xmm2
541	andnpd	%xmm3, %xmm0
542	orpd	%xmm2, %xmm0
543	ret
544
545For unpredictable branches, the later is much more efficient.  This should
546just be a matter of having scalar sse map to SELECT_CC and custom expanding
547or iseling it.
548
549//===---------------------------------------------------------------------===//
550
551LLVM currently generates stack realignment code, when it is not necessary
552needed. The problem is that we need to know about stack alignment too early,
553before RA runs.
554
555At that point we don't know, whether there will be vector spill, or not.
556Stack realignment logic is overly conservative here, but otherwise we can
557produce unaligned loads/stores.
558
559Fixing this will require some huge RA changes.
560
561Testcase:
562#include <emmintrin.h>
563
564typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
565
566static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
567- 22725, - 12873};;
568
569vSInt16 madd(vSInt16 b)
570{
571    return _mm_madd_epi16(a, b);
572}
573
574Generated code (x86-32, linux):
575madd:
576        pushl   %ebp
577        movl    %esp, %ebp
578        andl    $-16, %esp
579        movaps  .LCPI1_0, %xmm1
580        pmaddwd %xmm1, %xmm0
581        movl    %ebp, %esp
582        popl    %ebp
583        ret
584
585//===---------------------------------------------------------------------===//
586
587Consider:
588#include <emmintrin.h> 
589__m128 foo2 (float x) {
590 return _mm_set_ps (0, 0, x, 0);
591}
592
593In x86-32 mode, we generate this spiffy code:
594
595_foo2:
596	movss	4(%esp), %xmm0
597	pshufd	$81, %xmm0, %xmm0
598	ret
599
600in x86-64 mode, we generate this code, which could be better:
601
602_foo2:
603	xorps	%xmm1, %xmm1
604	movss	%xmm0, %xmm1
605	pshufd	$81, %xmm1, %xmm0
606	ret
607
608In sse4 mode, we could use insertps to make both better.
609
610Here's another testcase that could use insertps [mem]:
611
612#include <xmmintrin.h>
613extern float x2, x3;
614__m128 foo1 (float x1, float x4) {
615 return _mm_set_ps (x2, x1, x3, x4);
616}
617
618gcc mainline compiles it to:
619
620foo1:
621       insertps        $0x10, x2(%rip), %xmm0
622       insertps        $0x10, x3(%rip), %xmm1
623       movaps  %xmm1, %xmm2
624       movlhps %xmm0, %xmm2
625       movaps  %xmm2, %xmm0
626       ret
627
628//===---------------------------------------------------------------------===//
629
630We compile vector multiply-by-constant into poor code:
631
632define <4 x i32> @f(<4 x i32> %i) nounwind  {
633	%A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
634	ret <4 x i32> %A
635}
636
637On targets without SSE4.1, this compiles into:
638
639LCPI1_0:					##  <4 x i32>
640	.long	10
641	.long	10
642	.long	10
643	.long	10
644	.text
645	.align	4,0x90
646	.globl	_f
647_f:
648	pshufd	$3, %xmm0, %xmm1
649	movd	%xmm1, %eax
650	imull	LCPI1_0+12, %eax
651	movd	%eax, %xmm1
652	pshufd	$1, %xmm0, %xmm2
653	movd	%xmm2, %eax
654	imull	LCPI1_0+4, %eax
655	movd	%eax, %xmm2
656	punpckldq	%xmm1, %xmm2
657	movd	%xmm0, %eax
658	imull	LCPI1_0, %eax
659	movd	%eax, %xmm1
660	movhlps	%xmm0, %xmm0
661	movd	%xmm0, %eax
662	imull	LCPI1_0+8, %eax
663	movd	%eax, %xmm0
664	punpckldq	%xmm0, %xmm1
665	movaps	%xmm1, %xmm0
666	punpckldq	%xmm2, %xmm0
667	ret
668
669It would be better to synthesize integer vector multiplication by constants
670using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
671simple cases such as multiplication by powers of two would be better as
672vector shifts than as multiplications.
673
674//===---------------------------------------------------------------------===//
675
676We compile this:
677
678__m128i
679foo2 (char x)
680{
681  return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
682}
683
684into:
685	movl	$1, %eax
686	xorps	%xmm0, %xmm0
687	pinsrw	$2, %eax, %xmm0
688	movzbl	4(%esp), %eax
689	pinsrw	$3, %eax, %xmm0
690	movl	$256, %eax
691	pinsrw	$7, %eax, %xmm0
692	ret
693
694
695gcc-4.2:
696	subl	$12, %esp
697	movzbl	16(%esp), %eax
698	movdqa	LC0, %xmm0
699	pinsrw	$3, %eax, %xmm0
700	addl	$12, %esp
701	ret
702	.const
703	.align 4
704LC0:
705	.word	0
706	.word	0
707	.word	1
708	.word	0
709	.word	0
710	.word	0
711	.word	0
712	.word	256
713
714With SSE4, it should be
715      movdqa  .LC0(%rip), %xmm0
716      pinsrb  $6, %edi, %xmm0
717
718//===---------------------------------------------------------------------===//
719
720We should transform a shuffle of two vectors of constants into a single vector
721of constants. Also, insertelement of a constant into a vector of constants
722should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
723
724We compiled it to something horrible:
725
726	.align	4
727LCPI1_1:					##  float
728	.long	1065353216	## float 1
729	.const
730
731	.align	4
732LCPI1_0:					##  <4 x float>
733	.space	4
734	.long	1065353216	## float 1
735	.space	4
736	.long	1065353216	## float 1
737	.text
738	.align	4,0x90
739	.globl	_t
740_t:
741	xorps	%xmm0, %xmm0
742	movhps	LCPI1_0, %xmm0
743	movss	LCPI1_1, %xmm1
744	movaps	%xmm0, %xmm2
745	shufps	$2, %xmm1, %xmm2
746	shufps	$132, %xmm2, %xmm0
747	movaps	%xmm0, 0
748
749//===---------------------------------------------------------------------===//
750rdar://5907648
751
752This function:
753
754float foo(unsigned char x) {
755  return x;
756}
757
758compiles to (x86-32):
759
760define float @foo(i8 zeroext  %x) nounwind  {
761	%tmp12 = uitofp i8 %x to float		; <float> [#uses=1]
762	ret float %tmp12
763}
764
765compiles to:
766
767_foo:
768	subl	$4, %esp
769	movzbl	8(%esp), %eax
770	cvtsi2ss	%eax, %xmm0
771	movss	%xmm0, (%esp)
772	flds	(%esp)
773	addl	$4, %esp
774	ret
775
776We should be able to use:
777  cvtsi2ss 8($esp), %xmm0
778since we know the stack slot is already zext'd.
779
780//===---------------------------------------------------------------------===//
781
782Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
783when code size is critical. movlps is slower than movsd on core2 but it's one
784byte shorter.
785
786//===---------------------------------------------------------------------===//
787
788We should use a dynamic programming based approach to tell when using FPStack
789operations is cheaper than SSE.  SciMark montecarlo contains code like this
790for example:
791
792double MonteCarlo_num_flops(int Num_samples) {
793    return ((double) Num_samples)* 4.0;
794}
795
796In fpstack mode, this compiles into:
797
798LCPI1_0:					
799	.long	1082130432	## float 4.000000e+00
800_MonteCarlo_num_flops:
801	subl	$4, %esp
802	movl	8(%esp), %eax
803	movl	%eax, (%esp)
804	fildl	(%esp)
805	fmuls	LCPI1_0
806	addl	$4, %esp
807	ret
808        
809in SSE mode, it compiles into significantly slower code:
810
811_MonteCarlo_num_flops:
812	subl	$12, %esp
813	cvtsi2sd	16(%esp), %xmm0
814	mulsd	LCPI1_0, %xmm0
815	movsd	%xmm0, (%esp)
816	fldl	(%esp)
817	addl	$12, %esp
818	ret
819
820There are also other cases in scimark where using fpstack is better, it is
821cheaper to do fld1 than load from a constant pool for example, so
822"load, add 1.0, store" is better done in the fp stack, etc.
823
824//===---------------------------------------------------------------------===//
825
826The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
827"cmpsd".  For example, this code:
828
829double d1(double x) { return x == x ? x : x + x; }
830
831Compiles into:
832
833_d1:
834	ucomisd	%xmm0, %xmm0
835	jnp	LBB1_2
836	addsd	%xmm0, %xmm0
837	ret
838LBB1_2:
839	ret
840
841Also, the 'ret's should be shared.  This is PR6032.
842
843//===---------------------------------------------------------------------===//
844
845These should compile into the same code (PR6214): Perhaps instcombine should
846canonicalize the former into the later?
847
848define float @foo(float %x) nounwind {
849  %t = bitcast float %x to i32
850  %s = and i32 %t, 2147483647
851  %d = bitcast i32 %s to float
852  ret float %d
853}
854
855declare float @fabsf(float %n)
856define float @bar(float %x) nounwind {
857  %d = call float @fabsf(float %x)
858  ret float %d
859}
860
861//===---------------------------------------------------------------------===//
862
863This IR (from PR6194):
864
865target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
866target triple = "x86_64-apple-darwin10.0.0"
867
868%0 = type { double, double }
869%struct.float3 = type { float, float, float }
870
871define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
872entry:
873  %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
874  %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
875  %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
876  %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
877  %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
878  %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
879  %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
880  store float %tmp12, float* %tmp5
881  ret void
882}
883
884Compiles to:
885
886_test:                                  ## @test
887	movd	%xmm0, %rax
888	shrq	$32, %rax
889	movl	%eax, 4(%rdi)
890	ret
891
892This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
893doing a shuffle from v[1] to v[0] then a float store.
894
895//===---------------------------------------------------------------------===//
896
897On SSE4 machines, we compile this code:
898
899define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
900       <2 x float> *%P) nounwind {
901  %Z = fadd <2 x float> %Q, %R
902
903  store <2 x float> %Z, <2 x float> *%P
904  ret <2 x float> %Z
905}
906
907into:
908
909_test2:                                 ## @test2
910## BB#0:
911	insertps	$0, %xmm2, %xmm2
912	insertps	$16, %xmm3, %xmm2
913	insertps	$0, %xmm0, %xmm3
914	insertps	$16, %xmm1, %xmm3
915	addps	%xmm2, %xmm3
916	movq	%xmm3, (%rdi)
917	movaps	%xmm3, %xmm0
918	pshufd	$1, %xmm3, %xmm1
919                                        ## kill: XMM1<def> XMM1<kill>
920	ret
921
922The insertps's of $0 are pointless complex copies.
923
924//===---------------------------------------------------------------------===//
925
926[UNSAFE FP]
927
928void foo(double, double, double);
929void norm(double x, double y, double z) {
930  double scale = __builtin_sqrt(x*x + y*y + z*z);
931  foo(x/scale, y/scale, z/scale);
932}
933
934We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
935slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
936and emit 3 mulsd in place of the divs. This can be done as a target-independent
937transform.
938
939If we're dealing with floats instead of doubles we could even replace the sqrtss
940and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
941cost of reduced accuracy.
942
943//===---------------------------------------------------------------------===//
944
945This function should be matched to haddpd when the appropriate CPU is enabled:
946
947#include <x86intrin.h>
948double f (__m128d p) {
949  return p[0] + p[1];
950}
951
952similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
953turn into hsubpd also.
954
955//===---------------------------------------------------------------------===//
956