xref: /llvm-project/llvm/lib/Target/PowerPC/README_ALTIVEC.txt (revision dfb717da1f794c235b81a985a57dc238c82318e6)
1//===- README_ALTIVEC.txt - Notes for improving Altivec code gen ----------===//
2
3Implement PPCInstrInfo::isLoadFromStackSlot/isStoreToStackSlot for vector
4registers, to generate better spill code.
5
6//===----------------------------------------------------------------------===//
7
8The first should be a single lvx from the constant pool, the second should be
9a xor/stvx:
10
11void foo(void) {
12  int x[8] __attribute__((aligned(128))) = { 1, 1, 1, 17, 1, 1, 1, 1 };
13  bar (x);
14}
15
16#include <string.h>
17void foo(void) {
18  int x[8] __attribute__((aligned(128)));
19  memset (x, 0, sizeof (x));
20  bar (x);
21}
22
23//===----------------------------------------------------------------------===//
24
25Altivec: Codegen'ing MUL with vector FMADD should add -0.0, not 0.0:
26http://gcc.gnu.org/bugzilla/show_bug.cgi?id=8763
27
28When -ffast-math is on, we can use 0.0.
29
30//===----------------------------------------------------------------------===//
31
32  Consider this:
33  v4f32 Vector;
34  v4f32 Vector2 = { Vector.X, Vector.X, Vector.X, Vector.X };
35
36Since we know that "Vector" is 16-byte aligned and we know the element offset
37of ".X", we should change the load into a lve*x instruction, instead of doing
38a load/store/lve*x sequence.
39
40//===----------------------------------------------------------------------===//
41
42Implement passing vectors by value into calls and receiving them as arguments.
43
44//===----------------------------------------------------------------------===//
45
46GCC apparently tries to codegen { C1, C2, Variable, C3 } as a constant pool load
47of C1/C2/C3, then a load and vperm of Variable.
48
49//===----------------------------------------------------------------------===//
50
51We need a way to teach tblgen that some operands of an intrinsic are required to
52be constants.  The verifier should enforce this constraint.
53
54//===----------------------------------------------------------------------===//
55
56We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a 16-byte
57aligned stack slot, followed by a load/vperm.  We should probably just store it
58to a scalar stack slot, then use lvsl/vperm to load it.  If the value is already
59in memory this is a big win.
60
61//===----------------------------------------------------------------------===//
62
63extract_vector_elt of an arbitrary constant vector can be done with the
64following instructions:
65
66vTemp = vec_splat(v0,2);    // 2 is the element the src is in.
67vec_ste(&destloc,0,vTemp);
68
69We can do an arbitrary non-constant value by using lvsr/perm/ste.
70
71//===----------------------------------------------------------------------===//
72
73If we want to tie instruction selection into the scheduler, we can do some
74constant formation with different instructions.  For example, we can generate
75"vsplti -1" with "vcmpequw R,R" and 1,1,1,1 with "vsubcuw R,R", and 0,0,0,0 with
76"vsplti 0" or "vxor", each of which use different execution units, thus could
77help scheduling.
78
79This is probably only reasonable for a post-pass scheduler.
80
81//===----------------------------------------------------------------------===//
82
83For this function:
84
85void test(vector float *A, vector float *B) {
86  vector float C = (vector float)vec_cmpeq(*A, *B);
87  if (!vec_any_eq(*A, *B))
88    *B = (vector float){0,0,0,0};
89  *A = C;
90}
91
92we get the following basic block:
93
94	...
95        lvx v2, 0, r4
96        lvx v3, 0, r3
97        vcmpeqfp v4, v3, v2
98        vcmpeqfp. v2, v3, v2
99        bne cr6, LBB1_2 ; cond_next
100
101The vcmpeqfp/vcmpeqfp. instructions currently cannot be merged when the
102vcmpeqfp. result is used by a branch.  This can be improved.
103
104//===----------------------------------------------------------------------===//
105
106The code generated for this is truly aweful:
107
108vector float test(float a, float b) {
109 return (vector float){ 0.0, a, 0.0, 0.0};
110}
111
112LCPI1_0:                                        ;  float
113        .space  4
114        .text
115        .globl  _test
116        .align  4
117_test:
118        mfspr r2, 256
119        oris r3, r2, 4096
120        mtspr 256, r3
121        lis r3, ha16(LCPI1_0)
122        addi r4, r1, -32
123        stfs f1, -16(r1)
124        addi r5, r1, -16
125        lfs f0, lo16(LCPI1_0)(r3)
126        stfs f0, -32(r1)
127        lvx v2, 0, r4
128        lvx v3, 0, r5
129        vmrghw v3, v3, v2
130        vspltw v2, v2, 0
131        vmrghw v2, v2, v3
132        mtspr 256, r2
133        blr
134
135//===----------------------------------------------------------------------===//
136
137int foo(vector float *x, vector float *y) {
138        if (vec_all_eq(*x,*y)) return 3245;
139        else return 12;
140}
141
142A predicate compare being used in a select_cc should have the same peephole
143applied to it as a predicate compare used by a br_cc.  There should be no
144mfcr here:
145
146_foo:
147        mfspr r2, 256
148        oris r5, r2, 12288
149        mtspr 256, r5
150        li r5, 12
151        li r6, 3245
152        lvx v2, 0, r4
153        lvx v3, 0, r3
154        vcmpeqfp. v2, v3, v2
155        mfcr r3, 2
156        rlwinm r3, r3, 25, 31, 31
157        cmpwi cr0, r3, 0
158        bne cr0, LBB1_2 ; entry
159LBB1_1: ; entry
160        mr r6, r5
161LBB1_2: ; entry
162        mr r3, r6
163        mtspr 256, r2
164        blr
165
166//===----------------------------------------------------------------------===//
167
168CodeGen/PowerPC/vec_constants.ll has an and operation that should be
169codegen'd to andc.  The issue is that the 'all ones' build vector is
170SelectNodeTo'd a VSPLTISB instruction node before the and/xor is selected
171which prevents the vnot pattern from matching.
172
173
174//===----------------------------------------------------------------------===//
175
176An alternative to the store/store/load approach for illegal insert element
177lowering would be:
178
1791. store element to any ol' slot
1802. lvx the slot
1813. lvsl 0; splat index; vcmpeq to generate a select mask
1824. lvsl slot + x; vperm to rotate result into correct slot
1835. vsel result together.
184
185//===----------------------------------------------------------------------===//
186
187Should codegen branches on vec_any/vec_all to avoid mfcr.  Two examples:
188
189#include <altivec.h>
190 int f(vector float a, vector float b)
191 {
192  int aa = 0;
193  if (vec_all_ge(a, b))
194    aa |= 0x1;
195  if (vec_any_ge(a,b))
196    aa |= 0x2;
197  return aa;
198}
199
200vector float f(vector float a, vector float b) {
201  if (vec_any_eq(a, b))
202    return a;
203  else
204    return b;
205}
206
207//===----------------------------------------------------------------------===//
208
209We should do a little better with eliminating dead stores.
210The stores to the stack are dead since %a and %b are not needed
211
212; Function Attrs: nounwind
213define <16 x i8> @test_vpmsumb() #0 {
214  entry:
215  %a = alloca <16 x i8>, align 16
216  %b = alloca <16 x i8>, align 16
217  store <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>, <16 x i8>* %a, align 16
218  store <16 x i8> <i8 113, i8 114, i8 115, i8 116, i8 117, i8 118, i8 119, i8 120, i8 121, i8 122, i8 123, i8 124, i8 125, i8 126, i8 127, i8 112>, <16 x i8>* %b, align 16
219  %0 = load <16 x i8>* %a, align 16
220  %1 = load <16 x i8>* %b, align 16
221  %2 = call <16 x i8> @llvm.ppc.altivec.crypto.vpmsumb(<16 x i8> %0, <16 x i8> %1)
222  ret <16 x i8> %2
223}
224
225
226; Function Attrs: nounwind readnone
227declare <16 x i8> @llvm.ppc.altivec.crypto.vpmsumb(<16 x i8>, <16 x i8>) #1
228
229
230Produces the following code with -mtriple=powerpc64-unknown-linux-gnu:
231# %bb.0:                                # %entry
232    addis 3, 2, .LCPI0_0@toc@ha
233    addis 4, 2, .LCPI0_1@toc@ha
234    addi 3, 3, .LCPI0_0@toc@l
235    addi 4, 4, .LCPI0_1@toc@l
236    lxvw4x 0, 0, 3
237    addi 3, 1, -16
238    lxvw4x 35, 0, 4
239    stxvw4x 0, 0, 3
240    ori 2, 2, 0
241    lxvw4x 34, 0, 3
242    addi 3, 1, -32
243    stxvw4x 35, 0, 3
244    vpmsumb 2, 2, 3
245    blr
246    .long   0
247    .quad   0
248
249The two stxvw4x instructions are not needed.
250With -mtriple=powerpc64le-unknown-linux-gnu, the associated permutes
251are present too.
252
253//===----------------------------------------------------------------------===//
254
255The following example is found in test/CodeGen/PowerPC/vec_add_sub_doubleword.ll:
256
257define <2 x i64> @increment_by_val(<2 x i64> %x, i64 %val) nounwind {
258       %tmpvec = insertelement <2 x i64> <i64 0, i64 0>, i64 %val, i32 0
259       %tmpvec2 = insertelement <2 x i64> %tmpvec, i64 %val, i32 1
260       %result = add <2 x i64> %x, %tmpvec2
261       ret <2 x i64> %result
262
263This will generate the following instruction sequence:
264        std 5, -8(1)
265        std 5, -16(1)
266        addi 3, 1, -16
267        ori 2, 2, 0
268        lxvd2x 35, 0, 3
269        vaddudm 2, 2, 3
270        blr
271
272This will almost certainly cause a load-hit-store hazard.
273Since val is a value parameter, it should not need to be saved onto
274the stack, unless it's being done set up the vector register. Instead,
275it would be better to splat the value into a vector register, and then
276remove the (dead) stores to the stack.
277
278//===----------------------------------------------------------------------===//
279
280At the moment we always generate a lxsdx in preference to lfd, or stxsdx in
281preference to stfd.  When we have a reg-immediate addressing mode, this is a
282poor choice, since we have to load the address into an index register.  This
283should be fixed for P7/P8.
284
285//===----------------------------------------------------------------------===//
286
287Right now, ShuffleKind 0 is supported only on BE, and ShuffleKind 2 only on LE.
288However, we could actually support both kinds on either endianness, if we check
289for the appropriate shufflevector pattern for each case ...  this would cause
290some additional shufflevectors to be recognized and implemented via the
291"swapped" form.
292
293//===----------------------------------------------------------------------===//
294
295There is a utility program called PerfectShuffle that generates a table of the
296shortest instruction sequence for implementing a shufflevector operation on
297PowerPC.  However, this was designed for big-endian code generation.  We could
298modify this program to create a little endian version of the table.  The table
299is used in PPCISelLowering.cpp, PPCTargetLowering::LOWERVECTOR_SHUFFLE().
300
301//===----------------------------------------------------------------------===//
302
303Opportunies to use instructions from PPCInstrVSX.td during code gen
304  - Conversion instructions (Sections 7.6.1.5 and 7.6.1.6 of ISA 2.07)
305  - Scalar comparisons (xscmpodp and xscmpudp)
306  - Min and max (xsmaxdp, xsmindp, xvmaxdp, xvmindp, xvmaxsp, xvminsp)
307
308Related to this: we currently do not generate the lxvw4x instruction for either
309v4f32 or v4i32, probably because adding a dag pattern to the recognizer requires
310a single target type.  This should probably be addressed in the PPCISelDAGToDAG logic.
311
312//===----------------------------------------------------------------------===//
313
314Currently EXTRACT_VECTOR_ELT and INSERT_VECTOR_ELT are type-legal only
315for v2f64 with VSX available.  We should create custom lowering
316support for the other vector types.  Without this support, we generate
317sequences with load-hit-store hazards.
318
319v4f32 can be supported with VSX by shifting the correct element into
320big-endian lane 0, using xscvspdpn to produce a double-precision
321representation of the single-precision value in big-endian
322double-precision lane 0, and reinterpreting lane 0 as an FPR or
323vector-scalar register.
324
325v2i64 can be supported with VSX and P8Vector in the same manner as
326v2f64, followed by a direct move to a GPR.
327
328v4i32 can be supported with VSX and P8Vector by shifting the correct
329element into big-endian lane 1, using a direct move to a GPR, and
330sign-extending the 32-bit result to 64 bits.
331
332v8i16 can be supported with VSX and P8Vector by shifting the correct
333element into big-endian lane 3, using a direct move to a GPR, and
334sign-extending the 16-bit result to 64 bits.
335
336v16i8 can be supported with VSX and P8Vector by shifting the correct
337element into big-endian lane 7, using a direct move to a GPR, and
338sign-extending the 8-bit result to 64 bits.
339