xref: /llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp (revision 94a9f6c56ca8f1ec1933ba16bf8b9ffd8ce7217a)
1 //===-- HexagonVectorCombine.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // HexagonVectorCombine is a utility class implementing a variety of functions
9 // that assist in vector-based optimizations.
10 //
11 // AlignVectors: replace unaligned vector loads and stores with aligned ones.
12 // HvxIdioms: recognize various opportunities to generate HVX intrinsic code.
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/ADT/APInt.h"
16 #include "llvm/ADT/ArrayRef.h"
17 #include "llvm/ADT/DenseMap.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/Analysis/AliasAnalysis.h"
21 #include "llvm/Analysis/AssumptionCache.h"
22 #include "llvm/Analysis/InstSimplifyFolder.h"
23 #include "llvm/Analysis/InstructionSimplify.h"
24 #include "llvm/Analysis/TargetLibraryInfo.h"
25 #include "llvm/Analysis/ValueTracking.h"
26 #include "llvm/Analysis/VectorUtils.h"
27 #include "llvm/CodeGen/TargetPassConfig.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/Dominators.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/IntrinsicInst.h"
32 #include "llvm/IR/Intrinsics.h"
33 #include "llvm/IR/IntrinsicsHexagon.h"
34 #include "llvm/IR/Metadata.h"
35 #include "llvm/IR/PatternMatch.h"
36 #include "llvm/InitializePasses.h"
37 #include "llvm/Pass.h"
38 #include "llvm/Support/CommandLine.h"
39 #include "llvm/Support/KnownBits.h"
40 #include "llvm/Support/MathExtras.h"
41 #include "llvm/Support/raw_ostream.h"
42 #include "llvm/Target/TargetMachine.h"
43 #include "llvm/Transforms/Utils/Local.h"
44 
45 #include "HexagonSubtarget.h"
46 #include "HexagonTargetMachine.h"
47 
48 #include <algorithm>
49 #include <deque>
50 #include <map>
51 #include <optional>
52 #include <set>
53 #include <utility>
54 #include <vector>
55 
56 #define DEBUG_TYPE "hexagon-vc"
57 
58 using namespace llvm;
59 
60 namespace {
61 cl::opt<bool> AlignFullHvxStores("hvc-align-full-hvx-stores", cl::Hidden);
62 
63 class HexagonVectorCombine {
64 public:
65   HexagonVectorCombine(Function &F_, AliasAnalysis &AA_, AssumptionCache &AC_,
66                        DominatorTree &DT_, TargetLibraryInfo &TLI_,
67                        const TargetMachine &TM_)
68       : F(F_), DL(F.getParent()->getDataLayout()), AA(AA_), AC(AC_), DT(DT_),
69         TLI(TLI_),
70         HST(static_cast<const HexagonSubtarget &>(*TM_.getSubtargetImpl(F))) {}
71 
72   bool run();
73 
74   // Common integer type.
75   IntegerType *getIntTy(unsigned Width = 32) const;
76   // Byte type: either scalar (when Length = 0), or vector with given
77   // element count.
78   Type *getByteTy(int ElemCount = 0) const;
79   // Boolean type: either scalar (when Length = 0), or vector with given
80   // element count.
81   Type *getBoolTy(int ElemCount = 0) const;
82   // Create a ConstantInt of type returned by getIntTy with the value Val.
83   ConstantInt *getConstInt(int Val, unsigned Width = 32) const;
84   // Get the integer value of V, if it exists.
85   std::optional<APInt> getIntValue(const Value *Val) const;
86   // Is Val a constant 0, or a vector of 0s?
87   bool isZero(const Value *Val) const;
88   // Is Val an undef value?
89   bool isUndef(const Value *Val) const;
90   // Is Val a scalar (i1 true) or a vector of (i1 true)?
91   bool isTrue(const Value *Val) const;
92   // Is Val a scalar (i1 false) or a vector of (i1 false)?
93   bool isFalse(const Value *Val) const;
94 
95   // Get HVX vector type with the given element type.
96   VectorType *getHvxTy(Type *ElemTy, bool Pair = false) const;
97 
98   enum SizeKind {
99     Store, // Store size
100     Alloc, // Alloc size
101   };
102   int getSizeOf(const Value *Val, SizeKind Kind = Store) const;
103   int getSizeOf(const Type *Ty, SizeKind Kind = Store) const;
104   int getTypeAlignment(Type *Ty) const;
105   size_t length(Value *Val) const;
106   size_t length(Type *Ty) const;
107 
108   Constant *getNullValue(Type *Ty) const;
109   Constant *getFullValue(Type *Ty) const;
110   Constant *getConstSplat(Type *Ty, int Val) const;
111 
112   Value *simplify(Value *Val) const;
113 
114   Value *insertb(IRBuilderBase &Builder, Value *Dest, Value *Src, int Start,
115                  int Length, int Where) const;
116   Value *vlalignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
117                   Value *Amt) const;
118   Value *vralignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
119                   Value *Amt) const;
120   Value *concat(IRBuilderBase &Builder, ArrayRef<Value *> Vecs) const;
121   Value *vresize(IRBuilderBase &Builder, Value *Val, int NewSize,
122                  Value *Pad) const;
123   Value *rescale(IRBuilderBase &Builder, Value *Mask, Type *FromTy,
124                  Type *ToTy) const;
125   Value *vlsb(IRBuilderBase &Builder, Value *Val) const;
126   Value *vbytes(IRBuilderBase &Builder, Value *Val) const;
127   Value *subvector(IRBuilderBase &Builder, Value *Val, unsigned Start,
128                    unsigned Length) const;
129   Value *sublo(IRBuilderBase &Builder, Value *Val) const;
130   Value *subhi(IRBuilderBase &Builder, Value *Val) const;
131   Value *vdeal(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
132   Value *vshuff(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
133 
134   Value *createHvxIntrinsic(IRBuilderBase &Builder, Intrinsic::ID IntID,
135                             Type *RetTy, ArrayRef<Value *> Args,
136                             ArrayRef<Type *> ArgTys = std::nullopt,
137                             ArrayRef<Value *> MDSources = std::nullopt) const;
138   SmallVector<Value *> splitVectorElements(IRBuilderBase &Builder, Value *Vec,
139                                            unsigned ToWidth) const;
140   Value *joinVectorElements(IRBuilderBase &Builder, ArrayRef<Value *> Values,
141                             VectorType *ToType) const;
142 
143   std::optional<int> calculatePointerDifference(Value *Ptr0, Value *Ptr1) const;
144 
145   unsigned getNumSignificantBits(const Value *V,
146                                  const Instruction *CtxI = nullptr) const;
147   KnownBits getKnownBits(const Value *V,
148                          const Instruction *CtxI = nullptr) const;
149 
150   template <typename T = std::vector<Instruction *>>
151   bool isSafeToMoveBeforeInBB(const Instruction &In,
152                               BasicBlock::const_iterator To,
153                               const T &IgnoreInsts = {}) const;
154 
155   // This function is only used for assertions at the moment.
156   [[maybe_unused]] bool isByteVecTy(Type *Ty) const;
157 
158   Function &F;
159   const DataLayout &DL;
160   AliasAnalysis &AA;
161   AssumptionCache &AC;
162   DominatorTree &DT;
163   TargetLibraryInfo &TLI;
164   const HexagonSubtarget &HST;
165 
166 private:
167   Value *getElementRange(IRBuilderBase &Builder, Value *Lo, Value *Hi,
168                          int Start, int Length) const;
169 };
170 
171 class AlignVectors {
172   // This code tries to replace unaligned vector loads/stores with aligned
173   // ones.
174   // Consider unaligned load:
175   //   %v = original_load %some_addr, align <bad>
176   //   %user = %v
177   // It will generate
178   //      = load ..., align <good>
179   //      = load ..., align <good>
180   //      = valign
181   //      etc.
182   //   %synthesize = combine/shuffle the loaded data so that it looks
183   //                 exactly like what "original_load" has loaded.
184   //   %user = %synthesize
185   // Similarly for stores.
186 public:
187   AlignVectors(const HexagonVectorCombine &HVC_) : HVC(HVC_) {}
188 
189   bool run();
190 
191 private:
192   using InstList = std::vector<Instruction *>;
193 
194   struct AddrInfo {
195     AddrInfo(const AddrInfo &) = default;
196     AddrInfo(const HexagonVectorCombine &HVC, Instruction *I, Value *A, Type *T,
197              Align H)
198         : Inst(I), Addr(A), ValTy(T), HaveAlign(H),
199           NeedAlign(HVC.getTypeAlignment(ValTy)) {}
200     AddrInfo &operator=(const AddrInfo &) = default;
201 
202     // XXX: add Size member?
203     Instruction *Inst;
204     Value *Addr;
205     Type *ValTy;
206     Align HaveAlign;
207     Align NeedAlign;
208     int Offset = 0; // Offset (in bytes) from the first member of the
209                     // containing AddrList.
210   };
211   using AddrList = std::vector<AddrInfo>;
212 
213   struct InstrLess {
214     bool operator()(const Instruction *A, const Instruction *B) const {
215       return A->comesBefore(B);
216     }
217   };
218   using DepList = std::set<Instruction *, InstrLess>;
219 
220   struct MoveGroup {
221     MoveGroup(const AddrInfo &AI, Instruction *B, bool Hvx, bool Load)
222         : Base(B), Main{AI.Inst}, IsHvx(Hvx), IsLoad(Load) {}
223     Instruction *Base; // Base instruction of the parent address group.
224     InstList Main;     // Main group of instructions.
225     InstList Deps;     // List of dependencies.
226     bool IsHvx;        // Is this group of HVX instructions?
227     bool IsLoad;       // Is this a load group?
228   };
229   using MoveList = std::vector<MoveGroup>;
230 
231   struct ByteSpan {
232     // A representation of "interesting" bytes within a given span of memory.
233     // These bytes are those that are loaded or stored, and they don't have
234     // to cover the entire span of memory.
235     //
236     // The representation works by picking a contiguous sequence of bytes
237     // from somewhere within a llvm::Value, and placing it at a given offset
238     // within the span.
239     //
240     // The sequence of bytes from llvm:Value is represented by Segment.
241     // Block is Segment, plus where it goes in the span.
242     //
243     // An important feature of ByteSpan is being able to make a "section",
244     // i.e. creating another ByteSpan corresponding to a range of offsets
245     // relative to the source span.
246 
247     struct Segment {
248       // Segment of a Value: 'Len' bytes starting at byte 'Begin'.
249       Segment(Value *Val, int Begin, int Len)
250           : Val(Val), Start(Begin), Size(Len) {}
251       Segment(const Segment &Seg) = default;
252       Segment &operator=(const Segment &Seg) = default;
253       Value *Val; // Value representable as a sequence of bytes.
254       int Start;  // First byte of the value that belongs to the segment.
255       int Size;   // Number of bytes in the segment.
256     };
257 
258     struct Block {
259       Block(Value *Val, int Len, int Pos) : Seg(Val, 0, Len), Pos(Pos) {}
260       Block(Value *Val, int Off, int Len, int Pos)
261           : Seg(Val, Off, Len), Pos(Pos) {}
262       Block(const Block &Blk) = default;
263       Block &operator=(const Block &Blk) = default;
264       Segment Seg; // Value segment.
265       int Pos;     // Position (offset) of the block in the span.
266     };
267 
268     int extent() const;
269     ByteSpan section(int Start, int Length) const;
270     ByteSpan &shift(int Offset);
271     SmallVector<Value *, 8> values() const;
272 
273     int size() const { return Blocks.size(); }
274     Block &operator[](int i) { return Blocks[i]; }
275     const Block &operator[](int i) const { return Blocks[i]; }
276 
277     std::vector<Block> Blocks;
278 
279     using iterator = decltype(Blocks)::iterator;
280     iterator begin() { return Blocks.begin(); }
281     iterator end() { return Blocks.end(); }
282     using const_iterator = decltype(Blocks)::const_iterator;
283     const_iterator begin() const { return Blocks.begin(); }
284     const_iterator end() const { return Blocks.end(); }
285   };
286 
287   Align getAlignFromValue(const Value *V) const;
288   std::optional<AddrInfo> getAddrInfo(Instruction &In) const;
289   bool isHvx(const AddrInfo &AI) const;
290   // This function is only used for assertions at the moment.
291   [[maybe_unused]] bool isSectorTy(Type *Ty) const;
292 
293   Value *getPayload(Value *Val) const;
294   Value *getMask(Value *Val) const;
295   Value *getPassThrough(Value *Val) const;
296 
297   Value *createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
298                                int Adjust) const;
299   Value *createAlignedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
300                               int Alignment) const;
301 
302   Value *createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
303                     Value *Predicate, int Alignment, Value *Mask,
304                     Value *PassThru,
305                     ArrayRef<Value *> MDSources = std::nullopt) const;
306   Value *createSimpleLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
307                           int Alignment,
308                           ArrayRef<Value *> MDSources = std::nullopt) const;
309 
310   Value *createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
311                      Value *Predicate, int Alignment, Value *Mask,
312                      ArrayRef<Value *> MDSources = std ::nullopt) const;
313   Value *createSimpleStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
314                            int Alignment,
315                            ArrayRef<Value *> MDSources = std ::nullopt) const;
316 
317   Value *createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
318                               Value *Predicate, int Alignment,
319                               ArrayRef<Value *> MDSources = std::nullopt) const;
320   Value *
321   createPredicatedStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
322                         Value *Predicate, int Alignment,
323                         ArrayRef<Value *> MDSources = std::nullopt) const;
324 
325   DepList getUpwardDeps(Instruction *In, Instruction *Base) const;
326   bool createAddressGroups();
327   MoveList createLoadGroups(const AddrList &Group) const;
328   MoveList createStoreGroups(const AddrList &Group) const;
329   bool move(const MoveGroup &Move) const;
330   void realignLoadGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
331                         int ScLen, Value *AlignVal, Value *AlignAddr) const;
332   void realignStoreGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
333                          int ScLen, Value *AlignVal, Value *AlignAddr) const;
334   bool realignGroup(const MoveGroup &Move) const;
335 
336   Value *makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
337                              int Alignment) const;
338 
339   friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
340   friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
341   friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
342   friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
343 
344   std::map<Instruction *, AddrList> AddrGroups;
345   const HexagonVectorCombine &HVC;
346 };
347 
348 LLVM_ATTRIBUTE_UNUSED
349 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
350   OS << "Inst: " << AI.Inst << "  " << *AI.Inst << '\n';
351   OS << "Addr: " << *AI.Addr << '\n';
352   OS << "Type: " << *AI.ValTy << '\n';
353   OS << "HaveAlign: " << AI.HaveAlign.value() << '\n';
354   OS << "NeedAlign: " << AI.NeedAlign.value() << '\n';
355   OS << "Offset: " << AI.Offset;
356   return OS;
357 }
358 
359 LLVM_ATTRIBUTE_UNUSED
360 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
361   OS << "IsLoad:" << (MG.IsLoad ? "yes" : "no");
362   OS << ", IsHvx:" << (MG.IsHvx ? "yes" : "no") << '\n';
363   OS << "Main\n";
364   for (Instruction *I : MG.Main)
365     OS << "  " << *I << '\n';
366   OS << "Deps\n";
367   for (Instruction *I : MG.Deps)
368     OS << "  " << *I << '\n';
369   return OS;
370 }
371 
372 LLVM_ATTRIBUTE_UNUSED
373 raw_ostream &operator<<(raw_ostream &OS,
374                         const AlignVectors::ByteSpan::Block &B) {
375   OS << "  @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] ";
376   if (B.Seg.Val == reinterpret_cast<const Value *>(&B)) {
377     OS << "(self:" << B.Seg.Val << ')';
378   } else if (B.Seg.Val != nullptr) {
379     OS << *B.Seg.Val;
380   } else {
381     OS << "(null)";
382   }
383   return OS;
384 }
385 
386 LLVM_ATTRIBUTE_UNUSED
387 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
388   OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
389   for (const AlignVectors::ByteSpan::Block &B : BS)
390     OS << B << '\n';
391   OS << ']';
392   return OS;
393 }
394 
395 class HvxIdioms {
396 public:
397   HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {
398     auto *Int32Ty = HVC.getIntTy(32);
399     HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false);
400     HvxP32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/true);
401   }
402 
403   bool run();
404 
405 private:
406   enum Signedness { Positive, Signed, Unsigned };
407 
408   // Value + sign
409   // This is to keep track of whether the value should be treated as signed
410   // or unsigned, or is known to be positive.
411   struct SValue {
412     Value *Val;
413     Signedness Sgn;
414   };
415 
416   struct FxpOp {
417     unsigned Opcode;
418     unsigned Frac; // Number of fraction bits
419     SValue X, Y;
420     // If present, add 1 << RoundAt before shift:
421     std::optional<unsigned> RoundAt;
422     VectorType *ResTy;
423   };
424 
425   auto getNumSignificantBits(Value *V, Instruction *In) const
426       -> std::pair<unsigned, Signedness>;
427   auto canonSgn(SValue X, SValue Y) const -> std::pair<SValue, SValue>;
428 
429   auto matchFxpMul(Instruction &In) const -> std::optional<FxpOp>;
430   auto processFxpMul(Instruction &In, const FxpOp &Op) const -> Value *;
431 
432   auto processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
433                             const FxpOp &Op) const -> Value *;
434   auto createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
435                     bool Rounding) const -> Value *;
436   auto createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
437                     bool Rounding) const -> Value *;
438   // Return {Result, Carry}, where Carry is a vector predicate.
439   auto createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
440                       Value *CarryIn = nullptr) const
441       -> std::pair<Value *, Value *>;
442   auto createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const -> Value *;
443   auto createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
444       -> Value *;
445   auto createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
446       -> std::pair<Value *, Value *>;
447   auto createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
448                      ArrayRef<Value *> WordY) const -> SmallVector<Value *>;
449   auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
450                      Signedness SgnX, ArrayRef<Value *> WordY,
451                      Signedness SgnY) const -> SmallVector<Value *>;
452 
453   VectorType *HvxI32Ty;
454   VectorType *HvxP32Ty;
455   const HexagonVectorCombine &HVC;
456 
457   friend raw_ostream &operator<<(raw_ostream &, const FxpOp &);
458 };
459 
460 [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
461                                          const HvxIdioms::FxpOp &Op) {
462   static const char *SgnNames[] = {"Positive", "Signed", "Unsigned"};
463   OS << Instruction::getOpcodeName(Op.Opcode) << '.' << Op.Frac;
464   if (Op.RoundAt.has_value()) {
465     if (Op.Frac != 0 && *Op.RoundAt == Op.Frac - 1) {
466       OS << ":rnd";
467     } else {
468       OS << " + 1<<" << *Op.RoundAt;
469     }
470   }
471   OS << "\n  X:(" << SgnNames[Op.X.Sgn] << ") " << *Op.X.Val << "\n"
472      << "  Y:(" << SgnNames[Op.Y.Sgn] << ") " << *Op.Y.Val;
473   return OS;
474 }
475 
476 } // namespace
477 
478 namespace {
479 
480 template <typename T> T *getIfUnordered(T *MaybeT) {
481   return MaybeT && MaybeT->isUnordered() ? MaybeT : nullptr;
482 }
483 template <typename T> T *isCandidate(Instruction *In) {
484   return dyn_cast<T>(In);
485 }
486 template <> LoadInst *isCandidate<LoadInst>(Instruction *In) {
487   return getIfUnordered(dyn_cast<LoadInst>(In));
488 }
489 template <> StoreInst *isCandidate<StoreInst>(Instruction *In) {
490   return getIfUnordered(dyn_cast<StoreInst>(In));
491 }
492 
493 #if !defined(_MSC_VER) || _MSC_VER >= 1926
494 // VS2017 and some versions of VS2019 have trouble compiling this:
495 // error C2976: 'std::map': too few template arguments
496 // VS 2019 16.x is known to work, except for 16.4/16.5 (MSC_VER 1924/1925)
497 template <typename Pred, typename... Ts>
498 void erase_if(std::map<Ts...> &map, Pred p)
499 #else
500 template <typename Pred, typename T, typename U>
501 void erase_if(std::map<T, U> &map, Pred p)
502 #endif
503 {
504   for (auto i = map.begin(), e = map.end(); i != e;) {
505     if (p(*i))
506       i = map.erase(i);
507     else
508       i = std::next(i);
509   }
510 }
511 
512 // Forward other erase_ifs to the LLVM implementations.
513 template <typename Pred, typename T> void erase_if(T &&container, Pred p) {
514   llvm::erase_if(std::forward<T>(container), p);
515 }
516 
517 } // namespace
518 
519 // --- Begin AlignVectors
520 
521 // For brevity, only consider loads. We identify a group of loads where we
522 // know the relative differences between their addresses, so we know how they
523 // are laid out in memory (relative to one another). These loads can overlap,
524 // can be shorter or longer than the desired vector length.
525 // Ultimately we want to generate a sequence of aligned loads that will load
526 // every byte that the original loads loaded, and have the program use these
527 // loaded values instead of the original loads.
528 // We consider the contiguous memory area spanned by all these loads.
529 //
530 // Let's say that a single aligned vector load can load 16 bytes at a time.
531 // If the program wanted to use a byte at offset 13 from the beginning of the
532 // original span, it will be a byte at offset 13+x in the aligned data for
533 // some x>=0. This may happen to be in the first aligned load, or in the load
534 // following it. Since we generally don't know what the that alignment value
535 // is at compile time, we proactively do valigns on the aligned loads, so that
536 // byte that was at offset 13 is still at offset 13 after the valigns.
537 //
538 // This will be the starting point for making the rest of the program use the
539 // data loaded by the new loads.
540 // For each original load, and its users:
541 //   %v = load ...
542 //   ... = %v
543 //   ... = %v
544 // we create
545 //   %new_v = extract/combine/shuffle data from loaded/valigned vectors so
546 //            it contains the same value as %v did before
547 // then replace all users of %v with %new_v.
548 //   ... = %new_v
549 //   ... = %new_v
550 
551 auto AlignVectors::ByteSpan::extent() const -> int {
552   if (size() == 0)
553     return 0;
554   int Min = Blocks[0].Pos;
555   int Max = Blocks[0].Pos + Blocks[0].Seg.Size;
556   for (int i = 1, e = size(); i != e; ++i) {
557     Min = std::min(Min, Blocks[i].Pos);
558     Max = std::max(Max, Blocks[i].Pos + Blocks[i].Seg.Size);
559   }
560   return Max - Min;
561 }
562 
563 auto AlignVectors::ByteSpan::section(int Start, int Length) const -> ByteSpan {
564   ByteSpan Section;
565   for (const ByteSpan::Block &B : Blocks) {
566     int L = std::max(B.Pos, Start);                       // Left end.
567     int R = std::min(B.Pos + B.Seg.Size, Start + Length); // Right end+1.
568     if (L < R) {
569       // How much to chop off the beginning of the segment:
570       int Off = L > B.Pos ? L - B.Pos : 0;
571       Section.Blocks.emplace_back(B.Seg.Val, B.Seg.Start + Off, R - L, L);
572     }
573   }
574   return Section;
575 }
576 
577 auto AlignVectors::ByteSpan::shift(int Offset) -> ByteSpan & {
578   for (Block &B : Blocks)
579     B.Pos += Offset;
580   return *this;
581 }
582 
583 auto AlignVectors::ByteSpan::values() const -> SmallVector<Value *, 8> {
584   SmallVector<Value *, 8> Values(Blocks.size());
585   for (int i = 0, e = Blocks.size(); i != e; ++i)
586     Values[i] = Blocks[i].Seg.Val;
587   return Values;
588 }
589 
590 auto AlignVectors::getAlignFromValue(const Value *V) const -> Align {
591   const auto *C = dyn_cast<ConstantInt>(V);
592   assert(C && "Alignment must be a compile-time constant integer");
593   return C->getAlignValue();
594 }
595 
596 auto AlignVectors::getAddrInfo(Instruction &In) const
597     -> std::optional<AddrInfo> {
598   if (auto *L = isCandidate<LoadInst>(&In))
599     return AddrInfo(HVC, L, L->getPointerOperand(), L->getType(),
600                     L->getAlign());
601   if (auto *S = isCandidate<StoreInst>(&In))
602     return AddrInfo(HVC, S, S->getPointerOperand(),
603                     S->getValueOperand()->getType(), S->getAlign());
604   if (auto *II = isCandidate<IntrinsicInst>(&In)) {
605     Intrinsic::ID ID = II->getIntrinsicID();
606     switch (ID) {
607     case Intrinsic::masked_load:
608       return AddrInfo(HVC, II, II->getArgOperand(0), II->getType(),
609                       getAlignFromValue(II->getArgOperand(1)));
610     case Intrinsic::masked_store:
611       return AddrInfo(HVC, II, II->getArgOperand(1),
612                       II->getArgOperand(0)->getType(),
613                       getAlignFromValue(II->getArgOperand(2)));
614     }
615   }
616   return std::nullopt;
617 }
618 
619 auto AlignVectors::isHvx(const AddrInfo &AI) const -> bool {
620   return HVC.HST.isTypeForHVX(AI.ValTy);
621 }
622 
623 auto AlignVectors::getPayload(Value *Val) const -> Value * {
624   if (auto *In = dyn_cast<Instruction>(Val)) {
625     Intrinsic::ID ID = 0;
626     if (auto *II = dyn_cast<IntrinsicInst>(In))
627       ID = II->getIntrinsicID();
628     if (isa<StoreInst>(In) || ID == Intrinsic::masked_store)
629       return In->getOperand(0);
630   }
631   return Val;
632 }
633 
634 auto AlignVectors::getMask(Value *Val) const -> Value * {
635   if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
636     switch (II->getIntrinsicID()) {
637     case Intrinsic::masked_load:
638       return II->getArgOperand(2);
639     case Intrinsic::masked_store:
640       return II->getArgOperand(3);
641     }
642   }
643 
644   Type *ValTy = getPayload(Val)->getType();
645   if (auto *VecTy = dyn_cast<VectorType>(ValTy))
646     return HVC.getFullValue(HVC.getBoolTy(HVC.length(VecTy)));
647   return HVC.getFullValue(HVC.getBoolTy());
648 }
649 
650 auto AlignVectors::getPassThrough(Value *Val) const -> Value * {
651   if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
652     if (II->getIntrinsicID() == Intrinsic::masked_load)
653       return II->getArgOperand(3);
654   }
655   return UndefValue::get(getPayload(Val)->getType());
656 }
657 
658 auto AlignVectors::createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr,
659                                          Type *ValTy, int Adjust) const
660     -> Value * {
661   // The adjustment is in bytes, but if it's a multiple of the type size,
662   // we don't need to do pointer casts.
663   auto *PtrTy = cast<PointerType>(Ptr->getType());
664   if (!PtrTy->isOpaque()) {
665     Type *ElemTy = PtrTy->getNonOpaquePointerElementType();
666     int ElemSize = HVC.getSizeOf(ElemTy, HVC.Alloc);
667     if (Adjust % ElemSize == 0 && Adjust != 0) {
668       Value *Tmp0 = Builder.CreateGEP(
669           ElemTy, Ptr, HVC.getConstInt(Adjust / ElemSize), "gep");
670       return Builder.CreatePointerCast(Tmp0, ValTy->getPointerTo(), "cst");
671     }
672   }
673 
674   PointerType *CharPtrTy = Type::getInt8PtrTy(HVC.F.getContext());
675   Value *Tmp0 = Builder.CreatePointerCast(Ptr, CharPtrTy, "cst");
676   Value *Tmp1 = Builder.CreateGEP(Type::getInt8Ty(HVC.F.getContext()), Tmp0,
677                                   HVC.getConstInt(Adjust), "gep");
678   return Builder.CreatePointerCast(Tmp1, ValTy->getPointerTo(), "cst");
679 }
680 
681 auto AlignVectors::createAlignedPointer(IRBuilderBase &Builder, Value *Ptr,
682                                         Type *ValTy, int Alignment) const
683     -> Value * {
684   Value *AsInt = Builder.CreatePtrToInt(Ptr, HVC.getIntTy(), "pti");
685   Value *Mask = HVC.getConstInt(-Alignment);
686   Value *And = Builder.CreateAnd(AsInt, Mask, "add");
687   return Builder.CreateIntToPtr(And, ValTy->getPointerTo(), "itp");
688 }
689 
690 auto AlignVectors::createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
691                               Value *Predicate, int Alignment, Value *Mask,
692                               Value *PassThru,
693                               ArrayRef<Value *> MDSources) const -> Value * {
694   bool HvxHasPredLoad = HVC.HST.useHVXV62Ops();
695   // Predicate is nullptr if not creating predicated load
696   if (Predicate) {
697     assert(!Predicate->getType()->isVectorTy() &&
698            "Expectning scalar predicate");
699     if (HVC.isFalse(Predicate))
700       return UndefValue::get(ValTy);
701     if (!HVC.isTrue(Predicate) && HvxHasPredLoad) {
702       Value *Load = createPredicatedLoad(Builder, ValTy, Ptr, Predicate,
703                                          Alignment, MDSources);
704       return Builder.CreateSelect(Mask, Load, PassThru);
705     }
706     // Predicate == true here.
707   }
708   assert(!HVC.isUndef(Mask)); // Should this be allowed?
709   if (HVC.isZero(Mask))
710     return PassThru;
711   if (HVC.isTrue(Mask))
712     return createSimpleLoad(Builder, ValTy, Ptr, Alignment, MDSources);
713 
714   Instruction *Load = Builder.CreateMaskedLoad(ValTy, Ptr, Align(Alignment),
715                                                Mask, PassThru, "mld");
716   propagateMetadata(Load, MDSources);
717   return Load;
718 }
719 
720 auto AlignVectors::createSimpleLoad(IRBuilderBase &Builder, Type *ValTy,
721                                     Value *Ptr, int Alignment,
722                                     ArrayRef<Value *> MDSources) const
723     -> Value * {
724   Instruction *Load =
725       Builder.CreateAlignedLoad(ValTy, Ptr, Align(Alignment), "ald");
726   propagateMetadata(Load, MDSources);
727   return Load;
728 }
729 
730 auto AlignVectors::createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy,
731                                         Value *Ptr, Value *Predicate,
732                                         int Alignment,
733                                         ArrayRef<Value *> MDSources) const
734     -> Value * {
735   assert(HVC.HST.isTypeForHVX(ValTy) &&
736          "Predicates 'scalar' vector loads not yet supported");
737   assert(Predicate);
738   assert(!Predicate->getType()->isVectorTy() && "Expectning scalar predicate");
739   assert(HVC.getSizeOf(ValTy, HVC.Alloc) % Alignment == 0);
740   if (HVC.isFalse(Predicate))
741     return UndefValue::get(ValTy);
742   if (HVC.isTrue(Predicate))
743     return createSimpleLoad(Builder, ValTy, Ptr, Alignment, MDSources);
744 
745   auto V6_vL32b_pred_ai = HVC.HST.getIntrinsicId(Hexagon::V6_vL32b_pred_ai);
746   // FIXME: This may not put the offset from Ptr into the vmem offset.
747   return HVC.createHvxIntrinsic(Builder, V6_vL32b_pred_ai, ValTy,
748                                 {Predicate, Ptr, HVC.getConstInt(0)},
749                                 std::nullopt, MDSources);
750 }
751 
752 auto AlignVectors::createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
753                                Value *Predicate, int Alignment, Value *Mask,
754                                ArrayRef<Value *> MDSources) const -> Value * {
755   if (HVC.isZero(Mask) || HVC.isUndef(Val) || HVC.isUndef(Mask))
756     return UndefValue::get(Val->getType());
757   assert(!Predicate || (!Predicate->getType()->isVectorTy() &&
758                         "Expectning scalar predicate"));
759   if (Predicate) {
760     if (HVC.isFalse(Predicate))
761       return UndefValue::get(Val->getType());
762     if (HVC.isTrue(Predicate))
763       Predicate = nullptr;
764   }
765   // Here both Predicate and Mask are true or unknown.
766 
767   if (HVC.isTrue(Mask)) {
768     if (Predicate) { // Predicate unknown
769       return createPredicatedStore(Builder, Val, Ptr, Predicate, Alignment,
770                                    MDSources);
771     }
772     // Predicate is true:
773     return createSimpleStore(Builder, Val, Ptr, Alignment, MDSources);
774   }
775 
776   // Mask is unknown
777   if (!Predicate) {
778     Instruction *Store =
779         Builder.CreateMaskedStore(Val, Ptr, Align(Alignment), Mask);
780     propagateMetadata(Store, MDSources);
781     return Store;
782   }
783 
784   // Both Predicate and Mask are unknown.
785   // Emulate masked store with predicated-load + mux + predicated-store.
786   Value *PredLoad = createPredicatedLoad(Builder, Val->getType(), Ptr,
787                                          Predicate, Alignment, MDSources);
788   Value *Mux = Builder.CreateSelect(Mask, Val, PredLoad);
789   return createPredicatedStore(Builder, Mux, Ptr, Predicate, Alignment,
790                                MDSources);
791 }
792 
793 auto AlignVectors::createSimpleStore(IRBuilderBase &Builder, Value *Val,
794                                      Value *Ptr, int Alignment,
795                                      ArrayRef<Value *> MDSources) const
796     -> Value * {
797   Instruction *Store = Builder.CreateAlignedStore(Val, Ptr, Align(Alignment));
798   propagateMetadata(Store, MDSources);
799   return Store;
800 }
801 
802 auto AlignVectors::createPredicatedStore(IRBuilderBase &Builder, Value *Val,
803                                          Value *Ptr, Value *Predicate,
804                                          int Alignment,
805                                          ArrayRef<Value *> MDSources) const
806     -> Value * {
807   assert(HVC.HST.isTypeForHVX(Val->getType()) &&
808          "Predicates 'scalar' vector stores not yet supported");
809   assert(Predicate);
810   if (HVC.isFalse(Predicate))
811     return UndefValue::get(Val->getType());
812   if (HVC.isTrue(Predicate))
813     return createSimpleStore(Builder, Val, Ptr, Alignment, MDSources);
814 
815   assert(HVC.getSizeOf(Val, HVC.Alloc) % Alignment == 0);
816   auto V6_vS32b_pred_ai = HVC.HST.getIntrinsicId(Hexagon::V6_vS32b_pred_ai);
817   // FIXME: This may not put the offset from Ptr into the vmem offset.
818   return HVC.createHvxIntrinsic(Builder, V6_vS32b_pred_ai, nullptr,
819                                 {Predicate, Ptr, HVC.getConstInt(0), Val},
820                                 std::nullopt, MDSources);
821 }
822 
823 auto AlignVectors::getUpwardDeps(Instruction *In, Instruction *Base) const
824     -> DepList {
825   BasicBlock *Parent = Base->getParent();
826   assert(In->getParent() == Parent &&
827          "Base and In should be in the same block");
828   assert(Base->comesBefore(In) && "Base should come before In");
829 
830   DepList Deps;
831   std::deque<Instruction *> WorkQ = {In};
832   while (!WorkQ.empty()) {
833     Instruction *D = WorkQ.front();
834     WorkQ.pop_front();
835     Deps.insert(D);
836     for (Value *Op : D->operands()) {
837       if (auto *I = dyn_cast<Instruction>(Op)) {
838         if (I->getParent() == Parent && Base->comesBefore(I))
839           WorkQ.push_back(I);
840       }
841     }
842   }
843   return Deps;
844 }
845 
846 auto AlignVectors::createAddressGroups() -> bool {
847   // An address group created here may contain instructions spanning
848   // multiple basic blocks.
849   AddrList WorkStack;
850 
851   auto findBaseAndOffset = [&](AddrInfo &AI) -> std::pair<Instruction *, int> {
852     for (AddrInfo &W : WorkStack) {
853       if (auto D = HVC.calculatePointerDifference(AI.Addr, W.Addr))
854         return std::make_pair(W.Inst, *D);
855     }
856     return std::make_pair(nullptr, 0);
857   };
858 
859   auto traverseBlock = [&](DomTreeNode *DomN, auto Visit) -> void {
860     BasicBlock &Block = *DomN->getBlock();
861     for (Instruction &I : Block) {
862       auto AI = this->getAddrInfo(I); // Use this-> for gcc6.
863       if (!AI)
864         continue;
865       auto F = findBaseAndOffset(*AI);
866       Instruction *GroupInst;
867       if (Instruction *BI = F.first) {
868         AI->Offset = F.second;
869         GroupInst = BI;
870       } else {
871         WorkStack.push_back(*AI);
872         GroupInst = AI->Inst;
873       }
874       AddrGroups[GroupInst].push_back(*AI);
875     }
876 
877     for (DomTreeNode *C : DomN->children())
878       Visit(C, Visit);
879 
880     while (!WorkStack.empty() && WorkStack.back().Inst->getParent() == &Block)
881       WorkStack.pop_back();
882   };
883 
884   traverseBlock(HVC.DT.getRootNode(), traverseBlock);
885   assert(WorkStack.empty());
886 
887   // AddrGroups are formed.
888 
889   // Remove groups of size 1.
890   erase_if(AddrGroups, [](auto &G) { return G.second.size() == 1; });
891   // Remove groups that don't use HVX types.
892   erase_if(AddrGroups, [&](auto &G) {
893     return llvm::none_of(
894         G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(I.ValTy); });
895   });
896 
897   return !AddrGroups.empty();
898 }
899 
900 auto AlignVectors::createLoadGroups(const AddrList &Group) const -> MoveList {
901   // Form load groups.
902   // To avoid complications with moving code across basic blocks, only form
903   // groups that are contained within a single basic block.
904 
905   auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
906     assert(!Move.Main.empty() && "Move group should have non-empty Main");
907     // Don't mix HVX and non-HVX instructions.
908     if (Move.IsHvx != isHvx(Info))
909       return false;
910     // Leading instruction in the load group.
911     Instruction *Base = Move.Main.front();
912     if (Base->getParent() != Info.Inst->getParent())
913       return false;
914 
915     auto isSafeToMoveToBase = [&](const Instruction *I) {
916       return HVC.isSafeToMoveBeforeInBB(*I, Base->getIterator());
917     };
918     DepList Deps = getUpwardDeps(Info.Inst, Base);
919     if (!llvm::all_of(Deps, isSafeToMoveToBase))
920       return false;
921 
922     // The dependencies will be moved together with the load, so make sure
923     // that none of them could be moved independently in another group.
924     Deps.erase(Info.Inst);
925     auto inAddrMap = [&](Instruction *I) { return AddrGroups.count(I) > 0; };
926     if (llvm::any_of(Deps, inAddrMap))
927       return false;
928     Move.Main.push_back(Info.Inst);
929     llvm::append_range(Move.Deps, Deps);
930     return true;
931   };
932 
933   MoveList LoadGroups;
934 
935   for (const AddrInfo &Info : Group) {
936     if (!Info.Inst->mayReadFromMemory())
937       continue;
938     if (LoadGroups.empty() || !tryAddTo(Info, LoadGroups.back()))
939       LoadGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), true);
940   }
941 
942   // Erase singleton groups.
943   erase_if(LoadGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
944 
945   // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
946   if (!HVC.HST.useHVXV62Ops())
947     erase_if(LoadGroups, [](const MoveGroup &G) { return G.IsHvx; });
948 
949   return LoadGroups;
950 }
951 
952 auto AlignVectors::createStoreGroups(const AddrList &Group) const -> MoveList {
953   // Form store groups.
954   // To avoid complications with moving code across basic blocks, only form
955   // groups that are contained within a single basic block.
956 
957   auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
958     assert(!Move.Main.empty() && "Move group should have non-empty Main");
959     // For stores with return values we'd have to collect downward depenencies.
960     // There are no such stores that we handle at the moment, so omit that.
961     assert(Info.Inst->getType()->isVoidTy() &&
962            "Not handling stores with return values");
963     // Don't mix HVX and non-HVX instructions.
964     if (Move.IsHvx != isHvx(Info))
965       return false;
966     // For stores we need to be careful whether it's safe to move them.
967     // Stores that are otherwise safe to move together may not appear safe
968     // to move over one another (i.e. isSafeToMoveBefore may return false).
969     Instruction *Base = Move.Main.front();
970     if (Base->getParent() != Info.Inst->getParent())
971       return false;
972     if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator(), Move.Main))
973       return false;
974     Move.Main.push_back(Info.Inst);
975     return true;
976   };
977 
978   MoveList StoreGroups;
979 
980   for (auto I = Group.rbegin(), E = Group.rend(); I != E; ++I) {
981     const AddrInfo &Info = *I;
982     if (!Info.Inst->mayWriteToMemory())
983       continue;
984     if (StoreGroups.empty() || !tryAddTo(Info, StoreGroups.back()))
985       StoreGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), false);
986   }
987 
988   // Erase singleton groups.
989   erase_if(StoreGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
990 
991   // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
992   if (!HVC.HST.useHVXV62Ops())
993     erase_if(StoreGroups, [](const MoveGroup &G) { return G.IsHvx; });
994 
995   // Erase groups where every store is a full HVX vector. The reason is that
996   // aligning predicated stores generates complex code that may be less
997   // efficient than a sequence of unaligned vector stores.
998   if (!AlignFullHvxStores) {
999     erase_if(StoreGroups, [this](const MoveGroup &G) {
1000       return G.IsHvx && llvm::all_of(G.Main, [this](Instruction *S) {
1001                auto MaybeInfo = this->getAddrInfo(*S);
1002                assert(MaybeInfo.has_value());
1003                return HVC.HST.isHVXVectorType(
1004                    EVT::getEVT(MaybeInfo->ValTy, false));
1005              });
1006     });
1007   }
1008 
1009   return StoreGroups;
1010 }
1011 
1012 auto AlignVectors::move(const MoveGroup &Move) const -> bool {
1013   assert(!Move.Main.empty() && "Move group should have non-empty Main");
1014   Instruction *Where = Move.Main.front();
1015 
1016   if (Move.IsLoad) {
1017     // Move all deps to before Where, keeping order.
1018     for (Instruction *D : Move.Deps)
1019       D->moveBefore(Where);
1020     // Move all main instructions to after Where, keeping order.
1021     ArrayRef<Instruction *> Main(Move.Main);
1022     for (Instruction *M : Main.drop_front(1)) {
1023       M->moveAfter(Where);
1024       Where = M;
1025     }
1026   } else {
1027     // NOTE: Deps are empty for "store" groups. If they need to be
1028     // non-empty, decide on the order.
1029     assert(Move.Deps.empty());
1030     // Move all main instructions to before Where, inverting order.
1031     ArrayRef<Instruction *> Main(Move.Main);
1032     for (Instruction *M : Main.drop_front(1)) {
1033       M->moveBefore(Where);
1034       Where = M;
1035     }
1036   }
1037 
1038   return Move.Main.size() + Move.Deps.size() > 1;
1039 }
1040 
1041 auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
1042                                     const ByteSpan &VSpan, int ScLen,
1043                                     Value *AlignVal, Value *AlignAddr) const
1044     -> void {
1045   LLVM_DEBUG(dbgs() << __func__ << "\n");
1046 
1047   Type *SecTy = HVC.getByteTy(ScLen);
1048   int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1049   bool DoAlign = !HVC.isZero(AlignVal);
1050   BasicBlock::iterator BasePos = Builder.GetInsertPoint();
1051   BasicBlock *BaseBlock = Builder.GetInsertBlock();
1052 
1053   ByteSpan ASpan;
1054   auto *True = HVC.getFullValue(HVC.getBoolTy(ScLen));
1055   auto *Undef = UndefValue::get(SecTy);
1056 
1057   SmallVector<Instruction *> Loads(NumSectors + DoAlign, nullptr);
1058 
1059   // We could create all of the aligned loads, and generate the valigns
1060   // at the location of the first load, but for large load groups, this
1061   // could create highly suboptimal code (there have been groups of 140+
1062   // loads in real code).
1063   // Instead, place the loads/valigns as close to the users as possible.
1064   // In any case we need to have a mapping from the blocks of VSpan (the
1065   // span covered by the pre-existing loads) to ASpan (the span covered
1066   // by the aligned loads). There is a small problem, though: ASpan needs
1067   // to have pointers to the loads/valigns, but we don't have these loads
1068   // because we don't know where to put them yet. We find out by creating
1069   // a section of ASpan that corresponds to values (blocks) from VSpan,
1070   // and checking where the new load should be placed. We need to attach
1071   // this location information to each block in ASpan somehow, so we put
1072   // distincts values for Seg.Val in each ASpan.Blocks[i], and use a map
1073   // to store the location for each Seg.Val.
1074   // The distinct values happen to be Blocks[i].Seg.Val = &Blocks[i],
1075   // which helps with printing ByteSpans without crashing when printing
1076   // Segments with these temporary identifiers in place of Val.
1077 
1078   // Populate the blocks first, to avoid reallocations of the vector
1079   // interfering with generating the placeholder addresses.
1080   for (int Index = 0; Index != NumSectors; ++Index)
1081     ASpan.Blocks.emplace_back(nullptr, ScLen, Index * ScLen);
1082   for (int Index = 0; Index != NumSectors; ++Index) {
1083     ASpan.Blocks[Index].Seg.Val =
1084         reinterpret_cast<Value *>(&ASpan.Blocks[Index]);
1085   }
1086 
1087   // Multiple values from VSpan can map to the same value in ASpan. Since we
1088   // try to create loads lazily, we need to find the earliest use for each
1089   // value from ASpan.
1090   DenseMap<void *, Instruction *> EarliestUser;
1091   auto isEarlier = [](Instruction *A, Instruction *B) {
1092     if (B == nullptr)
1093       return true;
1094     if (A == nullptr)
1095       return false;
1096     assert(A->getParent() == B->getParent());
1097     return A->comesBefore(B);
1098   };
1099   auto earliestUser = [&](const auto &Uses) {
1100     Instruction *User = nullptr;
1101     for (const Use &U : Uses) {
1102       auto *I = dyn_cast<Instruction>(U.getUser());
1103       assert(I != nullptr && "Load used in a non-instruction?");
1104       // Make sure we only consider users in this block, but we need
1105       // to remember if there were users outside the block too. This is
1106       // because if no users are found, aligned loads will not be created.
1107       if (I->getParent() == BaseBlock) {
1108         if (!isa<PHINode>(I))
1109           User = std::min(User, I, isEarlier);
1110       } else {
1111         User = std::min(User, BaseBlock->getTerminator(), isEarlier);
1112       }
1113     }
1114     return User;
1115   };
1116 
1117   for (const ByteSpan::Block &B : VSpan) {
1118     ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size);
1119     for (const ByteSpan::Block &S : ASection) {
1120       EarliestUser[S.Seg.Val] = std::min(
1121           EarliestUser[S.Seg.Val], earliestUser(B.Seg.Val->uses()), isEarlier);
1122     }
1123   }
1124 
1125   LLVM_DEBUG({
1126     dbgs() << "ASpan:\n" << ASpan << '\n';
1127     dbgs() << "Earliest users of ASpan:\n";
1128     for (auto &[Val, User] : EarliestUser) {
1129       dbgs() << Val << "\n ->" << *User << '\n';
1130     }
1131   });
1132 
1133   auto createLoad = [&](IRBuilderBase &Builder, const ByteSpan &VSpan,
1134                         int Index, bool MakePred) {
1135     Value *Ptr =
1136         createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
1137     Value *Predicate =
1138         MakePred ? makeTestIfUnaligned(Builder, AlignVal, ScLen) : nullptr;
1139 
1140     // If vector shifting is potentially needed, accumulate metadata
1141     // from source sections of twice the load width.
1142     int Start = (Index - DoAlign) * ScLen;
1143     int Width = (1 + DoAlign) * ScLen;
1144     Value *Load = this->createLoad(Builder, SecTy, Ptr, Predicate, ScLen, True,
1145                                    Undef, VSpan.section(Start, Width).values());
1146     return cast<Instruction>(Load);
1147   };
1148 
1149   auto moveBefore = [this](Instruction *In, Instruction *To) {
1150     // Move In and its upward dependencies to before To.
1151     assert(In->getParent() == To->getParent());
1152     DepList Deps = getUpwardDeps(In, To);
1153     // DepList is sorted with respect to positions in the basic block.
1154     for (Instruction *I : Deps)
1155       I->moveBefore(To);
1156   };
1157 
1158   // Generate necessary loads at appropriate locations.
1159   LLVM_DEBUG(dbgs() << "Creating loads for ASpan sectors\n");
1160   for (int Index = 0; Index != NumSectors + 1; ++Index) {
1161     // In ASpan, each block will be either a single aligned load, or a
1162     // valign of a pair of loads. In the latter case, an aligned load j
1163     // will belong to the current valign, and the one in the previous
1164     // block (for j > 0).
1165     // Place the load at a location which will dominate the valign, assuming
1166     // the valign will be placed right before the earliest user.
1167     Instruction *PrevAt =
1168         DoAlign && Index > 0 ? EarliestUser[&ASpan[Index - 1]] : nullptr;
1169     Instruction *ThisAt =
1170         Index < NumSectors ? EarliestUser[&ASpan[Index]] : nullptr;
1171     if (auto *Where = std::min(PrevAt, ThisAt, isEarlier)) {
1172       Builder.SetInsertPoint(Where);
1173       Loads[Index] =
1174           createLoad(Builder, VSpan, Index, DoAlign && Index == NumSectors);
1175       // We know it's safe to put the load at BasePos, but we'd prefer to put
1176       // it at "Where". To see if the load is safe to be placed at Where, put
1177       // it there first and then check if it's safe to move it to BasePos.
1178       // If not, then the load needs to be placed at BasePos.
1179       // We can't do this check proactively because we need the load to exist
1180       // in order to check legality.
1181       if (!HVC.isSafeToMoveBeforeInBB(*Loads[Index], BasePos))
1182         moveBefore(Loads[Index], &*BasePos);
1183       LLVM_DEBUG(dbgs() << "Loads[" << Index << "]:" << *Loads[Index] << '\n');
1184     }
1185   }
1186 
1187   // Generate valigns if needed, and fill in proper values in ASpan
1188   LLVM_DEBUG(dbgs() << "Creating values for ASpan sectors\n");
1189   for (int Index = 0; Index != NumSectors; ++Index) {
1190     ASpan[Index].Seg.Val = nullptr;
1191     if (auto *Where = EarliestUser[&ASpan[Index]]) {
1192       Builder.SetInsertPoint(Where);
1193       Value *Val = Loads[Index];
1194       assert(Val != nullptr);
1195       if (DoAlign) {
1196         Value *NextLoad = Loads[Index + 1];
1197         assert(NextLoad != nullptr);
1198         Val = HVC.vralignb(Builder, Val, NextLoad, AlignVal);
1199       }
1200       ASpan[Index].Seg.Val = Val;
1201       LLVM_DEBUG(dbgs() << "ASpan[" << Index << "]:" << *Val << '\n');
1202     }
1203   }
1204 
1205   for (const ByteSpan::Block &B : VSpan) {
1206     ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
1207     Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
1208     Builder.SetInsertPoint(cast<Instruction>(B.Seg.Val));
1209 
1210     // We're generating a reduction, where each instruction depends on
1211     // the previous one, so we need to order them according to the position
1212     // of their inputs in the code.
1213     std::vector<ByteSpan::Block *> ABlocks;
1214     for (ByteSpan::Block &S : ASection) {
1215       if (S.Seg.Val != nullptr)
1216         ABlocks.push_back(&S);
1217     }
1218     llvm::sort(ABlocks,
1219                [&](const ByteSpan::Block *A, const ByteSpan::Block *B) {
1220                  return isEarlier(cast<Instruction>(A->Seg.Val),
1221                                   cast<Instruction>(B->Seg.Val));
1222                });
1223     for (ByteSpan::Block *S : ABlocks) {
1224       // The processing of the data loaded by the aligned loads
1225       // needs to be inserted after the data is available.
1226       Instruction *SegI = cast<Instruction>(S->Seg.Val);
1227       Builder.SetInsertPoint(&*std::next(SegI->getIterator()));
1228       Value *Pay = HVC.vbytes(Builder, getPayload(S->Seg.Val));
1229       Accum =
1230           HVC.insertb(Builder, Accum, Pay, S->Seg.Start, S->Seg.Size, S->Pos);
1231     }
1232     // Instead of casting everything to bytes for the vselect, cast to the
1233     // original value type. This will avoid complications with casting masks.
1234     // For example, in cases when the original mask applied to i32, it could
1235     // be converted to a mask applicable to i8 via pred_typecast intrinsic,
1236     // but if the mask is not exactly of HVX length, extra handling would be
1237     // needed to make it work.
1238     Type *ValTy = getPayload(B.Seg.Val)->getType();
1239     Value *Cast = Builder.CreateBitCast(Accum, ValTy, "cst");
1240     Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,
1241                                       getPassThrough(B.Seg.Val), "sel");
1242     B.Seg.Val->replaceAllUsesWith(Sel);
1243   }
1244 }
1245 
1246 auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
1247                                      const ByteSpan &VSpan, int ScLen,
1248                                      Value *AlignVal, Value *AlignAddr) const
1249     -> void {
1250   LLVM_DEBUG(dbgs() << __func__ << "\n");
1251 
1252   Type *SecTy = HVC.getByteTy(ScLen);
1253   int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1254   bool DoAlign = !HVC.isZero(AlignVal);
1255 
1256   // Stores.
1257   ByteSpan ASpanV, ASpanM;
1258 
1259   // Return a vector value corresponding to the input value Val:
1260   // either <1 x Val> for scalar Val, or Val itself for vector Val.
1261   auto MakeVec = [](IRBuilderBase &Builder, Value *Val) -> Value * {
1262     Type *Ty = Val->getType();
1263     if (Ty->isVectorTy())
1264       return Val;
1265     auto *VecTy = VectorType::get(Ty, 1, /*Scalable=*/false);
1266     return Builder.CreateBitCast(Val, VecTy, "cst");
1267   };
1268 
1269   // Create an extra "undef" sector at the beginning and at the end.
1270   // They will be used as the left/right filler in the vlalign step.
1271   for (int Index = (DoAlign ? -1 : 0); Index != NumSectors + DoAlign; ++Index) {
1272     // For stores, the size of each section is an aligned vector length.
1273     // Adjust the store offsets relative to the section start offset.
1274     ByteSpan VSection =
1275         VSpan.section(Index * ScLen, ScLen).shift(-Index * ScLen);
1276     Value *AccumV = UndefValue::get(SecTy);
1277     Value *AccumM = HVC.getNullValue(SecTy);
1278     for (ByteSpan::Block &S : VSection) {
1279       Value *Pay = getPayload(S.Seg.Val);
1280       Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
1281                                 Pay->getType(), HVC.getByteTy());
1282       AccumM = HVC.insertb(Builder, AccumM, HVC.vbytes(Builder, Mask),
1283                            S.Seg.Start, S.Seg.Size, S.Pos);
1284       AccumV = HVC.insertb(Builder, AccumV, HVC.vbytes(Builder, Pay),
1285                            S.Seg.Start, S.Seg.Size, S.Pos);
1286     }
1287     ASpanV.Blocks.emplace_back(AccumV, ScLen, Index * ScLen);
1288     ASpanM.Blocks.emplace_back(AccumM, ScLen, Index * ScLen);
1289   }
1290 
1291   // vlalign
1292   if (DoAlign) {
1293     for (int Index = 1; Index != NumSectors + 2; ++Index) {
1294       Value *PrevV = ASpanV[Index - 1].Seg.Val, *ThisV = ASpanV[Index].Seg.Val;
1295       Value *PrevM = ASpanM[Index - 1].Seg.Val, *ThisM = ASpanM[Index].Seg.Val;
1296       assert(isSectorTy(PrevV->getType()) && isSectorTy(PrevM->getType()));
1297       ASpanV[Index - 1].Seg.Val = HVC.vlalignb(Builder, PrevV, ThisV, AlignVal);
1298       ASpanM[Index - 1].Seg.Val = HVC.vlalignb(Builder, PrevM, ThisM, AlignVal);
1299     }
1300   }
1301 
1302   auto createStore = [&](IRBuilderBase &Builder, const ByteSpan &ASpanV,
1303                          const ByteSpan &ASpanM, int Index, bool MakePred) {
1304     Value *Val = ASpanV[Index].Seg.Val;
1305     Value *Mask = ASpanM[Index].Seg.Val; // bytes
1306     if (HVC.isUndef(Val) || HVC.isZero(Mask))
1307       return;
1308     Value *Ptr =
1309         createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
1310     Value *Predicate =
1311         MakePred ? makeTestIfUnaligned(Builder, AlignVal, ScLen) : nullptr;
1312 
1313     // If vector shifting is potentially needed, accumulate metadata
1314     // from source sections of twice the store width.
1315     int Start = (Index - DoAlign) * ScLen;
1316     int Width = (1 + DoAlign) * ScLen;
1317     this->createStore(Builder, Val, Ptr, Predicate, ScLen,
1318                       HVC.vlsb(Builder, Mask),
1319                       VSpan.section(Start, Width).values());
1320   };
1321 
1322   for (int Index = 0; Index != NumSectors + DoAlign; ++Index) {
1323     createStore(Builder, ASpanV, ASpanM, Index, DoAlign && Index == NumSectors);
1324   }
1325 }
1326 
1327 auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
1328   LLVM_DEBUG(dbgs() << "Realigning group:\n" << Move << '\n');
1329 
1330   // TODO: Needs support for masked loads/stores of "scalar" vectors.
1331   if (!Move.IsHvx)
1332     return false;
1333 
1334   // Return the element with the maximum alignment from Range,
1335   // where GetValue obtains the value to compare from an element.
1336   auto getMaxOf = [](auto Range, auto GetValue) {
1337     return *std::max_element(
1338         Range.begin(), Range.end(),
1339         [&GetValue](auto &A, auto &B) { return GetValue(A) < GetValue(B); });
1340   };
1341 
1342   const AddrList &BaseInfos = AddrGroups.at(Move.Base);
1343 
1344   // Conceptually, there is a vector of N bytes covering the addresses
1345   // starting from the minimum offset (i.e. Base.Addr+Start). This vector
1346   // represents a contiguous memory region that spans all accessed memory
1347   // locations.
1348   // The correspondence between loaded or stored values will be expressed
1349   // in terms of this vector. For example, the 0th element of the vector
1350   // from the Base address info will start at byte Start from the beginning
1351   // of this conceptual vector.
1352   //
1353   // This vector will be loaded/stored starting at the nearest down-aligned
1354   // address and the amount od the down-alignment will be AlignVal:
1355   //   valign(load_vector(align_down(Base+Start)), AlignVal)
1356 
1357   std::set<Instruction *> TestSet(Move.Main.begin(), Move.Main.end());
1358   AddrList MoveInfos;
1359   llvm::copy_if(
1360       BaseInfos, std::back_inserter(MoveInfos),
1361       [&TestSet](const AddrInfo &AI) { return TestSet.count(AI.Inst); });
1362 
1363   // Maximum alignment present in the whole address group.
1364   const AddrInfo &WithMaxAlign =
1365       getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.HaveAlign; });
1366   Align MaxGiven = WithMaxAlign.HaveAlign;
1367 
1368   // Minimum alignment present in the move address group.
1369   const AddrInfo &WithMinOffset =
1370       getMaxOf(MoveInfos, [](const AddrInfo &AI) { return -AI.Offset; });
1371 
1372   const AddrInfo &WithMaxNeeded =
1373       getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.NeedAlign; });
1374   Align MinNeeded = WithMaxNeeded.NeedAlign;
1375 
1376   // Set the builder's insertion point right before the load group, or
1377   // immediately after the store group. (Instructions in a store group are
1378   // listed in reverse order.)
1379   Instruction *InsertAt = Move.Main.front();
1380   if (!Move.IsLoad) {
1381     // There should be a terminator (which store isn't, but check anyways).
1382     assert(InsertAt->getIterator() != InsertAt->getParent()->end());
1383     InsertAt = &*std::next(InsertAt->getIterator());
1384   }
1385 
1386   IRBuilder Builder(InsertAt->getParent(), InsertAt->getIterator(),
1387                     InstSimplifyFolder(HVC.DL));
1388   Value *AlignAddr = nullptr; // Actual aligned address.
1389   Value *AlignVal = nullptr;  // Right-shift amount (for valign).
1390 
1391   if (MinNeeded <= MaxGiven) {
1392     int Start = WithMinOffset.Offset;
1393     int OffAtMax = WithMaxAlign.Offset;
1394     // Shift the offset of the maximally aligned instruction (OffAtMax)
1395     // back by just enough multiples of the required alignment to cover the
1396     // distance from Start to OffAtMax.
1397     // Calculate the address adjustment amount based on the address with the
1398     // maximum alignment. This is to allow a simple gep instruction instead
1399     // of potential bitcasts to i8*.
1400     int Adjust = -alignTo(OffAtMax - Start, MinNeeded.value());
1401     AlignAddr = createAdjustedPointer(Builder, WithMaxAlign.Addr,
1402                                       WithMaxAlign.ValTy, Adjust);
1403     int Diff = Start - (OffAtMax + Adjust);
1404     AlignVal = HVC.getConstInt(Diff);
1405     assert(Diff >= 0);
1406     assert(static_cast<decltype(MinNeeded.value())>(Diff) < MinNeeded.value());
1407   } else {
1408     // WithMinOffset is the lowest address in the group,
1409     //   WithMinOffset.Addr = Base+Start.
1410     // Align instructions for both HVX (V6_valign) and scalar (S2_valignrb)
1411     // mask off unnecessary bits, so it's ok to just the original pointer as
1412     // the alignment amount.
1413     // Do an explicit down-alignment of the address to avoid creating an
1414     // aligned instruction with an address that is not really aligned.
1415     AlignAddr = createAlignedPointer(Builder, WithMinOffset.Addr,
1416                                      WithMinOffset.ValTy, MinNeeded.value());
1417     AlignVal =
1418         Builder.CreatePtrToInt(WithMinOffset.Addr, HVC.getIntTy(), "pti");
1419   }
1420 
1421   ByteSpan VSpan;
1422   for (const AddrInfo &AI : MoveInfos) {
1423     VSpan.Blocks.emplace_back(AI.Inst, HVC.getSizeOf(AI.ValTy),
1424                               AI.Offset - WithMinOffset.Offset);
1425   }
1426 
1427   // The aligned loads/stores will use blocks that are either scalars,
1428   // or HVX vectors. Let "sector" be the unified term for such a block.
1429   // blend(scalar, vector) -> sector...
1430   int ScLen = Move.IsHvx ? HVC.HST.getVectorLength()
1431                          : std::max<int>(MinNeeded.value(), 4);
1432   assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);
1433   assert(Move.IsHvx || ScLen == 4 || ScLen == 8);
1434 
1435   LLVM_DEBUG({
1436     dbgs() << "ScLen:  " << ScLen << "\n";
1437     dbgs() << "AlignVal:" << *AlignVal << "\n";
1438     dbgs() << "AlignAddr:" << *AlignAddr << "\n";
1439     dbgs() << "VSpan:\n" << VSpan << '\n';
1440   });
1441 
1442   if (Move.IsLoad)
1443     realignLoadGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1444   else
1445     realignStoreGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1446 
1447   for (auto *Inst : Move.Main)
1448     Inst->eraseFromParent();
1449 
1450   return true;
1451 }
1452 
1453 auto AlignVectors::makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
1454                                        int Alignment) const -> Value * {
1455   auto *AlignTy = AlignVal->getType();
1456   Value *And =
1457       Builder.CreateAnd(AlignVal, ConstantInt::get(AlignTy, Alignment - 1));
1458   Value *Zero = ConstantInt::get(AlignTy, 0);
1459   return Builder.CreateICmpNE(And, Zero, "isz");
1460 }
1461 
1462 auto AlignVectors::isSectorTy(Type *Ty) const -> bool {
1463   if (!HVC.isByteVecTy(Ty))
1464     return false;
1465   int Size = HVC.getSizeOf(Ty);
1466   if (HVC.HST.isTypeForHVX(Ty))
1467     return Size == static_cast<int>(HVC.HST.getVectorLength());
1468   return Size == 4 || Size == 8;
1469 }
1470 
1471 auto AlignVectors::run() -> bool {
1472   LLVM_DEBUG(dbgs() << "Running HVC::AlignVectors\n");
1473   if (!createAddressGroups())
1474     return false;
1475 
1476   LLVM_DEBUG({
1477     dbgs() << "Address groups(" << AddrGroups.size() << "):\n";
1478     for (auto &[In, AL] : AddrGroups) {
1479       for (const AddrInfo &AI : AL)
1480         dbgs() << "---\n" << AI << '\n';
1481     }
1482   });
1483 
1484   bool Changed = false;
1485   MoveList LoadGroups, StoreGroups;
1486 
1487   for (auto &G : AddrGroups) {
1488     llvm::append_range(LoadGroups, createLoadGroups(G.second));
1489     llvm::append_range(StoreGroups, createStoreGroups(G.second));
1490   }
1491 
1492   LLVM_DEBUG({
1493     dbgs() << "\nLoad groups(" << LoadGroups.size() << "):\n";
1494     for (const MoveGroup &G : LoadGroups)
1495       dbgs() << G << "\n";
1496     dbgs() << "Store groups(" << StoreGroups.size() << "):\n";
1497     for (const MoveGroup &G : StoreGroups)
1498       dbgs() << G << "\n";
1499   });
1500 
1501   for (auto &M : LoadGroups)
1502     Changed |= move(M);
1503   for (auto &M : StoreGroups)
1504     Changed |= move(M);
1505 
1506   LLVM_DEBUG(dbgs() << "After move:\n" << HVC.F);
1507 
1508   for (auto &M : LoadGroups)
1509     Changed |= realignGroup(M);
1510   for (auto &M : StoreGroups)
1511     Changed |= realignGroup(M);
1512 
1513   return Changed;
1514 }
1515 
1516 // --- End AlignVectors
1517 
1518 // --- Begin HvxIdioms
1519 
1520 auto HvxIdioms::getNumSignificantBits(Value *V, Instruction *In) const
1521     -> std::pair<unsigned, Signedness> {
1522   unsigned Bits = HVC.getNumSignificantBits(V, In);
1523   // The significant bits are calculated including the sign bit. This may
1524   // add an extra bit for zero-extended values, e.g. (zext i32 to i64) may
1525   // result in 33 significant bits. To avoid extra words, skip the extra
1526   // sign bit, but keep information that the value is to be treated as
1527   // unsigned.
1528   KnownBits Known = HVC.getKnownBits(V, In);
1529   Signedness Sign = Signed;
1530   unsigned NumToTest = 0; // Number of bits used in test for unsignedness.
1531   if (isPowerOf2_32(Bits))
1532     NumToTest = Bits;
1533   else if (Bits > 1 && isPowerOf2_32(Bits - 1))
1534     NumToTest = Bits - 1;
1535 
1536   if (NumToTest != 0 && Known.Zero.ashr(NumToTest).isAllOnes()) {
1537     Sign = Unsigned;
1538     Bits = NumToTest;
1539   }
1540 
1541   // If the top bit of the nearest power-of-2 is zero, this value is
1542   // positive. It could be treated as either signed or unsigned.
1543   if (unsigned Pow2 = PowerOf2Ceil(Bits); Pow2 != Bits) {
1544     if (Known.Zero.ashr(Pow2 - 1).isAllOnes())
1545       Sign = Positive;
1546   }
1547   return {Bits, Sign};
1548 }
1549 
1550 auto HvxIdioms::canonSgn(SValue X, SValue Y) const
1551     -> std::pair<SValue, SValue> {
1552   // Canonicalize the signedness of X and Y, so that the result is one of:
1553   //   S, S
1554   //   U/P, S
1555   //   U/P, U/P
1556   if (X.Sgn == Signed && Y.Sgn != Signed)
1557     std::swap(X, Y);
1558   return {X, Y};
1559 }
1560 
1561 // Match
1562 //   (X * Y) [>> N], or
1563 //   ((X * Y) + (1 << M)) >> N
1564 auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> {
1565   using namespace PatternMatch;
1566   auto *Ty = In.getType();
1567 
1568   if (!Ty->isVectorTy() || !Ty->getScalarType()->isIntegerTy())
1569     return std::nullopt;
1570 
1571   unsigned Width = cast<IntegerType>(Ty->getScalarType())->getBitWidth();
1572 
1573   FxpOp Op;
1574   Value *Exp = &In;
1575 
1576   // Fixed-point multiplication is always shifted right (except when the
1577   // fraction is 0 bits).
1578   auto m_Shr = [](auto &&V, auto &&S) {
1579     return m_CombineOr(m_LShr(V, S), m_AShr(V, S));
1580   };
1581 
1582   const APInt *Qn = nullptr;
1583   if (Value * T; match(Exp, m_Shr(m_Value(T), m_APInt(Qn)))) {
1584     Op.Frac = Qn->getZExtValue();
1585     Exp = T;
1586   } else {
1587     Op.Frac = 0;
1588   }
1589 
1590   if (Op.Frac > Width)
1591     return std::nullopt;
1592 
1593   // Check if there is rounding added.
1594   const APInt *C = nullptr;
1595   if (Value * T; Op.Frac > 0 && match(Exp, m_Add(m_Value(T), m_APInt(C)))) {
1596     uint64_t CV = C->getZExtValue();
1597     if (CV != 0 && !isPowerOf2_64(CV))
1598       return std::nullopt;
1599     if (CV != 0)
1600       Op.RoundAt = Log2_64(CV);
1601     Exp = T;
1602   }
1603 
1604   // Check if the rest is a multiplication.
1605   if (match(Exp, m_Mul(m_Value(Op.X.Val), m_Value(Op.Y.Val)))) {
1606     Op.Opcode = Instruction::Mul;
1607     // FIXME: The information below is recomputed.
1608     Op.X.Sgn = getNumSignificantBits(Op.X.Val, &In).second;
1609     Op.Y.Sgn = getNumSignificantBits(Op.Y.Val, &In).second;
1610     Op.ResTy = cast<VectorType>(Ty);
1611     return Op;
1612   }
1613 
1614   return std::nullopt;
1615 }
1616 
1617 auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const
1618     -> Value * {
1619   assert(Op.X.Val->getType() == Op.Y.Val->getType());
1620 
1621   auto *VecTy = dyn_cast<VectorType>(Op.X.Val->getType());
1622   if (VecTy == nullptr)
1623     return nullptr;
1624   auto *ElemTy = cast<IntegerType>(VecTy->getElementType());
1625   unsigned ElemWidth = ElemTy->getBitWidth();
1626 
1627   // TODO: This can be relaxed after legalization is done pre-isel.
1628   if ((HVC.length(VecTy) * ElemWidth) % (8 * HVC.HST.getVectorLength()) != 0)
1629     return nullptr;
1630 
1631   // There are no special intrinsics that should be used for multiplying
1632   // signed 8-bit values, so just skip them. Normal codegen should handle
1633   // this just fine.
1634   if (ElemWidth <= 8)
1635     return nullptr;
1636   // Similarly, if this is just a multiplication that can be handled without
1637   // intervention, then leave it alone.
1638   if (ElemWidth <= 32 && Op.Frac == 0)
1639     return nullptr;
1640 
1641   auto [BitsX, SignX] = getNumSignificantBits(Op.X.Val, &In);
1642   auto [BitsY, SignY] = getNumSignificantBits(Op.Y.Val, &In);
1643 
1644   // TODO: Add multiplication of vectors by scalar registers (up to 4 bytes).
1645 
1646   Value *X = Op.X.Val, *Y = Op.Y.Val;
1647   IRBuilder Builder(In.getParent(), In.getIterator(),
1648                     InstSimplifyFolder(HVC.DL));
1649 
1650   auto roundUpWidth = [](unsigned Width) -> unsigned {
1651     if (Width <= 32 && !isPowerOf2_32(Width)) {
1652       // If the element width is not a power of 2, round it up
1653       // to the next one. Do this for widths not exceeding 32.
1654       return PowerOf2Ceil(Width);
1655     }
1656     if (Width > 32 && Width % 32 != 0) {
1657       // For wider elements, round it up to the multiple of 32.
1658       return alignTo(Width, 32u);
1659     }
1660     return Width;
1661   };
1662 
1663   BitsX = roundUpWidth(BitsX);
1664   BitsY = roundUpWidth(BitsY);
1665 
1666   // For elementwise multiplication vectors must have the same lengths, so
1667   // resize the elements of both inputs to the same width, the max of the
1668   // calculated significant bits.
1669   unsigned Width = std::max(BitsX, BitsY);
1670 
1671   auto *ResizeTy = VectorType::get(HVC.getIntTy(Width), VecTy);
1672   if (Width < ElemWidth) {
1673     X = Builder.CreateTrunc(X, ResizeTy, "trn");
1674     Y = Builder.CreateTrunc(Y, ResizeTy, "trn");
1675   } else if (Width > ElemWidth) {
1676     X = SignX == Signed ? Builder.CreateSExt(X, ResizeTy, "sxt")
1677                         : Builder.CreateZExt(X, ResizeTy, "zxt");
1678     Y = SignY == Signed ? Builder.CreateSExt(Y, ResizeTy, "sxt")
1679                         : Builder.CreateZExt(Y, ResizeTy, "zxt");
1680   };
1681 
1682   assert(X->getType() == Y->getType() && X->getType() == ResizeTy);
1683 
1684   unsigned VecLen = HVC.length(ResizeTy);
1685   unsigned ChopLen = (8 * HVC.HST.getVectorLength()) / std::min(Width, 32u);
1686 
1687   SmallVector<Value *> Results;
1688   FxpOp ChopOp = Op;
1689   ChopOp.ResTy = VectorType::get(Op.ResTy->getElementType(), ChopLen, false);
1690 
1691   for (unsigned V = 0; V != VecLen / ChopLen; ++V) {
1692     ChopOp.X.Val = HVC.subvector(Builder, X, V * ChopLen, ChopLen);
1693     ChopOp.Y.Val = HVC.subvector(Builder, Y, V * ChopLen, ChopLen);
1694     Results.push_back(processFxpMulChopped(Builder, In, ChopOp));
1695     if (Results.back() == nullptr)
1696       break;
1697   }
1698 
1699   if (Results.empty() || Results.back() == nullptr)
1700     return nullptr;
1701 
1702   Value *Cat = HVC.concat(Builder, Results);
1703   Value *Ext = SignX == Signed || SignY == Signed
1704                    ? Builder.CreateSExt(Cat, VecTy, "sxt")
1705                    : Builder.CreateZExt(Cat, VecTy, "zxt");
1706   return Ext;
1707 }
1708 
1709 auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
1710                                      const FxpOp &Op) const -> Value * {
1711   assert(Op.X.Val->getType() == Op.Y.Val->getType());
1712   auto *InpTy = cast<VectorType>(Op.X.Val->getType());
1713   unsigned Width = InpTy->getScalarSizeInBits();
1714   bool Rounding = Op.RoundAt.has_value();
1715 
1716   if (!Op.RoundAt || *Op.RoundAt == Op.Frac - 1) {
1717     // The fixed-point intrinsics do signed multiplication.
1718     if (Width == Op.Frac + 1 && Op.X.Sgn != Unsigned && Op.Y.Sgn != Unsigned) {
1719       Value *QMul = nullptr;
1720       if (Width == 16) {
1721         QMul = createMulQ15(Builder, Op.X, Op.Y, Rounding);
1722       } else if (Width == 32) {
1723         QMul = createMulQ31(Builder, Op.X, Op.Y, Rounding);
1724       }
1725       if (QMul != nullptr)
1726         return QMul;
1727     }
1728   }
1729 
1730   assert(Width >= 32 || isPowerOf2_32(Width)); // Width <= 32 => Width is 2^n
1731   assert(Width < 32 || Width % 32 == 0);       // Width > 32 => Width is 32*k
1732 
1733   // If Width < 32, then it should really be 16.
1734   if (Width < 32) {
1735     if (Width < 16)
1736       return nullptr;
1737     // Getting here with Op.Frac == 0 isn't wrong, but suboptimal: here we
1738     // generate a full precision products, which is unnecessary if there is
1739     // no shift.
1740     assert(Width == 16);
1741     assert(Op.Frac != 0 && "Unshifted mul should have been skipped");
1742     if (Op.Frac == 16) {
1743       // Multiply high
1744       if (Value *MulH = createMulH16(Builder, Op.X, Op.Y))
1745         return MulH;
1746     }
1747     // Do full-precision multiply and shift.
1748     Value *Prod32 = createMul16(Builder, Op.X, Op.Y);
1749     if (Rounding) {
1750       Value *RoundVal = HVC.getConstSplat(Prod32->getType(), 1 << *Op.RoundAt);
1751       Prod32 = Builder.CreateAdd(Prod32, RoundVal, "add");
1752     }
1753 
1754     Value *ShiftAmt = HVC.getConstSplat(Prod32->getType(), Op.Frac);
1755     Value *Shifted = Op.X.Sgn == Signed || Op.Y.Sgn == Signed
1756                          ? Builder.CreateAShr(Prod32, ShiftAmt, "asr")
1757                          : Builder.CreateLShr(Prod32, ShiftAmt, "lsr");
1758     return Builder.CreateTrunc(Shifted, InpTy, "trn");
1759   }
1760 
1761   // Width >= 32
1762 
1763   // Break up the arguments Op.X and Op.Y into vectors of smaller widths
1764   // in preparation of doing the multiplication by 32-bit parts.
1765   auto WordX = HVC.splitVectorElements(Builder, Op.X.Val, /*ToWidth=*/32);
1766   auto WordY = HVC.splitVectorElements(Builder, Op.Y.Val, /*ToWidth=*/32);
1767   auto WordP = createMulLong(Builder, WordX, Op.X.Sgn, WordY, Op.Y.Sgn);
1768 
1769   auto *HvxWordTy = cast<VectorType>(WordP.front()->getType());
1770 
1771   // Add the optional rounding to the proper word.
1772   if (Op.RoundAt.has_value()) {
1773     Value *Zero = HVC.getNullValue(WordX[0]->getType());
1774     SmallVector<Value *> RoundV(WordP.size(), Zero);
1775     RoundV[*Op.RoundAt / 32] =
1776         HVC.getConstSplat(HvxWordTy, 1 << (*Op.RoundAt % 32));
1777     WordP = createAddLong(Builder, WordP, RoundV);
1778   }
1779 
1780   // createRightShiftLong?
1781 
1782   // Shift all products right by Op.Frac.
1783   unsigned SkipWords = Op.Frac / 32;
1784   Constant *ShiftAmt = HVC.getConstSplat(HvxWordTy, Op.Frac % 32);
1785 
1786   for (int Dst = 0, End = WordP.size() - SkipWords; Dst != End; ++Dst) {
1787     int Src = Dst + SkipWords;
1788     Value *Lo = WordP[Src];
1789     if (Src + 1 < End) {
1790       Value *Hi = WordP[Src + 1];
1791       WordP[Dst] = Builder.CreateIntrinsic(HvxWordTy, Intrinsic::fshr,
1792                                            {Hi, Lo, ShiftAmt},
1793                                            /*FMFSource*/ nullptr, "int");
1794     } else {
1795       // The shift of the most significant word.
1796       WordP[Dst] = Builder.CreateAShr(Lo, ShiftAmt, "asr");
1797     }
1798   }
1799   if (SkipWords != 0)
1800     WordP.resize(WordP.size() - SkipWords);
1801 
1802   return HVC.joinVectorElements(Builder, WordP, Op.ResTy);
1803 }
1804 
1805 auto HvxIdioms::createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
1806                              bool Rounding) const -> Value * {
1807   assert(X.Val->getType() == Y.Val->getType());
1808   assert(X.Val->getType()->getScalarType() == HVC.getIntTy(16));
1809   assert(HVC.HST.isHVXVectorType(EVT::getEVT(X.Val->getType(), false)));
1810 
1811   // There is no non-rounding intrinsic for i16.
1812   if (!Rounding || X.Sgn == Unsigned || Y.Sgn == Unsigned)
1813     return nullptr;
1814 
1815   auto V6_vmpyhvsrs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhvsrs);
1816   return HVC.createHvxIntrinsic(Builder, V6_vmpyhvsrs, X.Val->getType(),
1817                                 {X.Val, Y.Val});
1818 }
1819 
1820 auto HvxIdioms::createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
1821                              bool Rounding) const -> Value * {
1822   Type *InpTy = X.Val->getType();
1823   assert(InpTy == Y.Val->getType());
1824   assert(InpTy->getScalarType() == HVC.getIntTy(32));
1825   assert(HVC.HST.isHVXVectorType(EVT::getEVT(InpTy, false)));
1826 
1827   if (X.Sgn == Unsigned || Y.Sgn == Unsigned)
1828     return nullptr;
1829 
1830   auto V6_vmpyewuh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyewuh);
1831   auto V6_vmpyo_acc = Rounding
1832                           ? HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_rnd_sacc)
1833                           : HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_sacc);
1834   Value *V1 =
1835       HVC.createHvxIntrinsic(Builder, V6_vmpyewuh, InpTy, {X.Val, Y.Val});
1836   return HVC.createHvxIntrinsic(Builder, V6_vmpyo_acc, InpTy,
1837                                 {V1, X.Val, Y.Val});
1838 }
1839 
1840 auto HvxIdioms::createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
1841                                Value *CarryIn) const
1842     -> std::pair<Value *, Value *> {
1843   assert(X->getType() == Y->getType());
1844   auto VecTy = cast<VectorType>(X->getType());
1845   if (VecTy == HvxI32Ty && HVC.HST.useHVXV62Ops()) {
1846     SmallVector<Value *> Args = {X, Y};
1847     Intrinsic::ID AddCarry;
1848     if (CarryIn == nullptr && HVC.HST.useHVXV66Ops()) {
1849       AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarryo);
1850     } else {
1851       AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarry);
1852       if (CarryIn == nullptr)
1853         CarryIn = HVC.getNullValue(HVC.getBoolTy(HVC.length(VecTy)));
1854       Args.push_back(CarryIn);
1855     }
1856     Value *Ret = HVC.createHvxIntrinsic(Builder, AddCarry,
1857                                         /*RetTy=*/nullptr, Args);
1858     Value *Result = Builder.CreateExtractValue(Ret, {0}, "ext");
1859     Value *CarryOut = Builder.CreateExtractValue(Ret, {1}, "ext");
1860     return {Result, CarryOut};
1861   }
1862 
1863   // In other cases, do a regular add, and unsigned compare-less-than.
1864   // The carry-out can originate in two places: adding the carry-in or adding
1865   // the two input values.
1866   Value *Result1 = X; // Result1 = X + CarryIn
1867   if (CarryIn != nullptr) {
1868     unsigned Width = VecTy->getScalarSizeInBits();
1869     uint32_t Mask = 1;
1870     if (Width < 32) {
1871       for (unsigned i = 0, e = 32 / Width; i != e; ++i)
1872         Mask = (Mask << Width) | 1;
1873     }
1874     auto V6_vandqrt = HVC.HST.getIntrinsicId(Hexagon::V6_vandqrt);
1875     Value *ValueIn =
1876         HVC.createHvxIntrinsic(Builder, V6_vandqrt, /*RetTy=*/nullptr,
1877                                {CarryIn, HVC.getConstInt(Mask)});
1878     Result1 = Builder.CreateAdd(X, ValueIn, "add");
1879   }
1880 
1881   Value *CarryOut1 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result1, X, "cmp");
1882   Value *Result2 = Builder.CreateAdd(Result1, Y, "add");
1883   Value *CarryOut2 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result2, Y, "cmp");
1884   return {Result2, Builder.CreateOr(CarryOut1, CarryOut2, "orb")};
1885 }
1886 
1887 auto HvxIdioms::createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const
1888     -> Value * {
1889   Intrinsic::ID V6_vmpyh = 0;
1890   std::tie(X, Y) = canonSgn(X, Y);
1891 
1892   if (X.Sgn == Signed) {
1893     V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhv);
1894   } else if (Y.Sgn == Signed) {
1895     // In vmpyhus the second operand is unsigned
1896     V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhus);
1897   } else {
1898     V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhv);
1899   }
1900 
1901   // i16*i16 -> i32 / interleaved
1902   Value *P =
1903       HVC.createHvxIntrinsic(Builder, V6_vmpyh, HvxP32Ty, {Y.Val, X.Val});
1904   // Deinterleave
1905   return HVC.vshuff(Builder, HVC.sublo(Builder, P), HVC.subhi(Builder, P));
1906 }
1907 
1908 auto HvxIdioms::createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
1909     -> Value * {
1910   Type *HvxI16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/false);
1911 
1912   if (HVC.HST.useHVXV69Ops()) {
1913     if (X.Sgn != Signed && Y.Sgn != Signed) {
1914       auto V6_vmpyuhvs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhvs);
1915       return HVC.createHvxIntrinsic(Builder, V6_vmpyuhvs, HvxI16Ty,
1916                                     {X.Val, Y.Val});
1917     }
1918   }
1919 
1920   Type *HvxP16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/true);
1921   Value *Pair16 =
1922       Builder.CreateBitCast(createMul16(Builder, X, Y), HvxP16Ty, "cst");
1923   unsigned Len = HVC.length(HvxP16Ty) / 2;
1924 
1925   SmallVector<int, 128> PickOdd(Len);
1926   for (int i = 0; i != static_cast<int>(Len); ++i)
1927     PickOdd[i] = 2 * i + 1;
1928 
1929   return Builder.CreateShuffleVector(
1930       HVC.sublo(Builder, Pair16), HVC.subhi(Builder, Pair16), PickOdd, "shf");
1931 }
1932 
1933 auto HvxIdioms::createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
1934     -> std::pair<Value *, Value *> {
1935   assert(X.Val->getType() == Y.Val->getType());
1936   assert(X.Val->getType() == HvxI32Ty);
1937 
1938   Intrinsic::ID V6_vmpy_parts;
1939   std::tie(X, Y) = canonSgn(X, Y);
1940 
1941   if (X.Sgn == Signed) {
1942     V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyss_parts;
1943   } else if (Y.Sgn == Signed) {
1944     V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyus_parts;
1945   } else {
1946     V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyuu_parts;
1947   }
1948 
1949   Value *Parts = HVC.createHvxIntrinsic(Builder, V6_vmpy_parts, nullptr,
1950                                         {X.Val, Y.Val}, {HvxI32Ty});
1951   Value *Hi = Builder.CreateExtractValue(Parts, {0}, "ext");
1952   Value *Lo = Builder.CreateExtractValue(Parts, {1}, "ext");
1953   return {Lo, Hi};
1954 }
1955 
1956 auto HvxIdioms::createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
1957                               ArrayRef<Value *> WordY) const
1958     -> SmallVector<Value *> {
1959   assert(WordX.size() == WordY.size());
1960   unsigned Idx = 0, Length = WordX.size();
1961   SmallVector<Value *> Sum(Length);
1962 
1963   while (Idx != Length) {
1964     if (HVC.isZero(WordX[Idx]))
1965       Sum[Idx] = WordY[Idx];
1966     else if (HVC.isZero(WordY[Idx]))
1967       Sum[Idx] = WordX[Idx];
1968     else
1969       break;
1970     ++Idx;
1971   }
1972 
1973   Value *Carry = nullptr;
1974   for (; Idx != Length; ++Idx) {
1975     std::tie(Sum[Idx], Carry) =
1976         createAddCarry(Builder, WordX[Idx], WordY[Idx], Carry);
1977   }
1978 
1979   // This drops the final carry beyond the highest word.
1980   return Sum;
1981 }
1982 
1983 auto HvxIdioms::createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
1984                               Signedness SgnX, ArrayRef<Value *> WordY,
1985                               Signedness SgnY) const -> SmallVector<Value *> {
1986   SmallVector<SmallVector<Value *>> Products(WordX.size() + WordY.size());
1987 
1988   // WordX[i] * WordY[j] produces words i+j and i+j+1 of the results,
1989   // that is halves 2(i+j), 2(i+j)+1, 2(i+j)+2, 2(i+j)+3.
1990   for (int i = 0, e = WordX.size(); i != e; ++i) {
1991     for (int j = 0, f = WordY.size(); j != f; ++j) {
1992       // Check the 4 halves that this multiplication can generate.
1993       Signedness SX = (i + 1 == e) ? SgnX : Unsigned;
1994       Signedness SY = (j + 1 == f) ? SgnY : Unsigned;
1995       auto [Lo, Hi] = createMul32(Builder, {WordX[i], SX}, {WordY[j], SY});
1996       Products[i + j + 0].push_back(Lo);
1997       Products[i + j + 1].push_back(Hi);
1998     }
1999   }
2000 
2001   Value *Zero = HVC.getNullValue(WordX[0]->getType());
2002 
2003   auto pop_back_or_zero = [Zero](auto &Vector) -> Value * {
2004     if (Vector.empty())
2005       return Zero;
2006     auto Last = Vector.back();
2007     Vector.pop_back();
2008     return Last;
2009   };
2010 
2011   for (int i = 0, e = Products.size(); i != e; ++i) {
2012     while (Products[i].size() > 1) {
2013       Value *Carry = nullptr; // no carry-in
2014       for (int j = i; j != e; ++j) {
2015         auto &ProdJ = Products[j];
2016         auto [Sum, CarryOut] = createAddCarry(Builder, pop_back_or_zero(ProdJ),
2017                                               pop_back_or_zero(ProdJ), Carry);
2018         ProdJ.insert(ProdJ.begin(), Sum);
2019         Carry = CarryOut;
2020       }
2021     }
2022   }
2023 
2024   SmallVector<Value *> WordP;
2025   for (auto &P : Products) {
2026     assert(P.size() == 1 && "Should have been added together");
2027     WordP.push_back(P.front());
2028   }
2029 
2030   return WordP;
2031 }
2032 
2033 auto HvxIdioms::run() -> bool {
2034   bool Changed = false;
2035 
2036   for (BasicBlock &B : HVC.F) {
2037     for (auto It = B.rbegin(); It != B.rend(); ++It) {
2038       if (auto Fxm = matchFxpMul(*It)) {
2039         Value *New = processFxpMul(*It, *Fxm);
2040         // Always report "changed" for now.
2041         Changed = true;
2042         if (!New)
2043           continue;
2044         bool StartOver = !isa<Instruction>(New);
2045         It->replaceAllUsesWith(New);
2046         RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI);
2047         It = StartOver ? B.rbegin()
2048                        : cast<Instruction>(New)->getReverseIterator();
2049         Changed = true;
2050       }
2051     }
2052   }
2053 
2054   return Changed;
2055 }
2056 
2057 // --- End HvxIdioms
2058 
2059 auto HexagonVectorCombine::run() -> bool {
2060   if (!HST.useHVXOps())
2061     return false;
2062 
2063   bool Changed = false;
2064   Changed |= AlignVectors(*this).run();
2065   Changed |= HvxIdioms(*this).run();
2066 
2067   return Changed;
2068 }
2069 
2070 auto HexagonVectorCombine::getIntTy(unsigned Width) const -> IntegerType * {
2071   return IntegerType::get(F.getContext(), Width);
2072 }
2073 
2074 auto HexagonVectorCombine::getByteTy(int ElemCount) const -> Type * {
2075   assert(ElemCount >= 0);
2076   IntegerType *ByteTy = Type::getInt8Ty(F.getContext());
2077   if (ElemCount == 0)
2078     return ByteTy;
2079   return VectorType::get(ByteTy, ElemCount, /*Scalable=*/false);
2080 }
2081 
2082 auto HexagonVectorCombine::getBoolTy(int ElemCount) const -> Type * {
2083   assert(ElemCount >= 0);
2084   IntegerType *BoolTy = Type::getInt1Ty(F.getContext());
2085   if (ElemCount == 0)
2086     return BoolTy;
2087   return VectorType::get(BoolTy, ElemCount, /*Scalable=*/false);
2088 }
2089 
2090 auto HexagonVectorCombine::getConstInt(int Val, unsigned Width) const
2091     -> ConstantInt * {
2092   return ConstantInt::getSigned(getIntTy(Width), Val);
2093 }
2094 
2095 auto HexagonVectorCombine::isZero(const Value *Val) const -> bool {
2096   if (auto *C = dyn_cast<Constant>(Val))
2097     return C->isZeroValue();
2098   return false;
2099 }
2100 
2101 auto HexagonVectorCombine::getIntValue(const Value *Val) const
2102     -> std::optional<APInt> {
2103   if (auto *CI = dyn_cast<ConstantInt>(Val))
2104     return CI->getValue();
2105   return std::nullopt;
2106 }
2107 
2108 auto HexagonVectorCombine::isUndef(const Value *Val) const -> bool {
2109   return isa<UndefValue>(Val);
2110 }
2111 
2112 auto HexagonVectorCombine::isTrue(const Value *Val) const -> bool {
2113   return Val == ConstantInt::getTrue(Val->getType());
2114 }
2115 
2116 auto HexagonVectorCombine::isFalse(const Value *Val) const -> bool {
2117   return isZero(Val);
2118 }
2119 
2120 auto HexagonVectorCombine::getHvxTy(Type *ElemTy, bool Pair) const
2121     -> VectorType * {
2122   EVT ETy = EVT::getEVT(ElemTy, false);
2123   assert(ETy.isSimple() && "Invalid HVX element type");
2124   // Do not allow boolean types here: they don't have a fixed length.
2125   assert(HST.isHVXElementType(ETy.getSimpleVT(), /*IncludeBool=*/false) &&
2126          "Invalid HVX element type");
2127   unsigned HwLen = HST.getVectorLength();
2128   unsigned NumElems = (8 * HwLen) / ETy.getSizeInBits();
2129   return VectorType::get(ElemTy, Pair ? 2 * NumElems : NumElems,
2130                          /*Scalable=*/false);
2131 }
2132 
2133 auto HexagonVectorCombine::getSizeOf(const Value *Val, SizeKind Kind) const
2134     -> int {
2135   return getSizeOf(Val->getType(), Kind);
2136 }
2137 
2138 auto HexagonVectorCombine::getSizeOf(const Type *Ty, SizeKind Kind) const
2139     -> int {
2140   auto *NcTy = const_cast<Type *>(Ty);
2141   switch (Kind) {
2142   case Store:
2143     return DL.getTypeStoreSize(NcTy).getFixedValue();
2144   case Alloc:
2145     return DL.getTypeAllocSize(NcTy).getFixedValue();
2146   }
2147   llvm_unreachable("Unhandled SizeKind enum");
2148 }
2149 
2150 auto HexagonVectorCombine::getTypeAlignment(Type *Ty) const -> int {
2151   // The actual type may be shorter than the HVX vector, so determine
2152   // the alignment based on subtarget info.
2153   if (HST.isTypeForHVX(Ty))
2154     return HST.getVectorLength();
2155   return DL.getABITypeAlign(Ty).value();
2156 }
2157 
2158 auto HexagonVectorCombine::length(Value *Val) const -> size_t {
2159   return length(Val->getType());
2160 }
2161 
2162 auto HexagonVectorCombine::length(Type *Ty) const -> size_t {
2163   auto *VecTy = dyn_cast<VectorType>(Ty);
2164   assert(VecTy && "Must be a vector type");
2165   return VecTy->getElementCount().getFixedValue();
2166 }
2167 
2168 auto HexagonVectorCombine::getNullValue(Type *Ty) const -> Constant * {
2169   assert(Ty->isIntOrIntVectorTy());
2170   auto Zero = ConstantInt::get(Ty->getScalarType(), 0);
2171   if (auto *VecTy = dyn_cast<VectorType>(Ty))
2172     return ConstantVector::getSplat(VecTy->getElementCount(), Zero);
2173   return Zero;
2174 }
2175 
2176 auto HexagonVectorCombine::getFullValue(Type *Ty) const -> Constant * {
2177   assert(Ty->isIntOrIntVectorTy());
2178   auto Minus1 = ConstantInt::get(Ty->getScalarType(), -1);
2179   if (auto *VecTy = dyn_cast<VectorType>(Ty))
2180     return ConstantVector::getSplat(VecTy->getElementCount(), Minus1);
2181   return Minus1;
2182 }
2183 
2184 auto HexagonVectorCombine::getConstSplat(Type *Ty, int Val) const
2185     -> Constant * {
2186   assert(Ty->isVectorTy());
2187   auto VecTy = cast<VectorType>(Ty);
2188   Type *ElemTy = VecTy->getElementType();
2189   // Add support for floats if needed.
2190   auto *Splat = ConstantVector::getSplat(VecTy->getElementCount(),
2191                                          ConstantInt::get(ElemTy, Val));
2192   return Splat;
2193 }
2194 
2195 auto HexagonVectorCombine::simplify(Value *V) const -> Value * {
2196   if (auto *In = dyn_cast<Instruction>(V)) {
2197     SimplifyQuery Q(DL, &TLI, &DT, &AC, In);
2198     return simplifyInstruction(In, Q);
2199   }
2200   return nullptr;
2201 }
2202 
2203 // Insert bytes [Start..Start+Length) of Src into Dst at byte Where.
2204 auto HexagonVectorCombine::insertb(IRBuilderBase &Builder, Value *Dst,
2205                                    Value *Src, int Start, int Length,
2206                                    int Where) const -> Value * {
2207   assert(isByteVecTy(Dst->getType()) && isByteVecTy(Src->getType()));
2208   int SrcLen = getSizeOf(Src);
2209   int DstLen = getSizeOf(Dst);
2210   assert(0 <= Start && Start + Length <= SrcLen);
2211   assert(0 <= Where && Where + Length <= DstLen);
2212 
2213   int P2Len = PowerOf2Ceil(SrcLen | DstLen);
2214   auto *Undef = UndefValue::get(getByteTy());
2215   Value *P2Src = vresize(Builder, Src, P2Len, Undef);
2216   Value *P2Dst = vresize(Builder, Dst, P2Len, Undef);
2217 
2218   SmallVector<int, 256> SMask(P2Len);
2219   for (int i = 0; i != P2Len; ++i) {
2220     // If i is in [Where, Where+Length), pick Src[Start+(i-Where)].
2221     // Otherwise, pick Dst[i];
2222     SMask[i] =
2223         (Where <= i && i < Where + Length) ? P2Len + Start + (i - Where) : i;
2224   }
2225 
2226   Value *P2Insert = Builder.CreateShuffleVector(P2Dst, P2Src, SMask, "shf");
2227   return vresize(Builder, P2Insert, DstLen, Undef);
2228 }
2229 
2230 auto HexagonVectorCombine::vlalignb(IRBuilderBase &Builder, Value *Lo,
2231                                     Value *Hi, Value *Amt) const -> Value * {
2232   assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
2233   if (isZero(Amt))
2234     return Hi;
2235   int VecLen = getSizeOf(Hi);
2236   if (auto IntAmt = getIntValue(Amt))
2237     return getElementRange(Builder, Lo, Hi, VecLen - IntAmt->getSExtValue(),
2238                            VecLen);
2239 
2240   if (HST.isTypeForHVX(Hi->getType())) {
2241     assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
2242            "Expecting an exact HVX type");
2243     return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_vlalignb),
2244                               Hi->getType(), {Hi, Lo, Amt});
2245   }
2246 
2247   if (VecLen == 4) {
2248     Value *Pair = concat(Builder, {Lo, Hi});
2249     Value *Shift =
2250         Builder.CreateLShr(Builder.CreateShl(Pair, Amt, "shl"), 32, "lsr");
2251     Value *Trunc =
2252         Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");
2253     return Builder.CreateBitCast(Trunc, Hi->getType(), "cst");
2254   }
2255   if (VecLen == 8) {
2256     Value *Sub = Builder.CreateSub(getConstInt(VecLen), Amt, "sub");
2257     return vralignb(Builder, Lo, Hi, Sub);
2258   }
2259   llvm_unreachable("Unexpected vector length");
2260 }
2261 
2262 auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo,
2263                                     Value *Hi, Value *Amt) const -> Value * {
2264   assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
2265   if (isZero(Amt))
2266     return Lo;
2267   int VecLen = getSizeOf(Lo);
2268   if (auto IntAmt = getIntValue(Amt))
2269     return getElementRange(Builder, Lo, Hi, IntAmt->getSExtValue(), VecLen);
2270 
2271   if (HST.isTypeForHVX(Lo->getType())) {
2272     assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
2273            "Expecting an exact HVX type");
2274     return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_valignb),
2275                               Lo->getType(), {Hi, Lo, Amt});
2276   }
2277 
2278   if (VecLen == 4) {
2279     Value *Pair = concat(Builder, {Lo, Hi});
2280     Value *Shift = Builder.CreateLShr(Pair, Amt, "lsr");
2281     Value *Trunc =
2282         Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");
2283     return Builder.CreateBitCast(Trunc, Lo->getType(), "cst");
2284   }
2285   if (VecLen == 8) {
2286     Type *Int64Ty = Type::getInt64Ty(F.getContext());
2287     Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty, "cst");
2288     Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty, "cst");
2289     Function *FI = Intrinsic::getDeclaration(F.getParent(),
2290                                              Intrinsic::hexagon_S2_valignrb);
2291     Value *Call = Builder.CreateCall(FI, {Hi64, Lo64, Amt}, "cup");
2292     return Builder.CreateBitCast(Call, Lo->getType(), "cst");
2293   }
2294   llvm_unreachable("Unexpected vector length");
2295 }
2296 
2297 // Concatenates a sequence of vectors of the same type.
2298 auto HexagonVectorCombine::concat(IRBuilderBase &Builder,
2299                                   ArrayRef<Value *> Vecs) const -> Value * {
2300   assert(!Vecs.empty());
2301   SmallVector<int, 256> SMask;
2302   std::vector<Value *> Work[2];
2303   int ThisW = 0, OtherW = 1;
2304 
2305   Work[ThisW].assign(Vecs.begin(), Vecs.end());
2306   while (Work[ThisW].size() > 1) {
2307     auto *Ty = cast<VectorType>(Work[ThisW].front()->getType());
2308     SMask.resize(length(Ty) * 2);
2309     std::iota(SMask.begin(), SMask.end(), 0);
2310 
2311     Work[OtherW].clear();
2312     if (Work[ThisW].size() % 2 != 0)
2313       Work[ThisW].push_back(UndefValue::get(Ty));
2314     for (int i = 0, e = Work[ThisW].size(); i < e; i += 2) {
2315       Value *Joined = Builder.CreateShuffleVector(
2316           Work[ThisW][i], Work[ThisW][i + 1], SMask, "shf");
2317       Work[OtherW].push_back(Joined);
2318     }
2319     std::swap(ThisW, OtherW);
2320   }
2321 
2322   // Since there may have been some undefs appended to make shuffle operands
2323   // have the same type, perform the last shuffle to only pick the original
2324   // elements.
2325   SMask.resize(Vecs.size() * length(Vecs.front()->getType()));
2326   std::iota(SMask.begin(), SMask.end(), 0);
2327   Value *Total = Work[ThisW].front();
2328   return Builder.CreateShuffleVector(Total, SMask, "shf");
2329 }
2330 
2331 auto HexagonVectorCombine::vresize(IRBuilderBase &Builder, Value *Val,
2332                                    int NewSize, Value *Pad) const -> Value * {
2333   assert(isa<VectorType>(Val->getType()));
2334   auto *ValTy = cast<VectorType>(Val->getType());
2335   assert(ValTy->getElementType() == Pad->getType());
2336 
2337   int CurSize = length(ValTy);
2338   if (CurSize == NewSize)
2339     return Val;
2340   // Truncate?
2341   if (CurSize > NewSize)
2342     return getElementRange(Builder, Val, /*Ignored*/ Val, 0, NewSize);
2343   // Extend.
2344   SmallVector<int, 128> SMask(NewSize);
2345   std::iota(SMask.begin(), SMask.begin() + CurSize, 0);
2346   std::fill(SMask.begin() + CurSize, SMask.end(), CurSize);
2347   Value *PadVec = Builder.CreateVectorSplat(CurSize, Pad, "spt");
2348   return Builder.CreateShuffleVector(Val, PadVec, SMask, "shf");
2349 }
2350 
2351 auto HexagonVectorCombine::rescale(IRBuilderBase &Builder, Value *Mask,
2352                                    Type *FromTy, Type *ToTy) const -> Value * {
2353   // Mask is a vector <N x i1>, where each element corresponds to an
2354   // element of FromTy. Remap it so that each element will correspond
2355   // to an element of ToTy.
2356   assert(isa<VectorType>(Mask->getType()));
2357 
2358   Type *FromSTy = FromTy->getScalarType();
2359   Type *ToSTy = ToTy->getScalarType();
2360   if (FromSTy == ToSTy)
2361     return Mask;
2362 
2363   int FromSize = getSizeOf(FromSTy);
2364   int ToSize = getSizeOf(ToSTy);
2365   assert(FromSize % ToSize == 0 || ToSize % FromSize == 0);
2366 
2367   auto *MaskTy = cast<VectorType>(Mask->getType());
2368   int FromCount = length(MaskTy);
2369   int ToCount = (FromCount * FromSize) / ToSize;
2370   assert((FromCount * FromSize) % ToSize == 0);
2371 
2372   auto *FromITy = getIntTy(FromSize * 8);
2373   auto *ToITy = getIntTy(ToSize * 8);
2374 
2375   // Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> ->
2376   // -> trunc to <M x i1>.
2377   Value *Ext = Builder.CreateSExt(
2378       Mask, VectorType::get(FromITy, FromCount, /*Scalable=*/false), "sxt");
2379   Value *Cast = Builder.CreateBitCast(
2380       Ext, VectorType::get(ToITy, ToCount, /*Scalable=*/false), "cst");
2381   return Builder.CreateTrunc(
2382       Cast, VectorType::get(getBoolTy(), ToCount, /*Scalable=*/false), "trn");
2383 }
2384 
2385 // Bitcast to bytes, and return least significant bits.
2386 auto HexagonVectorCombine::vlsb(IRBuilderBase &Builder, Value *Val) const
2387     -> Value * {
2388   Type *ScalarTy = Val->getType()->getScalarType();
2389   if (ScalarTy == getBoolTy())
2390     return Val;
2391 
2392   Value *Bytes = vbytes(Builder, Val);
2393   if (auto *VecTy = dyn_cast<VectorType>(Bytes->getType()))
2394     return Builder.CreateTrunc(Bytes, getBoolTy(getSizeOf(VecTy)), "trn");
2395   // If Bytes is a scalar (i.e. Val was a scalar byte), return i1, not
2396   // <1 x i1>.
2397   return Builder.CreateTrunc(Bytes, getBoolTy(), "trn");
2398 }
2399 
2400 // Bitcast to bytes for non-bool. For bool, convert i1 -> i8.
2401 auto HexagonVectorCombine::vbytes(IRBuilderBase &Builder, Value *Val) const
2402     -> Value * {
2403   Type *ScalarTy = Val->getType()->getScalarType();
2404   if (ScalarTy == getByteTy())
2405     return Val;
2406 
2407   if (ScalarTy != getBoolTy())
2408     return Builder.CreateBitCast(Val, getByteTy(getSizeOf(Val)), "cst");
2409   // For bool, return a sext from i1 to i8.
2410   if (auto *VecTy = dyn_cast<VectorType>(Val->getType()))
2411     return Builder.CreateSExt(Val, VectorType::get(getByteTy(), VecTy), "sxt");
2412   return Builder.CreateSExt(Val, getByteTy(), "sxt");
2413 }
2414 
2415 auto HexagonVectorCombine::subvector(IRBuilderBase &Builder, Value *Val,
2416                                      unsigned Start, unsigned Length) const
2417     -> Value * {
2418   assert(Start + Length <= length(Val));
2419   return getElementRange(Builder, Val, /*Ignored*/ Val, Start, Length);
2420 }
2421 
2422 auto HexagonVectorCombine::sublo(IRBuilderBase &Builder, Value *Val) const
2423     -> Value * {
2424   size_t Len = length(Val);
2425   assert(Len % 2 == 0 && "Length should be even");
2426   return subvector(Builder, Val, 0, Len / 2);
2427 }
2428 
2429 auto HexagonVectorCombine::subhi(IRBuilderBase &Builder, Value *Val) const
2430     -> Value * {
2431   size_t Len = length(Val);
2432   assert(Len % 2 == 0 && "Length should be even");
2433   return subvector(Builder, Val, Len / 2, Len / 2);
2434 }
2435 
2436 auto HexagonVectorCombine::vdeal(IRBuilderBase &Builder, Value *Val0,
2437                                  Value *Val1) const -> Value * {
2438   assert(Val0->getType() == Val1->getType());
2439   int Len = length(Val0);
2440   SmallVector<int, 128> Mask(2 * Len);
2441 
2442   for (int i = 0; i != Len; ++i) {
2443     Mask[i] = 2 * i;           // Even
2444     Mask[i + Len] = 2 * i + 1; // Odd
2445   }
2446   return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");
2447 }
2448 
2449 auto HexagonVectorCombine::vshuff(IRBuilderBase &Builder, Value *Val0,
2450                                   Value *Val1) const -> Value * { //
2451   assert(Val0->getType() == Val1->getType());
2452   int Len = length(Val0);
2453   SmallVector<int, 128> Mask(2 * Len);
2454 
2455   for (int i = 0; i != Len; ++i) {
2456     Mask[2 * i + 0] = i;       // Val0
2457     Mask[2 * i + 1] = i + Len; // Val1
2458   }
2459   return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");
2460 }
2461 
2462 auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,
2463                                               Intrinsic::ID IntID, Type *RetTy,
2464                                               ArrayRef<Value *> Args,
2465                                               ArrayRef<Type *> ArgTys,
2466                                               ArrayRef<Value *> MDSources) const
2467     -> Value * {
2468   auto getCast = [&](IRBuilderBase &Builder, Value *Val,
2469                      Type *DestTy) -> Value * {
2470     Type *SrcTy = Val->getType();
2471     if (SrcTy == DestTy)
2472       return Val;
2473 
2474     // Non-HVX type. It should be a scalar, and it should already have
2475     // a valid type.
2476     assert(HST.isTypeForHVX(SrcTy, /*IncludeBool=*/true));
2477 
2478     Type *BoolTy = Type::getInt1Ty(F.getContext());
2479     if (cast<VectorType>(SrcTy)->getElementType() != BoolTy)
2480       return Builder.CreateBitCast(Val, DestTy, "cst");
2481 
2482     // Predicate HVX vector.
2483     unsigned HwLen = HST.getVectorLength();
2484     Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast
2485                                    : Intrinsic::hexagon_V6_pred_typecast_128B;
2486     Function *FI =
2487         Intrinsic::getDeclaration(F.getParent(), TC, {DestTy, Val->getType()});
2488     return Builder.CreateCall(FI, {Val}, "cup");
2489   };
2490 
2491   Function *IntrFn = Intrinsic::getDeclaration(F.getParent(), IntID, ArgTys);
2492   FunctionType *IntrTy = IntrFn->getFunctionType();
2493 
2494   SmallVector<Value *, 4> IntrArgs;
2495   for (int i = 0, e = Args.size(); i != e; ++i) {
2496     Value *A = Args[i];
2497     Type *T = IntrTy->getParamType(i);
2498     if (A->getType() != T) {
2499       IntrArgs.push_back(getCast(Builder, A, T));
2500     } else {
2501       IntrArgs.push_back(A);
2502     }
2503   }
2504   StringRef MaybeName = !IntrTy->getReturnType()->isVoidTy() ? "cup" : "";
2505   CallInst *Call = Builder.CreateCall(IntrFn, IntrArgs, MaybeName);
2506 
2507   MemoryEffects ME = Call->getAttributes().getMemoryEffects();
2508   if (!ME.doesNotAccessMemory() && !ME.onlyAccessesInaccessibleMem())
2509     propagateMetadata(Call, MDSources);
2510 
2511   Type *CallTy = Call->getType();
2512   if (RetTy == nullptr || CallTy == RetTy)
2513     return Call;
2514   // Scalar types should have RetTy matching the call return type.
2515   assert(HST.isTypeForHVX(CallTy, /*IncludeBool=*/true));
2516   return getCast(Builder, Call, RetTy);
2517 }
2518 
2519 auto HexagonVectorCombine::splitVectorElements(IRBuilderBase &Builder,
2520                                                Value *Vec,
2521                                                unsigned ToWidth) const
2522     -> SmallVector<Value *> {
2523   // Break a vector of wide elements into a series of vectors with narrow
2524   // elements:
2525   //   (...c0:b0:a0, ...c1:b1:a1, ...c2:b2:a2, ...)
2526   // -->
2527   //   (a0, a1, a2, ...)    // lowest "ToWidth" bits
2528   //   (b0, b1, b2, ...)    // the next lowest...
2529   //   (c0, c1, c2, ...)    // ...
2530   //   ...
2531   //
2532   // The number of elements in each resulting vector is the same as
2533   // in the original vector.
2534 
2535   auto *VecTy = cast<VectorType>(Vec->getType());
2536   assert(VecTy->getElementType()->isIntegerTy());
2537   unsigned FromWidth = VecTy->getScalarSizeInBits();
2538   assert(isPowerOf2_32(ToWidth) && isPowerOf2_32(FromWidth));
2539   assert(ToWidth <= FromWidth && "Breaking up into wider elements?");
2540   unsigned NumResults = FromWidth / ToWidth;
2541 
2542   SmallVector<Value *> Results(NumResults);
2543   Results[0] = Vec;
2544   unsigned Length = length(VecTy);
2545 
2546   // Do it by splitting in half, since those operations correspond to deal
2547   // instructions.
2548   auto splitInHalf = [&](unsigned Begin, unsigned End, auto splitFunc) -> void {
2549     // Take V = Results[Begin], split it in L, H.
2550     // Store Results[Begin] = L, Results[(Begin+End)/2] = H
2551     // Call itself recursively split(Begin, Half), split(Half+1, End)
2552     if (Begin + 1 == End)
2553       return;
2554 
2555     Value *Val = Results[Begin];
2556     unsigned Width = Val->getType()->getScalarSizeInBits();
2557 
2558     auto *VTy = VectorType::get(getIntTy(Width / 2), 2 * Length, false);
2559     Value *VVal = Builder.CreateBitCast(Val, VTy, "cst");
2560 
2561     Value *Res = vdeal(Builder, sublo(Builder, VVal), subhi(Builder, VVal));
2562 
2563     unsigned Half = (Begin + End) / 2;
2564     Results[Begin] = sublo(Builder, Res);
2565     Results[Half] = subhi(Builder, Res);
2566 
2567     splitFunc(Begin, Half, splitFunc);
2568     splitFunc(Half, End, splitFunc);
2569   };
2570 
2571   splitInHalf(0, NumResults, splitInHalf);
2572   return Results;
2573 }
2574 
2575 auto HexagonVectorCombine::joinVectorElements(IRBuilderBase &Builder,
2576                                               ArrayRef<Value *> Values,
2577                                               VectorType *ToType) const
2578     -> Value * {
2579   assert(ToType->getElementType()->isIntegerTy());
2580 
2581   // If the list of values does not have power-of-2 elements, append copies
2582   // of the sign bit to it, to make the size be 2^n.
2583   // The reason for this is that the values will be joined in pairs, because
2584   // otherwise the shuffles will result in convoluted code. With pairwise
2585   // joins, the shuffles will hopefully be folded into a perfect shuffle.
2586   // The output will need to be sign-extended to a type with element width
2587   // being a power-of-2 anyways.
2588   SmallVector<Value *> Inputs(Values.begin(), Values.end());
2589 
2590   unsigned ToWidth = ToType->getScalarSizeInBits();
2591   unsigned Width = Inputs.front()->getType()->getScalarSizeInBits();
2592   assert(Width <= ToWidth);
2593   assert(isPowerOf2_32(Width) && isPowerOf2_32(ToWidth));
2594   unsigned Length = length(Inputs.front()->getType());
2595 
2596   unsigned NeedInputs = ToWidth / Width;
2597   if (Inputs.size() != NeedInputs) {
2598     // Having too many inputs is ok: drop the high bits (usual wrap-around).
2599     // If there are too few, fill them with the sign bit.
2600     Value *Last = Inputs.back();
2601     Value *Sign = Builder.CreateAShr(
2602         Last, getConstSplat(Last->getType(), Width - 1), "asr");
2603     Inputs.resize(NeedInputs, Sign);
2604   }
2605 
2606   while (Inputs.size() > 1) {
2607     Width *= 2;
2608     auto *VTy = VectorType::get(getIntTy(Width), Length, false);
2609     for (int i = 0, e = Inputs.size(); i < e; i += 2) {
2610       Value *Res = vshuff(Builder, Inputs[i], Inputs[i + 1]);
2611       Inputs[i / 2] = Builder.CreateBitCast(Res, VTy, "cst");
2612     }
2613     Inputs.resize(Inputs.size() / 2);
2614   }
2615 
2616   assert(Inputs.front()->getType() == ToType);
2617   return Inputs.front();
2618 }
2619 
2620 auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,
2621                                                       Value *Ptr1) const
2622     -> std::optional<int> {
2623   struct Builder : IRBuilder<> {
2624     Builder(BasicBlock *B) : IRBuilder<>(B->getTerminator()) {}
2625     ~Builder() {
2626       for (Instruction *I : llvm::reverse(ToErase))
2627         I->eraseFromParent();
2628     }
2629     SmallVector<Instruction *, 8> ToErase;
2630   };
2631 
2632 #define CallBuilder(B, F)                                                      \
2633   [&](auto &B_) {                                                              \
2634     Value *V = B_.F;                                                           \
2635     if (auto *I = dyn_cast<Instruction>(V))                                    \
2636       B_.ToErase.push_back(I);                                                 \
2637     return V;                                                                  \
2638   }(B)
2639 
2640   auto Simplify = [this](Value *V) {
2641     if (Value *S = simplify(V))
2642       return S;
2643     return V;
2644   };
2645 
2646   auto StripBitCast = [](Value *V) {
2647     while (auto *C = dyn_cast<BitCastInst>(V))
2648       V = C->getOperand(0);
2649     return V;
2650   };
2651 
2652   Ptr0 = StripBitCast(Ptr0);
2653   Ptr1 = StripBitCast(Ptr1);
2654   if (!isa<GetElementPtrInst>(Ptr0) || !isa<GetElementPtrInst>(Ptr1))
2655     return std::nullopt;
2656 
2657   auto *Gep0 = cast<GetElementPtrInst>(Ptr0);
2658   auto *Gep1 = cast<GetElementPtrInst>(Ptr1);
2659   if (Gep0->getPointerOperand() != Gep1->getPointerOperand())
2660     return std::nullopt;
2661   if (Gep0->getSourceElementType() != Gep1->getSourceElementType())
2662     return std::nullopt;
2663 
2664   Builder B(Gep0->getParent());
2665   int Scale = getSizeOf(Gep0->getSourceElementType(), Alloc);
2666 
2667   // FIXME: for now only check GEPs with a single index.
2668   if (Gep0->getNumOperands() != 2 || Gep1->getNumOperands() != 2)
2669     return std::nullopt;
2670 
2671   Value *Idx0 = Gep0->getOperand(1);
2672   Value *Idx1 = Gep1->getOperand(1);
2673 
2674   // First, try to simplify the subtraction directly.
2675   if (auto *Diff = dyn_cast<ConstantInt>(
2676           Simplify(CallBuilder(B, CreateSub(Idx0, Idx1)))))
2677     return Diff->getSExtValue() * Scale;
2678 
2679   KnownBits Known0 = getKnownBits(Idx0, Gep0);
2680   KnownBits Known1 = getKnownBits(Idx1, Gep1);
2681   APInt Unknown = ~(Known0.Zero | Known0.One) | ~(Known1.Zero | Known1.One);
2682   if (Unknown.isAllOnes())
2683     return std::nullopt;
2684 
2685   Value *MaskU = ConstantInt::get(Idx0->getType(), Unknown);
2686   Value *AndU0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskU)));
2687   Value *AndU1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskU)));
2688   Value *SubU = Simplify(CallBuilder(B, CreateSub(AndU0, AndU1)));
2689   int Diff0 = 0;
2690   if (auto *C = dyn_cast<ConstantInt>(SubU)) {
2691     Diff0 = C->getSExtValue();
2692   } else {
2693     return std::nullopt;
2694   }
2695 
2696   Value *MaskK = ConstantInt::get(MaskU->getType(), ~Unknown);
2697   Value *AndK0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskK)));
2698   Value *AndK1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskK)));
2699   Value *SubK = Simplify(CallBuilder(B, CreateSub(AndK0, AndK1)));
2700   int Diff1 = 0;
2701   if (auto *C = dyn_cast<ConstantInt>(SubK)) {
2702     Diff1 = C->getSExtValue();
2703   } else {
2704     return std::nullopt;
2705   }
2706 
2707   return (Diff0 + Diff1) * Scale;
2708 
2709 #undef CallBuilder
2710 }
2711 
2712 auto HexagonVectorCombine::getNumSignificantBits(const Value *V,
2713                                                  const Instruction *CtxI) const
2714     -> unsigned {
2715   return ComputeMaxSignificantBits(V, DL, /*Depth=*/0, &AC, CtxI, &DT);
2716 }
2717 
2718 auto HexagonVectorCombine::getKnownBits(const Value *V,
2719                                         const Instruction *CtxI) const
2720     -> KnownBits {
2721   return computeKnownBits(V, DL, /*Depth=*/0, &AC, CtxI, &DT, /*ORE=*/nullptr,
2722                           /*UseInstrInfo=*/true);
2723 }
2724 
2725 template <typename T>
2726 auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
2727                                                   BasicBlock::const_iterator To,
2728                                                   const T &IgnoreInsts) const
2729     -> bool {
2730   auto getLocOrNone =
2731       [this](const Instruction &I) -> std::optional<MemoryLocation> {
2732     if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
2733       switch (II->getIntrinsicID()) {
2734       case Intrinsic::masked_load:
2735         return MemoryLocation::getForArgument(II, 0, TLI);
2736       case Intrinsic::masked_store:
2737         return MemoryLocation::getForArgument(II, 1, TLI);
2738       }
2739     }
2740     return MemoryLocation::getOrNone(&I);
2741   };
2742 
2743   // The source and the destination must be in the same basic block.
2744   const BasicBlock &Block = *In.getParent();
2745   assert(Block.begin() == To || Block.end() == To || To->getParent() == &Block);
2746   // No PHIs.
2747   if (isa<PHINode>(In) || (To != Block.end() && isa<PHINode>(*To)))
2748     return false;
2749 
2750   if (!mayHaveNonDefUseDependency(In))
2751     return true;
2752   bool MayWrite = In.mayWriteToMemory();
2753   auto MaybeLoc = getLocOrNone(In);
2754 
2755   auto From = In.getIterator();
2756   if (From == To)
2757     return true;
2758   bool MoveUp = (To != Block.end() && To->comesBefore(&In));
2759   auto Range =
2760       MoveUp ? std::make_pair(To, From) : std::make_pair(std::next(From), To);
2761   for (auto It = Range.first; It != Range.second; ++It) {
2762     const Instruction &I = *It;
2763     if (llvm::is_contained(IgnoreInsts, &I))
2764       continue;
2765     // assume intrinsic can be ignored
2766     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
2767       if (II->getIntrinsicID() == Intrinsic::assume)
2768         continue;
2769     }
2770     // Parts based on isSafeToMoveBefore from CoveMoverUtils.cpp.
2771     if (I.mayThrow())
2772       return false;
2773     if (auto *CB = dyn_cast<CallBase>(&I)) {
2774       if (!CB->hasFnAttr(Attribute::WillReturn))
2775         return false;
2776       if (!CB->hasFnAttr(Attribute::NoSync))
2777         return false;
2778     }
2779     if (I.mayReadOrWriteMemory()) {
2780       auto MaybeLocI = getLocOrNone(I);
2781       if (MayWrite || I.mayWriteToMemory()) {
2782         if (!MaybeLoc || !MaybeLocI)
2783           return false;
2784         if (!AA.isNoAlias(*MaybeLoc, *MaybeLocI))
2785           return false;
2786       }
2787     }
2788   }
2789   return true;
2790 }
2791 
2792 auto HexagonVectorCombine::isByteVecTy(Type *Ty) const -> bool {
2793   if (auto *VecTy = dyn_cast<VectorType>(Ty))
2794     return VecTy->getElementType() == getByteTy();
2795   return false;
2796 }
2797 
2798 auto HexagonVectorCombine::getElementRange(IRBuilderBase &Builder, Value *Lo,
2799                                            Value *Hi, int Start,
2800                                            int Length) const -> Value * {
2801   assert(0 <= Start && size_t(Start + Length) < length(Lo) + length(Hi));
2802   SmallVector<int, 128> SMask(Length);
2803   std::iota(SMask.begin(), SMask.end(), Start);
2804   return Builder.CreateShuffleVector(Lo, Hi, SMask, "shf");
2805 }
2806 
2807 // Pass management.
2808 
2809 namespace llvm {
2810 void initializeHexagonVectorCombineLegacyPass(PassRegistry &);
2811 FunctionPass *createHexagonVectorCombineLegacyPass();
2812 } // namespace llvm
2813 
2814 namespace {
2815 class HexagonVectorCombineLegacy : public FunctionPass {
2816 public:
2817   static char ID;
2818 
2819   HexagonVectorCombineLegacy() : FunctionPass(ID) {}
2820 
2821   StringRef getPassName() const override { return "Hexagon Vector Combine"; }
2822 
2823   void getAnalysisUsage(AnalysisUsage &AU) const override {
2824     AU.setPreservesCFG();
2825     AU.addRequired<AAResultsWrapperPass>();
2826     AU.addRequired<AssumptionCacheTracker>();
2827     AU.addRequired<DominatorTreeWrapperPass>();
2828     AU.addRequired<TargetLibraryInfoWrapperPass>();
2829     AU.addRequired<TargetPassConfig>();
2830     FunctionPass::getAnalysisUsage(AU);
2831   }
2832 
2833   bool runOnFunction(Function &F) override {
2834     if (skipFunction(F))
2835       return false;
2836     AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
2837     AssumptionCache &AC =
2838         getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2839     DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2840     TargetLibraryInfo &TLI =
2841         getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2842     auto &TM = getAnalysis<TargetPassConfig>().getTM<HexagonTargetMachine>();
2843     HexagonVectorCombine HVC(F, AA, AC, DT, TLI, TM);
2844     return HVC.run();
2845   }
2846 };
2847 } // namespace
2848 
2849 char HexagonVectorCombineLegacy::ID = 0;
2850 
2851 INITIALIZE_PASS_BEGIN(HexagonVectorCombineLegacy, DEBUG_TYPE,
2852                       "Hexagon Vector Combine", false, false)
2853 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
2854 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
2855 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
2856 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
2857 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
2858 INITIALIZE_PASS_END(HexagonVectorCombineLegacy, DEBUG_TYPE,
2859                     "Hexagon Vector Combine", false, false)
2860 
2861 FunctionPass *llvm::createHexagonVectorCombineLegacyPass() {
2862   return new HexagonVectorCombineLegacy();
2863 }
2864