xref: /llvm-project/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp (revision e27e8e0541320ec207773c933d430738b5a73bc8)
1 //===-- HexagonVectorCombine.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // HexagonVectorCombine is a utility class implementing a variety of functions
9 // that assist in vector-based optimizations.
10 //
11 // AlignVectors: replace unaligned vector loads and stores with aligned ones.
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/ADT/APInt.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/ADT/None.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/Analysis/AliasAnalysis.h"
22 #include "llvm/Analysis/AssumptionCache.h"
23 #include "llvm/Analysis/InstSimplifyFolder.h"
24 #include "llvm/Analysis/InstructionSimplify.h"
25 #include "llvm/Analysis/TargetLibraryInfo.h"
26 #include "llvm/Analysis/ValueTracking.h"
27 #include "llvm/Analysis/VectorUtils.h"
28 #include "llvm/CodeGen/TargetPassConfig.h"
29 #include "llvm/CodeGen/ValueTypes.h"
30 #include "llvm/IR/Dominators.h"
31 #include "llvm/IR/IRBuilder.h"
32 #include "llvm/IR/IntrinsicInst.h"
33 #include "llvm/IR/Intrinsics.h"
34 #include "llvm/IR/IntrinsicsHexagon.h"
35 #include "llvm/IR/Metadata.h"
36 #include "llvm/IR/PatternMatch.h"
37 #include "llvm/InitializePasses.h"
38 #include "llvm/Pass.h"
39 #include "llvm/Support/KnownBits.h"
40 #include "llvm/Support/MathExtras.h"
41 #include "llvm/Support/raw_ostream.h"
42 #include "llvm/Target/TargetMachine.h"
43 #include "llvm/Transforms/Utils/Local.h"
44 
45 #include "HexagonSubtarget.h"
46 #include "HexagonTargetMachine.h"
47 
48 #include <algorithm>
49 #include <deque>
50 #include <map>
51 #include <optional>
52 #include <set>
53 #include <utility>
54 #include <vector>
55 
56 #define DEBUG_TYPE "hexagon-vc"
57 
58 using namespace llvm;
59 
60 namespace {
61 class HexagonVectorCombine {
62 public:
63   HexagonVectorCombine(Function &F_, AliasAnalysis &AA_, AssumptionCache &AC_,
64                        DominatorTree &DT_, TargetLibraryInfo &TLI_,
65                        const TargetMachine &TM_)
66       : F(F_), DL(F.getParent()->getDataLayout()), AA(AA_), AC(AC_), DT(DT_),
67         TLI(TLI_),
68         HST(static_cast<const HexagonSubtarget &>(*TM_.getSubtargetImpl(F))) {}
69 
70   bool run();
71 
72   // Common integer type.
73   IntegerType *getIntTy(unsigned Width = 32) const;
74   // Byte type: either scalar (when Length = 0), or vector with given
75   // element count.
76   Type *getByteTy(int ElemCount = 0) const;
77   // Boolean type: either scalar (when Length = 0), or vector with given
78   // element count.
79   Type *getBoolTy(int ElemCount = 0) const;
80   // Create a ConstantInt of type returned by getIntTy with the value Val.
81   ConstantInt *getConstInt(int Val, unsigned Width = 32) const;
82   // Get the integer value of V, if it exists.
83   std::optional<APInt> getIntValue(const Value *Val) const;
84   // Is V a constant 0, or a vector of 0s?
85   bool isZero(const Value *Val) const;
86   // Is V an undef value?
87   bool isUndef(const Value *Val) const;
88 
89   // Get HVX vector type with the given element type.
90   VectorType *getHvxTy(Type *ElemTy, bool Pair = false) const;
91 
92   enum SizeKind {
93     Store, // Store size
94     Alloc, // Alloc size
95   };
96   int getSizeOf(const Value *Val, SizeKind Kind = Store) const;
97   int getSizeOf(const Type *Ty, SizeKind Kind = Store) const;
98   int getTypeAlignment(Type *Ty) const;
99   size_t length(Value *Val) const;
100   size_t length(Type *Ty) const;
101 
102   Constant *getNullValue(Type *Ty) const;
103   Constant *getFullValue(Type *Ty) const;
104   Constant *getConstSplat(Type *Ty, int Val) const;
105 
106   Value *simplify(Value *Val) const;
107 
108   Value *insertb(IRBuilderBase &Builder, Value *Dest, Value *Src, int Start,
109                  int Length, int Where) const;
110   Value *vlalignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
111                   Value *Amt) const;
112   Value *vralignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
113                   Value *Amt) const;
114   Value *concat(IRBuilderBase &Builder, ArrayRef<Value *> Vecs) const;
115   Value *vresize(IRBuilderBase &Builder, Value *Val, int NewSize,
116                  Value *Pad) const;
117   Value *rescale(IRBuilderBase &Builder, Value *Mask, Type *FromTy,
118                  Type *ToTy) const;
119   Value *vlsb(IRBuilderBase &Builder, Value *Val) const;
120   Value *vbytes(IRBuilderBase &Builder, Value *Val) const;
121   Value *subvector(IRBuilderBase &Builder, Value *Val, unsigned Start,
122                    unsigned Length) const;
123   Value *sublo(IRBuilderBase &Builder, Value *Val) const;
124   Value *subhi(IRBuilderBase &Builder, Value *Val) const;
125   Value *vdeal(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
126   Value *vshuff(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
127 
128   Value *createHvxIntrinsic(IRBuilderBase &Builder, Intrinsic::ID IntID,
129                             Type *RetTy, ArrayRef<Value *> Args,
130                             ArrayRef<Type *> ArgTys = None) const;
131   SmallVector<Value *> splitVectorElements(IRBuilderBase &Builder, Value *Vec,
132                                            unsigned ToWidth) const;
133   Value *joinVectorElements(IRBuilderBase &Builder, ArrayRef<Value *> Values,
134                             VectorType *ToType) const;
135 
136   std::optional<int> calculatePointerDifference(Value *Ptr0, Value *Ptr1) const;
137 
138   unsigned getNumSignificantBits(const Value *V,
139                                  const Instruction *CtxI = nullptr) const;
140   KnownBits getKnownBits(const Value *V,
141                          const Instruction *CtxI = nullptr) const;
142 
143   template <typename T = std::vector<Instruction *>>
144   bool isSafeToMoveBeforeInBB(const Instruction &In,
145                               BasicBlock::const_iterator To,
146                               const T &IgnoreInsts = {}) const;
147 
148   // This function is only used for assertions at the moment.
149   [[maybe_unused]] bool isByteVecTy(Type *Ty) const;
150 
151   Function &F;
152   const DataLayout &DL;
153   AliasAnalysis &AA;
154   AssumptionCache &AC;
155   DominatorTree &DT;
156   TargetLibraryInfo &TLI;
157   const HexagonSubtarget &HST;
158 
159 private:
160   Value *getElementRange(IRBuilderBase &Builder, Value *Lo, Value *Hi,
161                          int Start, int Length) const;
162 };
163 
164 class AlignVectors {
165 public:
166   AlignVectors(const HexagonVectorCombine &HVC_) : HVC(HVC_) {}
167 
168   bool run();
169 
170 private:
171   using InstList = std::vector<Instruction *>;
172 
173   struct Segment {
174     void *Data;
175     int Start;
176     int Size;
177   };
178 
179   struct AddrInfo {
180     AddrInfo(const AddrInfo &) = default;
181     AddrInfo(const HexagonVectorCombine &HVC, Instruction *I, Value *A, Type *T,
182              Align H)
183         : Inst(I), Addr(A), ValTy(T), HaveAlign(H),
184           NeedAlign(HVC.getTypeAlignment(ValTy)) {}
185     AddrInfo &operator=(const AddrInfo &) = default;
186 
187     // XXX: add Size member?
188     Instruction *Inst;
189     Value *Addr;
190     Type *ValTy;
191     Align HaveAlign;
192     Align NeedAlign;
193     int Offset = 0; // Offset (in bytes) from the first member of the
194                     // containing AddrList.
195   };
196   using AddrList = std::vector<AddrInfo>;
197 
198   struct InstrLess {
199     bool operator()(const Instruction *A, const Instruction *B) const {
200       return A->comesBefore(B);
201     }
202   };
203   using DepList = std::set<Instruction *, InstrLess>;
204 
205   struct MoveGroup {
206     MoveGroup(const AddrInfo &AI, Instruction *B, bool Hvx, bool Load)
207         : Base(B), Main{AI.Inst}, IsHvx(Hvx), IsLoad(Load) {}
208     Instruction *Base; // Base instruction of the parent address group.
209     InstList Main;     // Main group of instructions.
210     InstList Deps;     // List of dependencies.
211     bool IsHvx;        // Is this group of HVX instructions?
212     bool IsLoad;       // Is this a load group?
213   };
214   using MoveList = std::vector<MoveGroup>;
215 
216   struct ByteSpan {
217     struct Segment {
218       // Segment of a Value: 'Len' bytes starting at byte 'Begin'.
219       Segment(Value *Val, int Begin, int Len)
220           : Val(Val), Start(Begin), Size(Len) {}
221       Segment(const Segment &Seg) = default;
222       Segment &operator=(const Segment &Seg) = default;
223       Value *Val; // Value representable as a sequence of bytes.
224       int Start;  // First byte of the value that belongs to the segment.
225       int Size;   // Number of bytes in the segment.
226     };
227 
228     struct Block {
229       Block(Value *Val, int Len, int Pos) : Seg(Val, 0, Len), Pos(Pos) {}
230       Block(Value *Val, int Off, int Len, int Pos)
231           : Seg(Val, Off, Len), Pos(Pos) {}
232       Block(const Block &Blk) = default;
233       Block &operator=(const Block &Blk) = default;
234       Segment Seg; // Value segment.
235       int Pos;     // Position (offset) of the segment in the Block.
236     };
237 
238     int extent() const;
239     ByteSpan section(int Start, int Length) const;
240     ByteSpan &shift(int Offset);
241     SmallVector<Value *, 8> values() const;
242 
243     int size() const { return Blocks.size(); }
244     Block &operator[](int i) { return Blocks[i]; }
245 
246     std::vector<Block> Blocks;
247 
248     using iterator = decltype(Blocks)::iterator;
249     iterator begin() { return Blocks.begin(); }
250     iterator end() { return Blocks.end(); }
251     using const_iterator = decltype(Blocks)::const_iterator;
252     const_iterator begin() const { return Blocks.begin(); }
253     const_iterator end() const { return Blocks.end(); }
254   };
255 
256   Align getAlignFromValue(const Value *V) const;
257   std::optional<MemoryLocation> getLocation(const Instruction &In) const;
258   std::optional<AddrInfo> getAddrInfo(Instruction &In) const;
259   bool isHvx(const AddrInfo &AI) const;
260   // This function is only used for assertions at the moment.
261   [[maybe_unused]] bool isSectorTy(Type *Ty) const;
262 
263   Value *getPayload(Value *Val) const;
264   Value *getMask(Value *Val) const;
265   Value *getPassThrough(Value *Val) const;
266 
267   Value *createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
268                                int Adjust) const;
269   Value *createAlignedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
270                               int Alignment) const;
271   Value *createAlignedLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
272                            int Alignment, Value *Mask, Value *PassThru) const;
273   Value *createAlignedStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
274                             int Alignment, Value *Mask) const;
275 
276   DepList getUpwardDeps(Instruction *In, Instruction *Base) const;
277   bool createAddressGroups();
278   MoveList createLoadGroups(const AddrList &Group) const;
279   MoveList createStoreGroups(const AddrList &Group) const;
280   bool move(const MoveGroup &Move) const;
281   void realignLoadGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
282                         int ScLen, Value *AlignVal, Value *AlignAddr) const;
283   void realignStoreGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
284                          int ScLen, Value *AlignVal, Value *AlignAddr) const;
285   bool realignGroup(const MoveGroup &Move) const;
286 
287   friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
288   friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
289   friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
290   friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
291 
292   std::map<Instruction *, AddrList> AddrGroups;
293   const HexagonVectorCombine &HVC;
294 };
295 
296 LLVM_ATTRIBUTE_UNUSED
297 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
298   OS << "Inst: " << AI.Inst << "  " << *AI.Inst << '\n';
299   OS << "Addr: " << *AI.Addr << '\n';
300   OS << "Type: " << *AI.ValTy << '\n';
301   OS << "HaveAlign: " << AI.HaveAlign.value() << '\n';
302   OS << "NeedAlign: " << AI.NeedAlign.value() << '\n';
303   OS << "Offset: " << AI.Offset;
304   return OS;
305 }
306 
307 LLVM_ATTRIBUTE_UNUSED
308 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
309   OS << "Main\n";
310   for (Instruction *I : MG.Main)
311     OS << "  " << *I << '\n';
312   OS << "Deps\n";
313   for (Instruction *I : MG.Deps)
314     OS << "  " << *I << '\n';
315   return OS;
316 }
317 
318 LLVM_ATTRIBUTE_UNUSED
319 raw_ostream &operator<<(raw_ostream &OS,
320                         const AlignVectors::ByteSpan::Block &B) {
321   OS << "  @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] "
322      << *B.Seg.Val;
323   return OS;
324 }
325 
326 LLVM_ATTRIBUTE_UNUSED
327 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
328   OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
329   for (const AlignVectors::ByteSpan::Block &B : BS)
330     OS << B << '\n';
331   OS << ']';
332   return OS;
333 }
334 
335 class HvxIdioms {
336 public:
337   HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {
338     auto *Int32Ty = HVC.getIntTy(32);
339     HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false);
340     HvxP32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/true);
341   }
342 
343   bool run();
344 
345 private:
346   enum Signedness { Positive, Signed, Unsigned };
347 
348   // Value + sign
349   // This is to keep track of whether the value should be treated as signed
350   // or unsigned, or is known to be positive.
351   struct SValue {
352     Value *Val;
353     Signedness Sgn;
354   };
355 
356   struct FxpOp {
357     unsigned Opcode;
358     unsigned Frac; // Number of fraction bits
359     SValue X, Y;
360     // If present, add 1 << RoundAt before shift:
361     std::optional<unsigned> RoundAt;
362   };
363 
364   auto getNumSignificantBits(Value *V, Instruction *In) const
365       -> std::pair<unsigned, Signedness>;
366   auto canonSgn(SValue X, SValue Y) const -> std::pair<SValue, SValue>;
367 
368   auto matchFxpMul(Instruction &In) const -> std::optional<FxpOp>;
369   auto processFxpMul(Instruction &In, const FxpOp &Op) const -> Value *;
370 
371   auto processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
372                             const FxpOp &Op) const -> Value *;
373   auto createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
374                     bool Rounding) const -> Value *;
375   auto createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
376                     bool Rounding) const -> Value *;
377   // Return {Result, Carry}, where Carry is a vector predicate.
378   auto createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
379                       Value *CarryIn = nullptr) const
380       -> std::pair<Value *, Value *>;
381   auto createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const -> Value *;
382   auto createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
383       -> Value *;
384   auto createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
385       -> std::pair<Value *, Value *>;
386   auto createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
387                      ArrayRef<Value *> WordY) const -> SmallVector<Value *>;
388   auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
389                      Signedness SgnX, ArrayRef<Value *> WordY,
390                      Signedness SgnY) const -> SmallVector<Value *>;
391 
392   VectorType *HvxI32Ty;
393   VectorType *HvxP32Ty;
394   const HexagonVectorCombine &HVC;
395 
396   friend raw_ostream &operator<<(raw_ostream &, const FxpOp &);
397 };
398 
399 [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
400                                          const HvxIdioms::FxpOp &Op) {
401   static const char *SgnNames[] = {"Positive", "Signed", "Unsigned"};
402   OS << Instruction::getOpcodeName(Op.Opcode) << '.' << Op.Frac;
403   if (Op.RoundAt.has_value()) {
404     if (Op.Frac != 0 && Op.RoundAt.value() == Op.Frac - 1) {
405       OS << ":rnd";
406     } else {
407       OS << " + 1<<" << Op.RoundAt.value();
408     }
409   }
410   OS << "\n  X:(" << SgnNames[Op.X.Sgn] << ") " << *Op.X.Val << "\n"
411      << "  Y:(" << SgnNames[Op.Y.Sgn] << ") " << *Op.Y.Val;
412   return OS;
413 }
414 
415 } // namespace
416 
417 namespace {
418 
419 template <typename T> T *getIfUnordered(T *MaybeT) {
420   return MaybeT && MaybeT->isUnordered() ? MaybeT : nullptr;
421 }
422 template <typename T> T *isCandidate(Instruction *In) {
423   return dyn_cast<T>(In);
424 }
425 template <> LoadInst *isCandidate<LoadInst>(Instruction *In) {
426   return getIfUnordered(dyn_cast<LoadInst>(In));
427 }
428 template <> StoreInst *isCandidate<StoreInst>(Instruction *In) {
429   return getIfUnordered(dyn_cast<StoreInst>(In));
430 }
431 
432 #if !defined(_MSC_VER) || _MSC_VER >= 1926
433 // VS2017 and some versions of VS2019 have trouble compiling this:
434 // error C2976: 'std::map': too few template arguments
435 // VS 2019 16.x is known to work, except for 16.4/16.5 (MSC_VER 1924/1925)
436 template <typename Pred, typename... Ts>
437 void erase_if(std::map<Ts...> &map, Pred p)
438 #else
439 template <typename Pred, typename T, typename U>
440 void erase_if(std::map<T, U> &map, Pred p)
441 #endif
442 {
443   for (auto i = map.begin(), e = map.end(); i != e;) {
444     if (p(*i))
445       i = map.erase(i);
446     else
447       i = std::next(i);
448   }
449 }
450 
451 // Forward other erase_ifs to the LLVM implementations.
452 template <typename Pred, typename T> void erase_if(T &&container, Pred p) {
453   llvm::erase_if(std::forward<T>(container), p);
454 }
455 
456 } // namespace
457 
458 // --- Begin AlignVectors
459 
460 auto AlignVectors::ByteSpan::extent() const -> int {
461   if (size() == 0)
462     return 0;
463   int Min = Blocks[0].Pos;
464   int Max = Blocks[0].Pos + Blocks[0].Seg.Size;
465   for (int i = 1, e = size(); i != e; ++i) {
466     Min = std::min(Min, Blocks[i].Pos);
467     Max = std::max(Max, Blocks[i].Pos + Blocks[i].Seg.Size);
468   }
469   return Max - Min;
470 }
471 
472 auto AlignVectors::ByteSpan::section(int Start, int Length) const -> ByteSpan {
473   ByteSpan Section;
474   for (const ByteSpan::Block &B : Blocks) {
475     int L = std::max(B.Pos, Start);                       // Left end.
476     int R = std::min(B.Pos + B.Seg.Size, Start + Length); // Right end+1.
477     if (L < R) {
478       // How much to chop off the beginning of the segment:
479       int Off = L > B.Pos ? L - B.Pos : 0;
480       Section.Blocks.emplace_back(B.Seg.Val, B.Seg.Start + Off, R - L, L);
481     }
482   }
483   return Section;
484 }
485 
486 auto AlignVectors::ByteSpan::shift(int Offset) -> ByteSpan & {
487   for (Block &B : Blocks)
488     B.Pos += Offset;
489   return *this;
490 }
491 
492 auto AlignVectors::ByteSpan::values() const -> SmallVector<Value *, 8> {
493   SmallVector<Value *, 8> Values(Blocks.size());
494   for (int i = 0, e = Blocks.size(); i != e; ++i)
495     Values[i] = Blocks[i].Seg.Val;
496   return Values;
497 }
498 
499 auto AlignVectors::getAlignFromValue(const Value *V) const -> Align {
500   const auto *C = dyn_cast<ConstantInt>(V);
501   assert(C && "Alignment must be a compile-time constant integer");
502   return C->getAlignValue();
503 }
504 
505 auto AlignVectors::getAddrInfo(Instruction &In) const
506     -> std::optional<AddrInfo> {
507   if (auto *L = isCandidate<LoadInst>(&In))
508     return AddrInfo(HVC, L, L->getPointerOperand(), L->getType(),
509                     L->getAlign());
510   if (auto *S = isCandidate<StoreInst>(&In))
511     return AddrInfo(HVC, S, S->getPointerOperand(),
512                     S->getValueOperand()->getType(), S->getAlign());
513   if (auto *II = isCandidate<IntrinsicInst>(&In)) {
514     Intrinsic::ID ID = II->getIntrinsicID();
515     switch (ID) {
516     case Intrinsic::masked_load:
517       return AddrInfo(HVC, II, II->getArgOperand(0), II->getType(),
518                       getAlignFromValue(II->getArgOperand(1)));
519     case Intrinsic::masked_store:
520       return AddrInfo(HVC, II, II->getArgOperand(1),
521                       II->getArgOperand(0)->getType(),
522                       getAlignFromValue(II->getArgOperand(2)));
523     }
524   }
525   return std::nullopt;
526 }
527 
528 auto AlignVectors::isHvx(const AddrInfo &AI) const -> bool {
529   return HVC.HST.isTypeForHVX(AI.ValTy);
530 }
531 
532 auto AlignVectors::getPayload(Value *Val) const -> Value * {
533   if (auto *In = dyn_cast<Instruction>(Val)) {
534     Intrinsic::ID ID = 0;
535     if (auto *II = dyn_cast<IntrinsicInst>(In))
536       ID = II->getIntrinsicID();
537     if (isa<StoreInst>(In) || ID == Intrinsic::masked_store)
538       return In->getOperand(0);
539   }
540   return Val;
541 }
542 
543 auto AlignVectors::getMask(Value *Val) const -> Value * {
544   if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
545     switch (II->getIntrinsicID()) {
546     case Intrinsic::masked_load:
547       return II->getArgOperand(2);
548     case Intrinsic::masked_store:
549       return II->getArgOperand(3);
550     }
551   }
552 
553   Type *ValTy = getPayload(Val)->getType();
554   if (auto *VecTy = dyn_cast<VectorType>(ValTy))
555     return HVC.getFullValue(HVC.getBoolTy(HVC.length(VecTy)));
556   return HVC.getFullValue(HVC.getBoolTy());
557 }
558 
559 auto AlignVectors::getPassThrough(Value *Val) const -> Value * {
560   if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
561     if (II->getIntrinsicID() == Intrinsic::masked_load)
562       return II->getArgOperand(3);
563   }
564   return UndefValue::get(getPayload(Val)->getType());
565 }
566 
567 auto AlignVectors::createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr,
568                                          Type *ValTy, int Adjust) const
569     -> Value * {
570   // The adjustment is in bytes, but if it's a multiple of the type size,
571   // we don't need to do pointer casts.
572   auto *PtrTy = cast<PointerType>(Ptr->getType());
573   if (!PtrTy->isOpaque()) {
574     Type *ElemTy = PtrTy->getNonOpaquePointerElementType();
575     int ElemSize = HVC.getSizeOf(ElemTy, HVC.Alloc);
576     if (Adjust % ElemSize == 0 && Adjust != 0) {
577       Value *Tmp0 =
578           Builder.CreateGEP(ElemTy, Ptr, HVC.getConstInt(Adjust / ElemSize));
579       return Builder.CreatePointerCast(Tmp0, ValTy->getPointerTo());
580     }
581   }
582 
583   PointerType *CharPtrTy = Type::getInt8PtrTy(HVC.F.getContext());
584   Value *Tmp0 = Builder.CreatePointerCast(Ptr, CharPtrTy);
585   Value *Tmp1 = Builder.CreateGEP(Type::getInt8Ty(HVC.F.getContext()), Tmp0,
586                                   HVC.getConstInt(Adjust));
587   return Builder.CreatePointerCast(Tmp1, ValTy->getPointerTo());
588 }
589 
590 auto AlignVectors::createAlignedPointer(IRBuilderBase &Builder, Value *Ptr,
591                                         Type *ValTy, int Alignment) const
592     -> Value * {
593   Value *AsInt = Builder.CreatePtrToInt(Ptr, HVC.getIntTy());
594   Value *Mask = HVC.getConstInt(-Alignment);
595   Value *And = Builder.CreateAnd(AsInt, Mask);
596   return Builder.CreateIntToPtr(And, ValTy->getPointerTo());
597 }
598 
599 auto AlignVectors::createAlignedLoad(IRBuilderBase &Builder, Type *ValTy,
600                                      Value *Ptr, int Alignment, Value *Mask,
601                                      Value *PassThru) const -> Value * {
602   assert(!HVC.isUndef(Mask)); // Should this be allowed?
603   if (HVC.isZero(Mask))
604     return PassThru;
605   if (Mask == ConstantInt::getTrue(Mask->getType()))
606     return Builder.CreateAlignedLoad(ValTy, Ptr, Align(Alignment));
607   return Builder.CreateMaskedLoad(ValTy, Ptr, Align(Alignment), Mask, PassThru);
608 }
609 
610 auto AlignVectors::createAlignedStore(IRBuilderBase &Builder, Value *Val,
611                                       Value *Ptr, int Alignment,
612                                       Value *Mask) const -> Value * {
613   if (HVC.isZero(Mask) || HVC.isUndef(Val) || HVC.isUndef(Mask))
614     return UndefValue::get(Val->getType());
615   if (Mask == ConstantInt::getTrue(Mask->getType()))
616     return Builder.CreateAlignedStore(Val, Ptr, Align(Alignment));
617   return Builder.CreateMaskedStore(Val, Ptr, Align(Alignment), Mask);
618 }
619 
620 auto AlignVectors::getUpwardDeps(Instruction *In, Instruction *Base) const
621     -> DepList {
622   BasicBlock *Parent = Base->getParent();
623   assert(In->getParent() == Parent &&
624          "Base and In should be in the same block");
625   assert(Base->comesBefore(In) && "Base should come before In");
626 
627   DepList Deps;
628   std::deque<Instruction *> WorkQ = {In};
629   while (!WorkQ.empty()) {
630     Instruction *D = WorkQ.front();
631     WorkQ.pop_front();
632     Deps.insert(D);
633     for (Value *Op : D->operands()) {
634       if (auto *I = dyn_cast<Instruction>(Op)) {
635         if (I->getParent() == Parent && Base->comesBefore(I))
636           WorkQ.push_back(I);
637       }
638     }
639   }
640   return Deps;
641 }
642 
643 auto AlignVectors::createAddressGroups() -> bool {
644   // An address group created here may contain instructions spanning
645   // multiple basic blocks.
646   AddrList WorkStack;
647 
648   auto findBaseAndOffset = [&](AddrInfo &AI) -> std::pair<Instruction *, int> {
649     for (AddrInfo &W : WorkStack) {
650       if (auto D = HVC.calculatePointerDifference(AI.Addr, W.Addr))
651         return std::make_pair(W.Inst, *D);
652     }
653     return std::make_pair(nullptr, 0);
654   };
655 
656   auto traverseBlock = [&](DomTreeNode *DomN, auto Visit) -> void {
657     BasicBlock &Block = *DomN->getBlock();
658     for (Instruction &I : Block) {
659       auto AI = this->getAddrInfo(I); // Use this-> for gcc6.
660       if (!AI)
661         continue;
662       auto F = findBaseAndOffset(*AI);
663       Instruction *GroupInst;
664       if (Instruction *BI = F.first) {
665         AI->Offset = F.second;
666         GroupInst = BI;
667       } else {
668         WorkStack.push_back(*AI);
669         GroupInst = AI->Inst;
670       }
671       AddrGroups[GroupInst].push_back(*AI);
672     }
673 
674     for (DomTreeNode *C : DomN->children())
675       Visit(C, Visit);
676 
677     while (!WorkStack.empty() && WorkStack.back().Inst->getParent() == &Block)
678       WorkStack.pop_back();
679   };
680 
681   traverseBlock(HVC.DT.getRootNode(), traverseBlock);
682   assert(WorkStack.empty());
683 
684   // AddrGroups are formed.
685 
686   // Remove groups of size 1.
687   erase_if(AddrGroups, [](auto &G) { return G.second.size() == 1; });
688   // Remove groups that don't use HVX types.
689   erase_if(AddrGroups, [&](auto &G) {
690     return llvm::none_of(
691         G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(I.ValTy); });
692   });
693 
694   return !AddrGroups.empty();
695 }
696 
697 auto AlignVectors::createLoadGroups(const AddrList &Group) const -> MoveList {
698   // Form load groups.
699   // To avoid complications with moving code across basic blocks, only form
700   // groups that are contained within a single basic block.
701 
702   auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
703     assert(!Move.Main.empty() && "Move group should have non-empty Main");
704     // Don't mix HVX and non-HVX instructions.
705     if (Move.IsHvx != isHvx(Info))
706       return false;
707     // Leading instruction in the load group.
708     Instruction *Base = Move.Main.front();
709     if (Base->getParent() != Info.Inst->getParent())
710       return false;
711 
712     auto isSafeToMoveToBase = [&](const Instruction *I) {
713       return HVC.isSafeToMoveBeforeInBB(*I, Base->getIterator());
714     };
715     DepList Deps = getUpwardDeps(Info.Inst, Base);
716     if (!llvm::all_of(Deps, isSafeToMoveToBase))
717       return false;
718 
719     // The dependencies will be moved together with the load, so make sure
720     // that none of them could be moved independently in another group.
721     Deps.erase(Info.Inst);
722     auto inAddrMap = [&](Instruction *I) { return AddrGroups.count(I) > 0; };
723     if (llvm::any_of(Deps, inAddrMap))
724       return false;
725     Move.Main.push_back(Info.Inst);
726     llvm::append_range(Move.Deps, Deps);
727     return true;
728   };
729 
730   MoveList LoadGroups;
731 
732   for (const AddrInfo &Info : Group) {
733     if (!Info.Inst->mayReadFromMemory())
734       continue;
735     if (LoadGroups.empty() || !tryAddTo(Info, LoadGroups.back()))
736       LoadGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), true);
737   }
738 
739   // Erase singleton groups.
740   erase_if(LoadGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
741   return LoadGroups;
742 }
743 
744 auto AlignVectors::createStoreGroups(const AddrList &Group) const -> MoveList {
745   // Form store groups.
746   // To avoid complications with moving code across basic blocks, only form
747   // groups that are contained within a single basic block.
748 
749   auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
750     assert(!Move.Main.empty() && "Move group should have non-empty Main");
751     // For stores with return values we'd have to collect downward depenencies.
752     // There are no such stores that we handle at the moment, so omit that.
753     assert(Info.Inst->getType()->isVoidTy() &&
754            "Not handling stores with return values");
755     // Don't mix HVX and non-HVX instructions.
756     if (Move.IsHvx != isHvx(Info))
757       return false;
758     // For stores we need to be careful whether it's safe to move them.
759     // Stores that are otherwise safe to move together may not appear safe
760     // to move over one another (i.e. isSafeToMoveBefore may return false).
761     Instruction *Base = Move.Main.front();
762     if (Base->getParent() != Info.Inst->getParent())
763       return false;
764     if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator(), Move.Main))
765       return false;
766     Move.Main.push_back(Info.Inst);
767     return true;
768   };
769 
770   MoveList StoreGroups;
771 
772   for (auto I = Group.rbegin(), E = Group.rend(); I != E; ++I) {
773     const AddrInfo &Info = *I;
774     if (!Info.Inst->mayWriteToMemory())
775       continue;
776     if (StoreGroups.empty() || !tryAddTo(Info, StoreGroups.back()))
777       StoreGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), false);
778   }
779 
780   // Erase singleton groups.
781   erase_if(StoreGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
782   return StoreGroups;
783 }
784 
785 auto AlignVectors::move(const MoveGroup &Move) const -> bool {
786   assert(!Move.Main.empty() && "Move group should have non-empty Main");
787   Instruction *Where = Move.Main.front();
788 
789   if (Move.IsLoad) {
790     // Move all deps to before Where, keeping order.
791     for (Instruction *D : Move.Deps)
792       D->moveBefore(Where);
793     // Move all main instructions to after Where, keeping order.
794     ArrayRef<Instruction *> Main(Move.Main);
795     for (Instruction *M : Main.drop_front(1)) {
796       M->moveAfter(Where);
797       Where = M;
798     }
799   } else {
800     // NOTE: Deps are empty for "store" groups. If they need to be
801     // non-empty, decide on the order.
802     assert(Move.Deps.empty());
803     // Move all main instructions to before Where, inverting order.
804     ArrayRef<Instruction *> Main(Move.Main);
805     for (Instruction *M : Main.drop_front(1)) {
806       M->moveBefore(Where);
807       Where = M;
808     }
809   }
810 
811   return Move.Main.size() + Move.Deps.size() > 1;
812 }
813 
814 auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
815                                     const ByteSpan &VSpan, int ScLen,
816                                     Value *AlignVal, Value *AlignAddr) const
817     -> void {
818   Type *SecTy = HVC.getByteTy(ScLen);
819   int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
820   bool DoAlign = !HVC.isZero(AlignVal);
821   BasicBlock::iterator BasePos = Builder.GetInsertPoint();
822   BasicBlock *BaseBlock = Builder.GetInsertBlock();
823 
824   ByteSpan ASpan;
825   auto *True = HVC.getFullValue(HVC.getBoolTy(ScLen));
826   auto *Undef = UndefValue::get(SecTy);
827 
828   SmallVector<Instruction *> Loads(NumSectors + DoAlign, nullptr);
829 
830   // We could create all of the aligned loads, and generate the valigns
831   // at the location of the first load, but for large load groups, this
832   // could create highly suboptimal code (there have been groups of 140+
833   // loads in real code).
834   // Instead, place the loads/valigns as close to the users as possible.
835   // In any case we need to have a mapping from the blocks of VSpan (the
836   // span covered by the pre-existing loads) to ASpan (the span covered
837   // by the aligned loads). There is a small problem, though: ASpan needs
838   // to have pointers to the loads/valigns, but we don't know where to put
839   // them yet. We can't use nullptr, because when we create sections of
840   // ASpan (corresponding to blocks from VSpan), for each block in the
841   // section we need to know which blocks of ASpan they are a part of.
842   // To have 1-1 mapping between blocks of ASpan and the temporary value
843   // pointers, use the addresses of the blocks themselves.
844 
845   // Populate the blocks first, to avoid reallocations of the vector
846   // interfering with generating the placeholder addresses.
847   for (int Index = 0; Index != NumSectors; ++Index)
848     ASpan.Blocks.emplace_back(nullptr, ScLen, Index * ScLen);
849   for (int Index = 0; Index != NumSectors; ++Index) {
850     ASpan.Blocks[Index].Seg.Val =
851         reinterpret_cast<Value *>(&ASpan.Blocks[Index]);
852   }
853 
854   // Multiple values from VSpan can map to the same value in ASpan. Since we
855   // try to create loads lazily, we need to find the earliest use for each
856   // value from ASpan.
857   DenseMap<void *, Instruction *> EarliestUser;
858   auto isEarlier = [](Instruction *A, Instruction *B) {
859     if (B == nullptr)
860       return true;
861     if (A == nullptr)
862       return false;
863     assert(A->getParent() == B->getParent());
864     return A->comesBefore(B);
865   };
866   auto earliestUser = [&](const auto &Uses) {
867     Instruction *User = nullptr;
868     for (const Use &U : Uses) {
869       auto *I = dyn_cast<Instruction>(U.getUser());
870       assert(I != nullptr && "Load used in a non-instruction?");
871       // Make sure we only consider at users in this block, but we need
872       // to remember if there were users outside the block too. This is
873       // because if there are no users, aligned loads will not be created.
874       if (I->getParent() == BaseBlock) {
875         if (!isa<PHINode>(I))
876           User = std::min(User, I, isEarlier);
877       } else {
878         User = std::min(User, BaseBlock->getTerminator(), isEarlier);
879       }
880     }
881     return User;
882   };
883 
884   for (const ByteSpan::Block &B : VSpan) {
885     ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size);
886     for (const ByteSpan::Block &S : ASection) {
887       EarliestUser[S.Seg.Val] = std::min(
888           EarliestUser[S.Seg.Val], earliestUser(B.Seg.Val->uses()), isEarlier);
889     }
890   }
891 
892   auto createLoad = [&](IRBuilderBase &Builder, const ByteSpan &VSpan,
893                         int Index) {
894     Value *Ptr =
895         createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
896     // FIXME: generate a predicated load?
897     Value *Load = createAlignedLoad(Builder, SecTy, Ptr, ScLen, True, Undef);
898     // If vector shifting is potentially needed, accumulate metadata
899     // from source sections of twice the load width.
900     int Start = (Index - DoAlign) * ScLen;
901     int Width = (1 + DoAlign) * ScLen;
902     propagateMetadata(cast<Instruction>(Load),
903                       VSpan.section(Start, Width).values());
904     return cast<Instruction>(Load);
905   };
906 
907   auto moveBefore = [this](Instruction *In, Instruction *To) {
908     // Move In and its upward dependencies to before To.
909     assert(In->getParent() == To->getParent());
910     DepList Deps = getUpwardDeps(In, To);
911     // DepList is sorted with respect to positions in the basic block.
912     for (Instruction *I : Deps)
913       I->moveBefore(To);
914   };
915 
916   // Generate necessary loads at appropriate locations.
917   for (int Index = 0; Index != NumSectors + 1; ++Index) {
918     // In ASpan, each block will be either a single aligned load, or a
919     // valign of a pair of loads. In the latter case, an aligned load j
920     // will belong to the current valign, and the one in the previous
921     // block (for j > 0).
922     Instruction *PrevAt =
923         DoAlign && Index > 0 ? EarliestUser[&ASpan[Index - 1]] : nullptr;
924     Instruction *ThisAt =
925         Index < NumSectors ? EarliestUser[&ASpan[Index]] : nullptr;
926     if (auto *Where = std::min(PrevAt, ThisAt, isEarlier)) {
927       Builder.SetInsertPoint(Where);
928       Loads[Index] = createLoad(Builder, VSpan, Index);
929       // We know it's safe to put the load at BasePos, so if it's not safe
930       // to move it from this location to BasePos, then the current location
931       // is not valid.
932       // We can't do this check proactively because we need the load to exist
933       // in order to check legality.
934       if (!HVC.isSafeToMoveBeforeInBB(*Loads[Index], BasePos))
935         moveBefore(Loads[Index], &*BasePos);
936     }
937   }
938   // Generate valigns if needed, and fill in proper values in ASpan
939   for (int Index = 0; Index != NumSectors; ++Index) {
940     ASpan[Index].Seg.Val = nullptr;
941     if (auto *Where = EarliestUser[&ASpan[Index]]) {
942       Builder.SetInsertPoint(Where);
943       Value *Val = Loads[Index];
944       assert(Val != nullptr);
945       if (DoAlign) {
946         Value *NextLoad = Loads[Index + 1];
947         assert(NextLoad != nullptr);
948         Val = HVC.vralignb(Builder, Val, NextLoad, AlignVal);
949       }
950       ASpan[Index].Seg.Val = Val;
951     }
952   }
953 
954   for (const ByteSpan::Block &B : VSpan) {
955     ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
956     Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
957     Builder.SetInsertPoint(cast<Instruction>(B.Seg.Val));
958 
959     for (ByteSpan::Block &S : ASection) {
960       if (S.Seg.Val == nullptr)
961         continue;
962       // The processing of the data loaded by the aligned loads
963       // needs to be inserted after the data is available.
964       Instruction *SegI = cast<Instruction>(S.Seg.Val);
965       Builder.SetInsertPoint(&*std::next(SegI->getIterator()));
966       Value *Pay = HVC.vbytes(Builder, getPayload(S.Seg.Val));
967       Accum = HVC.insertb(Builder, Accum, Pay, S.Seg.Start, S.Seg.Size, S.Pos);
968     }
969     // Instead of casting everything to bytes for the vselect, cast to the
970     // original value type. This will avoid complications with casting masks.
971     // For example, in cases when the original mask applied to i32, it could
972     // be converted to a mask applicable to i8 via pred_typecast intrinsic,
973     // but if the mask is not exactly of HVX length, extra handling would be
974     // needed to make it work.
975     Type *ValTy = getPayload(B.Seg.Val)->getType();
976     Value *Cast = Builder.CreateBitCast(Accum, ValTy);
977     Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,
978                                       getPassThrough(B.Seg.Val));
979     B.Seg.Val->replaceAllUsesWith(Sel);
980   }
981 }
982 
983 auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
984                                      const ByteSpan &VSpan, int ScLen,
985                                      Value *AlignVal, Value *AlignAddr) const
986     -> void {
987   Type *SecTy = HVC.getByteTy(ScLen);
988   int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
989   bool DoAlign = !HVC.isZero(AlignVal);
990 
991   // Stores.
992   ByteSpan ASpanV, ASpanM;
993 
994   // Return a vector value corresponding to the input value Val:
995   // either <1 x Val> for scalar Val, or Val itself for vector Val.
996   auto MakeVec = [](IRBuilderBase &Builder, Value *Val) -> Value * {
997     Type *Ty = Val->getType();
998     if (Ty->isVectorTy())
999       return Val;
1000     auto *VecTy = VectorType::get(Ty, 1, /*Scalable=*/false);
1001     return Builder.CreateBitCast(Val, VecTy);
1002   };
1003 
1004   // Create an extra "undef" sector at the beginning and at the end.
1005   // They will be used as the left/right filler in the vlalign step.
1006   for (int i = (DoAlign ? -1 : 0); i != NumSectors + DoAlign; ++i) {
1007     // For stores, the size of each section is an aligned vector length.
1008     // Adjust the store offsets relative to the section start offset.
1009     ByteSpan VSection = VSpan.section(i * ScLen, ScLen).shift(-i * ScLen);
1010     Value *AccumV = UndefValue::get(SecTy);
1011     Value *AccumM = HVC.getNullValue(SecTy);
1012     for (ByteSpan::Block &S : VSection) {
1013       Value *Pay = getPayload(S.Seg.Val);
1014       Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
1015                                 Pay->getType(), HVC.getByteTy());
1016       AccumM = HVC.insertb(Builder, AccumM, HVC.vbytes(Builder, Mask),
1017                            S.Seg.Start, S.Seg.Size, S.Pos);
1018       AccumV = HVC.insertb(Builder, AccumV, HVC.vbytes(Builder, Pay),
1019                            S.Seg.Start, S.Seg.Size, S.Pos);
1020     }
1021     ASpanV.Blocks.emplace_back(AccumV, ScLen, i * ScLen);
1022     ASpanM.Blocks.emplace_back(AccumM, ScLen, i * ScLen);
1023   }
1024 
1025   // vlalign
1026   if (DoAlign) {
1027     for (int j = 1; j != NumSectors + 2; ++j) {
1028       Value *PrevV = ASpanV[j - 1].Seg.Val, *ThisV = ASpanV[j].Seg.Val;
1029       Value *PrevM = ASpanM[j - 1].Seg.Val, *ThisM = ASpanM[j].Seg.Val;
1030       assert(isSectorTy(PrevV->getType()) && isSectorTy(PrevM->getType()));
1031       ASpanV[j - 1].Seg.Val = HVC.vlalignb(Builder, PrevV, ThisV, AlignVal);
1032       ASpanM[j - 1].Seg.Val = HVC.vlalignb(Builder, PrevM, ThisM, AlignVal);
1033     }
1034   }
1035 
1036   for (int i = 0; i != NumSectors + DoAlign; ++i) {
1037     Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
1038     Value *Val = ASpanV[i].Seg.Val;
1039     Value *Mask = ASpanM[i].Seg.Val; // bytes
1040     if (!HVC.isUndef(Val) && !HVC.isZero(Mask)) {
1041       Value *Store =
1042           createAlignedStore(Builder, Val, Ptr, ScLen, HVC.vlsb(Builder, Mask));
1043       // If vector shifting is potentially needed, accumulate metadata
1044       // from source sections of twice the store width.
1045       int Start = (i - DoAlign) * ScLen;
1046       int Width = (1 + DoAlign) * ScLen;
1047       propagateMetadata(cast<Instruction>(Store),
1048                         VSpan.section(Start, Width).values());
1049     }
1050   }
1051 }
1052 
1053 auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
1054   // TODO: Needs support for masked loads/stores of "scalar" vectors.
1055   if (!Move.IsHvx)
1056     return false;
1057 
1058   // Return the element with the maximum alignment from Range,
1059   // where GetValue obtains the value to compare from an element.
1060   auto getMaxOf = [](auto Range, auto GetValue) {
1061     return *std::max_element(
1062         Range.begin(), Range.end(),
1063         [&GetValue](auto &A, auto &B) { return GetValue(A) < GetValue(B); });
1064   };
1065 
1066   const AddrList &BaseInfos = AddrGroups.at(Move.Base);
1067 
1068   // Conceptually, there is a vector of N bytes covering the addresses
1069   // starting from the minimum offset (i.e. Base.Addr+Start). This vector
1070   // represents a contiguous memory region that spans all accessed memory
1071   // locations.
1072   // The correspondence between loaded or stored values will be expressed
1073   // in terms of this vector. For example, the 0th element of the vector
1074   // from the Base address info will start at byte Start from the beginning
1075   // of this conceptual vector.
1076   //
1077   // This vector will be loaded/stored starting at the nearest down-aligned
1078   // address and the amount od the down-alignment will be AlignVal:
1079   //   valign(load_vector(align_down(Base+Start)), AlignVal)
1080 
1081   std::set<Instruction *> TestSet(Move.Main.begin(), Move.Main.end());
1082   AddrList MoveInfos;
1083   llvm::copy_if(
1084       BaseInfos, std::back_inserter(MoveInfos),
1085       [&TestSet](const AddrInfo &AI) { return TestSet.count(AI.Inst); });
1086 
1087   // Maximum alignment present in the whole address group.
1088   const AddrInfo &WithMaxAlign =
1089       getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.HaveAlign; });
1090   Align MaxGiven = WithMaxAlign.HaveAlign;
1091 
1092   // Minimum alignment present in the move address group.
1093   const AddrInfo &WithMinOffset =
1094       getMaxOf(MoveInfos, [](const AddrInfo &AI) { return -AI.Offset; });
1095 
1096   const AddrInfo &WithMaxNeeded =
1097       getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.NeedAlign; });
1098   Align MinNeeded = WithMaxNeeded.NeedAlign;
1099 
1100   // Set the builder's insertion point right before the load group, or
1101   // immediately after the store group. (Instructions in a store group are
1102   // listed in reverse order.)
1103   Instruction *InsertAt = Move.Main.front();
1104   if (!Move.IsLoad) {
1105     // There should be a terminator (which store isn't, but check anyways).
1106     assert(InsertAt->getIterator() != InsertAt->getParent()->end());
1107     InsertAt = &*std::next(InsertAt->getIterator());
1108   }
1109 
1110   IRBuilder Builder(InsertAt->getParent(), InsertAt->getIterator(),
1111                     InstSimplifyFolder(HVC.DL));
1112   Value *AlignAddr = nullptr; // Actual aligned address.
1113   Value *AlignVal = nullptr;  // Right-shift amount (for valign).
1114 
1115   if (MinNeeded <= MaxGiven) {
1116     int Start = WithMinOffset.Offset;
1117     int OffAtMax = WithMaxAlign.Offset;
1118     // Shift the offset of the maximally aligned instruction (OffAtMax)
1119     // back by just enough multiples of the required alignment to cover the
1120     // distance from Start to OffAtMax.
1121     // Calculate the address adjustment amount based on the address with the
1122     // maximum alignment. This is to allow a simple gep instruction instead
1123     // of potential bitcasts to i8*.
1124     int Adjust = -alignTo(OffAtMax - Start, MinNeeded.value());
1125     AlignAddr = createAdjustedPointer(Builder, WithMaxAlign.Addr,
1126                                       WithMaxAlign.ValTy, Adjust);
1127     int Diff = Start - (OffAtMax + Adjust);
1128     AlignVal = HVC.getConstInt(Diff);
1129     assert(Diff >= 0);
1130     assert(static_cast<decltype(MinNeeded.value())>(Diff) < MinNeeded.value());
1131   } else {
1132     // WithMinOffset is the lowest address in the group,
1133     //   WithMinOffset.Addr = Base+Start.
1134     // Align instructions for both HVX (V6_valign) and scalar (S2_valignrb)
1135     // mask off unnecessary bits, so it's ok to just the original pointer as
1136     // the alignment amount.
1137     // Do an explicit down-alignment of the address to avoid creating an
1138     // aligned instruction with an address that is not really aligned.
1139     AlignAddr = createAlignedPointer(Builder, WithMinOffset.Addr,
1140                                      WithMinOffset.ValTy, MinNeeded.value());
1141     AlignVal = Builder.CreatePtrToInt(WithMinOffset.Addr, HVC.getIntTy());
1142   }
1143 
1144   ByteSpan VSpan;
1145   for (const AddrInfo &AI : MoveInfos) {
1146     VSpan.Blocks.emplace_back(AI.Inst, HVC.getSizeOf(AI.ValTy),
1147                               AI.Offset - WithMinOffset.Offset);
1148   }
1149 
1150   // The aligned loads/stores will use blocks that are either scalars,
1151   // or HVX vectors. Let "sector" be the unified term for such a block.
1152   // blend(scalar, vector) -> sector...
1153   int ScLen = Move.IsHvx ? HVC.HST.getVectorLength()
1154                          : std::max<int>(MinNeeded.value(), 4);
1155   assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);
1156   assert(Move.IsHvx || ScLen == 4 || ScLen == 8);
1157 
1158   if (Move.IsLoad)
1159     realignLoadGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1160   else
1161     realignStoreGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1162 
1163   for (auto *Inst : Move.Main)
1164     Inst->eraseFromParent();
1165 
1166   return true;
1167 }
1168 
1169 auto AlignVectors::isSectorTy(Type *Ty) const -> bool {
1170   if (!HVC.isByteVecTy(Ty))
1171     return false;
1172   int Size = HVC.getSizeOf(Ty);
1173   if (HVC.HST.isTypeForHVX(Ty))
1174     return Size == static_cast<int>(HVC.HST.getVectorLength());
1175   return Size == 4 || Size == 8;
1176 }
1177 
1178 auto AlignVectors::run() -> bool {
1179   if (!createAddressGroups())
1180     return false;
1181 
1182   bool Changed = false;
1183   MoveList LoadGroups, StoreGroups;
1184 
1185   for (auto &G : AddrGroups) {
1186     llvm::append_range(LoadGroups, createLoadGroups(G.second));
1187     llvm::append_range(StoreGroups, createStoreGroups(G.second));
1188   }
1189 
1190   for (auto &M : LoadGroups)
1191     Changed |= move(M);
1192   for (auto &M : StoreGroups)
1193     Changed |= move(M);
1194 
1195   for (auto &M : LoadGroups)
1196     Changed |= realignGroup(M);
1197   for (auto &M : StoreGroups)
1198     Changed |= realignGroup(M);
1199 
1200   return Changed;
1201 }
1202 
1203 // --- End AlignVectors
1204 
1205 // --- Begin HvxIdioms
1206 
1207 auto HvxIdioms::getNumSignificantBits(Value *V, Instruction *In) const
1208     -> std::pair<unsigned, Signedness> {
1209   unsigned Bits = HVC.getNumSignificantBits(V, In);
1210   // The significant bits are calculated including the sign bit. This may
1211   // add an extra bit for zero-extended values, e.g. (zext i32 to i64) may
1212   // result in 33 significant bits. To avoid extra words, skip the extra
1213   // sign bit, but keep information that the value is to be treated as
1214   // unsigned.
1215   KnownBits Known = HVC.getKnownBits(V, In);
1216   Signedness Sign = Signed;
1217   unsigned NumToTest = 0; // Number of bits used in test for unsignedness.
1218   if (isPowerOf2_32(Bits))
1219     NumToTest = Bits;
1220   else if (Bits > 1 && isPowerOf2_32(Bits - 1))
1221     NumToTest = Bits - 1;
1222 
1223   if (NumToTest != 0 && Known.Zero.ashr(NumToTest).isAllOnes()) {
1224     Sign = Unsigned;
1225     Bits = NumToTest;
1226   }
1227 
1228   // If the top bit of the nearest power-of-2 is zero, this value is
1229   // positive. It could be treated as either signed or unsigned.
1230   if (unsigned Pow2 = PowerOf2Ceil(Bits); Pow2 != Bits) {
1231     if (Known.Zero.ashr(Pow2 - 1).isAllOnes())
1232       Sign = Positive;
1233   }
1234   return {Bits, Sign};
1235 }
1236 
1237 auto HvxIdioms::canonSgn(SValue X, SValue Y) const
1238     -> std::pair<SValue, SValue> {
1239   // Canonicalize the signedness of X and Y, so that the result is one of:
1240   //   S, S
1241   //   U/P, S
1242   //   U/P, U/P
1243   if (X.Sgn == Signed && Y.Sgn != Signed)
1244     std::swap(X, Y);
1245   return {X, Y};
1246 }
1247 
1248 // Match
1249 //   (X * Y) [>> N], or
1250 //   ((X * Y) + (1 << N-1)) >> N
1251 auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> {
1252   using namespace PatternMatch;
1253   auto *Ty = In.getType();
1254 
1255   if (!Ty->isVectorTy() || !Ty->getScalarType()->isIntegerTy())
1256     return std::nullopt;
1257 
1258   unsigned Width = cast<IntegerType>(Ty->getScalarType())->getBitWidth();
1259 
1260   FxpOp Op;
1261   Value *Exp = &In;
1262 
1263   // Fixed-point multiplication is always shifted right (except when the
1264   // fraction is 0 bits).
1265   auto m_Shr = [](auto &&V, auto &&S) {
1266     return m_CombineOr(m_LShr(V, S), m_AShr(V, S));
1267   };
1268 
1269   const APInt *Qn = nullptr;
1270   if (Value * T; match(Exp, m_Shr(m_Value(T), m_APInt(Qn)))) {
1271     Op.Frac = Qn->getZExtValue();
1272     Exp = T;
1273   } else {
1274     Op.Frac = 0;
1275   }
1276 
1277   if (Op.Frac > Width)
1278     return std::nullopt;
1279 
1280   // Check if there is rounding added.
1281   const APInt *C = nullptr;
1282   if (Value * T; Op.Frac > 0 && match(Exp, m_Add(m_Value(T), m_APInt(C)))) {
1283     unsigned CV = C->getZExtValue();
1284     if (CV != 0 && !isPowerOf2_32(CV))
1285       return std::nullopt;
1286     if (CV != 0)
1287       Op.RoundAt = Log2_32(CV);
1288     Exp = T;
1289   }
1290 
1291   // Check if the rest is a multiplication.
1292   if (match(Exp, m_Mul(m_Value(Op.X.Val), m_Value(Op.Y.Val)))) {
1293     Op.Opcode = Instruction::Mul;
1294     // FIXME: The information below is recomputed.
1295     Op.X.Sgn = getNumSignificantBits(Op.X.Val, &In).second;
1296     Op.Y.Sgn = getNumSignificantBits(Op.Y.Val, &In).second;
1297     return Op;
1298   }
1299 
1300   return std::nullopt;
1301 }
1302 
1303 auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const
1304     -> Value * {
1305   assert(Op.X.Val->getType() == Op.Y.Val->getType());
1306 
1307   auto *VecTy = dyn_cast<VectorType>(Op.X.Val->getType());
1308   if (VecTy == nullptr)
1309     return nullptr;
1310   auto *ElemTy = cast<IntegerType>(VecTy->getElementType());
1311   unsigned ElemWidth = ElemTy->getBitWidth();
1312 
1313   // TODO: This can be relaxed after legalization is done pre-isel.
1314   if ((HVC.length(VecTy) * ElemWidth) % (8 * HVC.HST.getVectorLength()) != 0)
1315     return nullptr;
1316 
1317   // There are no special intrinsics that should be used for multiplying
1318   // signed 8-bit values, so just skip them. Normal codegen should handle
1319   // this just fine.
1320   if (ElemWidth <= 8)
1321     return nullptr;
1322   // Similarly, if this is just a multiplication that can be handled without
1323   // intervention, then leave it alone.
1324   if (ElemWidth <= 32 && Op.Frac == 0)
1325     return nullptr;
1326 
1327   auto [BitsX, SignX] = getNumSignificantBits(Op.X.Val, &In);
1328   auto [BitsY, SignY] = getNumSignificantBits(Op.Y.Val, &In);
1329 
1330   // TODO: Add multiplication of vectors by scalar registers (up to 4 bytes).
1331 
1332   Value *X = Op.X.Val, *Y = Op.Y.Val;
1333   IRBuilder Builder(In.getParent(), In.getIterator(),
1334                     InstSimplifyFolder(HVC.DL));
1335 
1336   auto roundUpWidth = [](unsigned Width) -> unsigned {
1337     if (Width <= 32 && !isPowerOf2_32(Width)) {
1338       // If the element width is not a power of 2, round it up
1339       // to the next one. Do this for widths not exceeding 32.
1340       return PowerOf2Ceil(Width);
1341     }
1342     if (Width > 32 && Width % 32 != 0) {
1343       // For wider elements, round it up to the multiple of 32.
1344       return alignTo(Width, 32u);
1345     }
1346     return Width;
1347   };
1348 
1349   BitsX = roundUpWidth(BitsX);
1350   BitsY = roundUpWidth(BitsY);
1351 
1352   // For elementwise multiplication vectors must have the same lengths, so
1353   // resize the elements of both inputs to the same width, the max of the
1354   // calculated significant bits.
1355   unsigned Width = std::max(BitsX, BitsY);
1356 
1357   auto *ResizeTy = VectorType::get(HVC.getIntTy(Width), VecTy);
1358   if (Width < ElemWidth) {
1359     X = Builder.CreateTrunc(X, ResizeTy);
1360     Y = Builder.CreateTrunc(Y, ResizeTy);
1361   } else if (Width > ElemWidth) {
1362     X = SignX == Signed ? Builder.CreateSExt(X, ResizeTy)
1363                         : Builder.CreateZExt(X, ResizeTy);
1364     Y = SignY == Signed ? Builder.CreateSExt(Y, ResizeTy)
1365                         : Builder.CreateZExt(Y, ResizeTy);
1366   };
1367 
1368   assert(X->getType() == Y->getType() && X->getType() == ResizeTy);
1369 
1370   unsigned VecLen = HVC.length(ResizeTy);
1371   unsigned ChopLen = (8 * HVC.HST.getVectorLength()) / std::min(Width, 32u);
1372 
1373   SmallVector<Value *> Results;
1374   FxpOp ChopOp = Op;
1375 
1376   for (unsigned V = 0; V != VecLen / ChopLen; ++V) {
1377     ChopOp.X.Val = HVC.subvector(Builder, X, V * ChopLen, ChopLen);
1378     ChopOp.Y.Val = HVC.subvector(Builder, Y, V * ChopLen, ChopLen);
1379     Results.push_back(processFxpMulChopped(Builder, In, ChopOp));
1380     if (Results.back() == nullptr)
1381       break;
1382   }
1383 
1384   if (Results.back() == nullptr)
1385     return nullptr;
1386 
1387   Value *Cat = HVC.concat(Builder, Results);
1388   Value *Ext = SignX == Signed || SignY == Signed
1389                    ? Builder.CreateSExt(Cat, VecTy)
1390                    : Builder.CreateZExt(Cat, VecTy);
1391   return Ext;
1392 }
1393 
1394 auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
1395                                      const FxpOp &Op) const -> Value * {
1396   assert(Op.X.Val->getType() == Op.Y.Val->getType());
1397   auto *InpTy = cast<VectorType>(Op.X.Val->getType());
1398   unsigned Width = InpTy->getScalarSizeInBits();
1399   bool Rounding = Op.RoundAt.has_value();
1400 
1401   if (!Op.RoundAt || *Op.RoundAt == Op.Frac - 1) {
1402     // The fixed-point intrinsics do signed multiplication.
1403     if (Width == Op.Frac + 1 && Op.X.Sgn != Unsigned && Op.Y.Sgn != Unsigned) {
1404       Value *QMul = nullptr;
1405       if (Width == 16) {
1406         QMul = createMulQ15(Builder, Op.X, Op.Y, Rounding);
1407       } else if (Width == 32) {
1408         QMul = createMulQ31(Builder, Op.X, Op.Y, Rounding);
1409       }
1410       if (QMul != nullptr)
1411         return QMul;
1412     }
1413   }
1414 
1415   assert(Width >= 32 || isPowerOf2_32(Width)); // Width <= 32 => Width is 2^n
1416   assert(Width < 32 || Width % 32 == 0);       // Width > 32 => Width is 32*k
1417 
1418   // If Width < 32, then it should really be 16.
1419   if (Width < 32) {
1420     if (Width < 16)
1421       return nullptr;
1422     // Getting here with Op.Frac == 0 isn't wrong, but suboptimal: here we
1423     // generate a full precision products, which is unnecessary if there is
1424     // no shift.
1425     assert(Width == 16);
1426     assert(Op.Frac != 0 && "Unshifted mul should have been skipped");
1427     if (Op.Frac == 16) {
1428       // Multiply high
1429       if (Value *MulH = createMulH16(Builder, Op.X, Op.Y))
1430         return MulH;
1431     }
1432     // Do full-precision multiply and shift.
1433     Value *Prod32 = createMul16(Builder, Op.X, Op.Y);
1434     if (Rounding) {
1435       Value *RoundVal = HVC.getConstSplat(Prod32->getType(), 1 << *Op.RoundAt);
1436       Prod32 = Builder.CreateAdd(Prod32, RoundVal);
1437     }
1438 
1439     Value *ShiftAmt = HVC.getConstSplat(Prod32->getType(), Op.Frac);
1440     Value *Shifted = Op.X.Sgn == Signed || Op.Y.Sgn == Signed
1441                ? Builder.CreateAShr(Prod32, ShiftAmt)
1442                : Builder.CreateLShr(Prod32, ShiftAmt);
1443     return Builder.CreateTrunc(Shifted, InpTy);
1444   }
1445 
1446   // Width >= 32
1447 
1448   // Break up the arguments Op.X and Op.Y into vectors of smaller widths
1449   // in preparation of doing the multiplication by 32-bit parts.
1450   auto WordX = HVC.splitVectorElements(Builder, Op.X.Val, /*ToWidth=*/32);
1451   auto WordY = HVC.splitVectorElements(Builder, Op.Y.Val, /*ToWidth=*/32);
1452   auto WordP = createMulLong(Builder, WordX, Op.X.Sgn, WordY, Op.Y.Sgn);
1453 
1454   auto *HvxWordTy = cast<VectorType>(WordP.front()->getType());
1455 
1456   // Add the optional rounding to the proper word.
1457   if (Op.RoundAt.has_value()) {
1458     Value *Zero = HVC.getNullValue(WordX[0]->getType());
1459     SmallVector<Value *> RoundV(WordP.size(), Zero);
1460     RoundV[*Op.RoundAt / 32] =
1461         HVC.getConstSplat(HvxWordTy, 1 << (*Op.RoundAt % 32));
1462     WordP = createAddLong(Builder, WordP, RoundV);
1463   }
1464 
1465   // createRightShiftLong?
1466 
1467   // Shift all products right by Op.Frac.
1468   unsigned SkipWords = Op.Frac / 32;
1469   Constant *ShiftAmt = HVC.getConstSplat(HvxWordTy, Op.Frac % 32);
1470 
1471   for (int Dst = 0, End = WordP.size() - SkipWords; Dst != End; ++Dst) {
1472     int Src = Dst + SkipWords;
1473     Value *Lo = WordP[Src];
1474     if (Src + 1 < End) {
1475       Value *Hi = WordP[Src + 1];
1476       WordP[Dst] = Builder.CreateIntrinsic(HvxWordTy, Intrinsic::fshr,
1477                                            {Hi, Lo, ShiftAmt});
1478     } else {
1479       // The shift of the most significant word.
1480       WordP[Dst] = Builder.CreateAShr(Lo, ShiftAmt);
1481     }
1482   }
1483   if (SkipWords != 0)
1484     WordP.resize(WordP.size() - SkipWords);
1485 
1486   return HVC.joinVectorElements(Builder, WordP, InpTy);
1487 }
1488 
1489 auto HvxIdioms::createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
1490                              bool Rounding) const -> Value * {
1491   assert(X.Val->getType() == Y.Val->getType());
1492   assert(X.Val->getType()->getScalarType() == HVC.getIntTy(16));
1493   assert(HVC.HST.isHVXVectorType(EVT::getEVT(X.Val->getType(), false)));
1494 
1495   // There is no non-rounding intrinsic for i16.
1496   if (!Rounding || X.Sgn == Unsigned || Y.Sgn == Unsigned)
1497     return nullptr;
1498 
1499   auto V6_vmpyhvsrs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhvsrs);
1500   return HVC.createHvxIntrinsic(Builder, V6_vmpyhvsrs, X.Val->getType(),
1501                                 {X.Val, Y.Val});
1502 }
1503 
1504 auto HvxIdioms::createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
1505                              bool Rounding) const -> Value * {
1506   Type *InpTy = X.Val->getType();
1507   assert(InpTy == Y.Val->getType());
1508   assert(InpTy->getScalarType() == HVC.getIntTy(32));
1509   assert(HVC.HST.isHVXVectorType(EVT::getEVT(InpTy, false)));
1510 
1511   if (X.Sgn == Unsigned || Y.Sgn == Unsigned)
1512     return nullptr;
1513 
1514   auto V6_vmpyewuh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyewuh);
1515   auto V6_vmpyo_acc = Rounding
1516                           ? HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_rnd_sacc)
1517                           : HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_sacc);
1518   Value *V1 =
1519       HVC.createHvxIntrinsic(Builder, V6_vmpyewuh, InpTy, {X.Val, Y.Val});
1520   return HVC.createHvxIntrinsic(Builder, V6_vmpyo_acc, InpTy,
1521                                 {V1, X.Val, Y.Val});
1522 }
1523 
1524 auto HvxIdioms::createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
1525                                Value *CarryIn) const
1526     -> std::pair<Value *, Value *> {
1527   assert(X->getType() == Y->getType());
1528   auto VecTy = cast<VectorType>(X->getType());
1529   if (VecTy == HvxI32Ty && HVC.HST.useHVXV62Ops()) {
1530     SmallVector<Value *> Args = {X, Y};
1531     Intrinsic::ID AddCarry;
1532     if (CarryIn == nullptr && HVC.HST.useHVXV66Ops()) {
1533       AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarryo);
1534     } else {
1535       AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarry);
1536       if (CarryIn == nullptr)
1537         CarryIn = HVC.getNullValue(HVC.getBoolTy(HVC.length(VecTy)));
1538       Args.push_back(CarryIn);
1539     }
1540     Value *Ret = HVC.createHvxIntrinsic(Builder, AddCarry,
1541                                         /*RetTy=*/nullptr, Args);
1542     Value *Result = Builder.CreateExtractValue(Ret, {0});
1543     Value *CarryOut = Builder.CreateExtractValue(Ret, {1});
1544     return {Result, CarryOut};
1545   }
1546 
1547   // In other cases, do a regular add, and unsigned compare-less-than.
1548   // The carry-out can originate in two places: adding the carry-in or adding
1549   // the two input values.
1550   Value *Result1 = X; // Result1 = X + CarryIn
1551   if (CarryIn != nullptr) {
1552     unsigned Width = VecTy->getScalarSizeInBits();
1553     uint32_t Mask = 1;
1554     if (Width < 32) {
1555       for (unsigned i = 0, e = 32 / Width; i != e; ++i)
1556         Mask = (Mask << Width) | 1;
1557     }
1558     auto V6_vandqrt = HVC.HST.getIntrinsicId(Hexagon::V6_vandqrt);
1559     Value *ValueIn =
1560         HVC.createHvxIntrinsic(Builder, V6_vandqrt, /*RetTy=*/nullptr,
1561                                {CarryIn, HVC.getConstInt(Mask)});
1562     Result1 = Builder.CreateAdd(X, ValueIn);
1563   }
1564 
1565   Value *CarryOut1 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result1, X);
1566   Value *Result2 = Builder.CreateAdd(Result1, Y);
1567   Value *CarryOut2 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result2, Y);
1568   return {Result2, Builder.CreateOr(CarryOut1, CarryOut2)};
1569 }
1570 
1571 auto HvxIdioms::createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const
1572     -> Value * {
1573   Intrinsic::ID V6_vmpyh = 0;
1574   std::tie(X, Y) = canonSgn(X, Y);
1575 
1576   if (X.Sgn == Signed) {
1577     V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhv);
1578   } else if (Y.Sgn == Signed) {
1579     V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhus);
1580   } else {
1581     V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhv);
1582   }
1583 
1584   // i16*i16 -> i32 / interleaved
1585   Value *P =
1586       HVC.createHvxIntrinsic(Builder, V6_vmpyh, HvxP32Ty, {X.Val, Y.Val});
1587   // Deinterleave
1588   return HVC.vdeal(Builder, HVC.sublo(Builder, P), HVC.subhi(Builder, P));
1589 }
1590 
1591 auto HvxIdioms::createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
1592     -> Value * {
1593   Type *HvxI16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/false);
1594 
1595   if (HVC.HST.useHVXV69Ops()) {
1596     if (X.Sgn != Signed && Y.Sgn != Signed) {
1597       auto V6_vmpyuhvs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhvs);
1598       return HVC.createHvxIntrinsic(Builder, V6_vmpyuhvs, HvxI16Ty,
1599                                     {X.Val, Y.Val});
1600     }
1601   }
1602 
1603   Type *HvxP16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/true);
1604   Value *Pair16 = Builder.CreateBitCast(createMul16(Builder, X, Y), HvxP16Ty);
1605   unsigned Len = HVC.length(HvxP16Ty) / 2;
1606 
1607   SmallVector<int, 128> PickOdd(Len);
1608   for (int i = 0; i != static_cast<int>(Len); ++i)
1609     PickOdd[i] = 2 * i + 1;
1610 
1611   return Builder.CreateShuffleVector(HVC.sublo(Builder, Pair16),
1612                                      HVC.subhi(Builder, Pair16), PickOdd);
1613 }
1614 
1615 auto HvxIdioms::createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
1616     -> std::pair<Value *, Value *> {
1617   assert(X.Val->getType() == Y.Val->getType());
1618   assert(X.Val->getType() == HvxI32Ty);
1619 
1620   Intrinsic::ID V6_vmpy_parts;
1621   std::tie(X, Y) = canonSgn(X, Y);
1622 
1623   if (X.Sgn == Signed) {
1624     V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyss_parts;
1625   } else if (Y.Sgn == Signed) {
1626     V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyus_parts;
1627   } else {
1628     V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyuu_parts;
1629   }
1630 
1631   Value *Parts = HVC.createHvxIntrinsic(Builder, V6_vmpy_parts, nullptr,
1632                                         {X.Val, Y.Val}, {HvxI32Ty});
1633   Value *Hi = Builder.CreateExtractValue(Parts, {0});
1634   Value *Lo = Builder.CreateExtractValue(Parts, {1});
1635   return {Lo, Hi};
1636 }
1637 
1638 auto HvxIdioms::createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
1639                               ArrayRef<Value *> WordY) const
1640     -> SmallVector<Value *> {
1641   assert(WordX.size() == WordY.size());
1642   unsigned Idx = 0, Length = WordX.size();
1643   SmallVector<Value *> Sum(Length);
1644 
1645   while (Idx != Length) {
1646     if (HVC.isZero(WordX[Idx]))
1647       Sum[Idx] = WordY[Idx];
1648     else if (HVC.isZero(WordY[Idx]))
1649       Sum[Idx] = WordX[Idx];
1650     else
1651       break;
1652     ++Idx;
1653   }
1654 
1655   Value *Carry = nullptr;
1656   for (; Idx != Length; ++Idx) {
1657     std::tie(Sum[Idx], Carry) =
1658         createAddCarry(Builder, WordX[Idx], WordY[Idx], Carry);
1659   }
1660 
1661   // This drops the final carry beyond the highest word.
1662   return Sum;
1663 }
1664 
1665 auto HvxIdioms::createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
1666                               Signedness SgnX, ArrayRef<Value *> WordY,
1667                               Signedness SgnY) const -> SmallVector<Value *> {
1668   SmallVector<SmallVector<Value *>> Products(WordX.size() + WordY.size());
1669 
1670   // WordX[i] * WordY[j] produces words i+j and i+j+1 of the results,
1671   // that is halves 2(i+j), 2(i+j)+1, 2(i+j)+2, 2(i+j)+3.
1672   for (int i = 0, e = WordX.size(); i != e; ++i) {
1673     for (int j = 0, f = WordY.size(); j != f; ++j) {
1674       // Check the 4 halves that this multiplication can generate.
1675       Signedness SX = (i + 1 == e) ? SgnX : Unsigned;
1676       Signedness SY = (j + 1 == f) ? SgnY : Unsigned;
1677       auto [Lo, Hi] = createMul32(Builder, {WordX[i], SX}, {WordY[j], SY});
1678       Products[i + j + 0].push_back(Lo);
1679       Products[i + j + 1].push_back(Hi);
1680     }
1681   }
1682 
1683   Value *Zero = HVC.getNullValue(WordX[0]->getType());
1684 
1685   auto pop_back_or_zero = [Zero](auto &Vector) -> Value * {
1686     if (Vector.empty())
1687       return Zero;
1688     auto Last = Vector.back();
1689     Vector.pop_back();
1690     return Last;
1691   };
1692 
1693   for (int i = 0, e = Products.size(); i != e; ++i) {
1694     while (Products[i].size() > 1) {
1695       Value *Carry = nullptr; // no carry-in
1696       for (int j = i; j != e; ++j) {
1697         auto &ProdJ = Products[j];
1698         auto [Sum, CarryOut] = createAddCarry(Builder, pop_back_or_zero(ProdJ),
1699                                               pop_back_or_zero(ProdJ), Carry);
1700         ProdJ.insert(ProdJ.begin(), Sum);
1701         Carry = CarryOut;
1702       }
1703     }
1704   }
1705 
1706   SmallVector<Value *> WordP;
1707   for (auto &P : Products) {
1708     assert(P.size() == 1 && "Should have been added together");
1709     WordP.push_back(P.front());
1710   }
1711 
1712   return WordP;
1713 }
1714 
1715 auto HvxIdioms::run() -> bool {
1716   bool Changed = false;
1717 
1718   for (BasicBlock &B : HVC.F) {
1719     for (auto It = B.rbegin(); It != B.rend(); ++It) {
1720       if (auto Fxm = matchFxpMul(*It)) {
1721         Value *New = processFxpMul(*It, *Fxm);
1722         // Always report "changed" for now.
1723         Changed = true;
1724         if (!New)
1725           continue;
1726         bool StartOver = !isa<Instruction>(New);
1727         It->replaceAllUsesWith(New);
1728         RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI);
1729         It = StartOver ? B.rbegin()
1730                        : cast<Instruction>(New)->getReverseIterator();
1731         Changed = true;
1732       }
1733     }
1734   }
1735 
1736   return Changed;
1737 }
1738 
1739 // --- End HvxIdioms
1740 
1741 auto HexagonVectorCombine::run() -> bool {
1742   if (!HST.useHVXOps())
1743     return false;
1744 
1745   bool Changed = false;
1746   Changed |= AlignVectors(*this).run();
1747   Changed |= HvxIdioms(*this).run();
1748 
1749   return Changed;
1750 }
1751 
1752 auto HexagonVectorCombine::getIntTy(unsigned Width) const -> IntegerType * {
1753   return IntegerType::get(F.getContext(), Width);
1754 }
1755 
1756 auto HexagonVectorCombine::getByteTy(int ElemCount) const -> Type * {
1757   assert(ElemCount >= 0);
1758   IntegerType *ByteTy = Type::getInt8Ty(F.getContext());
1759   if (ElemCount == 0)
1760     return ByteTy;
1761   return VectorType::get(ByteTy, ElemCount, /*Scalable=*/false);
1762 }
1763 
1764 auto HexagonVectorCombine::getBoolTy(int ElemCount) const -> Type * {
1765   assert(ElemCount >= 0);
1766   IntegerType *BoolTy = Type::getInt1Ty(F.getContext());
1767   if (ElemCount == 0)
1768     return BoolTy;
1769   return VectorType::get(BoolTy, ElemCount, /*Scalable=*/false);
1770 }
1771 
1772 auto HexagonVectorCombine::getConstInt(int Val, unsigned Width) const
1773     -> ConstantInt * {
1774   return ConstantInt::getSigned(getIntTy(Width), Val);
1775 }
1776 
1777 auto HexagonVectorCombine::isZero(const Value *Val) const -> bool {
1778   if (auto *C = dyn_cast<Constant>(Val))
1779     return C->isZeroValue();
1780   return false;
1781 }
1782 
1783 auto HexagonVectorCombine::getIntValue(const Value *Val) const
1784     -> std::optional<APInt> {
1785   if (auto *CI = dyn_cast<ConstantInt>(Val))
1786     return CI->getValue();
1787   return std::nullopt;
1788 }
1789 
1790 auto HexagonVectorCombine::isUndef(const Value *Val) const -> bool {
1791   return isa<UndefValue>(Val);
1792 }
1793 
1794 auto HexagonVectorCombine::getHvxTy(Type *ElemTy, bool Pair) const
1795     -> VectorType * {
1796   EVT ETy = EVT::getEVT(ElemTy, false);
1797   assert(ETy.isSimple() && "Invalid HVX element type");
1798   // Do not allow boolean types here: they don't have a fixed length.
1799   assert(HST.isHVXElementType(ETy.getSimpleVT(), /*IncludeBool=*/false) &&
1800          "Invalid HVX element type");
1801   unsigned HwLen = HST.getVectorLength();
1802   unsigned NumElems = (8 * HwLen) / ETy.getSizeInBits();
1803   return VectorType::get(ElemTy, Pair ? 2 * NumElems : NumElems,
1804                          /*Scalable=*/false);
1805 }
1806 
1807 auto HexagonVectorCombine::getSizeOf(const Value *Val, SizeKind Kind) const
1808     -> int {
1809   return getSizeOf(Val->getType(), Kind);
1810 }
1811 
1812 auto HexagonVectorCombine::getSizeOf(const Type *Ty, SizeKind Kind) const
1813     -> int {
1814   auto *NcTy = const_cast<Type *>(Ty);
1815   switch (Kind) {
1816   case Store:
1817     return DL.getTypeStoreSize(NcTy).getFixedValue();
1818   case Alloc:
1819     return DL.getTypeAllocSize(NcTy).getFixedValue();
1820   }
1821   llvm_unreachable("Unhandled SizeKind enum");
1822 }
1823 
1824 auto HexagonVectorCombine::getTypeAlignment(Type *Ty) const -> int {
1825   // The actual type may be shorter than the HVX vector, so determine
1826   // the alignment based on subtarget info.
1827   if (HST.isTypeForHVX(Ty))
1828     return HST.getVectorLength();
1829   return DL.getABITypeAlign(Ty).value();
1830 }
1831 
1832 auto HexagonVectorCombine::length(Value *Val) const -> size_t {
1833   return length(Val->getType());
1834 }
1835 
1836 auto HexagonVectorCombine::length(Type *Ty) const -> size_t {
1837   auto *VecTy = dyn_cast<VectorType>(Ty);
1838   assert(VecTy && "Must be a vector type");
1839   return VecTy->getElementCount().getFixedValue();
1840 }
1841 
1842 auto HexagonVectorCombine::getNullValue(Type *Ty) const -> Constant * {
1843   assert(Ty->isIntOrIntVectorTy());
1844   auto Zero = ConstantInt::get(Ty->getScalarType(), 0);
1845   if (auto *VecTy = dyn_cast<VectorType>(Ty))
1846     return ConstantVector::getSplat(VecTy->getElementCount(), Zero);
1847   return Zero;
1848 }
1849 
1850 auto HexagonVectorCombine::getFullValue(Type *Ty) const -> Constant * {
1851   assert(Ty->isIntOrIntVectorTy());
1852   auto Minus1 = ConstantInt::get(Ty->getScalarType(), -1);
1853   if (auto *VecTy = dyn_cast<VectorType>(Ty))
1854     return ConstantVector::getSplat(VecTy->getElementCount(), Minus1);
1855   return Minus1;
1856 }
1857 
1858 auto HexagonVectorCombine::getConstSplat(Type *Ty, int Val) const
1859     -> Constant * {
1860   assert(Ty->isVectorTy());
1861   auto VecTy = cast<VectorType>(Ty);
1862   Type *ElemTy = VecTy->getElementType();
1863   // Add support for floats if needed.
1864   auto *Splat = ConstantVector::getSplat(VecTy->getElementCount(),
1865                                          ConstantInt::get(ElemTy, Val));
1866   return Splat;
1867 }
1868 
1869 auto HexagonVectorCombine::simplify(Value *V) const -> Value * {
1870   if (auto *In = dyn_cast<Instruction>(V)) {
1871     SimplifyQuery Q(DL, &TLI, &DT, &AC, In);
1872     return simplifyInstruction(In, Q);
1873   }
1874   return nullptr;
1875 }
1876 
1877 // Insert bytes [Start..Start+Length) of Src into Dst at byte Where.
1878 auto HexagonVectorCombine::insertb(IRBuilderBase &Builder, Value *Dst,
1879                                    Value *Src, int Start, int Length,
1880                                    int Where) const -> Value * {
1881   assert(isByteVecTy(Dst->getType()) && isByteVecTy(Src->getType()));
1882   int SrcLen = getSizeOf(Src);
1883   int DstLen = getSizeOf(Dst);
1884   assert(0 <= Start && Start + Length <= SrcLen);
1885   assert(0 <= Where && Where + Length <= DstLen);
1886 
1887   int P2Len = PowerOf2Ceil(SrcLen | DstLen);
1888   auto *Undef = UndefValue::get(getByteTy());
1889   Value *P2Src = vresize(Builder, Src, P2Len, Undef);
1890   Value *P2Dst = vresize(Builder, Dst, P2Len, Undef);
1891 
1892   SmallVector<int, 256> SMask(P2Len);
1893   for (int i = 0; i != P2Len; ++i) {
1894     // If i is in [Where, Where+Length), pick Src[Start+(i-Where)].
1895     // Otherwise, pick Dst[i];
1896     SMask[i] =
1897         (Where <= i && i < Where + Length) ? P2Len + Start + (i - Where) : i;
1898   }
1899 
1900   Value *P2Insert = Builder.CreateShuffleVector(P2Dst, P2Src, SMask);
1901   return vresize(Builder, P2Insert, DstLen, Undef);
1902 }
1903 
1904 auto HexagonVectorCombine::vlalignb(IRBuilderBase &Builder, Value *Lo,
1905                                     Value *Hi, Value *Amt) const -> Value * {
1906   assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
1907   if (isZero(Amt))
1908     return Hi;
1909   int VecLen = getSizeOf(Hi);
1910   if (auto IntAmt = getIntValue(Amt))
1911     return getElementRange(Builder, Lo, Hi, VecLen - IntAmt->getSExtValue(),
1912                            VecLen);
1913 
1914   if (HST.isTypeForHVX(Hi->getType())) {
1915     assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
1916            "Expecting an exact HVX type");
1917     return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_vlalignb),
1918                               Hi->getType(), {Hi, Lo, Amt});
1919   }
1920 
1921   if (VecLen == 4) {
1922     Value *Pair = concat(Builder, {Lo, Hi});
1923     Value *Shift = Builder.CreateLShr(Builder.CreateShl(Pair, Amt), 32);
1924     Value *Trunc = Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()));
1925     return Builder.CreateBitCast(Trunc, Hi->getType());
1926   }
1927   if (VecLen == 8) {
1928     Value *Sub = Builder.CreateSub(getConstInt(VecLen), Amt);
1929     return vralignb(Builder, Lo, Hi, Sub);
1930   }
1931   llvm_unreachable("Unexpected vector length");
1932 }
1933 
1934 auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo,
1935                                     Value *Hi, Value *Amt) const -> Value * {
1936   assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
1937   if (isZero(Amt))
1938     return Lo;
1939   int VecLen = getSizeOf(Lo);
1940   if (auto IntAmt = getIntValue(Amt))
1941     return getElementRange(Builder, Lo, Hi, IntAmt->getSExtValue(), VecLen);
1942 
1943   if (HST.isTypeForHVX(Lo->getType())) {
1944     assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
1945            "Expecting an exact HVX type");
1946     return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_valignb),
1947                               Lo->getType(), {Hi, Lo, Amt});
1948   }
1949 
1950   if (VecLen == 4) {
1951     Value *Pair = concat(Builder, {Lo, Hi});
1952     Value *Shift = Builder.CreateLShr(Pair, Amt);
1953     Value *Trunc = Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()));
1954     return Builder.CreateBitCast(Trunc, Lo->getType());
1955   }
1956   if (VecLen == 8) {
1957     Type *Int64Ty = Type::getInt64Ty(F.getContext());
1958     Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty);
1959     Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty);
1960     Function *FI = Intrinsic::getDeclaration(F.getParent(),
1961                                              Intrinsic::hexagon_S2_valignrb);
1962     Value *Call = Builder.CreateCall(FI, {Hi64, Lo64, Amt});
1963     return Builder.CreateBitCast(Call, Lo->getType());
1964   }
1965   llvm_unreachable("Unexpected vector length");
1966 }
1967 
1968 // Concatenates a sequence of vectors of the same type.
1969 auto HexagonVectorCombine::concat(IRBuilderBase &Builder,
1970                                   ArrayRef<Value *> Vecs) const -> Value * {
1971   assert(!Vecs.empty());
1972   SmallVector<int, 256> SMask;
1973   std::vector<Value *> Work[2];
1974   int ThisW = 0, OtherW = 1;
1975 
1976   Work[ThisW].assign(Vecs.begin(), Vecs.end());
1977   while (Work[ThisW].size() > 1) {
1978     auto *Ty = cast<VectorType>(Work[ThisW].front()->getType());
1979     SMask.resize(length(Ty) * 2);
1980     std::iota(SMask.begin(), SMask.end(), 0);
1981 
1982     Work[OtherW].clear();
1983     if (Work[ThisW].size() % 2 != 0)
1984       Work[ThisW].push_back(UndefValue::get(Ty));
1985     for (int i = 0, e = Work[ThisW].size(); i < e; i += 2) {
1986       Value *Joined = Builder.CreateShuffleVector(Work[ThisW][i],
1987                                                   Work[ThisW][i + 1], SMask);
1988       Work[OtherW].push_back(Joined);
1989     }
1990     std::swap(ThisW, OtherW);
1991   }
1992 
1993   // Since there may have been some undefs appended to make shuffle operands
1994   // have the same type, perform the last shuffle to only pick the original
1995   // elements.
1996   SMask.resize(Vecs.size() * length(Vecs.front()->getType()));
1997   std::iota(SMask.begin(), SMask.end(), 0);
1998   Value *Total = Work[ThisW].front();
1999   return Builder.CreateShuffleVector(Total, SMask);
2000 }
2001 
2002 auto HexagonVectorCombine::vresize(IRBuilderBase &Builder, Value *Val,
2003                                    int NewSize, Value *Pad) const -> Value * {
2004   assert(isa<VectorType>(Val->getType()));
2005   auto *ValTy = cast<VectorType>(Val->getType());
2006   assert(ValTy->getElementType() == Pad->getType());
2007 
2008   int CurSize = length(ValTy);
2009   if (CurSize == NewSize)
2010     return Val;
2011   // Truncate?
2012   if (CurSize > NewSize)
2013     return getElementRange(Builder, Val, /*Ignored*/ Val, 0, NewSize);
2014   // Extend.
2015   SmallVector<int, 128> SMask(NewSize);
2016   std::iota(SMask.begin(), SMask.begin() + CurSize, 0);
2017   std::fill(SMask.begin() + CurSize, SMask.end(), CurSize);
2018   Value *PadVec = Builder.CreateVectorSplat(CurSize, Pad);
2019   return Builder.CreateShuffleVector(Val, PadVec, SMask);
2020 }
2021 
2022 auto HexagonVectorCombine::rescale(IRBuilderBase &Builder, Value *Mask,
2023                                    Type *FromTy, Type *ToTy) const -> Value * {
2024   // Mask is a vector <N x i1>, where each element corresponds to an
2025   // element of FromTy. Remap it so that each element will correspond
2026   // to an element of ToTy.
2027   assert(isa<VectorType>(Mask->getType()));
2028 
2029   Type *FromSTy = FromTy->getScalarType();
2030   Type *ToSTy = ToTy->getScalarType();
2031   if (FromSTy == ToSTy)
2032     return Mask;
2033 
2034   int FromSize = getSizeOf(FromSTy);
2035   int ToSize = getSizeOf(ToSTy);
2036   assert(FromSize % ToSize == 0 || ToSize % FromSize == 0);
2037 
2038   auto *MaskTy = cast<VectorType>(Mask->getType());
2039   int FromCount = length(MaskTy);
2040   int ToCount = (FromCount * FromSize) / ToSize;
2041   assert((FromCount * FromSize) % ToSize == 0);
2042 
2043   auto *FromITy = getIntTy(FromSize * 8);
2044   auto *ToITy = getIntTy(ToSize * 8);
2045 
2046   // Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> ->
2047   // -> trunc to <M x i1>.
2048   Value *Ext = Builder.CreateSExt(
2049       Mask, VectorType::get(FromITy, FromCount, /*Scalable=*/false));
2050   Value *Cast = Builder.CreateBitCast(
2051       Ext, VectorType::get(ToITy, ToCount, /*Scalable=*/false));
2052   return Builder.CreateTrunc(
2053       Cast, VectorType::get(getBoolTy(), ToCount, /*Scalable=*/false));
2054 }
2055 
2056 // Bitcast to bytes, and return least significant bits.
2057 auto HexagonVectorCombine::vlsb(IRBuilderBase &Builder, Value *Val) const
2058     -> Value * {
2059   Type *ScalarTy = Val->getType()->getScalarType();
2060   if (ScalarTy == getBoolTy())
2061     return Val;
2062 
2063   Value *Bytes = vbytes(Builder, Val);
2064   if (auto *VecTy = dyn_cast<VectorType>(Bytes->getType()))
2065     return Builder.CreateTrunc(Bytes, getBoolTy(getSizeOf(VecTy)));
2066   // If Bytes is a scalar (i.e. Val was a scalar byte), return i1, not
2067   // <1 x i1>.
2068   return Builder.CreateTrunc(Bytes, getBoolTy());
2069 }
2070 
2071 // Bitcast to bytes for non-bool. For bool, convert i1 -> i8.
2072 auto HexagonVectorCombine::vbytes(IRBuilderBase &Builder, Value *Val) const
2073     -> Value * {
2074   Type *ScalarTy = Val->getType()->getScalarType();
2075   if (ScalarTy == getByteTy())
2076     return Val;
2077 
2078   if (ScalarTy != getBoolTy())
2079     return Builder.CreateBitCast(Val, getByteTy(getSizeOf(Val)));
2080   // For bool, return a sext from i1 to i8.
2081   if (auto *VecTy = dyn_cast<VectorType>(Val->getType()))
2082     return Builder.CreateSExt(Val, VectorType::get(getByteTy(), VecTy));
2083   return Builder.CreateSExt(Val, getByteTy());
2084 }
2085 
2086 auto HexagonVectorCombine::subvector(IRBuilderBase &Builder, Value *Val,
2087                                      unsigned Start, unsigned Length) const
2088     -> Value * {
2089   assert(Start + Length <= length(Val));
2090   return getElementRange(Builder, Val, /*Ignored*/ Val, Start, Length);
2091 }
2092 
2093 auto HexagonVectorCombine::sublo(IRBuilderBase &Builder, Value *Val) const
2094     -> Value * {
2095   size_t Len = length(Val);
2096   assert(Len % 2 == 0 && "Length should be even");
2097   return subvector(Builder, Val, 0, Len / 2);
2098 }
2099 
2100 auto HexagonVectorCombine::subhi(IRBuilderBase &Builder, Value *Val) const
2101     -> Value * {
2102   size_t Len = length(Val);
2103   assert(Len % 2 == 0 && "Length should be even");
2104   return subvector(Builder, Val, Len / 2, Len / 2);
2105 }
2106 
2107 auto HexagonVectorCombine::vdeal(IRBuilderBase &Builder, Value *Val0,
2108                                  Value *Val1) const -> Value * {
2109   assert(Val0->getType() == Val1->getType());
2110   int Len = length(Val0);
2111   SmallVector<int, 128> Mask(2 * Len);
2112 
2113   for (int i = 0; i != Len; ++i) {
2114     Mask[i] = 2 * i;           // Even
2115     Mask[i + Len] = 2 * i + 1; // Odd
2116   }
2117   return Builder.CreateShuffleVector(Val0, Val1, Mask);
2118 }
2119 
2120 auto HexagonVectorCombine::vshuff(IRBuilderBase &Builder, Value *Val0,
2121                                   Value *Val1) const -> Value * { //
2122   assert(Val0->getType() == Val1->getType());
2123   int Len = length(Val0);
2124   SmallVector<int, 128> Mask(2 * Len);
2125 
2126   for (int i = 0; i != Len; ++i) {
2127     Mask[2 * i + 0] = i;       // Val0
2128     Mask[2 * i + 1] = i + Len; // Val1
2129   }
2130   return Builder.CreateShuffleVector(Val0, Val1, Mask);
2131 }
2132 
2133 auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,
2134                                               Intrinsic::ID IntID, Type *RetTy,
2135                                               ArrayRef<Value *> Args,
2136                                               ArrayRef<Type *> ArgTys) const
2137     -> Value * {
2138   auto getCast = [&](IRBuilderBase &Builder, Value *Val,
2139                      Type *DestTy) -> Value * {
2140     Type *SrcTy = Val->getType();
2141     if (SrcTy == DestTy)
2142       return Val;
2143 
2144     // Non-HVX type. It should be a scalar, and it should already have
2145     // a valid type.
2146     assert(HST.isTypeForHVX(SrcTy, /*IncludeBool=*/true));
2147 
2148     Type *BoolTy = Type::getInt1Ty(F.getContext());
2149     if (cast<VectorType>(SrcTy)->getElementType() != BoolTy)
2150       return Builder.CreateBitCast(Val, DestTy);
2151 
2152     // Predicate HVX vector.
2153     unsigned HwLen = HST.getVectorLength();
2154     Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast
2155                                    : Intrinsic::hexagon_V6_pred_typecast_128B;
2156     Function *FI =
2157         Intrinsic::getDeclaration(F.getParent(), TC, {DestTy, Val->getType()});
2158     return Builder.CreateCall(FI, {Val});
2159   };
2160 
2161   Function *IntrFn = Intrinsic::getDeclaration(F.getParent(), IntID, ArgTys);
2162   FunctionType *IntrTy = IntrFn->getFunctionType();
2163 
2164   SmallVector<Value *, 4> IntrArgs;
2165   for (int i = 0, e = Args.size(); i != e; ++i) {
2166     Value *A = Args[i];
2167     Type *T = IntrTy->getParamType(i);
2168     if (A->getType() != T) {
2169       IntrArgs.push_back(getCast(Builder, A, T));
2170     } else {
2171       IntrArgs.push_back(A);
2172     }
2173   }
2174   Value *Call = Builder.CreateCall(IntrFn, IntrArgs);
2175 
2176   Type *CallTy = Call->getType();
2177   if (RetTy == nullptr || CallTy == RetTy)
2178     return Call;
2179   // Scalar types should have RetTy matching the call return type.
2180   assert(HST.isTypeForHVX(CallTy, /*IncludeBool=*/true));
2181   return getCast(Builder, Call, RetTy);
2182 }
2183 
2184 auto HexagonVectorCombine::splitVectorElements(IRBuilderBase &Builder,
2185                                                Value *Vec,
2186                                                unsigned ToWidth) const
2187     -> SmallVector<Value *> {
2188   // Break a vector of wide elements into a series of vectors with narrow
2189   // elements:
2190   //   (...c0:b0:a0, ...c1:b1:a1, ...c2:b2:a2, ...)
2191   // -->
2192   //   (a0, a1, a2, ...)    // lowest "ToWidth" bits
2193   //   (b0, b1, b2, ...)    // the next lowest...
2194   //   (c0, c1, c2, ...)    // ...
2195   //   ...
2196   //
2197   // The number of elements in each resulting vector is the same as
2198   // in the original vector.
2199 
2200   auto *VecTy = cast<VectorType>(Vec->getType());
2201   assert(VecTy->getElementType()->isIntegerTy());
2202   unsigned FromWidth = VecTy->getScalarSizeInBits();
2203   assert(isPowerOf2_32(ToWidth) && isPowerOf2_32(FromWidth));
2204   assert(ToWidth <= FromWidth && "Breaking up into wider elements?");
2205   unsigned NumResults = FromWidth / ToWidth;
2206 
2207   SmallVector<Value *> Results(NumResults);
2208   Results[0] = Vec;
2209   unsigned Length = length(VecTy);
2210 
2211   // Do it by splitting in half, since those operations correspond to deal
2212   // instructions.
2213   auto splitInHalf = [&](unsigned Begin, unsigned End, auto splitFunc) -> void {
2214     // Take V = Results[Begin], split it in L, H.
2215     // Store Results[Begin] = L, Results[(Begin+End)/2] = H
2216     // Call itself recursively split(Begin, Half), split(Half+1, End)
2217     if (Begin + 1 == End)
2218       return;
2219 
2220     Value *Val = Results[Begin];
2221     unsigned Width = Val->getType()->getScalarSizeInBits();
2222 
2223     auto *VTy = VectorType::get(getIntTy(Width / 2), 2 * Length, false);
2224     Value *VVal = Builder.CreateBitCast(Val, VTy);
2225 
2226     Value *Res = vdeal(Builder, sublo(Builder, VVal), subhi(Builder, VVal));
2227 
2228     unsigned Half = (Begin + End) / 2;
2229     Results[Begin] = sublo(Builder, Res);
2230     Results[Half] = subhi(Builder, Res);
2231 
2232     splitFunc(Begin, Half, splitFunc);
2233     splitFunc(Half, End, splitFunc);
2234   };
2235 
2236   splitInHalf(0, NumResults, splitInHalf);
2237   return Results;
2238 }
2239 
2240 auto HexagonVectorCombine::joinVectorElements(IRBuilderBase &Builder,
2241                                               ArrayRef<Value *> Values,
2242                                               VectorType *ToType) const
2243     -> Value * {
2244   assert(ToType->getElementType()->isIntegerTy());
2245 
2246   // If the list of values does not have power-of-2 elements, append copies
2247   // of the sign bit to it, to make the size be 2^n.
2248   // The reason for this is that the values will be joined in pairs, because
2249   // otherwise the shuffles will result in convoluted code. With pairwise
2250   // joins, the shuffles will hopefully be folded into a perfect shuffle.
2251   // The output will need to be sign-extended to a type with element width
2252   // being a power-of-2 anyways.
2253   SmallVector<Value *> Inputs(Values.begin(), Values.end());
2254 
2255   unsigned ToWidth = ToType->getScalarSizeInBits();
2256   unsigned Width = Inputs.front()->getType()->getScalarSizeInBits();
2257   assert(Width <= ToWidth);
2258   assert(isPowerOf2_32(Width) && isPowerOf2_32(ToWidth));
2259   unsigned Length = length(Inputs.front()->getType());
2260 
2261   unsigned NeedInputs = ToWidth / Width;
2262   if (Inputs.size() != NeedInputs) {
2263     Value *Last = Inputs.back();
2264     Value *Sign =
2265         Builder.CreateAShr(Last, getConstSplat(Last->getType(), Width - 1));
2266     Inputs.resize(NeedInputs, Sign);
2267   }
2268 
2269   while (Inputs.size() > 1) {
2270     Width *= 2;
2271     auto *VTy = VectorType::get(getIntTy(Width), Length, false);
2272     for (int i = 0, e = Inputs.size(); i < e; i += 2) {
2273       Value *Res = vshuff(Builder, Inputs[i], Inputs[i + 1]);
2274       Inputs[i / 2] = Builder.CreateBitCast(Res, VTy);
2275     }
2276     Inputs.resize(Inputs.size() / 2);
2277   }
2278 
2279   assert(Inputs.front()->getType() == ToType);
2280   return Inputs.front();
2281 }
2282 
2283 auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,
2284                                                       Value *Ptr1) const
2285     -> std::optional<int> {
2286   struct Builder : IRBuilder<> {
2287     Builder(BasicBlock *B) : IRBuilder<>(B->getTerminator()) {}
2288     ~Builder() {
2289       for (Instruction *I : llvm::reverse(ToErase))
2290         I->eraseFromParent();
2291     }
2292     SmallVector<Instruction *, 8> ToErase;
2293   };
2294 
2295 #define CallBuilder(B, F)                                                      \
2296   [&](auto &B_) {                                                              \
2297     Value *V = B_.F;                                                           \
2298     if (auto *I = dyn_cast<Instruction>(V))                                    \
2299       B_.ToErase.push_back(I);                                                 \
2300     return V;                                                                  \
2301   }(B)
2302 
2303   auto Simplify = [this](Value *V) {
2304     if (Value *S = simplify(V))
2305       return S;
2306     return V;
2307   };
2308 
2309   auto StripBitCast = [](Value *V) {
2310     while (auto *C = dyn_cast<BitCastInst>(V))
2311       V = C->getOperand(0);
2312     return V;
2313   };
2314 
2315   Ptr0 = StripBitCast(Ptr0);
2316   Ptr1 = StripBitCast(Ptr1);
2317   if (!isa<GetElementPtrInst>(Ptr0) || !isa<GetElementPtrInst>(Ptr1))
2318     return std::nullopt;
2319 
2320   auto *Gep0 = cast<GetElementPtrInst>(Ptr0);
2321   auto *Gep1 = cast<GetElementPtrInst>(Ptr1);
2322   if (Gep0->getPointerOperand() != Gep1->getPointerOperand())
2323     return std::nullopt;
2324 
2325   Builder B(Gep0->getParent());
2326   int Scale = getSizeOf(Gep0->getSourceElementType(), Alloc);
2327 
2328   // FIXME: for now only check GEPs with a single index.
2329   if (Gep0->getNumOperands() != 2 || Gep1->getNumOperands() != 2)
2330     return std::nullopt;
2331 
2332   Value *Idx0 = Gep0->getOperand(1);
2333   Value *Idx1 = Gep1->getOperand(1);
2334 
2335   // First, try to simplify the subtraction directly.
2336   if (auto *Diff = dyn_cast<ConstantInt>(
2337           Simplify(CallBuilder(B, CreateSub(Idx0, Idx1)))))
2338     return Diff->getSExtValue() * Scale;
2339 
2340   KnownBits Known0 = getKnownBits(Idx0, Gep0);
2341   KnownBits Known1 = getKnownBits(Idx1, Gep1);
2342   APInt Unknown = ~(Known0.Zero | Known0.One) | ~(Known1.Zero | Known1.One);
2343   if (Unknown.isAllOnes())
2344     return std::nullopt;
2345 
2346   Value *MaskU = ConstantInt::get(Idx0->getType(), Unknown);
2347   Value *AndU0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskU)));
2348   Value *AndU1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskU)));
2349   Value *SubU = Simplify(CallBuilder(B, CreateSub(AndU0, AndU1)));
2350   int Diff0 = 0;
2351   if (auto *C = dyn_cast<ConstantInt>(SubU)) {
2352     Diff0 = C->getSExtValue();
2353   } else {
2354     return std::nullopt;
2355   }
2356 
2357   Value *MaskK = ConstantInt::get(MaskU->getType(), ~Unknown);
2358   Value *AndK0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskK)));
2359   Value *AndK1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskK)));
2360   Value *SubK = Simplify(CallBuilder(B, CreateSub(AndK0, AndK1)));
2361   int Diff1 = 0;
2362   if (auto *C = dyn_cast<ConstantInt>(SubK)) {
2363     Diff1 = C->getSExtValue();
2364   } else {
2365     return std::nullopt;
2366   }
2367 
2368   return (Diff0 + Diff1) * Scale;
2369 
2370 #undef CallBuilder
2371 }
2372 
2373 auto HexagonVectorCombine::getNumSignificantBits(const Value *V,
2374                                                  const Instruction *CtxI) const
2375     -> unsigned {
2376   return ComputeMaxSignificantBits(V, DL, /*Depth=*/0, &AC, CtxI, &DT);
2377 }
2378 
2379 auto HexagonVectorCombine::getKnownBits(const Value *V,
2380                                         const Instruction *CtxI) const
2381     -> KnownBits {
2382   return computeKnownBits(V, DL, /*Depth=*/0, &AC, CtxI, &DT, /*ORE=*/nullptr,
2383                           /*UseInstrInfo=*/true);
2384 }
2385 
2386 template <typename T>
2387 auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
2388                                                   BasicBlock::const_iterator To,
2389                                                   const T &IgnoreInsts) const
2390     -> bool {
2391   auto getLocOrNone = [this](const Instruction &I) -> Optional<MemoryLocation> {
2392     if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
2393       switch (II->getIntrinsicID()) {
2394       case Intrinsic::masked_load:
2395         return MemoryLocation::getForArgument(II, 0, TLI);
2396       case Intrinsic::masked_store:
2397         return MemoryLocation::getForArgument(II, 1, TLI);
2398       }
2399     }
2400     return MemoryLocation::getOrNone(&I);
2401   };
2402 
2403   // The source and the destination must be in the same basic block.
2404   const BasicBlock &Block = *In.getParent();
2405   assert(Block.begin() == To || Block.end() == To || To->getParent() == &Block);
2406   // No PHIs.
2407   if (isa<PHINode>(In) || (To != Block.end() && isa<PHINode>(*To)))
2408     return false;
2409 
2410   if (!mayHaveNonDefUseDependency(In))
2411     return true;
2412   bool MayWrite = In.mayWriteToMemory();
2413   auto MaybeLoc = getLocOrNone(In);
2414 
2415   auto From = In.getIterator();
2416   if (From == To)
2417     return true;
2418   bool MoveUp = (To != Block.end() && To->comesBefore(&In));
2419   auto Range =
2420       MoveUp ? std::make_pair(To, From) : std::make_pair(std::next(From), To);
2421   for (auto It = Range.first; It != Range.second; ++It) {
2422     const Instruction &I = *It;
2423     if (llvm::is_contained(IgnoreInsts, &I))
2424       continue;
2425     // assume intrinsic can be ignored
2426     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
2427       if (II->getIntrinsicID() == Intrinsic::assume)
2428         continue;
2429     }
2430     // Parts based on isSafeToMoveBefore from CoveMoverUtils.cpp.
2431     if (I.mayThrow())
2432       return false;
2433     if (auto *CB = dyn_cast<CallBase>(&I)) {
2434       if (!CB->hasFnAttr(Attribute::WillReturn))
2435         return false;
2436       if (!CB->hasFnAttr(Attribute::NoSync))
2437         return false;
2438     }
2439     if (I.mayReadOrWriteMemory()) {
2440       auto MaybeLocI = getLocOrNone(I);
2441       if (MayWrite || I.mayWriteToMemory()) {
2442         if (!MaybeLoc || !MaybeLocI)
2443           return false;
2444         if (!AA.isNoAlias(*MaybeLoc, *MaybeLocI))
2445           return false;
2446       }
2447     }
2448   }
2449   return true;
2450 }
2451 
2452 auto HexagonVectorCombine::isByteVecTy(Type *Ty) const -> bool {
2453   if (auto *VecTy = dyn_cast<VectorType>(Ty))
2454     return VecTy->getElementType() == getByteTy();
2455   return false;
2456 }
2457 
2458 auto HexagonVectorCombine::getElementRange(IRBuilderBase &Builder, Value *Lo,
2459                                            Value *Hi, int Start,
2460                                            int Length) const -> Value * {
2461   assert(0 <= Start && size_t(Start + Length) < length(Lo) + length(Hi));
2462   SmallVector<int, 128> SMask(Length);
2463   std::iota(SMask.begin(), SMask.end(), Start);
2464   return Builder.CreateShuffleVector(Lo, Hi, SMask);
2465 }
2466 
2467 // Pass management.
2468 
2469 namespace llvm {
2470 void initializeHexagonVectorCombineLegacyPass(PassRegistry &);
2471 FunctionPass *createHexagonVectorCombineLegacyPass();
2472 } // namespace llvm
2473 
2474 namespace {
2475 class HexagonVectorCombineLegacy : public FunctionPass {
2476 public:
2477   static char ID;
2478 
2479   HexagonVectorCombineLegacy() : FunctionPass(ID) {}
2480 
2481   StringRef getPassName() const override { return "Hexagon Vector Combine"; }
2482 
2483   void getAnalysisUsage(AnalysisUsage &AU) const override {
2484     AU.setPreservesCFG();
2485     AU.addRequired<AAResultsWrapperPass>();
2486     AU.addRequired<AssumptionCacheTracker>();
2487     AU.addRequired<DominatorTreeWrapperPass>();
2488     AU.addRequired<TargetLibraryInfoWrapperPass>();
2489     AU.addRequired<TargetPassConfig>();
2490     FunctionPass::getAnalysisUsage(AU);
2491   }
2492 
2493   bool runOnFunction(Function &F) override {
2494     if (skipFunction(F))
2495       return false;
2496     AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
2497     AssumptionCache &AC =
2498         getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2499     DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2500     TargetLibraryInfo &TLI =
2501         getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2502     auto &TM = getAnalysis<TargetPassConfig>().getTM<HexagonTargetMachine>();
2503     HexagonVectorCombine HVC(F, AA, AC, DT, TLI, TM);
2504     return HVC.run();
2505   }
2506 };
2507 } // namespace
2508 
2509 char HexagonVectorCombineLegacy::ID = 0;
2510 
2511 INITIALIZE_PASS_BEGIN(HexagonVectorCombineLegacy, DEBUG_TYPE,
2512                       "Hexagon Vector Combine", false, false)
2513 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
2514 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
2515 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
2516 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
2517 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
2518 INITIALIZE_PASS_END(HexagonVectorCombineLegacy, DEBUG_TYPE,
2519                     "Hexagon Vector Combine", false, false)
2520 
2521 FunctionPass *llvm::createHexagonVectorCombineLegacyPass() {
2522   return new HexagonVectorCombineLegacy();
2523 }
2524