Dialect/X86Vector/Transforms.h

8508a63bSEmilio Cota//=- Transforms.h - X86Vector Dialect Transformation Entrypoints -*- C++ -*-=//
8508a63bSEmilio Cota//
8508a63bSEmilio Cota// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8508a63bSEmilio Cota// See https://llvm.org/LICENSE.txt for license information.
8508a63bSEmilio Cota// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8508a63bSEmilio Cota//
8508a63bSEmilio Cota//===----------------------------------------------------------------------===//
8508a63bSEmilio Cota
8508a63bSEmilio Cota#ifndef MLIR_DIALECT_X86VECTOR_TRANSFORMS_H
8508a63bSEmilio Cota#define MLIR_DIALECT_X86VECTOR_TRANSFORMS_H
8508a63bSEmilio Cota
34ff8573SNicolas Vasilache#include "mlir/IR/Value.h"
34ff8573SNicolas Vasilache
8508a63bSEmilio Cotanamespace mlir {
8508a63bSEmilio Cota
34ff8573SNicolas Vasilacheclass ImplicitLocOpBuilder;
8508a63bSEmilio Cotaclass LLVMConversionTarget;
8508a63bSEmilio Cotaclass LLVMTypeConverter;
8508a63bSEmilio Cotaclass RewritePatternSet;
8508a63bSEmilio Cota
34ff8573SNicolas Vasilachenamespace x86vector {
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache/// Helper class to factor out the creation and extraction of masks from nibs.
34ff8573SNicolas Vasilachestruct MaskHelper {
b2729fdaSNicolas Vasilache  /// b0 captures the lowest bit, b7 captures the highest bit.
b2729fdaSNicolas Vasilache  /// Meant to be used with instructions such as mm256BlendPs.
b2729fdaSNicolas Vasilache  template <uint8_t b0, uint8_t b1, uint8_t b2, uint8_t b3, uint8_t b4,
b2729fdaSNicolas Vasilache            uint8_t b5, uint8_t b6, uint8_t b7>
b2729fdaSNicolas Vasilache  static uint8_t blend() {
b2729fdaSNicolas Vasilache    static_assert(b0 <= 1 && b1 <= 1 && b2 <= 1 && b3 <= 1, "overflow");
b2729fdaSNicolas Vasilache    static_assert(b4 <= 1 && b5 <= 1 && b6 <= 1 && b7 <= 1, "overflow");
b2729fdaSNicolas Vasilache    return static_cast<uint8_t>((b7 << 7) | (b6 << 6) | (b5 << 5) | (b4 << 4) |
b2729fdaSNicolas Vasilache                                (b3 << 3) | (b2 << 2) | (b1 << 1) | b0);
b2729fdaSNicolas Vasilache  }
b2729fdaSNicolas Vasilache  /// b0 captures the lowest bit, b7 captures the highest bit.
b2729fdaSNicolas Vasilache  /// Meant to be used with instructions such as mm256BlendPs.
b2729fdaSNicolas Vasilache  static void extractBlend(uint8_t mask, uint8_t &b0, uint8_t &b1, uint8_t &b2,
b2729fdaSNicolas Vasilache                           uint8_t &b3, uint8_t &b4, uint8_t &b5, uint8_t &b6,
b2729fdaSNicolas Vasilache                           uint8_t &b7) {
b2729fdaSNicolas Vasilache    b7 = mask & (1 << 7);
b2729fdaSNicolas Vasilache    b6 = mask & (1 << 6);
b2729fdaSNicolas Vasilache    b5 = mask & (1 << 5);
b2729fdaSNicolas Vasilache    b4 = mask & (1 << 4);
b2729fdaSNicolas Vasilache    b3 = mask & (1 << 3);
b2729fdaSNicolas Vasilache    b2 = mask & (1 << 2);
b2729fdaSNicolas Vasilache    b1 = mask & (1 << 1);
b2729fdaSNicolas Vasilache    b0 = mask & 1;
b2729fdaSNicolas Vasilache  }
34ff8573SNicolas Vasilache  /// b01 captures the lower 2 bits, b67 captures the higher 2 bits.
34ff8573SNicolas Vasilache  /// Meant to be used with instructions such as mm256ShufflePs.
34ff8573SNicolas Vasilache  template <unsigned b67, unsigned b45, unsigned b23, unsigned b01>
b2729fdaSNicolas Vasilache  static uint8_t shuffle() {
34ff8573SNicolas Vasilache    static_assert(b01 <= 0x03, "overflow");
34ff8573SNicolas Vasilache    static_assert(b23 <= 0x03, "overflow");
34ff8573SNicolas Vasilache    static_assert(b45 <= 0x03, "overflow");
34ff8573SNicolas Vasilache    static_assert(b67 <= 0x03, "overflow");
b2729fdaSNicolas Vasilache    return static_cast<uint8_t>((b67 << 6) | (b45 << 4) | (b23 << 2) | b01);
34ff8573SNicolas Vasilache  }
34ff8573SNicolas Vasilache  /// b01 captures the lower 2 bits, b67 captures the higher 2 bits.
b2729fdaSNicolas Vasilache  static void extractShuffle(uint8_t mask, uint8_t &b01, uint8_t &b23,
b2729fdaSNicolas Vasilache                             uint8_t &b45, uint8_t &b67) {
34ff8573SNicolas Vasilache    b67 = (mask & (0x03 << 6)) >> 6;
34ff8573SNicolas Vasilache    b45 = (mask & (0x03 << 4)) >> 4;
34ff8573SNicolas Vasilache    b23 = (mask & (0x03 << 2)) >> 2;
34ff8573SNicolas Vasilache    b01 = mask & 0x03;
34ff8573SNicolas Vasilache  }
34ff8573SNicolas Vasilache  /// b03 captures the lower 4 bits, b47 captures the higher 4 bits.
34ff8573SNicolas Vasilache  /// Meant to be used with instructions such as mm256Permute2f128Ps.
34ff8573SNicolas Vasilache  template <unsigned b47, unsigned b03>
b2729fdaSNicolas Vasilache  static uint8_t permute() {
34ff8573SNicolas Vasilache    static_assert(b03 <= 0x0f, "overflow");
34ff8573SNicolas Vasilache    static_assert(b47 <= 0x0f, "overflow");
b2729fdaSNicolas Vasilache    return static_cast<uint8_t>((b47 << 4) + b03);
34ff8573SNicolas Vasilache  }
34ff8573SNicolas Vasilache  /// b03 captures the lower 4 bits, b47 captures the higher 4 bits.
b2729fdaSNicolas Vasilache  static void extractPermute(uint8_t mask, uint8_t &b03, uint8_t &b47) {
34ff8573SNicolas Vasilache    b47 = (mask & (0x0f << 4)) >> 4;
34ff8573SNicolas Vasilache    b03 = mask & 0x0f;
34ff8573SNicolas Vasilache  }
34ff8573SNicolas Vasilache};
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache//===----------------------------------------------------------------------===//
34ff8573SNicolas Vasilache/// Helpers extracted from:
34ff8573SNicolas Vasilache///   - clang/lib/Headers/avxintrin.h
34ff8573SNicolas Vasilache///   - clang/test/CodeGen/X86/avx-builtins.c
34ff8573SNicolas Vasilache///   - clang/test/CodeGen/X86/avx2-builtins.c
34ff8573SNicolas Vasilache///   - clang/test/CodeGen/X86/avx-shuffle-builtins.c
34ff8573SNicolas Vasilache/// as well as the Intel Intrinsics Guide
34ff8573SNicolas Vasilache/// (https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html)
34ff8573SNicolas Vasilache/// make it easier to just implement known good lowerings.
34ff8573SNicolas Vasilache/// All intrinsics correspond 1-1 to the Intel definition.
34ff8573SNicolas Vasilache//===----------------------------------------------------------------------===//
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilachenamespace avx2 {
34ff8573SNicolas Vasilache
b2729fdaSNicolas Vasilachenamespace inline_asm {
b2729fdaSNicolas Vasilache//===----------------------------------------------------------------------===//
b2729fdaSNicolas Vasilache/// Methods in the inline_asm namespace  emit calls to LLVM::InlineAsmOp.
b2729fdaSNicolas Vasilache//===----------------------------------------------------------------------===//
b2729fdaSNicolas Vasilache/// If bit i of `mask` is zero, take f32@i from v1 else take it from v2.
b2729fdaSNicolas VasilacheValue mm256BlendPsAsm(ImplicitLocOpBuilder &b, Value v1, Value v2,
b2729fdaSNicolas Vasilache                      uint8_t mask);
b2729fdaSNicolas Vasilache
b2729fdaSNicolas Vasilache} // namespace inline_asm
b2729fdaSNicolas Vasilache
b2729fdaSNicolas Vasilachenamespace intrin {
b2729fdaSNicolas Vasilache//===----------------------------------------------------------------------===//
b2729fdaSNicolas Vasilache/// Methods in the intrin namespace emulate clang's impl. of X86 intrinsics.
b2729fdaSNicolas Vasilache//===----------------------------------------------------------------------===//
34ff8573SNicolas Vasilache/// Lower to vector.shuffle v1, v2, [0, 8, 1, 9, 4, 12, 5, 13].
34ff8573SNicolas VasilacheValue mm256UnpackLoPs(ImplicitLocOpBuilder &b, Value v1, Value v2);
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache/// Lower to vector.shuffle v1, v2, [0, 8, 1, 9, 4, 12, 5, 13].
34ff8573SNicolas VasilacheValue mm256UnpackHiPs(ImplicitLocOpBuilder &b, Value v1, Value v2);
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache///                            a  a   b   b  a  a   b   b
34ff8573SNicolas Vasilache/// Take an 8 bit mask, 2 bit for each position of a[0, 3)  **and** b[0, 4):
34ff8573SNicolas Vasilache///                                 0:127    |         128:255
34ff8573SNicolas Vasilache///                            b01  b23  C8  D8  |  b01+4 b23+4 C8+4 D8+4
b2729fdaSNicolas VasilacheValue mm256ShufflePs(ImplicitLocOpBuilder &b, Value v1, Value v2, uint8_t mask);
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache// imm[0:1] out of imm[0:3] is:
34ff8573SNicolas Vasilache//    0             1           2             3
34ff8573SNicolas Vasilache// a[0:127] or a[128:255] or b[0:127] or b[128:255]    |
34ff8573SNicolas Vasilache//          a[0:127] or a[128:255] or b[0:127] or b[128:255]
34ff8573SNicolas Vasilache//             0             1           2             3
34ff8573SNicolas Vasilache// imm[0:1] out of imm[4:7].
34ff8573SNicolas VasilacheValue mm256Permute2f128Ps(ImplicitLocOpBuilder &b, Value v1, Value v2,
b2729fdaSNicolas Vasilache                          uint8_t mask);
34ff8573SNicolas Vasilache
b2729fdaSNicolas Vasilache/// If bit i of `mask` is zero, take f32@i from v1 else take it from v2.
b2729fdaSNicolas VasilacheValue mm256BlendPs(ImplicitLocOpBuilder &b, Value v1, Value v2, uint8_t mask);
b2729fdaSNicolas Vasilache} // namespace intrin
b2729fdaSNicolas Vasilache
b2729fdaSNicolas Vasilache//===----------------------------------------------------------------------===//
b2729fdaSNicolas Vasilache/// Generic lowerings may either use intrin or inline_asm depending on needs.
b2729fdaSNicolas Vasilache//===----------------------------------------------------------------------===//
34ff8573SNicolas Vasilache/// 4x8xf32-specific AVX2 transpose lowering.
34ff8573SNicolas Vasilachevoid transpose4x8xf32(ImplicitLocOpBuilder &ib, MutableArrayRef<Value> vs);
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache/// 8x8xf32-specific AVX2 transpose lowering.
34ff8573SNicolas Vasilachevoid transpose8x8xf32(ImplicitLocOpBuilder &ib, MutableArrayRef<Value> vs);
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache/// Structure to control the behavior of specialized AVX2 transpose lowering.
34ff8573SNicolas Vasilachestruct TransposeLoweringOptions {
34ff8573SNicolas Vasilache  bool lower4x8xf32_ = false;
34ff8573SNicolas Vasilache  TransposeLoweringOptions &lower4x8xf32(bool lower = true) {
34ff8573SNicolas Vasilache    lower4x8xf32_ = lower;
34ff8573SNicolas Vasilache    return *this;
34ff8573SNicolas Vasilache  }
34ff8573SNicolas Vasilache  bool lower8x8xf32_ = false;
34ff8573SNicolas Vasilache  TransposeLoweringOptions &lower8x8xf32(bool lower = true) {
34ff8573SNicolas Vasilache    lower8x8xf32_ = lower;
34ff8573SNicolas Vasilache    return *this;
34ff8573SNicolas Vasilache  }
34ff8573SNicolas Vasilache};
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache/// Options for controlling specialized AVX2 lowerings.
34ff8573SNicolas Vasilachestruct LoweringOptions {
34ff8573SNicolas Vasilache  /// Configure specialized vector lowerings.
34ff8573SNicolas Vasilache  TransposeLoweringOptions transposeOptions;
34ff8573SNicolas Vasilache  LoweringOptions &setTransposeOptions(TransposeLoweringOptions options) {
34ff8573SNicolas Vasilache    transposeOptions = options;
34ff8573SNicolas Vasilache    return *this;
34ff8573SNicolas Vasilache  }
34ff8573SNicolas Vasilache};
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache/// Insert specialized transpose lowering patterns.
34ff8573SNicolas Vasilachevoid populateSpecializedTransposeLoweringPatterns(
34ff8573SNicolas Vasilache    RewritePatternSet &patterns, LoweringOptions options = LoweringOptions(),
34ff8573SNicolas Vasilache    int benefit = 10);
34ff8573SNicolas Vasilache
34ff8573SNicolas Vasilache} // namespace avx2
34ff8573SNicolas Vasilache} // namespace x86vector
34ff8573SNicolas Vasilache
8508a63bSEmilio Cota/// Collect a set of patterns to lower X86Vector ops to ops that map to LLVM
8508a63bSEmilio Cota/// intrinsics.
8508a63bSEmilio Cotavoid populateX86VectorLegalizeForLLVMExportPatterns(
*206fad0eSMatthias Springer    const LLVMTypeConverter &converter, RewritePatternSet &patterns);
8508a63bSEmilio Cota
8508a63bSEmilio Cota/// Configure the target to support lowering X86Vector ops to ops that map to
8508a63bSEmilio Cota/// LLVM intrinsics.
8508a63bSEmilio Cotavoid configureX86VectorLegalizeForExportTarget(LLVMConversionTarget &target);
8508a63bSEmilio Cota
8508a63bSEmilio Cota} // namespace mlir
8508a63bSEmilio Cota
8508a63bSEmilio Cota#endif // MLIR_DIALECT_X86VECTOR_TRANSFORMS_H