xref: /llvm-project/flang/runtime/matmul-transpose.cpp (revision 8ce1aed55f3dbb71406dc6feaed3f162ac183d21)
1 //===-- runtime/matmul-transpose.cpp --------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 // Implements a fused matmul-transpose operation
10 //
11 // There are two main entry points; one establishes a descriptor for the
12 // result and allocates it, and the other expects a result descriptor that
13 // points to existing storage.
14 //
15 // This implementation must handle all combinations of numeric types and
16 // kinds (100 - 165 cases depending on the target), plus all combinations
17 // of logical kinds (16).  A single template undergoes many instantiations
18 // to cover all of the valid possibilities.
19 //
20 // The usefulness of this optimization should be reviewed once Matmul is swapped
21 // to use the faster BLAS routines.
22 
23 #include "flang/Runtime/matmul-transpose.h"
24 #include "terminator.h"
25 #include "tools.h"
26 #include "flang/Common/optional.h"
27 #include "flang/Runtime/c-or-cpp.h"
28 #include "flang/Runtime/cpp-type.h"
29 #include "flang/Runtime/descriptor.h"
30 #include <cstring>
31 
32 namespace {
33 using namespace Fortran::runtime;
34 
35 // Suppress the warnings about calling __host__-only std::complex operators,
36 // defined in C++ STD header files, from __device__ code.
37 RT_DIAG_PUSH
38 RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
39 
40 // Contiguous numeric TRANSPOSE(matrix)*matrix multiplication
41 //   TRANSPOSE(matrix(n, rows)) * matrix(n,cols) ->
42 //             matrix(rows, n)  * matrix(n,cols) -> matrix(rows,cols)
43 // The transpose is implemented by swapping the indices of accesses into the LHS
44 //
45 // Straightforward algorithm:
46 //   DO 1 I = 1, NROWS
47 //    DO 1 J = 1, NCOLS
48 //     RES(I,J) = 0
49 //     DO 1 K = 1, N
50 //   1  RES(I,J) = RES(I,J) + X(K,I)*Y(K,J)
51 //
52 // With loop distribution and transposition to avoid the inner sum
53 // reduction and to avoid non-unit strides:
54 //   DO 1 I = 1, NROWS
55 //    DO 1 J = 1, NCOLS
56 //   1 RES(I,J) = 0
57 //   DO 2 J = 1, NCOLS
58 //    DO 2 I = 1, NROWS
59 //     DO 2 K = 1, N
60 //   2  RES(I,J) = RES(I,J) + X(K,I)*Y(K,J) ! loop-invariant last term
61 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
62     bool X_HAS_STRIDED_COLUMNS, bool Y_HAS_STRIDED_COLUMNS>
63 inline static RT_API_ATTRS void MatrixTransposedTimesMatrix(
64     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
65     SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
66     SubscriptValue n, std::size_t xColumnByteStride = 0,
67     std::size_t yColumnByteStride = 0) {
68   using ResultType = CppTypeFor<RCAT, RKIND>;
69 
70   std::memset(product, 0, rows * cols * sizeof *product);
71   for (SubscriptValue j{0}; j < cols; ++j) {
72     for (SubscriptValue i{0}; i < rows; ++i) {
73       for (SubscriptValue k{0}; k < n; ++k) {
74         ResultType x_ki;
75         if constexpr (!X_HAS_STRIDED_COLUMNS) {
76           x_ki = static_cast<ResultType>(x[i * n + k]);
77         } else {
78           x_ki = static_cast<ResultType>(reinterpret_cast<const XT *>(
79               reinterpret_cast<const char *>(x) + i * xColumnByteStride)[k]);
80         }
81         ResultType y_kj;
82         if constexpr (!Y_HAS_STRIDED_COLUMNS) {
83           y_kj = static_cast<ResultType>(y[j * n + k]);
84         } else {
85           y_kj = static_cast<ResultType>(reinterpret_cast<const YT *>(
86               reinterpret_cast<const char *>(y) + j * yColumnByteStride)[k]);
87         }
88         product[j * rows + i] += x_ki * y_kj;
89       }
90     }
91   }
92 }
93 
94 RT_DIAG_POP
95 
96 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
97 inline static RT_API_ATTRS void MatrixTransposedTimesMatrixHelper(
98     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
99     SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
100     SubscriptValue n, Fortran::common::optional<std::size_t> xColumnByteStride,
101     Fortran::common::optional<std::size_t> yColumnByteStride) {
102   if (!xColumnByteStride) {
103     if (!yColumnByteStride) {
104       MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, false, false>(
105           product, rows, cols, x, y, n);
106     } else {
107       MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, false, true>(
108           product, rows, cols, x, y, n, 0, *yColumnByteStride);
109     }
110   } else {
111     if (!yColumnByteStride) {
112       MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, true, false>(
113           product, rows, cols, x, y, n, *xColumnByteStride);
114     } else {
115       MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, true, true>(
116           product, rows, cols, x, y, n, *xColumnByteStride, *yColumnByteStride);
117     }
118   }
119 }
120 
121 RT_DIAG_PUSH
122 RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
123 
124 // Contiguous numeric matrix*vector multiplication
125 //   matrix(rows,n) * column vector(n) -> column vector(rows)
126 // Straightforward algorithm:
127 //   DO 1 I = 1, NROWS
128 //    RES(I) = 0
129 //    DO 1 K = 1, N
130 //   1 RES(I) = RES(I) + X(K,I)*Y(K)
131 // With loop distribution and transposition to avoid the inner
132 // sum reduction and to avoid non-unit strides:
133 //   DO 1 I = 1, NROWS
134 //   1 RES(I) = 0
135 //   DO 2 I = 1, NROWS
136 //    DO 2 K = 1, N
137 //   2 RES(I) = RES(I) + X(K,I)*Y(K)
138 template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
139     bool X_HAS_STRIDED_COLUMNS>
140 inline static RT_API_ATTRS void MatrixTransposedTimesVector(
141     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
142     SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
143     std::size_t xColumnByteStride = 0) {
144   using ResultType = CppTypeFor<RCAT, RKIND>;
145   std::memset(product, 0, rows * sizeof *product);
146   for (SubscriptValue i{0}; i < rows; ++i) {
147     for (SubscriptValue k{0}; k < n; ++k) {
148       ResultType x_ki;
149       if constexpr (!X_HAS_STRIDED_COLUMNS) {
150         x_ki = static_cast<ResultType>(x[i * n + k]);
151       } else {
152         x_ki = static_cast<ResultType>(reinterpret_cast<const XT *>(
153             reinterpret_cast<const char *>(x) + i * xColumnByteStride)[k]);
154       }
155       ResultType y_k = static_cast<ResultType>(y[k]);
156       product[i] += x_ki * y_k;
157     }
158   }
159 }
160 
161 RT_DIAG_POP
162 
163 template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
164 inline static RT_API_ATTRS void MatrixTransposedTimesVectorHelper(
165     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
166     SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
167     Fortran::common::optional<std::size_t> xColumnByteStride) {
168   if (!xColumnByteStride) {
169     MatrixTransposedTimesVector<RCAT, RKIND, XT, YT, false>(
170         product, rows, n, x, y);
171   } else {
172     MatrixTransposedTimesVector<RCAT, RKIND, XT, YT, true>(
173         product, rows, n, x, y, *xColumnByteStride);
174   }
175 }
176 
177 RT_DIAG_PUSH
178 RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
179 
180 // Implements an instance of MATMUL for given argument types.
181 template <bool IS_ALLOCATING, TypeCategory RCAT, int RKIND, typename XT,
182     typename YT>
183 inline static RT_API_ATTRS void DoMatmulTranspose(
184     std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor> &result,
185     const Descriptor &x, const Descriptor &y, Terminator &terminator) {
186   int xRank{x.rank()};
187   int yRank{y.rank()};
188   int resRank{xRank + yRank - 2};
189   if (xRank * yRank != 2 * resRank) {
190     terminator.Crash(
191         "MATMUL-TRANSPOSE: bad argument ranks (%d * %d)", xRank, yRank);
192   }
193   SubscriptValue extent[2]{x.GetDimension(1).Extent(),
194       resRank == 2 ? y.GetDimension(1).Extent() : 0};
195   if constexpr (IS_ALLOCATING) {
196     result.Establish(
197         RCAT, RKIND, nullptr, resRank, extent, CFI_attribute_allocatable);
198     for (int j{0}; j < resRank; ++j) {
199       result.GetDimension(j).SetBounds(1, extent[j]);
200     }
201     if (int stat{result.Allocate()}) {
202       terminator.Crash(
203           "MATMUL-TRANSPOSE: could not allocate memory for result; STAT=%d",
204           stat);
205     }
206   } else {
207     RUNTIME_CHECK(terminator, resRank == result.rank());
208     RUNTIME_CHECK(
209         terminator, result.ElementBytes() == static_cast<std::size_t>(RKIND));
210     RUNTIME_CHECK(terminator, result.GetDimension(0).Extent() == extent[0]);
211     RUNTIME_CHECK(terminator,
212         resRank == 1 || result.GetDimension(1).Extent() == extent[1]);
213   }
214   SubscriptValue n{x.GetDimension(0).Extent()};
215   if (n != y.GetDimension(0).Extent()) {
216     terminator.Crash(
217         "MATMUL-TRANSPOSE: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
218         static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
219         static_cast<std::intmax_t>(x.GetDimension(1).Extent()),
220         static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
221         static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
222   }
223   using WriteResult =
224       CppTypeFor<RCAT == TypeCategory::Logical ? TypeCategory::Integer : RCAT,
225           RKIND>;
226   const SubscriptValue rows{extent[0]};
227   const SubscriptValue cols{extent[1]};
228   if constexpr (RCAT != TypeCategory::Logical) {
229     if (x.IsContiguous(1) && y.IsContiguous(1) &&
230         (IS_ALLOCATING || result.IsContiguous())) {
231       // Contiguous numeric matrices (maybe with columns
232       // separated by a stride).
233       Fortran::common::optional<std::size_t> xColumnByteStride;
234       if (!x.IsContiguous()) {
235         // X's columns are strided.
236         SubscriptValue xAt[2]{};
237         x.GetLowerBounds(xAt);
238         xAt[1]++;
239         xColumnByteStride = x.SubscriptsToByteOffset(xAt);
240       }
241       Fortran::common::optional<std::size_t> yColumnByteStride;
242       if (!y.IsContiguous()) {
243         // Y's columns are strided.
244         SubscriptValue yAt[2]{};
245         y.GetLowerBounds(yAt);
246         yAt[1]++;
247         yColumnByteStride = y.SubscriptsToByteOffset(yAt);
248       }
249       if (resRank == 2) { // M*M -> M
250         // TODO: use BLAS-3 GEMM for supported types.
251         MatrixTransposedTimesMatrixHelper<RCAT, RKIND, XT, YT>(
252             result.template OffsetElement<WriteResult>(), rows, cols,
253             x.OffsetElement<XT>(), y.OffsetElement<YT>(), n, xColumnByteStride,
254             yColumnByteStride);
255         return;
256       }
257       if (xRank == 2) { // M*V -> V
258         // TODO: use BLAS-2 GEMM for supported types.
259         MatrixTransposedTimesVectorHelper<RCAT, RKIND, XT, YT>(
260             result.template OffsetElement<WriteResult>(), rows, n,
261             x.OffsetElement<XT>(), y.OffsetElement<YT>(), xColumnByteStride);
262         return;
263       }
264       // else V*M -> V (not allowed because TRANSPOSE() is only defined for rank
265       // 1 matrices
266       terminator.Crash(
267           "MATMUL-TRANSPOSE: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
268           static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
269           static_cast<std::intmax_t>(n),
270           static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
271           static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
272       return;
273     }
274   }
275   // General algorithms for LOGICAL and noncontiguity
276   SubscriptValue xLB[2], yLB[2], resLB[2];
277   x.GetLowerBounds(xLB);
278   y.GetLowerBounds(yLB);
279   result.GetLowerBounds(resLB);
280   using ResultType = CppTypeFor<RCAT, RKIND>;
281   if (resRank == 2) { // M*M -> M
282     for (SubscriptValue i{0}; i < rows; ++i) {
283       for (SubscriptValue j{0}; j < cols; ++j) {
284         ResultType res_ij;
285         if constexpr (RCAT == TypeCategory::Logical) {
286           res_ij = false;
287         } else {
288           res_ij = 0;
289         }
290 
291         for (SubscriptValue k{0}; k < n; ++k) {
292           SubscriptValue xAt[2]{k + xLB[0], i + xLB[1]};
293           SubscriptValue yAt[2]{k + yLB[0], j + yLB[1]};
294           if constexpr (RCAT == TypeCategory::Logical) {
295             ResultType x_ki = IsLogicalElementTrue(x, xAt);
296             ResultType y_kj = IsLogicalElementTrue(y, yAt);
297             res_ij = res_ij || (x_ki && y_kj);
298           } else {
299             ResultType x_ki = static_cast<ResultType>(*x.Element<XT>(xAt));
300             ResultType y_kj = static_cast<ResultType>(*y.Element<YT>(yAt));
301             res_ij += x_ki * y_kj;
302           }
303         }
304         SubscriptValue resAt[2]{i + resLB[0], j + resLB[1]};
305         *result.template Element<WriteResult>(resAt) = res_ij;
306       }
307     }
308   } else if (xRank == 2) { // M*V -> V
309     for (SubscriptValue i{0}; i < rows; ++i) {
310       ResultType res_i;
311       if constexpr (RCAT == TypeCategory::Logical) {
312         res_i = false;
313       } else {
314         res_i = 0;
315       }
316 
317       for (SubscriptValue k{0}; k < n; ++k) {
318         SubscriptValue xAt[2]{k + xLB[0], i + xLB[1]};
319         SubscriptValue yAt[1]{k + yLB[0]};
320         if constexpr (RCAT == TypeCategory::Logical) {
321           ResultType x_ki = IsLogicalElementTrue(x, xAt);
322           ResultType y_k = IsLogicalElementTrue(y, yAt);
323           res_i = res_i || (x_ki && y_k);
324         } else {
325           ResultType x_ki = static_cast<ResultType>(*x.Element<XT>(xAt));
326           ResultType y_k = static_cast<ResultType>(*y.Element<YT>(yAt));
327           res_i += x_ki * y_k;
328         }
329       }
330       SubscriptValue resAt[1]{i + resLB[0]};
331       *result.template Element<WriteResult>(resAt) = res_i;
332     }
333   } else { // V*M -> V
334     // TRANSPOSE(V) not allowed by fortran standard
335     terminator.Crash(
336         "MATMUL-TRANSPOSE: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
337         static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
338         static_cast<std::intmax_t>(n),
339         static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
340         static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
341   }
342 }
343 
344 RT_DIAG_POP
345 
346 template <bool IS_ALLOCATING, TypeCategory XCAT, int XKIND, TypeCategory YCAT,
347     int YKIND>
348 struct MatmulTransposeHelper {
349   using ResultDescriptor =
350       std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor>;
351   RT_API_ATTRS void operator()(ResultDescriptor &result, const Descriptor &x,
352       const Descriptor &y, const char *sourceFile, int line) const {
353     Terminator terminator{sourceFile, line};
354     auto xCatKind{x.type().GetCategoryAndKind()};
355     auto yCatKind{y.type().GetCategoryAndKind()};
356     RUNTIME_CHECK(terminator, xCatKind.has_value() && yCatKind.has_value());
357     RUNTIME_CHECK(terminator, xCatKind->first == XCAT);
358     RUNTIME_CHECK(terminator, yCatKind->first == YCAT);
359     if constexpr (constexpr auto resultType{
360                       GetResultType(XCAT, XKIND, YCAT, YKIND)}) {
361       return DoMatmulTranspose<IS_ALLOCATING, resultType->first,
362           resultType->second, CppTypeFor<XCAT, XKIND>, CppTypeFor<YCAT, YKIND>>(
363           result, x, y, terminator);
364     }
365     terminator.Crash("MATMUL-TRANSPOSE: bad operand types (%d(%d), %d(%d))",
366         static_cast<int>(XCAT), XKIND, static_cast<int>(YCAT), YKIND);
367   }
368 };
369 } // namespace
370 
371 namespace Fortran::runtime {
372 extern "C" {
373 RT_EXT_API_GROUP_BEGIN
374 
375 #define MATMUL_INSTANCE(XCAT, XKIND, YCAT, YKIND) \
376   void RTDEF(MatmulTranspose##XCAT##XKIND##YCAT##YKIND)(Descriptor & result, \
377       const Descriptor &x, const Descriptor &y, const char *sourceFile, \
378       int line) { \
379     MatmulTransposeHelper<true, TypeCategory::XCAT, XKIND, TypeCategory::YCAT, \
380         YKIND>{}(result, x, y, sourceFile, line); \
381   }
382 
383 #define MATMUL_DIRECT_INSTANCE(XCAT, XKIND, YCAT, YKIND) \
384   void RTDEF(MatmulTransposeDirect##XCAT##XKIND##YCAT##YKIND)( \
385       Descriptor & result, const Descriptor &x, const Descriptor &y, \
386       const char *sourceFile, int line) { \
387     MatmulTransposeHelper<false, TypeCategory::XCAT, XKIND, \
388         TypeCategory::YCAT, YKIND>{}(result, x, y, sourceFile, line); \
389   }
390 
391 #define MATMUL_FORCE_ALL_TYPES 0
392 
393 #include "flang/Runtime/matmul-instances.inc"
394 
395 RT_EXT_API_GROUP_END
396 } // extern "C"
397 } // namespace Fortran::runtime
398