14d330820SGuray Ozen# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \ 24d330820SGuray Ozen# RUN: %PYTHON %s | FileCheck %s 34d330820SGuray Ozen 44d330820SGuray Ozen# ===----------------------------------------------------------------------===// 54d330820SGuray Ozen# Chapter 1 : 2D Saxpy 64d330820SGuray Ozen# ===----------------------------------------------------------------------===// 74d330820SGuray Ozen# 84d330820SGuray Ozen# This program demonstrates 2D Saxpy: 94d330820SGuray Ozen# 1. Use GPU dialect to allocate and copy memory host to gpu and vice versa 104d330820SGuray Ozen# 2. Computes 2D SAXPY kernel using operator overloading 114d330820SGuray Ozen# 3. Pass numpy arrays to MLIR as memref arguments 124d330820SGuray Ozen# 4. Verify MLIR program with reference computation in python 134d330820SGuray Ozen# 144d330820SGuray Ozen# ===----------------------------------------------------------------------===// 154d330820SGuray Ozen 164d330820SGuray Ozen 174d330820SGuray Ozenfrom mlir import ir 184d330820SGuray Ozenfrom mlir.dialects import gpu, memref 194d330820SGuray Ozenfrom tools.nvdsl import * 204d330820SGuray Ozenimport numpy as np 214d330820SGuray Ozen 224d330820SGuray Ozen 234d330820SGuray Ozen@NVDSL.mlir_func 244d330820SGuray Ozendef saxpy(x, y, alpha): 254d330820SGuray Ozen # 1. Use MLIR GPU dialect to allocate and copy memory 26*f8ff9094SGuray Ozen token_ty = gpu.AsyncTokenType.get() 274d330820SGuray Ozen t1 = gpu.wait(token_ty, []) 284d330820SGuray Ozen x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], []) 294d330820SGuray Ozen y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], []) 304d330820SGuray Ozen t4 = gpu.memcpy(token_ty, [t3], x_dev, x) 314d330820SGuray Ozen t5 = gpu.memcpy(token_ty, [t4], y_dev, y) 324d330820SGuray Ozen t6 = gpu.wait(token_ty, [t5]) 334d330820SGuray Ozen 344d330820SGuray Ozen # 2. Compute 2D SAXPY kernel 354d330820SGuray Ozen @NVDSL.mlir_gpu_launch(grid=(M, 1, 1), block=(N, 1, 1)) 364d330820SGuray Ozen def saxpy_kernel(): 374d330820SGuray Ozen bidx = gpu.block_id(gpu.Dimension.x) 384d330820SGuray Ozen tidx = gpu.thread_id(gpu.Dimension.x) 394d330820SGuray Ozen x_val = memref.load(x_dev, [bidx, tidx]) 404d330820SGuray Ozen y_val = memref.load(y_dev, [bidx, tidx]) 414d330820SGuray Ozen 424d330820SGuray Ozen # SAXPY: y[i] += a * x[i]; 434d330820SGuray Ozen y_val += x_val * alpha 444d330820SGuray Ozen 454d330820SGuray Ozen memref.store(y_val, y_dev, [bidx, tidx]) 464d330820SGuray Ozen 474d330820SGuray Ozen saxpy_kernel() 484d330820SGuray Ozen 494d330820SGuray Ozen t7 = gpu.memcpy(token_ty, [t6], y, y_dev) 504d330820SGuray Ozen gpu.wait(token_ty, [t7]) 514d330820SGuray Ozen 524d330820SGuray Ozen 534d330820SGuray Ozen# 3. Pass numpy arrays to MLIR 544d330820SGuray OzenM = 256 554d330820SGuray OzenN = 32 564d330820SGuray Ozenalpha = 2.0 574d330820SGuray Ozenx = np.random.randn(M, N).astype(np.float32) 584d330820SGuray Ozeny = np.ones((M, N), np.float32) 594d330820SGuray Ozensaxpy(x, y, alpha) 604d330820SGuray Ozen 614d330820SGuray Ozen# 4. Verify MLIR with reference computation 624d330820SGuray Ozenref = np.ones((M, N), np.float32) 634d330820SGuray Ozenref += x * alpha 644d330820SGuray Ozennp.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01) 654d330820SGuray Ozenprint("PASS") 664d330820SGuray Ozen# CHECK-NOT: Mismatched elements 67