1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \ 2 // RUN: -disable-O0-optnone -emit-llvm -o - %s \ 3 // RUN: | opt -S -passes=mem2reg,sroa \ 4 // RUN: | FileCheck %s 5 6 // REQUIRES: aarch64-registered-target 7 8 #include <arm_neon.h> 9 10 // CHECK-LABEL: test_vmmlaq_s32 11 // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) 12 // CHECK: ret <4 x i32> [[VAL]] 13 int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) { 14 return vmmlaq_s32(r, a, b); 15 } 16 17 // CHECK-LABEL: test_vmmlaq_u32 18 // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) 19 // CHECK: ret <4 x i32> [[VAL]] 20 uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) { 21 return vmmlaq_u32(r, a, b); 22 } 23 24 // CHECK-LABEL: test_vusmmlaq_s32 25 // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) 26 // CHECK: ret <4 x i32> [[VAL]] 27 int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { 28 return vusmmlaq_s32(r, a, b); 29 } 30 31 // CHECK-LABEL: test_vusdot_s32 32 // CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) 33 // CHECK: ret <2 x i32> [[VAL]] 34 int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { 35 return vusdot_s32(r, a, b); 36 } 37 38 // CHECK-LABEL: test_vusdot_lane_s32 39 // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> 40 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> 41 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 42 // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer 43 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 44 // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> 45 // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]]) 46 // CHECK: ret <2 x i32> [[OP]] 47 int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { 48 return vusdot_lane_s32(r, a, b, 0); 49 } 50 51 // CHECK-LABEL: test_vsudot_lane_s32 52 // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> 53 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %0 to <8 x i8> 54 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> %1 to <2 x i32> 55 // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer 56 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 57 // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> 58 // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a) 59 // CHECK: ret <2 x i32> [[OP]] 60 int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) { 61 return vsudot_lane_s32(r, a, b, 0); 62 } 63 64 // CHECK-LABEL: test_vusdot_laneq_s32 65 // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> 66 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> 67 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 68 // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer 69 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 70 // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> 71 // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]]) 72 // CHECK: ret <2 x i32> [[OP]] 73 int32x2_t test_vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b) { 74 return vusdot_laneq_s32(r, a, b, 0); 75 } 76 77 // CHECK-LABEL: test_vsudot_laneq_s32 78 // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> 79 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> 80 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 81 // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer 82 // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 83 // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> 84 // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a) 85 // CHECK: ret <2 x i32> [[OP]] 86 int32x2_t test_vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b) { 87 return vsudot_laneq_s32(r, a, b, 0); 88 } 89 90 // CHECK-LABEL: test_vusdotq_s32 91 // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) 92 // CHECK: ret <4 x i32> [[VAL]] 93 int32x4_t test_vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { 94 return vusdotq_s32(r, a, b); 95 } 96 97 // CHECK-LABEL: test_vusdotq_lane_s32 98 // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> 99 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> 100 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 101 // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer 102 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> 103 // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> 104 // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]]) 105 // CHECK: ret <4 x i32> [[OP]] 106 int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) { 107 return vusdotq_lane_s32(r, a, b, 0); 108 } 109 110 // CHECK-LABEL: test_vsudotq_lane_s32 111 // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> 112 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> 113 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 114 // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer 115 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> 116 // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> 117 // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a) 118 // CHECK: ret <4 x i32> [[OP]] 119 int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) { 120 return vsudotq_lane_s32(r, a, b, 0); 121 } 122 123 // CHECK-LABEL: test_vusdotq_laneq_s32 124 // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> 125 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> 126 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 127 // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer 128 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> 129 // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> 130 // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]]) 131 // CHECK: ret <4 x i32> [[OP]] 132 int32x4_t test_vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { 133 return vusdotq_laneq_s32(r, a, b, 0); 134 } 135 136 // CHECK-LABEL: test_vsudotq_laneq_s32 137 // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> 138 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> 139 // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 140 // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer 141 // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> 142 // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> 143 // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a) 144 // CHECK: ret <4 x i32> [[OP]] 145 int32x4_t test_vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b) { 146 return vsudotq_laneq_s32(r, a, b, 0); 147 } 148