1*207e5cccSFangrui Song // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \ 2*207e5cccSFangrui Song // RUN: -disable-O0-optnone -emit-llvm -o - %s \ 3*207e5cccSFangrui Song // RUN: | opt -S -passes=mem2reg,sroa \ 4*207e5cccSFangrui Song // RUN: | FileCheck %s 5*207e5cccSFangrui Song 6*207e5cccSFangrui Song // REQUIRES: aarch64-registered-target 7*207e5cccSFangrui Song 8*207e5cccSFangrui Song #include <arm_neon.h> 9*207e5cccSFangrui Song 10*207e5cccSFangrui Song // CHECK-LABEL: test_vmmlaq_s32 11*207e5cccSFangrui Song // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) 12*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[VAL]] 13*207e5cccSFangrui Song int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) { 14*207e5cccSFangrui Song return vmmlaq_s32(r, a, b); 15*207e5cccSFangrui Song } 16*207e5cccSFangrui Song 17*207e5cccSFangrui Song // CHECK-LABEL: test_vmmlaq_u32 18*207e5cccSFangrui Song // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) 19*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[VAL]] 20*207e5cccSFangrui Song uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) { 21*207e5cccSFangrui Song return vmmlaq_u32(r, a, b); 22*207e5cccSFangrui Song } 23*207e5cccSFangrui Song 24*207e5cccSFangrui Song // CHECK-LABEL: test_vusmmlaq_s32 25*207e5cccSFangrui Song // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) 26*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[VAL]] 27*207e5cccSFangrui Song int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { 28*207e5cccSFangrui Song return vusmmlaq_s32(r, a, b); 29*207e5cccSFangrui Song } 30*207e5cccSFangrui Song 31*207e5cccSFangrui Song // CHECK-LABEL: test_vusdot_s32 32*207e5cccSFangrui Song // CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) 33*207e5cccSFangrui Song // CHECK: ret <2 x i32> [[VAL]] 34*207e5cccSFangrui Song int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { 35*207e5cccSFangrui Song return vusdot_s32(r, a, b); 36*207e5cccSFangrui Song } 37*207e5cccSFangrui Song 38*207e5cccSFangrui Song // CHECK-LABEL: test_vusdot_lane_s32 39*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> 40*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> 41*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 42*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer 43*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 44*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> 45*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]]) 46*207e5cccSFangrui Song // CHECK: ret <2 x i32> [[OP]] 47*207e5cccSFangrui Song int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { 48*207e5cccSFangrui Song return vusdot_lane_s32(r, a, b, 0); 49*207e5cccSFangrui Song } 50*207e5cccSFangrui Song 51*207e5cccSFangrui Song // CHECK-LABEL: test_vsudot_lane_s32 52*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> 53*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %0 to <8 x i8> 54*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> %1 to <2 x i32> 55*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer 56*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 57*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> 58*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a) 59*207e5cccSFangrui Song // CHECK: ret <2 x i32> [[OP]] 60*207e5cccSFangrui Song int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) { 61*207e5cccSFangrui Song return vsudot_lane_s32(r, a, b, 0); 62*207e5cccSFangrui Song } 63*207e5cccSFangrui Song 64*207e5cccSFangrui Song // CHECK-LABEL: test_vusdot_laneq_s32 65*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> 66*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> 67*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 68*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer 69*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 70*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> 71*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]]) 72*207e5cccSFangrui Song // CHECK: ret <2 x i32> [[OP]] 73*207e5cccSFangrui Song int32x2_t test_vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b) { 74*207e5cccSFangrui Song return vusdot_laneq_s32(r, a, b, 0); 75*207e5cccSFangrui Song } 76*207e5cccSFangrui Song 77*207e5cccSFangrui Song // CHECK-LABEL: test_vsudot_laneq_s32 78*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> 79*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> 80*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 81*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer 82*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 83*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> 84*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a) 85*207e5cccSFangrui Song // CHECK: ret <2 x i32> [[OP]] 86*207e5cccSFangrui Song int32x2_t test_vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b) { 87*207e5cccSFangrui Song return vsudot_laneq_s32(r, a, b, 0); 88*207e5cccSFangrui Song } 89*207e5cccSFangrui Song 90*207e5cccSFangrui Song // CHECK-LABEL: test_vusdotq_s32 91*207e5cccSFangrui Song // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) 92*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[VAL]] 93*207e5cccSFangrui Song int32x4_t test_vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { 94*207e5cccSFangrui Song return vusdotq_s32(r, a, b); 95*207e5cccSFangrui Song } 96*207e5cccSFangrui Song 97*207e5cccSFangrui Song // CHECK-LABEL: test_vusdotq_lane_s32 98*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> 99*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> 100*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 101*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer 102*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> 103*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> 104*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]]) 105*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[OP]] 106*207e5cccSFangrui Song int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) { 107*207e5cccSFangrui Song return vusdotq_lane_s32(r, a, b, 0); 108*207e5cccSFangrui Song } 109*207e5cccSFangrui Song 110*207e5cccSFangrui Song // CHECK-LABEL: test_vsudotq_lane_s32 111*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> 112*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> 113*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 114*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer 115*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> 116*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> 117*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a) 118*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[OP]] 119*207e5cccSFangrui Song int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) { 120*207e5cccSFangrui Song return vsudotq_lane_s32(r, a, b, 0); 121*207e5cccSFangrui Song } 122*207e5cccSFangrui Song 123*207e5cccSFangrui Song // CHECK-LABEL: test_vusdotq_laneq_s32 124*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> 125*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> 126*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 127*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer 128*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> 129*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> 130*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]]) 131*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[OP]] 132*207e5cccSFangrui Song int32x4_t test_vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { 133*207e5cccSFangrui Song return vusdotq_laneq_s32(r, a, b, 0); 134*207e5cccSFangrui Song } 135*207e5cccSFangrui Song 136*207e5cccSFangrui Song // CHECK-LABEL: test_vsudotq_laneq_s32 137*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> 138*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> 139*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 140*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer 141*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> 142*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> 143*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a) 144*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[OP]] 145*207e5cccSFangrui Song int32x4_t test_vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b) { 146*207e5cccSFangrui Song return vsudotq_laneq_s32(r, a, b, 0); 147*207e5cccSFangrui Song } 148