xref: /llvm-project/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c (revision 207e5ccceec8d3cc3f32723e78f2a142bc61b07d)
1*207e5cccSFangrui Song // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \
2*207e5cccSFangrui Song // RUN: -disable-O0-optnone -emit-llvm -o - %s \
3*207e5cccSFangrui Song // RUN: | opt -S -passes=mem2reg,sroa \
4*207e5cccSFangrui Song // RUN: | FileCheck %s
5*207e5cccSFangrui Song 
6*207e5cccSFangrui Song // REQUIRES: aarch64-registered-target
7*207e5cccSFangrui Song 
8*207e5cccSFangrui Song #include <arm_neon.h>
9*207e5cccSFangrui Song 
10*207e5cccSFangrui Song // CHECK-LABEL: test_vmmlaq_s32
11*207e5cccSFangrui Song // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
12*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[VAL]]
13*207e5cccSFangrui Song int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) {
14*207e5cccSFangrui Song   return vmmlaq_s32(r, a, b);
15*207e5cccSFangrui Song }
16*207e5cccSFangrui Song 
17*207e5cccSFangrui Song // CHECK-LABEL: test_vmmlaq_u32
18*207e5cccSFangrui Song // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
19*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[VAL]]
20*207e5cccSFangrui Song uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
21*207e5cccSFangrui Song   return vmmlaq_u32(r, a, b);
22*207e5cccSFangrui Song }
23*207e5cccSFangrui Song 
24*207e5cccSFangrui Song // CHECK-LABEL: test_vusmmlaq_s32
25*207e5cccSFangrui Song // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
26*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[VAL]]
27*207e5cccSFangrui Song int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
28*207e5cccSFangrui Song   return vusmmlaq_s32(r, a, b);
29*207e5cccSFangrui Song }
30*207e5cccSFangrui Song 
31*207e5cccSFangrui Song // CHECK-LABEL: test_vusdot_s32
32*207e5cccSFangrui Song // CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
33*207e5cccSFangrui Song // CHECK: ret <2 x i32> [[VAL]]
34*207e5cccSFangrui Song int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
35*207e5cccSFangrui Song   return vusdot_s32(r, a, b);
36*207e5cccSFangrui Song }
37*207e5cccSFangrui Song 
38*207e5cccSFangrui Song // CHECK-LABEL: test_vusdot_lane_s32
39*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
40*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
41*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
42*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
43*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
44*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
45*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]])
46*207e5cccSFangrui Song // CHECK: ret <2 x i32> [[OP]]
47*207e5cccSFangrui Song int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
48*207e5cccSFangrui Song   return vusdot_lane_s32(r, a, b, 0);
49*207e5cccSFangrui Song }
50*207e5cccSFangrui Song 
51*207e5cccSFangrui Song // CHECK-LABEL: test_vsudot_lane_s32
52*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
53*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %0 to <8 x i8>
54*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> %1 to <2 x i32>
55*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
56*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
57*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
58*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a)
59*207e5cccSFangrui Song // CHECK: ret <2 x i32> [[OP]]
60*207e5cccSFangrui Song int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) {
61*207e5cccSFangrui Song   return vsudot_lane_s32(r, a, b, 0);
62*207e5cccSFangrui Song }
63*207e5cccSFangrui Song 
64*207e5cccSFangrui Song // CHECK-LABEL: test_vusdot_laneq_s32
65*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
66*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
67*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
68*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer
69*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
70*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
71*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]])
72*207e5cccSFangrui Song // CHECK: ret <2 x i32> [[OP]]
73*207e5cccSFangrui Song int32x2_t test_vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b) {
74*207e5cccSFangrui Song   return vusdot_laneq_s32(r, a, b, 0);
75*207e5cccSFangrui Song }
76*207e5cccSFangrui Song 
77*207e5cccSFangrui Song // CHECK-LABEL: test_vsudot_laneq_s32
78*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
79*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
80*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
81*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer
82*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
83*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
84*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a)
85*207e5cccSFangrui Song // CHECK: ret <2 x i32> [[OP]]
86*207e5cccSFangrui Song int32x2_t test_vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b) {
87*207e5cccSFangrui Song   return vsudot_laneq_s32(r, a, b, 0);
88*207e5cccSFangrui Song }
89*207e5cccSFangrui Song 
90*207e5cccSFangrui Song // CHECK-LABEL: test_vusdotq_s32
91*207e5cccSFangrui Song // CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
92*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[VAL]]
93*207e5cccSFangrui Song int32x4_t test_vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
94*207e5cccSFangrui Song   return vusdotq_s32(r, a, b);
95*207e5cccSFangrui Song }
96*207e5cccSFangrui Song 
97*207e5cccSFangrui Song // CHECK-LABEL: test_vusdotq_lane_s32
98*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
99*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
100*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
101*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
102*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
103*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
104*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]])
105*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[OP]]
106*207e5cccSFangrui Song int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) {
107*207e5cccSFangrui Song   return vusdotq_lane_s32(r, a, b, 0);
108*207e5cccSFangrui Song }
109*207e5cccSFangrui Song 
110*207e5cccSFangrui Song // CHECK-LABEL: test_vsudotq_lane_s32
111*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32>
112*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
113*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
114*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer
115*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
116*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
117*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a)
118*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[OP]]
119*207e5cccSFangrui Song int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) {
120*207e5cccSFangrui Song   return vsudotq_lane_s32(r, a, b, 0);
121*207e5cccSFangrui Song }
122*207e5cccSFangrui Song 
123*207e5cccSFangrui Song // CHECK-LABEL: test_vusdotq_laneq_s32
124*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
125*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
126*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
127*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
128*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
129*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
130*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]])
131*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[OP]]
132*207e5cccSFangrui Song int32x4_t test_vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
133*207e5cccSFangrui Song   return vusdotq_laneq_s32(r, a, b, 0);
134*207e5cccSFangrui Song }
135*207e5cccSFangrui Song 
136*207e5cccSFangrui Song // CHECK-LABEL: test_vsudotq_laneq_s32
137*207e5cccSFangrui Song // CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32>
138*207e5cccSFangrui Song // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
139*207e5cccSFangrui Song // CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
140*207e5cccSFangrui Song // CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
141*207e5cccSFangrui Song // CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
142*207e5cccSFangrui Song // CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
143*207e5cccSFangrui Song // CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a)
144*207e5cccSFangrui Song // CHECK: ret <4 x i32> [[OP]]
145*207e5cccSFangrui Song int32x4_t test_vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b) {
146*207e5cccSFangrui Song   return vsudotq_laneq_s32(r, a, b, 0);
147*207e5cccSFangrui Song }
148