xref: /llvm-project/clang/test/CodeGen/matrix-type-builtins.c (revision 94473f4db6a6f5f12d7c4081455b5b596094eac5)
1 // RUN: %clang_cc1 -no-enable-noundef-analysis -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=COMMON,CHECK64 %s
2 // RUN: %clang_cc1 -no-enable-noundef-analysis -fenable-matrix -triple i386-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=COMMON,CHECK32 %s
3 
4 // Also check we do not crash when running some middle-end passes. Most
5 // importantly this includes the IR verifier, to ensure we emit valid IR.
6 // RUN: %clang_cc1 -fenable-matrix -emit-llvm -triple x86_64-apple-darwin %s -o %t
7 
8 // Tests for the matrix type builtins.
9 
10 typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
11 typedef float fx2x3_t __attribute__((matrix_type(2, 3)));
12 typedef float fx3x2_t __attribute__((matrix_type(3, 2)));
13 typedef int ix20x4_t __attribute__((matrix_type(20, 4)));
14 typedef int ix4x20_t __attribute__((matrix_type(4, 20)));
15 typedef unsigned ux1x6_t __attribute__((matrix_type(1, 6)));
16 typedef unsigned ux6x1_t __attribute__((matrix_type(6, 1)));
17 
18 void transpose_double_5x5(dx5x5_t *a) {
19   // COMMON-LABEL: define{{.*}} void @transpose_double_5x5(
20   // CHECK32:       [[A:%.*]] = load <25 x double>, ptr {{.*}}, align 4
21   // CHECK64:       [[A:%.*]] = load <25 x double>, ptr {{.*}}, align 8
22   // COMMON-NEXT:   [[TRANS:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[A]], i32 5, i32 5)
23   // CHECK32-NEXT:  store <25 x double> [[TRANS]], ptr %a_t, align 4
24   // CHECK64-NEXT:  store <25 x double> [[TRANS]], ptr %a_t, align 8
25 
26   dx5x5_t a_t = __builtin_matrix_transpose(*a);
27 }
28 
29 void transpose_float_3x2(fx3x2_t *a) {
30   // COMMON-LABEL: define{{.*}} void @transpose_float_3x2(
31   // COMMON:        [[A:%.*]] = load <6 x float>, ptr {{.*}}, align 4
32   // COMMON-NEXT:   [[TRANS:%.*]] = call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> [[A]], i32 3, i32 2)
33   // COMMON-NEXT:   store <6 x float> [[TRANS]], ptr %a_t, align 4
34 
35   fx2x3_t a_t = __builtin_matrix_transpose(*a);
36 }
37 
38 void transpose_int_20x4(ix20x4_t *a) {
39   // COMMON-LABEL: define{{.*}} void @transpose_int_20x4(
40   // COMMON:         [[A:%.*]] = load <80 x i32>, ptr {{.*}}, align 4
41   // COMMON-NEXT:    [[TRANS:%.*]] = call <80 x i32> @llvm.matrix.transpose.v80i32(<80 x i32> [[A]], i32 20, i32 4)
42   // COMMON-NEXT:    store <80 x i32> [[TRANS]], ptr %a_t, align 4
43 
44   ix4x20_t a_t = __builtin_matrix_transpose(*a);
45 }
46 
47 struct Foo {
48   ux1x6_t in;
49   ux6x1_t out;
50 };
51 
52 void transpose_struct_member(struct Foo *F) {
53   // COMMON-LABEL: define{{.*}} void @transpose_struct_member(
54   // COMMON:         [[M:%.*]] = load <6 x i32>, ptr {{.*}}, align 4
55   // COMMON-NEXT:    [[M_T:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M]], i32 1, i32 6)
56   // CHECK32-NEXT:   [[F_ADDR:%.*]] = load ptr, ptr %F.addr, align 4
57   // CHECK64-NEXT:   [[F_ADDR:%.*]] = load ptr, ptr %F.addr, align 8
58   // COMMON-NEXT:    [[OUT_PTR:%.*]] = getelementptr inbounds nuw %struct.Foo, ptr [[F_ADDR]], i32 0, i32 1
59   // COMMON-NEXT:    store <6 x i32> [[M_T]], ptr [[OUT_PTR]], align 4
60 
61   F->out = __builtin_matrix_transpose(F->in);
62 }
63 
64 void transpose_transpose_struct_member(struct Foo *F) {
65   // COMMON-LABEL: define{{.*}} void @transpose_transpose_struct_member(
66   // COMMON:         [[M:%.*]] = load <6 x i32>, ptr {{.*}}, align 4
67   // COMMON-NEXT:    [[M_T:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M]], i32 1, i32 6)
68   // COMMON-NEXT:    [[M_T2:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M_T]], i32 6, i32 1)
69   // CHECK32-NEXT:   [[F_ADDR:%.*]] = load ptr, ptr %F.addr, align 4
70   // CHECK64-NEXT:   [[F_ADDR:%.*]] = load ptr, ptr %F.addr, align 8
71   // COMMON-NEXT:    [[IN_PTR:%.*]] = getelementptr inbounds nuw %struct.Foo, ptr [[F_ADDR]], i32 0, i32 0
72   // COMMON-NEXT:    store <6 x i32> [[M_T2]], ptr [[IN_PTR]], align 4
73 
74   F->in = __builtin_matrix_transpose(__builtin_matrix_transpose(F->in));
75 }
76 
77 dx5x5_t get_matrix(void);
78 
79 void transpose_rvalue(void) {
80   // COMMON-LABEL: define{{.*}} void @transpose_rvalue()
81   // COMMON-NEXT:  entry:
82   // CHECK32-NEXT:   [[M_T_ADDR:%.*]] = alloca [25 x double], align 4
83   // CHECK64-NEXT:   [[M_T_ADDR:%.*]] = alloca [25 x double], align 8
84   // COMMON-NEXT:    [[CALL:%.*]] = call <25 x double> @get_matrix()
85   // COMMON-NEXT:    [[M_T:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[CALL]], i32 5, i32 5)
86   // CHECK32-NEXT:   store <25 x double> [[M_T]], ptr [[M_T_ADDR]], align 4
87   // CHECK64-NEXT:   store <25 x double> [[M_T]], ptr [[M_T_ADDR]], align 8
88 
89   dx5x5_t m_t = __builtin_matrix_transpose(get_matrix());
90 }
91 
92 const dx5x5_t global_matrix;
93 
94 void transpose_global(void) {
95   // COMMON-LABEL: define{{.*}} void @transpose_global()
96   // COMMON-NEXT:  entry:
97   // CHECK32-NEXT:    [[M_T_ADDR:%.*]] = alloca [25 x double], align 4
98   // CHECK32-NEXT:    [[GLOBAL_MATRIX:%.*]] = load <25 x double>, ptr @global_matrix, align 4
99   // CHECK64-NEXT:    [[M_T_ADDR:%.*]] = alloca [25 x double], align 8
100   // CHECK64-NEXT:    [[GLOBAL_MATRIX:%.*]] = load <25 x double>, ptr @global_matrix, align 8
101   // COMMON-NEXT:    [[M_T:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[GLOBAL_MATRIX]], i32 5, i32 5)
102   // CHECK32-NEXT:    store <25 x double> [[M_T]], ptr [[M_T_ADDR]], align 4
103   // CHECK64-NEXT:    store <25 x double> [[M_T]], ptr [[M_T_ADDR]], align 8
104 
105   dx5x5_t m_t = __builtin_matrix_transpose(global_matrix);
106 }
107 
108 void column_major_load_with_const_stride_double(double *Ptr) {
109   // COMMON-LABEL: define{{.*}} void @column_major_load_with_const_stride_double(ptr %Ptr)
110   // CHECK32:         [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
111   // CHECK32-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(ptr align 4 [[PTR]], i32 5, i1 false, i32 5, i32 5)
112   // CHECK64:         [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
113   // CHECK64-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(ptr align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5)
114 
115   dx5x5_t m_a1 = __builtin_matrix_column_major_load(Ptr, 5, 5, 5);
116 }
117 
118 void column_major_load_with_const_stride2_double(double *Ptr) {
119   // COMMON-LABEL: define{{.*}} void @column_major_load_with_const_stride2_double(ptr %Ptr)
120   // CHECK32:         [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
121   // CHECK32-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(ptr align 4 [[PTR]], i32 15, i1 false, i32 5, i32 5)
122   // CHECK64:         [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
123   // CHECK64-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(ptr align 8 [[PTR]], i64 15, i1 false, i32 5, i32 5)
124 
125   dx5x5_t m_a2 = __builtin_matrix_column_major_load(Ptr, 5, 5, 2 * 3 + 9);
126 }
127 
128 void column_major_load_with_variable_stride_ull_float(float *Ptr, unsigned long long S) {
129   // COMMON-LABEL: define{{.*}} void @column_major_load_with_variable_stride_ull_float(ptr %Ptr, i64 %S)
130   // CHECK32:         [[S:%.*]] = load i64, ptr %S.addr, align 8
131   // CHECK32-NEXT:    [[STRIDE_TRUNC:%.*]] = trunc i64 [[S]] to i32
132   // CHECK32-NEXT:    [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
133   // CHECK32-NEXT:    call <6 x float> @llvm.matrix.column.major.load.v6f32.i32(ptr align 4 [[PTR]], i32 [[STRIDE_TRUNC]], i1 false, i32 2, i32 3)
134 
135   // CHECK64:         [[S:%.*]] = load i64, ptr %S.addr, align 8
136   // CHECK64-NEXT:    [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
137   // CHECK64-NEXT:    call <6 x float> @llvm.matrix.column.major.load.v6f32.i64(ptr align 4 [[PTR]], i64 [[S]], i1 false, i32 2, i32 3)
138 
139   fx2x3_t m_b = __builtin_matrix_column_major_load(Ptr, 2, 3, S);
140 }
141 
142 void column_major_load_with_stride_math_int(int *Ptr, int S) {
143   // COMMON-LABEL: define{{.*}} void @column_major_load_with_stride_math_int(ptr %Ptr, i32 %S)
144   // COMMON:         [[S:%.*]] = load i32, ptr %S.addr, align 4
145   // COMMON-NEXT:    [[STRIDE:%.*]] = add nsw i32 [[S]], 32
146   // CHECK32-NEXT:   [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
147   // CHECK32-NEXT:   call <80 x i32> @llvm.matrix.column.major.load.v80i32.i32(ptr align 4 [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 20)
148   //
149   // CHECK64-NEXT:   [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64
150   // CHECK64-NEXT:   [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
151   // CHECK64-NEXT:   call <80 x i32> @llvm.matrix.column.major.load.v80i32.i64(ptr align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 4, i32 20)
152 
153   ix4x20_t m_c = __builtin_matrix_column_major_load(Ptr, 4, 20, S + 32);
154 }
155 
156 void column_major_load_with_stride_math_s_int(int *Ptr, short S) {
157   // COMMON-LABEL:  define{{.*}} void @column_major_load_with_stride_math_s_int(ptr %Ptr, i16 signext %S)
158   // COMMON:         [[S:%.*]] = load i16, ptr %S.addr, align 2
159   // COMMON-NEXT:    [[S_EXT:%.*]] = sext i16 [[S]] to i32
160   // COMMON-NEXT:    [[STRIDE:%.*]] = add nsw i32 [[S_EXT]], 32
161   // CHECK32-NEXT:   [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
162   // CHECK32-NEXT:   %matrix = call <80 x i32> @llvm.matrix.column.major.load.v80i32.i32(ptr align 4 [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 20)
163   //
164   // CHECK64-NEXT:   [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64
165   // CHECK64-NEXT:   [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
166   // CHECK64-NEXT:   %matrix = call <80 x i32> @llvm.matrix.column.major.load.v80i32.i64(ptr align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 4, i32 20)
167 
168   ix4x20_t m_c = __builtin_matrix_column_major_load(Ptr, 4, 20, S + 32);
169 }
170 
171 void column_major_load_array1(double Ptr[25]) {
172   // COMMON-LABEL: define{{.*}} void @column_major_load_array1(ptr %Ptr)
173   // CHECK32:         [[ADDR:%.*]] = load ptr, ptr %Ptr.addr, align 4
174   // CHECK32-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(ptr align 4 [[ADDR]], i32 5, i1 false, i32 5, i32 5)
175 
176   // CHECK64:         [[ADDR:%.*]] = load ptr, ptr %Ptr.addr, align 8
177   // CHECK64-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(ptr align 8 [[ADDR]], i64 5, i1 false, i32 5, i32 5)
178 
179   dx5x5_t m = __builtin_matrix_column_major_load(Ptr, 5, 5, 5);
180 }
181 
182 void column_major_load_array2(void) {
183   // COMMON-LABEL: define{{.*}} void @column_major_load_array2() #0 {
184   // COMMON-NEXT:  entry:
185   // CHECK32-NEXT:    [[PTR:%.*]] = alloca [25 x double], align 8
186   // CHECK32:         [[ARRAY_DEC:%.*]] = getelementptr inbounds [25 x double], ptr [[PTR]], i32 0, i32 0
187   // CHECK32-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(ptr align 8 [[ARRAY_DEC]], i32 5, i1 false, i32 5, i32 5)
188 
189   // CHECK64-NEXT:    [[PTR:%.*]] = alloca [25 x double], align 16
190   // CHECK64:         [[ARRAY_DEC:%.*]] = getelementptr inbounds [25 x double], ptr [[PTR]], i64 0, i64 0
191   // CHECK64-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(ptr align 16 [[ARRAY_DEC]], i64 5, i1 false, i32 5, i32 5)
192 
193   double Ptr[25];
194   dx5x5_t m = __builtin_matrix_column_major_load(Ptr, 5, 5, 5);
195 }
196 
197 void column_major_load_const(const double *Ptr) {
198   // COMMON-LABEL: define{{.*}} void @column_major_load_const(ptr %Ptr)
199   // CHECK32:         [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
200   // CHECK32-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(ptr align 4 [[PTR]], i32 5, i1 false, i32 5, i32 5)
201   //
202   // CHECK64:         [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
203   // CHECK64-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(ptr align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5)
204 
205   dx5x5_t m_a1 = __builtin_matrix_column_major_load(Ptr, 5, 5, 5);
206 }
207 
208 void column_major_load_volatile(volatile double *Ptr) {
209   // COMMON-LABEL: define{{.*}} void @column_major_load_volatile(ptr %Ptr)
210   // CHECK32:         [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
211   // CHECK32-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64.i32(ptr align 4 [[PTR]], i32 5, i1 true, i32 5, i32 5)
212   //
213   // CHECK64:         [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
214   // CHECK64-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64.i64(ptr align 8 [[PTR]], i64 5, i1 true, i32 5, i32 5)
215 
216   dx5x5_t m_a1 = __builtin_matrix_column_major_load(Ptr, 5, 5, 5);
217 }
218 
219 void column_major_store_with_const_stride_double(double *Ptr) {
220   // COMMON-LABEL: define{{.*}} void @column_major_store_with_const_stride_double(ptr %Ptr)
221   // CHECK32:         [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 4
222   // CHECK32-NEXT:    [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
223   // CHECK32-NEXT:    call void @llvm.matrix.column.major.store.v25f64.i32(<25 x double> [[M]], ptr align 4 [[PTR]], i32 5, i1 false, i32 5, i32 5)
224   //
225   // CHECK64:         [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 8
226   // CHECK64-NEXT:    [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
227   // CHECK64-NEXT:    call void @llvm.matrix.column.major.store.v25f64.i64(<25 x double> [[M]], ptr align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5)
228 
229   dx5x5_t m;
230   __builtin_matrix_column_major_store(m, Ptr, 5);
231 }
232 
233 void column_major_store_with_const_stride2_double(double *Ptr) {
234   // COMMON-LABEL: define{{.*}} void @column_major_store_with_const_stride2_double(ptr %Ptr)
235   // CHECK32:         [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 4
236   // CHECK32-NEXT:    [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
237   // CHECK32-NEXT:    call void @llvm.matrix.column.major.store.v25f64.i32(<25 x double> [[M]], ptr align 4 [[PTR]], i32 15, i1 false, i32 5, i32 5)
238   //
239   // CHECK64:         [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 8
240   // CHECK64-NEXT:    [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
241   // CHECK64-NEXT:    call void @llvm.matrix.column.major.store.v25f64.i64(<25 x double> [[M]], ptr align 8 [[PTR]], i64 15, i1 false, i32 5, i32 5)
242   //
243   dx5x5_t m;
244   __builtin_matrix_column_major_store(m, Ptr, 2 * 3 + 9);
245 }
246 
247 void column_major_store_with_stride_math_int(int *Ptr, int S) {
248   // COMMON-LABEL: define{{.*}} void @column_major_store_with_stride_math_int(ptr %Ptr, i32 %S)
249   // COMMON:         [[M:%.*]] = load <80 x i32>, ptr {{.*}}, align 4
250   // CHECK32-NEXT:   [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
251   // CHECK64-NEXT:   [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
252   // COMMON-NEXT:    [[S:%.*]] = load i32, ptr %S.addr, align 4
253   // COMMON-NEXT:    [[ADD:%.*]] = add nsw i32 [[S]], 32
254   // CHECK32-NEXT:   call void @llvm.matrix.column.major.store.v80i32.i32(<80 x i32> [[M]], ptr align 4 [[PTR]], i32 [[ADD]], i1 false, i32 4, i32 20)
255   //
256   // CHECK64-NEXT:   [[IDX:%.*]] = sext i32 [[ADD]] to i64
257   // CHECK64-NEXT:   call void @llvm.matrix.column.major.store.v80i32.i64(<80 x i32> [[M]], ptr align 4 [[PTR]], i64 [[IDX]], i1 false, i32 4, i32 20)
258 
259   ix4x20_t m;
260   __builtin_matrix_column_major_store(m, Ptr, S + 32);
261 }
262 
263 void column_major_store_with_stride_math_s_int(int *Ptr, short S) {
264   // COMMON-LABEL: define{{.*}} void @column_major_store_with_stride_math_s_int(ptr %Ptr, i16 signext %S)
265   // COMMON:         [[M:%.*]] = load <80 x i32>, ptr {{.*}}, align 4
266   // CHECK32-NEXT:   [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
267   // CHECK64-NEXT:   [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
268   // COMMON-NEXT:    [[S:%.*]] = load i16, ptr %S.addr, align 2
269   // COMMON-NEXT:    [[EXT:%.*]] = sext i16 [[S]] to i32
270   // COMMON-NEXT:    [[ADD:%.*]] = add nsw i32 [[EXT]], 2
271   // CHECK32-NEXT:   call void @llvm.matrix.column.major.store.v80i32.i32(<80 x i32> [[M]], ptr align 4 [[PTR]], i32 [[ADD]], i1 false, i32 4, i32 20)
272   //
273   // CHECK64-NEXT:   [[IDX:%.*]] = sext i32 [[ADD]] to i64
274   // CHECK64-NEXT:   call void @llvm.matrix.column.major.store.v80i32.i64(<80 x i32> [[M]], ptr align 4 [[PTR]], i64 [[IDX]], i1 false, i32 4, i32 20)
275 
276   ix4x20_t m;
277   __builtin_matrix_column_major_store(m, Ptr, S + 2);
278 }
279 
280 void column_major_store_array1(double Ptr[25]) {
281   // COMMON-LABEL: define{{.*}} void @column_major_store_array1(ptr %Ptr)
282   // CHECK32:         [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 4
283   // CHECK32-NEXT:    [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
284   // CHECK32-NEXT:    call void @llvm.matrix.column.major.store.v25f64.i32(<25 x double> [[M]], ptr align 4 [[PTR]], i32 5, i1 false, i32 5, i32 5)
285   //
286   // CHECK64:         [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 8
287   // CHECK64-NEXT:    [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
288   // CHECK64-NEXT:    call void @llvm.matrix.column.major.store.v25f64.i64(<25 x double> [[M]], ptr align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5)
289 
290   dx5x5_t m;
291   __builtin_matrix_column_major_store(m, Ptr, 5);
292 }
293 
294 void column_major_store_array2(void) {
295   // COMMON-LABEL: define{{.*}} void @column_major_store_array2()
296   // CHECK32:         [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 4
297   // CHECK32-NEXT:    [[PTR:%.*]] = getelementptr inbounds [25 x double], ptr %Ptr, i32 0, i32 0
298   // CHECK32-NEXT:    call void @llvm.matrix.column.major.store.v25f64.i32(<25 x double> [[M]], ptr align 8 [[PTR]], i32 5, i1 false, i32 5, i32 5)
299   //
300   // CHECK64:         [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 8
301   // CHECK64-NEXT:    [[PTR:%.*]] = getelementptr inbounds [25 x double], ptr %Ptr, i64 0, i64 0
302   // CHECK64-NEXT:    call void @llvm.matrix.column.major.store.v25f64.i64(<25 x double> [[M]], ptr align 16 [[PTR]], i64 5, i1 false, i32 5, i32 5)
303 
304   double Ptr[25];
305   dx5x5_t m;
306   __builtin_matrix_column_major_store(m, Ptr, 5);
307 }
308 
309 void column_major_store_volatile(volatile double *Ptr) {
310   // COMMON-LABEL: define{{.*}} void @column_major_store_volatile(ptr %Ptr) #0 {
311   // CHECK32:         [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 4
312   // CHECK32-NEXT:    [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 4
313   // CHECK32-NEXT:    call void @llvm.matrix.column.major.store.v25f64.i32(<25 x double> [[M]], ptr align 4 [[PTR]], i32 5, i1 true, i32 5, i32 5)
314   //
315   // CHECK64:         [[M:%.*]] = load <25 x double>, ptr {{.*}}, align 8
316   // CHECK64-NEXT:    [[PTR:%.*]] = load ptr, ptr %Ptr.addr, align 8
317   // CHECK64-NEXT:    call void @llvm.matrix.column.major.store.v25f64.i64(<25 x double> [[M]], ptr align 8 [[PTR]], i64 5, i1 true, i32 5, i32 5)
318 
319   dx5x5_t m;
320   __builtin_matrix_column_major_store(m, Ptr, 5);
321 }
322