xref: /llvm-project/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll (revision 29441e4f5fa5f5c7709f7cf180815ba97f611297)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
3
4target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
5target triple = "aarch64"
6
7%struct.cmplx_int16_t = type { i16, i16 }
8%struct.compressed_data_8bit = type { i8, [24 x i8] }
9
10define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(i32 noundef %n_prb, ptr noundef %src, ptr noundef %dst, ptr noundef %scale) #0 {
11; CHECK-LABEL: define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(
12; CHECK-SAME: i32 noundef [[N_PRB:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], ptr noundef writeonly captures(none) [[DST:%.*]], ptr noundef readonly [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
13; CHECK-NEXT:  [[ENTRY:.*:]]
14; CHECK-NEXT:    [[CMP47_NOT:%.*]] = icmp eq i32 [[N_PRB]], 0
15; CHECK-NEXT:    br i1 [[CMP47_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY_LR_PH:.*]]
16; CHECK:       [[FOR_BODY_LR_PH]]:
17; CHECK-NEXT:    [[CMP31_NOT:%.*]] = icmp eq ptr [[SCALE]], null
18; CHECK-NEXT:    [[WIDE_TRIP_COUNT58:%.*]] = zext i32 [[N_PRB]] to i64
19; CHECK-NEXT:    br i1 [[CMP31_NOT]], label %[[FOR_BODY_US:.*]], label %[[FOR_BODY:.*]]
20; CHECK:       [[FOR_BODY_US]]:
21; CHECK-NEXT:    [[INDVARS_IV55:%.*]] = phi i64 [ [[INDVARS_IV_NEXT56:%.*]], %[[FOR_BODY_US]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
22; CHECK-NEXT:    [[DST_ADDR_052_US:%.*]] = phi ptr [ [[DST_ADDR_1_US:%.*]], %[[FOR_BODY_US]] ], [ [[DST]], %[[FOR_BODY_LR_PH]] ]
23; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw [[STRUCT_COMPRESSED_DATA_8BIT:%.*]], ptr [[SRC]], i64 [[INDVARS_IV55]]
24; CHECK-NEXT:    [[MANTISSA_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 1
25; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[MANTISSA_US]], align 1
26; CHECK-NEXT:    [[VMOVL_I59_US:%.*]] = sext <8 x i8> [[TMP0]] to <8 x i16>
27; CHECK-NEXT:    [[ARRAYIDX7_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 9
28; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX7_US]], align 1
29; CHECK-NEXT:    [[VMOVL_I56_US:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i16>
30; CHECK-NEXT:    [[ARRAYIDX15_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 17
31; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX15_US]], align 1
32; CHECK-NEXT:    [[VMOVL_I_US:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
33; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX_US]], align 1
34; CHECK-NEXT:    [[CONV_US:%.*]] = sext i8 [[TMP3]] to i16
35; CHECK-NEXT:    [[MUL_US:%.*]] = shl nsw i16 [[CONV_US]], 1
36; CHECK-NEXT:    [[VECINIT_I79_US:%.*]] = insertelement <8 x i16> poison, i16 [[MUL_US]], i64 0
37; CHECK-NEXT:    [[VECINIT7_I86_US:%.*]] = shufflevector <8 x i16> [[VECINIT_I79_US]], <8 x i16> poison, <8 x i32> zeroinitializer
38; CHECK-NEXT:    [[MUL_I87_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I59_US]]
39; CHECK-NEXT:    [[MUL_I74_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I56_US]]
40; CHECK-NEXT:    [[MUL_I_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I_US]]
41; CHECK-NEXT:    store <8 x i16> [[MUL_I87_US]], ptr [[DST_ADDR_052_US]], align 2
42; CHECK-NEXT:    [[ADD_PTR47_US:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 16
43; CHECK-NEXT:    store <8 x i16> [[MUL_I74_US]], ptr [[ADD_PTR47_US]], align 2
44; CHECK-NEXT:    [[ADD_PTR50_US:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 32
45; CHECK-NEXT:    store <8 x i16> [[MUL_I_US]], ptr [[ADD_PTR50_US]], align 2
46; CHECK-NEXT:    [[DST_ADDR_1_US]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 48
47; CHECK-NEXT:    [[INDVARS_IV_NEXT56]] = add nuw nsw i64 [[INDVARS_IV55]], 1
48; CHECK-NEXT:    [[EXITCOND59_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT56]], [[WIDE_TRIP_COUNT58]]
49; CHECK-NEXT:    br i1 [[EXITCOND59_NOT]], label %[[FOR_END]], label %[[FOR_BODY_US]], !llvm.loop [[LOOP4:![0-9]+]]
50; CHECK:       [[FOR_BODY]]:
51; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
52; CHECK-NEXT:    [[DST_ADDR_052:%.*]] = phi ptr [ [[DST_ADDR_1:%.*]], %[[FOR_BODY]] ], [ [[DST]], %[[FOR_BODY_LR_PH]] ]
53; CHECK-NEXT:    [[AGG_TMP_COERCE_050:%.*]] = phi i64 [ [[AGG_TMP_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
54; CHECK-NEXT:    [[AGG_TMP42_COERCE_049:%.*]] = phi i64 [ [[AGG_TMP42_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
55; CHECK-NEXT:    [[AGG_TMP37_COERCE_048:%.*]] = phi i64 [ [[AGG_TMP37_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
56; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_COMPRESSED_DATA_8BIT]], ptr [[SRC]], i64 [[INDVARS_IV]]
57; CHECK-NEXT:    [[MANTISSA:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 1
58; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[MANTISSA]], align 1
59; CHECK-NEXT:    [[VMOVL_I59:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i16>
60; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 9
61; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX7]], align 1
62; CHECK-NEXT:    [[VMOVL_I56:%.*]] = sext <8 x i8> [[TMP5]] to <8 x i16>
63; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 17
64; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX15]], align 1
65; CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <8 x i8> [[TMP6]] to <8 x i16>
66; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
67; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i16
68; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i16 [[CONV]], 1
69; CHECK-NEXT:    [[VECINIT_I79:%.*]] = insertelement <8 x i16> poison, i16 [[MUL]], i64 0
70; CHECK-NEXT:    [[VECINIT7_I86:%.*]] = shufflevector <8 x i16> [[VECINIT_I79]], <8 x i16> poison, <8 x i32> zeroinitializer
71; CHECK-NEXT:    [[MUL_I87:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I59]]
72; CHECK-NEXT:    [[MUL_I74:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I56]]
73; CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I]]
74; CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
75; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP_SROA_0_0_COPYLOAD]] to i64
76; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP_COERCE_050]], -4294967296
77; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP_COERCE_0_INSERT_MASK]], [[AGG_TMP_COERCE_0_INSERT_EXT]]
78; CHECK-NEXT:    [[CALL33:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I87]], i64 [[AGG_TMP_COERCE_0_INSERT_INSERT]])
79; CHECK-NEXT:    store <8 x i16> [[CALL33]], ptr [[DST_ADDR_052]], align 2
80; CHECK-NEXT:    [[AGG_TMP37_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
81; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP37_SROA_0_0_COPYLOAD]] to i64
82; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP37_COERCE_048]], -4294967296
83; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP37_COERCE_0_INSERT_MASK]], [[AGG_TMP37_COERCE_0_INSERT_EXT]]
84; CHECK-NEXT:    [[CALL38:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I74]], i64 [[AGG_TMP37_COERCE_0_INSERT_INSERT]])
85; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 16
86; CHECK-NEXT:    store <8 x i16> [[CALL38]], ptr [[ARRAYIDX39]], align 2
87; CHECK-NEXT:    [[AGG_TMP42_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
88; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP42_SROA_0_0_COPYLOAD]] to i64
89; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP42_COERCE_049]], -4294967296
90; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP42_COERCE_0_INSERT_MASK]], [[AGG_TMP42_COERCE_0_INSERT_EXT]]
91; CHECK-NEXT:    [[CALL43:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I]], i64 [[AGG_TMP42_COERCE_0_INSERT_INSERT]])
92; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 32
93; CHECK-NEXT:    store <8 x i16> [[CALL43]], ptr [[ARRAYIDX44]], align 2
94; CHECK-NEXT:    [[DST_ADDR_1]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 48
95; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
96; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT58]]
97; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4]]
98; CHECK:       [[FOR_END]]:
99; CHECK-NEXT:    ret i32 0
100;
101entry:
102  %__p0.addr.i75 = alloca <8 x i16>, align 16
103  %__p1.addr.i76 = alloca i16, align 2
104  %__ret.i77 = alloca <8 x i16>, align 16
105  %.compoundliteral.i78 = alloca <8 x i16>, align 16
106  %__p0.addr.i62 = alloca <8 x i16>, align 16
107  %__p1.addr.i63 = alloca i16, align 2
108  %__ret.i64 = alloca <8 x i16>, align 16
109  %.compoundliteral.i65 = alloca <8 x i16>, align 16
110  %__p0.addr.i60 = alloca <8 x i16>, align 16
111  %__p1.addr.i = alloca i16, align 2
112  %__ret.i61 = alloca <8 x i16>, align 16
113  %.compoundliteral.i = alloca <8 x i16>, align 16
114  %__p0.addr.i57 = alloca <8 x i8>, align 8
115  %__ret.i58 = alloca <8 x i16>, align 16
116  %__p0.addr.i54 = alloca <8 x i8>, align 8
117  %__ret.i55 = alloca <8 x i16>, align 16
118  %__p0.addr.i = alloca <8 x i8>, align 8
119  %__ret.i = alloca <8 x i16>, align 16
120  %n_prb.addr = alloca i32, align 4
121  %src.addr = alloca ptr, align 8
122  %dst.addr = alloca ptr, align 8
123  %scale.addr = alloca ptr, align 8
124  %i = alloca i32, align 4
125  %prb_comp_in = alloca [3 x <8 x i16>], align 16
126  %__ret = alloca <8 x i8>, align 8
127  %tmp = alloca <8 x i8>, align 8
128  %__ret3 = alloca <8 x i8>, align 8
129  %tmp8 = alloca <8 x i8>, align 8
130  %__ret11 = alloca <8 x i8>, align 8
131  %tmp16 = alloca <8 x i8>, align 8
132  %prb_decomp = alloca [3 x <8 x i16>], align 16
133  %scaling_factor = alloca i16, align 2
134  %__s1 = alloca <8 x i16>, align 16
135  %agg.tmp = alloca %struct.cmplx_int16_t, align 2
136  %agg.tmp.coerce = alloca i64, align 8
137  %__s135 = alloca <8 x i16>, align 16
138  %agg.tmp37 = alloca %struct.cmplx_int16_t, align 2
139  %agg.tmp37.coerce = alloca i64, align 8
140  %__s140 = alloca <8 x i16>, align 16
141  %agg.tmp42 = alloca %struct.cmplx_int16_t, align 2
142  %agg.tmp42.coerce = alloca i64, align 8
143  %__s145 = alloca <8 x i16>, align 16
144  %__s148 = alloca <8 x i16>, align 16
145  %__s151 = alloca <8 x i16>, align 16
146  store i32 %n_prb, ptr %n_prb.addr, align 4
147  store ptr %src, ptr %src.addr, align 8
148  store ptr %dst, ptr %dst.addr, align 8
149  store ptr %scale, ptr %scale.addr, align 8
150  store i32 0, ptr %i, align 4
151  br label %for.cond
152
153for.cond:                                         ; preds = %for.inc, %entry
154  %0 = load i32, ptr %i, align 4
155  %1 = load i32, ptr %n_prb.addr, align 4
156  %cmp = icmp ult i32 %0, %1
157  br i1 %cmp, label %for.body, label %for.end
158
159for.body:                                         ; preds = %for.cond
160  %2 = load ptr, ptr %src.addr, align 8
161  %3 = load i32, ptr %i, align 4
162  %idxprom = zext i32 %3 to i64
163  %arrayidx = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %2, i64 %idxprom
164  %mantissa = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx, i32 0, i32 1
165  %arrayidx1 = getelementptr inbounds [24 x i8], ptr %mantissa, i64 0, i64 0
166  %4 = load <8 x i8>, ptr %arrayidx1, align 1
167  store <8 x i8> %4, ptr %__ret, align 8
168  %5 = load <8 x i8>, ptr %__ret, align 8
169  store <8 x i8> %5, ptr %tmp, align 8
170  %6 = load <8 x i8>, ptr %tmp, align 8
171  store <8 x i8> %6, ptr %__p0.addr.i57, align 8
172  %7 = load <8 x i8>, ptr %__p0.addr.i57, align 8
173  %vmovl.i59 = sext <8 x i8> %7 to <8 x i16>
174  store <8 x i16> %vmovl.i59, ptr %__ret.i58, align 16
175  %8 = load <8 x i16>, ptr %__ret.i58, align 16
176  %arrayidx2 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 0
177  store <8 x i16> %8, ptr %arrayidx2, align 16
178  %9 = load ptr, ptr %src.addr, align 8
179  %10 = load i32, ptr %i, align 4
180  %idxprom4 = zext i32 %10 to i64
181  %arrayidx5 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %9, i64 %idxprom4
182  %mantissa6 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx5, i32 0, i32 1
183  %arrayidx7 = getelementptr inbounds [24 x i8], ptr %mantissa6, i64 0, i64 8
184  %11 = load <8 x i8>, ptr %arrayidx7, align 1
185  store <8 x i8> %11, ptr %__ret3, align 8
186  %12 = load <8 x i8>, ptr %__ret3, align 8
187  store <8 x i8> %12, ptr %tmp8, align 8
188  %13 = load <8 x i8>, ptr %tmp8, align 8
189  store <8 x i8> %13, ptr %__p0.addr.i54, align 8
190  %14 = load <8 x i8>, ptr %__p0.addr.i54, align 8
191  %vmovl.i56 = sext <8 x i8> %14 to <8 x i16>
192  store <8 x i16> %vmovl.i56, ptr %__ret.i55, align 16
193  %15 = load <8 x i16>, ptr %__ret.i55, align 16
194  %arrayidx10 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 1
195  store <8 x i16> %15, ptr %arrayidx10, align 16
196  %16 = load ptr, ptr %src.addr, align 8
197  %17 = load i32, ptr %i, align 4
198  %idxprom12 = zext i32 %17 to i64
199  %arrayidx13 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %16, i64 %idxprom12
200  %mantissa14 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx13, i32 0, i32 1
201  %arrayidx15 = getelementptr inbounds [24 x i8], ptr %mantissa14, i64 0, i64 16
202  %18 = load <8 x i8>, ptr %arrayidx15, align 1
203  store <8 x i8> %18, ptr %__ret11, align 8
204  %19 = load <8 x i8>, ptr %__ret11, align 8
205  store <8 x i8> %19, ptr %tmp16, align 8
206  %20 = load <8 x i8>, ptr %tmp16, align 8
207  store <8 x i8> %20, ptr %__p0.addr.i, align 8
208  %21 = load <8 x i8>, ptr %__p0.addr.i, align 8
209  %vmovl.i = sext <8 x i8> %21 to <8 x i16>
210  store <8 x i16> %vmovl.i, ptr %__ret.i, align 16
211  %22 = load <8 x i16>, ptr %__ret.i, align 16
212  %arrayidx18 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 2
213  store <8 x i16> %22, ptr %arrayidx18, align 16
214  %23 = load ptr, ptr %src.addr, align 8
215  %24 = load i32, ptr %i, align 4
216  %idxprom19 = zext i32 %24 to i64
217  %arrayidx20 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %23, i64 %idxprom19
218  %exp = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx20, i32 0, i32 0
219  %25 = load i8, ptr %exp, align 1
220  %conv = sext i8 %25 to i32
221  %mul = mul nsw i32 %conv, 2
222  %conv21 = trunc i32 %mul to i16
223  store i16 %conv21, ptr %scaling_factor, align 2
224  %arrayidx22 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 0
225  %26 = load <8 x i16>, ptr %arrayidx22, align 16
226  %27 = load i16, ptr %scaling_factor, align 2
227  store <8 x i16> %26, ptr %__p0.addr.i75, align 16
228  store i16 %27, ptr %__p1.addr.i76, align 2
229  %28 = load <8 x i16>, ptr %__p0.addr.i75, align 16
230  %29 = load i16, ptr %__p1.addr.i76, align 2
231  %vecinit.i79 = insertelement <8 x i16> poison, i16 %29, i32 0
232  %30 = load i16, ptr %__p1.addr.i76, align 2
233  %vecinit1.i80 = insertelement <8 x i16> %vecinit.i79, i16 %30, i32 1
234  %31 = load i16, ptr %__p1.addr.i76, align 2
235  %vecinit2.i81 = insertelement <8 x i16> %vecinit1.i80, i16 %31, i32 2
236  %32 = load i16, ptr %__p1.addr.i76, align 2
237  %vecinit3.i82 = insertelement <8 x i16> %vecinit2.i81, i16 %32, i32 3
238  %33 = load i16, ptr %__p1.addr.i76, align 2
239  %vecinit4.i83 = insertelement <8 x i16> %vecinit3.i82, i16 %33, i32 4
240  %34 = load i16, ptr %__p1.addr.i76, align 2
241  %vecinit5.i84 = insertelement <8 x i16> %vecinit4.i83, i16 %34, i32 5
242  %35 = load i16, ptr %__p1.addr.i76, align 2
243  %vecinit6.i85 = insertelement <8 x i16> %vecinit5.i84, i16 %35, i32 6
244  %36 = load i16, ptr %__p1.addr.i76, align 2
245  %vecinit7.i86 = insertelement <8 x i16> %vecinit6.i85, i16 %36, i32 7
246  store <8 x i16> %vecinit7.i86, ptr %.compoundliteral.i78, align 16
247  %37 = load <8 x i16>, ptr %.compoundliteral.i78, align 16
248  %mul.i87 = mul <8 x i16> %28, %37
249  store <8 x i16> %mul.i87, ptr %__ret.i77, align 16
250  %38 = load <8 x i16>, ptr %__ret.i77, align 16
251  %arrayidx24 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
252  store <8 x i16> %38, ptr %arrayidx24, align 16
253  %arrayidx25 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 1
254  %39 = load <8 x i16>, ptr %arrayidx25, align 16
255  %40 = load i16, ptr %scaling_factor, align 2
256  store <8 x i16> %39, ptr %__p0.addr.i62, align 16
257  store i16 %40, ptr %__p1.addr.i63, align 2
258  %41 = load <8 x i16>, ptr %__p0.addr.i62, align 16
259  %42 = load i16, ptr %__p1.addr.i63, align 2
260  %vecinit.i66 = insertelement <8 x i16> poison, i16 %42, i32 0
261  %43 = load i16, ptr %__p1.addr.i63, align 2
262  %vecinit1.i67 = insertelement <8 x i16> %vecinit.i66, i16 %43, i32 1
263  %44 = load i16, ptr %__p1.addr.i63, align 2
264  %vecinit2.i68 = insertelement <8 x i16> %vecinit1.i67, i16 %44, i32 2
265  %45 = load i16, ptr %__p1.addr.i63, align 2
266  %vecinit3.i69 = insertelement <8 x i16> %vecinit2.i68, i16 %45, i32 3
267  %46 = load i16, ptr %__p1.addr.i63, align 2
268  %vecinit4.i70 = insertelement <8 x i16> %vecinit3.i69, i16 %46, i32 4
269  %47 = load i16, ptr %__p1.addr.i63, align 2
270  %vecinit5.i71 = insertelement <8 x i16> %vecinit4.i70, i16 %47, i32 5
271  %48 = load i16, ptr %__p1.addr.i63, align 2
272  %vecinit6.i72 = insertelement <8 x i16> %vecinit5.i71, i16 %48, i32 6
273  %49 = load i16, ptr %__p1.addr.i63, align 2
274  %vecinit7.i73 = insertelement <8 x i16> %vecinit6.i72, i16 %49, i32 7
275  store <8 x i16> %vecinit7.i73, ptr %.compoundliteral.i65, align 16
276  %50 = load <8 x i16>, ptr %.compoundliteral.i65, align 16
277  %mul.i74 = mul <8 x i16> %41, %50
278  store <8 x i16> %mul.i74, ptr %__ret.i64, align 16
279  %51 = load <8 x i16>, ptr %__ret.i64, align 16
280  %arrayidx27 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
281  store <8 x i16> %51, ptr %arrayidx27, align 16
282  %arrayidx28 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 2
283  %52 = load <8 x i16>, ptr %arrayidx28, align 16
284  %53 = load i16, ptr %scaling_factor, align 2
285  store <8 x i16> %52, ptr %__p0.addr.i60, align 16
286  store i16 %53, ptr %__p1.addr.i, align 2
287  %54 = load <8 x i16>, ptr %__p0.addr.i60, align 16
288  %55 = load i16, ptr %__p1.addr.i, align 2
289  %vecinit.i = insertelement <8 x i16> poison, i16 %55, i32 0
290  %56 = load i16, ptr %__p1.addr.i, align 2
291  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %56, i32 1
292  %57 = load i16, ptr %__p1.addr.i, align 2
293  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %57, i32 2
294  %58 = load i16, ptr %__p1.addr.i, align 2
295  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %58, i32 3
296  %59 = load i16, ptr %__p1.addr.i, align 2
297  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %59, i32 4
298  %60 = load i16, ptr %__p1.addr.i, align 2
299  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %60, i32 5
300  %61 = load i16, ptr %__p1.addr.i, align 2
301  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %61, i32 6
302  %62 = load i16, ptr %__p1.addr.i, align 2
303  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %62, i32 7
304  store <8 x i16> %vecinit7.i, ptr %.compoundliteral.i, align 16
305  %63 = load <8 x i16>, ptr %.compoundliteral.i, align 16
306  %mul.i = mul <8 x i16> %54, %63
307  store <8 x i16> %mul.i, ptr %__ret.i61, align 16
308  %64 = load <8 x i16>, ptr %__ret.i61, align 16
309  %arrayidx30 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
310  store <8 x i16> %64, ptr %arrayidx30, align 16
311  %65 = load ptr, ptr %scale.addr, align 8
312  %cmp31 = icmp ne ptr %65, null
313  br i1 %cmp31, label %if.then, label %if.else
314
315if.then:                                          ; preds = %for.body
316  %arrayidx32 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
317  %66 = load <8 x i16>, ptr %arrayidx32, align 16
318  %67 = load ptr, ptr %scale.addr, align 8
319  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp, ptr align 2 %67, i64 4, i1 false)
320  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp.coerce, ptr align 2 %agg.tmp, i64 4, i1 false)
321  %68 = load i64, ptr %agg.tmp.coerce, align 8
322  %call33 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %66, i64 %68)
323  store <8 x i16> %call33, ptr %__s1, align 16
324  %69 = load ptr, ptr %dst.addr, align 8
325  %arrayidx34 = getelementptr inbounds %struct.cmplx_int16_t, ptr %69, i64 0
326  %70 = load <8 x i16>, ptr %__s1, align 16
327  %71 = bitcast <8 x i16> %70 to <16 x i8>
328  %72 = bitcast <16 x i8> %71 to <8 x i16>
329  store <8 x i16> %72, ptr %arrayidx34, align 2
330  %arrayidx36 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
331  %73 = load <8 x i16>, ptr %arrayidx36, align 16
332  %74 = load ptr, ptr %scale.addr, align 8
333  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp37, ptr align 2 %74, i64 4, i1 false)
334  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp37.coerce, ptr align 2 %agg.tmp37, i64 4, i1 false)
335  %75 = load i64, ptr %agg.tmp37.coerce, align 8
336  %call38 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %73, i64 %75)
337  store <8 x i16> %call38, ptr %__s135, align 16
338  %76 = load ptr, ptr %dst.addr, align 8
339  %arrayidx39 = getelementptr inbounds %struct.cmplx_int16_t, ptr %76, i64 4
340  %77 = load <8 x i16>, ptr %__s135, align 16
341  %78 = bitcast <8 x i16> %77 to <16 x i8>
342  %79 = bitcast <16 x i8> %78 to <8 x i16>
343  store <8 x i16> %79, ptr %arrayidx39, align 2
344  %arrayidx41 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
345  %80 = load <8 x i16>, ptr %arrayidx41, align 16
346  %81 = load ptr, ptr %scale.addr, align 8
347  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp42, ptr align 2 %81, i64 4, i1 false)
348  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp42.coerce, ptr align 2 %agg.tmp42, i64 4, i1 false)
349  %82 = load i64, ptr %agg.tmp42.coerce, align 8
350  %call43 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %80, i64 %82)
351  store <8 x i16> %call43, ptr %__s140, align 16
352  %83 = load ptr, ptr %dst.addr, align 8
353  %arrayidx44 = getelementptr inbounds %struct.cmplx_int16_t, ptr %83, i64 8
354  %84 = load <8 x i16>, ptr %__s140, align 16
355  %85 = bitcast <8 x i16> %84 to <16 x i8>
356  %86 = bitcast <16 x i8> %85 to <8 x i16>
357  store <8 x i16> %86, ptr %arrayidx44, align 2
358  %87 = load ptr, ptr %dst.addr, align 8
359  %add.ptr = getelementptr inbounds %struct.cmplx_int16_t, ptr %87, i64 12
360  store ptr %add.ptr, ptr %dst.addr, align 8
361  br label %if.end
362
363if.else:                                          ; preds = %for.body
364  %arrayidx46 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
365  %88 = load <8 x i16>, ptr %arrayidx46, align 16
366  store <8 x i16> %88, ptr %__s145, align 16
367  %89 = load ptr, ptr %dst.addr, align 8
368  %90 = load <8 x i16>, ptr %__s145, align 16
369  %91 = bitcast <8 x i16> %90 to <16 x i8>
370  %92 = bitcast <16 x i8> %91 to <8 x i16>
371  store <8 x i16> %92, ptr %89, align 2
372  %93 = load ptr, ptr %dst.addr, align 8
373  %add.ptr47 = getelementptr inbounds %struct.cmplx_int16_t, ptr %93, i64 4
374  store ptr %add.ptr47, ptr %dst.addr, align 8
375  %arrayidx49 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
376  %94 = load <8 x i16>, ptr %arrayidx49, align 16
377  store <8 x i16> %94, ptr %__s148, align 16
378  %95 = load ptr, ptr %dst.addr, align 8
379  %96 = load <8 x i16>, ptr %__s148, align 16
380  %97 = bitcast <8 x i16> %96 to <16 x i8>
381  %98 = bitcast <16 x i8> %97 to <8 x i16>
382  store <8 x i16> %98, ptr %95, align 2
383  %99 = load ptr, ptr %dst.addr, align 8
384  %add.ptr50 = getelementptr inbounds %struct.cmplx_int16_t, ptr %99, i64 4
385  store ptr %add.ptr50, ptr %dst.addr, align 8
386  %arrayidx52 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
387  %100 = load <8 x i16>, ptr %arrayidx52, align 16
388  store <8 x i16> %100, ptr %__s151, align 16
389  %101 = load ptr, ptr %dst.addr, align 8
390  %102 = load <8 x i16>, ptr %__s151, align 16
391  %103 = bitcast <8 x i16> %102 to <16 x i8>
392  %104 = bitcast <16 x i8> %103 to <8 x i16>
393  store <8 x i16> %104, ptr %101, align 2
394  %105 = load ptr, ptr %dst.addr, align 8
395  %add.ptr53 = getelementptr inbounds %struct.cmplx_int16_t, ptr %105, i64 4
396  store ptr %add.ptr53, ptr %dst.addr, align 8
397  br label %if.end
398
399if.end:                                           ; preds = %if.else, %if.then
400  br label %for.inc
401
402for.inc:                                          ; preds = %if.end
403  %106 = load i32, ptr %i, align 4
404  %inc = add i32 %106, 1
405  store i32 %inc, ptr %i, align 4
406  br label %for.cond, !llvm.loop !4
407
408for.end:                                          ; preds = %for.cond
409  ret i32 0
410}
411
412define internal noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %a, i64 %scale.coerce) #0 {
413; CHECK-LABEL: define internal fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(
414; CHECK-SAME: <8 x i16> noundef [[A:%.*]], i64 [[SCALE_COERCE:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] {
415; CHECK-NEXT:  [[ENTRY:.*:]]
416; CHECK-NEXT:    [[SCALE_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_COERCE]] to i16
417; CHECK-NEXT:    [[SCALE_SROA_2_0_EXTRACT_SHIFT36:%.*]] = lshr i64 [[SCALE_COERCE]], 16
418; CHECK-NEXT:    [[SCALE_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_SROA_2_0_EXTRACT_SHIFT36]] to i16
419; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
420; CHECK-NEXT:    [[VECINIT_I19:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_0_0_EXTRACT_TRUNC]], i64 0
421; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_2_0_EXTRACT_TRUNC]], i64 0
422; CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
423; CHECK-NEXT:    [[VQNEGQ_V1_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VECINIT7_I]])
424; CHECK-NEXT:    [[VBSL5_I:%.*]] = shufflevector <8 x i16> [[VQNEGQ_V1_I]], <8 x i16> [[VECINIT_I]], <8 x i32> <i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6, i32 8>
425; CHECK-NEXT:    [[SHUFFLE_I85:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
426; CHECK-NEXT:    [[SHUFFLE_I82:%.*]] = shufflevector <8 x i16> [[VECINIT_I19]], <8 x i16> poison, <4 x i32> zeroinitializer
427; CHECK-NEXT:    [[VQDMULL_V2_I72:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I85]], <4 x i16> [[SHUFFLE_I82]])
428; CHECK-NEXT:    [[SHUFFLE_I97:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
429; CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I97]], <4 x i16> [[SHUFFLE_I82]])
430; CHECK-NEXT:    [[SHUFFLE_I79:%.*]] = shufflevector <8 x i16> [[SHUFFLE_I]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
431; CHECK-NEXT:    [[SHUFFLE_I76:%.*]] = shufflevector <8 x i16> [[VBSL5_I]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
432; CHECK-NEXT:    [[VQDMLAL2_I106:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I79]], <4 x i16> [[SHUFFLE_I76]])
433; CHECK-NEXT:    [[VQDMLAL_V3_I107:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I72]], <4 x i32> [[VQDMLAL2_I106]])
434; CHECK-NEXT:    [[SHUFFLE_I91:%.*]] = shufflevector <8 x i16> [[SHUFFLE_I]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
435; CHECK-NEXT:    [[SHUFFLE_I88:%.*]] = shufflevector <8 x i16> [[VBSL5_I]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
436; CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I91]], <4 x i16> [[SHUFFLE_I88]])
437; CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I]], <4 x i32> [[VQDMLAL2_I]])
438; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VQDMLAL_V3_I107]] to <8 x i16>
439; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VQDMLAL_V3_I]] to <8 x i16>
440; CHECK-NEXT:    [[SHUFFLE_I61:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
441; CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I61]]
442;
443entry:
444  %__p0.addr.i102 = alloca <4 x i32>, align 16
445  %__p1.addr.i103 = alloca <4 x i16>, align 8
446  %__p2.addr.i104 = alloca <4 x i16>, align 8
447  %__ret.i105 = alloca <4 x i32>, align 16
448  %__p0.addr.i98 = alloca <4 x i32>, align 16
449  %__p1.addr.i99 = alloca <4 x i16>, align 8
450  %__p2.addr.i100 = alloca <4 x i16>, align 8
451  %__ret.i101 = alloca <4 x i32>, align 16
452  %__p0.addr.i95 = alloca <8 x i16>, align 16
453  %__ret.i96 = alloca <4 x i16>, align 8
454  %__p0.addr.i92 = alloca <8 x i16>, align 16
455  %__ret.i93 = alloca <4 x i16>, align 8
456  %__p0.addr.i89 = alloca <8 x i16>, align 16
457  %__ret.i90 = alloca <4 x i16>, align 8
458  %__p0.addr.i86 = alloca <8 x i16>, align 16
459  %__ret.i87 = alloca <4 x i16>, align 8
460  %__p0.addr.i83 = alloca <8 x i16>, align 16
461  %__ret.i84 = alloca <4 x i16>, align 8
462  %__p0.addr.i80 = alloca <8 x i16>, align 16
463  %__ret.i81 = alloca <4 x i16>, align 8
464  %__p0.addr.i77 = alloca <8 x i16>, align 16
465  %__ret.i78 = alloca <4 x i16>, align 8
466  %__p0.addr.i74 = alloca <8 x i16>, align 16
467  %__ret.i75 = alloca <4 x i16>, align 8
468  %__p0.addr.i69 = alloca <4 x i16>, align 8
469  %__p1.addr.i70 = alloca <4 x i16>, align 8
470  %__ret.i71 = alloca <4 x i32>, align 16
471  %__p0.addr.i66 = alloca <4 x i16>, align 8
472  %__p1.addr.i67 = alloca <4 x i16>, align 8
473  %__ret.i68 = alloca <4 x i32>, align 16
474  %__p0.addr.i64 = alloca <4 x i32>, align 16
475  %__ret.i65 = alloca <8 x i16>, align 16
476  %__p0.addr.i62 = alloca <4 x i32>, align 16
477  %__ret.i63 = alloca <8 x i16>, align 16
478  %__p0.addr.i58 = alloca <8 x i16>, align 16
479  %__p1.addr.i59 = alloca <8 x i16>, align 16
480  %__ret.i60 = alloca <8 x i16>, align 16
481  %__p0.addr.i51 = alloca <4 x i32>, align 16
482  %__p1.addr.i52 = alloca <8 x i16>, align 16
483  %__p2.addr.i53 = alloca <8 x i16>, align 16
484  %__ret.i54 = alloca <4 x i32>, align 16
485  %a.addr.i46 = alloca <4 x i32>, align 16
486  %b.addr.i47 = alloca <8 x i16>, align 16
487  %c.addr.i = alloca <8 x i16>, align 16
488  %__p0.addr.i40 = alloca <8 x i16>, align 16
489  %__p1.addr.i41 = alloca <8 x i16>, align 16
490  %__ret.i42 = alloca <4 x i32>, align 16
491  %a.addr.i = alloca <8 x i16>, align 16
492  %b.addr.i = alloca <8 x i16>, align 16
493  %__p0.addr.i38 = alloca <8 x i16>, align 16
494  %__ret.i39 = alloca <8 x i16>, align 16
495  %__p0.addr.i36 = alloca <8 x i16>, align 16
496  %__p1.addr.i = alloca <8 x i16>, align 16
497  %__p2.addr.i = alloca <8 x i16>, align 16
498  %__ret.i37 = alloca <8 x i16>, align 16
499  %__p0.addr.i29 = alloca i32, align 4
500  %__ret.i30 = alloca <4 x i32>, align 16
501  %.compoundliteral.i31 = alloca <4 x i32>, align 16
502  %__p0.addr.i27 = alloca <4 x i32>, align 16
503  %__ret.i28 = alloca <8 x i16>, align 16
504  %__p0.addr.i16 = alloca i16, align 2
505  %__ret.i17 = alloca <8 x i16>, align 16
506  %.compoundliteral.i18 = alloca <8 x i16>, align 16
507  %__p0.addr.i14 = alloca i16, align 2
508  %__ret.i15 = alloca <8 x i16>, align 16
509  %.compoundliteral.i = alloca <8 x i16>, align 16
510  %__p0.addr.i = alloca <8 x i16>, align 16
511  %__ret.i = alloca <8 x i16>, align 16
512  %scale = alloca %struct.cmplx_int16_t, align 2
513  %a.addr = alloca <8 x i16>, align 16
514  %a_rev = alloca <8 x i16>, align 16
515  %cc = alloca <8 x i16>, align 16
516  %dd = alloca <8 x i16>, align 16
517  %mult_mask = alloca <8 x i16>, align 16
518  %lo32 = alloca <4 x i32>, align 16
519  %hi32 = alloca <4 x i32>, align 16
520  %coerce.val.ii = trunc i64 %scale.coerce to i32
521  store i32 %coerce.val.ii, ptr %scale, align 2
522  store <8 x i16> %a, ptr %a.addr, align 16
523  %0 = load <8 x i16>, ptr %a.addr, align 16
524  store <8 x i16> %0, ptr %__p0.addr.i, align 16
525  %1 = load <8 x i16>, ptr %__p0.addr.i, align 16
526  %2 = load <8 x i16>, ptr %__p0.addr.i, align 16
527  %shuffle.i = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
528  store <8 x i16> %shuffle.i, ptr %__ret.i, align 16
529  %3 = load <8 x i16>, ptr %__ret.i, align 16
530  store <8 x i16> %3, ptr %a_rev, align 16
531  %re = getelementptr inbounds nuw %struct.cmplx_int16_t, ptr %scale, i32 0, i32 0
532  %4 = load i16, ptr %re, align 2
533  store i16 %4, ptr %__p0.addr.i16, align 2
534  %5 = load i16, ptr %__p0.addr.i16, align 2
535  %vecinit.i19 = insertelement <8 x i16> poison, i16 %5, i32 0
536  %6 = load i16, ptr %__p0.addr.i16, align 2
537  %vecinit1.i20 = insertelement <8 x i16> %vecinit.i19, i16 %6, i32 1
538  %7 = load i16, ptr %__p0.addr.i16, align 2
539  %vecinit2.i21 = insertelement <8 x i16> %vecinit1.i20, i16 %7, i32 2
540  %8 = load i16, ptr %__p0.addr.i16, align 2
541  %vecinit3.i22 = insertelement <8 x i16> %vecinit2.i21, i16 %8, i32 3
542  %9 = load i16, ptr %__p0.addr.i16, align 2
543  %vecinit4.i23 = insertelement <8 x i16> %vecinit3.i22, i16 %9, i32 4
544  %10 = load i16, ptr %__p0.addr.i16, align 2
545  %vecinit5.i24 = insertelement <8 x i16> %vecinit4.i23, i16 %10, i32 5
546  %11 = load i16, ptr %__p0.addr.i16, align 2
547  %vecinit6.i25 = insertelement <8 x i16> %vecinit5.i24, i16 %11, i32 6
548  %12 = load i16, ptr %__p0.addr.i16, align 2
549  %vecinit7.i26 = insertelement <8 x i16> %vecinit6.i25, i16 %12, i32 7
550  store <8 x i16> %vecinit7.i26, ptr %.compoundliteral.i18, align 16
551  %13 = load <8 x i16>, ptr %.compoundliteral.i18, align 16
552  store <8 x i16> %13, ptr %__ret.i17, align 16
553  %14 = load <8 x i16>, ptr %__ret.i17, align 16
554  store <8 x i16> %14, ptr %cc, align 16
555  %im = getelementptr inbounds nuw %struct.cmplx_int16_t, ptr %scale, i32 0, i32 1
556  %15 = load i16, ptr %im, align 2
557  store i16 %15, ptr %__p0.addr.i14, align 2
558  %16 = load i16, ptr %__p0.addr.i14, align 2
559  %vecinit.i = insertelement <8 x i16> poison, i16 %16, i32 0
560  %17 = load i16, ptr %__p0.addr.i14, align 2
561  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %17, i32 1
562  %18 = load i16, ptr %__p0.addr.i14, align 2
563  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %18, i32 2
564  %19 = load i16, ptr %__p0.addr.i14, align 2
565  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %19, i32 3
566  %20 = load i16, ptr %__p0.addr.i14, align 2
567  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %20, i32 4
568  %21 = load i16, ptr %__p0.addr.i14, align 2
569  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %21, i32 5
570  %22 = load i16, ptr %__p0.addr.i14, align 2
571  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %22, i32 6
572  %23 = load i16, ptr %__p0.addr.i14, align 2
573  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %23, i32 7
574  store <8 x i16> %vecinit7.i, ptr %.compoundliteral.i, align 16
575  %24 = load <8 x i16>, ptr %.compoundliteral.i, align 16
576  store <8 x i16> %24, ptr %__ret.i15, align 16
577  %25 = load <8 x i16>, ptr %__ret.i15, align 16
578  store <8 x i16> %25, ptr %dd, align 16
579  store i32 65535, ptr %__p0.addr.i29, align 4
580  %26 = load i32, ptr %__p0.addr.i29, align 4
581  %vecinit.i32 = insertelement <4 x i32> poison, i32 %26, i32 0
582  %27 = load i32, ptr %__p0.addr.i29, align 4
583  %vecinit1.i33 = insertelement <4 x i32> %vecinit.i32, i32 %27, i32 1
584  %28 = load i32, ptr %__p0.addr.i29, align 4
585  %vecinit2.i34 = insertelement <4 x i32> %vecinit1.i33, i32 %28, i32 2
586  %29 = load i32, ptr %__p0.addr.i29, align 4
587  %vecinit3.i35 = insertelement <4 x i32> %vecinit2.i34, i32 %29, i32 3
588  store <4 x i32> %vecinit3.i35, ptr %.compoundliteral.i31, align 16
589  %30 = load <4 x i32>, ptr %.compoundliteral.i31, align 16
590  store <4 x i32> %30, ptr %__ret.i30, align 16
591  %31 = load <4 x i32>, ptr %__ret.i30, align 16
592  store <4 x i32> %31, ptr %__p0.addr.i27, align 16
593  %32 = load <4 x i32>, ptr %__p0.addr.i27, align 16
594  %33 = bitcast <4 x i32> %32 to <8 x i16>
595  store <8 x i16> %33, ptr %__ret.i28, align 16
596  %34 = load <8 x i16>, ptr %__ret.i28, align 16
597  store <8 x i16> %34, ptr %mult_mask, align 16
598  %35 = load <8 x i16>, ptr %mult_mask, align 16
599  %36 = load <8 x i16>, ptr %dd, align 16
600  store <8 x i16> %36, ptr %__p0.addr.i38, align 16
601  %37 = load <8 x i16>, ptr %__p0.addr.i38, align 16
602  %38 = bitcast <8 x i16> %37 to <16 x i8>
603  %vqnegq_v1.i = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %37)
604  %vqnegq_v2.i = bitcast <8 x i16> %vqnegq_v1.i to <16 x i8>
605  store <8 x i16> %vqnegq_v1.i, ptr %__ret.i39, align 16
606  %39 = load <8 x i16>, ptr %__ret.i39, align 16
607  %40 = load <8 x i16>, ptr %dd, align 16
608  store <8 x i16> %35, ptr %__p0.addr.i36, align 16
609  store <8 x i16> %39, ptr %__p1.addr.i, align 16
610  store <8 x i16> %40, ptr %__p2.addr.i, align 16
611  %41 = load <8 x i16>, ptr %__p0.addr.i36, align 16
612  %42 = bitcast <8 x i16> %41 to <16 x i8>
613  %43 = load <8 x i16>, ptr %__p1.addr.i, align 16
614  %44 = bitcast <8 x i16> %43 to <16 x i8>
615  %45 = load <8 x i16>, ptr %__p2.addr.i, align 16
616  %46 = bitcast <8 x i16> %45 to <16 x i8>
617  %vbsl3.i = and <8 x i16> %41, %43
618  %47 = xor <8 x i16> %41, splat (i16 -1)
619  %vbsl4.i = and <8 x i16> %47, %45
620  %vbsl5.i = or <8 x i16> %vbsl3.i, %vbsl4.i
621  store <8 x i16> %vbsl5.i, ptr %__ret.i37, align 16
622  %48 = load <8 x i16>, ptr %__ret.i37, align 16
623  store <8 x i16> %48, ptr %dd, align 16
624  %49 = load <8 x i16>, ptr %a.addr, align 16
625  %50 = load <8 x i16>, ptr %cc, align 16
626  store <8 x i16> %49, ptr %a.addr.i, align 16
627  store <8 x i16> %50, ptr %b.addr.i, align 16
628  %51 = load <8 x i16>, ptr %a.addr.i, align 16
629  store <8 x i16> %51, ptr %__p0.addr.i83, align 16
630  %52 = load <8 x i16>, ptr %__p0.addr.i83, align 16
631  %53 = load <8 x i16>, ptr %__p0.addr.i83, align 16
632  %shuffle.i85 = shufflevector <8 x i16> %52, <8 x i16> %53, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
633  store <4 x i16> %shuffle.i85, ptr %__ret.i84, align 8
634  %54 = load <4 x i16>, ptr %__ret.i84, align 8
635  %55 = load <8 x i16>, ptr %b.addr.i, align 16
636  store <8 x i16> %55, ptr %__p0.addr.i80, align 16
637  %56 = load <8 x i16>, ptr %__p0.addr.i80, align 16
638  %57 = load <8 x i16>, ptr %__p0.addr.i80, align 16
639  %shuffle.i82 = shufflevector <8 x i16> %56, <8 x i16> %57, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
640  store <4 x i16> %shuffle.i82, ptr %__ret.i81, align 8
641  %58 = load <4 x i16>, ptr %__ret.i81, align 8
642  store <4 x i16> %54, ptr %__p0.addr.i69, align 8
643  store <4 x i16> %58, ptr %__p1.addr.i70, align 8
644  %59 = load <4 x i16>, ptr %__p0.addr.i69, align 8
645  %60 = bitcast <4 x i16> %59 to <8 x i8>
646  %61 = load <4 x i16>, ptr %__p1.addr.i70, align 8
647  %62 = bitcast <4 x i16> %61 to <8 x i8>
648  %vqdmull_v2.i72 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %59, <4 x i16> %61)
649  %vqdmull_v3.i73 = bitcast <4 x i32> %vqdmull_v2.i72 to <16 x i8>
650  store <4 x i32> %vqdmull_v2.i72, ptr %__ret.i71, align 16
651  %63 = load <4 x i32>, ptr %__ret.i71, align 16
652  store <4 x i32> %63, ptr %lo32, align 16
653  %64 = load <8 x i16>, ptr %a.addr, align 16
654  %65 = load <8 x i16>, ptr %cc, align 16
655  store <8 x i16> %64, ptr %__p0.addr.i40, align 16
656  store <8 x i16> %65, ptr %__p1.addr.i41, align 16
657  %66 = load <8 x i16>, ptr %__p0.addr.i40, align 16
658  store <8 x i16> %66, ptr %__p0.addr.i95, align 16
659  %67 = load <8 x i16>, ptr %__p0.addr.i95, align 16
660  %68 = load <8 x i16>, ptr %__p0.addr.i95, align 16
661  %shuffle.i97 = shufflevector <8 x i16> %67, <8 x i16> %68, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
662  store <4 x i16> %shuffle.i97, ptr %__ret.i96, align 8
663  %69 = load <4 x i16>, ptr %__ret.i96, align 8
664  %70 = load <8 x i16>, ptr %__p1.addr.i41, align 16
665  store <8 x i16> %70, ptr %__p0.addr.i92, align 16
666  %71 = load <8 x i16>, ptr %__p0.addr.i92, align 16
667  %72 = load <8 x i16>, ptr %__p0.addr.i92, align 16
668  %shuffle.i94 = shufflevector <8 x i16> %71, <8 x i16> %72, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
669  store <4 x i16> %shuffle.i94, ptr %__ret.i93, align 8
670  %73 = load <4 x i16>, ptr %__ret.i93, align 8
671  store <4 x i16> %69, ptr %__p0.addr.i66, align 8
672  store <4 x i16> %73, ptr %__p1.addr.i67, align 8
673  %74 = load <4 x i16>, ptr %__p0.addr.i66, align 8
674  %75 = bitcast <4 x i16> %74 to <8 x i8>
675  %76 = load <4 x i16>, ptr %__p1.addr.i67, align 8
676  %77 = bitcast <4 x i16> %76 to <8 x i8>
677  %vqdmull_v2.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %74, <4 x i16> %76)
678  %vqdmull_v3.i = bitcast <4 x i32> %vqdmull_v2.i to <16 x i8>
679  store <4 x i32> %vqdmull_v2.i, ptr %__ret.i68, align 16
680  %78 = load <4 x i32>, ptr %__ret.i68, align 16
681  store <4 x i32> %78, ptr %__ret.i42, align 16
682  %79 = load <4 x i32>, ptr %__ret.i42, align 16
683  store <4 x i32> %79, ptr %hi32, align 16
684  %80 = load <4 x i32>, ptr %lo32, align 16
685  %81 = load <8 x i16>, ptr %a_rev, align 16
686  %82 = load <8 x i16>, ptr %dd, align 16
687  store <4 x i32> %80, ptr %a.addr.i46, align 16
688  store <8 x i16> %81, ptr %b.addr.i47, align 16
689  store <8 x i16> %82, ptr %c.addr.i, align 16
690  %83 = load <4 x i32>, ptr %a.addr.i46, align 16
691  %84 = load <8 x i16>, ptr %b.addr.i47, align 16
692  store <8 x i16> %84, ptr %__p0.addr.i77, align 16
693  %85 = load <8 x i16>, ptr %__p0.addr.i77, align 16
694  %86 = load <8 x i16>, ptr %__p0.addr.i77, align 16
695  %shuffle.i79 = shufflevector <8 x i16> %85, <8 x i16> %86, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
696  store <4 x i16> %shuffle.i79, ptr %__ret.i78, align 8
697  %87 = load <4 x i16>, ptr %__ret.i78, align 8
698  %88 = load <8 x i16>, ptr %c.addr.i, align 16
699  store <8 x i16> %88, ptr %__p0.addr.i74, align 16
700  %89 = load <8 x i16>, ptr %__p0.addr.i74, align 16
701  %90 = load <8 x i16>, ptr %__p0.addr.i74, align 16
702  %shuffle.i76 = shufflevector <8 x i16> %89, <8 x i16> %90, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
703  store <4 x i16> %shuffle.i76, ptr %__ret.i75, align 8
704  %91 = load <4 x i16>, ptr %__ret.i75, align 8
705  store <4 x i32> %83, ptr %__p0.addr.i102, align 16
706  store <4 x i16> %87, ptr %__p1.addr.i103, align 8
707  store <4 x i16> %91, ptr %__p2.addr.i104, align 8
708  %92 = load <4 x i32>, ptr %__p0.addr.i102, align 16
709  %93 = bitcast <4 x i32> %92 to <16 x i8>
710  %94 = load <4 x i16>, ptr %__p1.addr.i103, align 8
711  %95 = bitcast <4 x i16> %94 to <8 x i8>
712  %96 = load <4 x i16>, ptr %__p2.addr.i104, align 8
713  %97 = bitcast <4 x i16> %96 to <8 x i8>
714  %vqdmlal2.i106 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %94, <4 x i16> %96)
715  %vqdmlal_v3.i107 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %92, <4 x i32> %vqdmlal2.i106)
716  store <4 x i32> %vqdmlal_v3.i107, ptr %__ret.i105, align 16
717  %98 = load <4 x i32>, ptr %__ret.i105, align 16
718  store <4 x i32> %98, ptr %lo32, align 16
719  %99 = load <4 x i32>, ptr %hi32, align 16
720  %100 = load <8 x i16>, ptr %a_rev, align 16
721  %101 = load <8 x i16>, ptr %dd, align 16
722  store <4 x i32> %99, ptr %__p0.addr.i51, align 16
723  store <8 x i16> %100, ptr %__p1.addr.i52, align 16
724  store <8 x i16> %101, ptr %__p2.addr.i53, align 16
725  %102 = load <4 x i32>, ptr %__p0.addr.i51, align 16
726  %103 = load <8 x i16>, ptr %__p1.addr.i52, align 16
727  store <8 x i16> %103, ptr %__p0.addr.i89, align 16
728  %104 = load <8 x i16>, ptr %__p0.addr.i89, align 16
729  %105 = load <8 x i16>, ptr %__p0.addr.i89, align 16
730  %shuffle.i91 = shufflevector <8 x i16> %104, <8 x i16> %105, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
731  store <4 x i16> %shuffle.i91, ptr %__ret.i90, align 8
732  %106 = load <4 x i16>, ptr %__ret.i90, align 8
733  %107 = load <8 x i16>, ptr %__p2.addr.i53, align 16
734  store <8 x i16> %107, ptr %__p0.addr.i86, align 16
735  %108 = load <8 x i16>, ptr %__p0.addr.i86, align 16
736  %109 = load <8 x i16>, ptr %__p0.addr.i86, align 16
737  %shuffle.i88 = shufflevector <8 x i16> %108, <8 x i16> %109, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
738  store <4 x i16> %shuffle.i88, ptr %__ret.i87, align 8
739  %110 = load <4 x i16>, ptr %__ret.i87, align 8
740  store <4 x i32> %102, ptr %__p0.addr.i98, align 16
741  store <4 x i16> %106, ptr %__p1.addr.i99, align 8
742  store <4 x i16> %110, ptr %__p2.addr.i100, align 8
743  %111 = load <4 x i32>, ptr %__p0.addr.i98, align 16
744  %112 = bitcast <4 x i32> %111 to <16 x i8>
745  %113 = load <4 x i16>, ptr %__p1.addr.i99, align 8
746  %114 = bitcast <4 x i16> %113 to <8 x i8>
747  %115 = load <4 x i16>, ptr %__p2.addr.i100, align 8
748  %116 = bitcast <4 x i16> %115 to <8 x i8>
749  %vqdmlal2.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %113, <4 x i16> %115)
750  %vqdmlal_v3.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %111, <4 x i32> %vqdmlal2.i)
751  store <4 x i32> %vqdmlal_v3.i, ptr %__ret.i101, align 16
752  %117 = load <4 x i32>, ptr %__ret.i101, align 16
753  store <4 x i32> %117, ptr %__ret.i54, align 16
754  %118 = load <4 x i32>, ptr %__ret.i54, align 16
755  store <4 x i32> %118, ptr %hi32, align 16
756  %119 = load <4 x i32>, ptr %lo32, align 16
757  store <4 x i32> %119, ptr %__p0.addr.i64, align 16
758  %120 = load <4 x i32>, ptr %__p0.addr.i64, align 16
759  %121 = bitcast <4 x i32> %120 to <8 x i16>
760  store <8 x i16> %121, ptr %__ret.i65, align 16
761  %122 = load <8 x i16>, ptr %__ret.i65, align 16
762  %123 = load <4 x i32>, ptr %hi32, align 16
763  store <4 x i32> %123, ptr %__p0.addr.i62, align 16
764  %124 = load <4 x i32>, ptr %__p0.addr.i62, align 16
765  %125 = bitcast <4 x i32> %124 to <8 x i16>
766  store <8 x i16> %125, ptr %__ret.i63, align 16
767  %126 = load <8 x i16>, ptr %__ret.i63, align 16
768  store <8 x i16> %122, ptr %__p0.addr.i58, align 16
769  store <8 x i16> %126, ptr %__p1.addr.i59, align 16
770  %127 = load <8 x i16>, ptr %__p0.addr.i58, align 16
771  %128 = load <8 x i16>, ptr %__p1.addr.i59, align 16
772  %shuffle.i61 = shufflevector <8 x i16> %127, <8 x i16> %128, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
773  store <8 x i16> %shuffle.i61, ptr %__ret.i60, align 16
774  %129 = load <8 x i16>, ptr %__ret.i60, align 16
775  ret <8 x i16> %129
776}
777
778; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
779declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
780
781; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
782declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) #2
783
784; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
785declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) #2
786
787; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
788declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) #2
789
790attributes #0 = { mustprogress noinline uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fullfp16,+jsconv,+lse,+neon,+pauth,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" }
791attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
792attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
793
794!llvm.module.flags = !{!0, !1, !2}
795!llvm.ident = !{!3}
796
797!0 = !{i32 1, !"wchar_size", i32 4}
798!1 = !{i32 7, !"uwtable", i32 2}
799!2 = !{i32 7, !"frame-pointer", i32 1}
800!3 = !{!"clang version 20.0.0git"}
801!4 = distinct !{!4, !5}
802!5 = !{!"llvm.loop.mustprogress"}
803;.
804; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]}
805; CHECK: [[META5]] = !{!"llvm.loop.mustprogress"}
806;.
807