1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 2; RUN: opt -passes="default<O3>" -S %s | FileCheck %s 3 4target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" 5target triple = "aarch64" 6 7%struct.cmplx_int16_t = type { i16, i16 } 8%struct.compressed_data_8bit = type { i8, [24 x i8] } 9 10define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(i32 noundef %n_prb, ptr noundef %src, ptr noundef %dst, ptr noundef %scale) #0 { 11; CHECK-LABEL: define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_( 12; CHECK-SAME: i32 noundef [[N_PRB:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], ptr noundef writeonly captures(none) [[DST:%.*]], ptr noundef readonly [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { 13; CHECK-NEXT: [[ENTRY:.*:]] 14; CHECK-NEXT: [[CMP47_NOT:%.*]] = icmp eq i32 [[N_PRB]], 0 15; CHECK-NEXT: br i1 [[CMP47_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY_LR_PH:.*]] 16; CHECK: [[FOR_BODY_LR_PH]]: 17; CHECK-NEXT: [[CMP31_NOT:%.*]] = icmp eq ptr [[SCALE]], null 18; CHECK-NEXT: [[WIDE_TRIP_COUNT58:%.*]] = zext i32 [[N_PRB]] to i64 19; CHECK-NEXT: br i1 [[CMP31_NOT]], label %[[FOR_BODY_US:.*]], label %[[FOR_BODY:.*]] 20; CHECK: [[FOR_BODY_US]]: 21; CHECK-NEXT: [[INDVARS_IV55:%.*]] = phi i64 [ [[INDVARS_IV_NEXT56:%.*]], %[[FOR_BODY_US]] ], [ 0, %[[FOR_BODY_LR_PH]] ] 22; CHECK-NEXT: [[DST_ADDR_052_US:%.*]] = phi ptr [ [[DST_ADDR_1_US:%.*]], %[[FOR_BODY_US]] ], [ [[DST]], %[[FOR_BODY_LR_PH]] ] 23; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw [[STRUCT_COMPRESSED_DATA_8BIT:%.*]], ptr [[SRC]], i64 [[INDVARS_IV55]] 24; CHECK-NEXT: [[MANTISSA_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 1 25; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[MANTISSA_US]], align 1 26; CHECK-NEXT: [[VMOVL_I59_US:%.*]] = sext <8 x i8> [[TMP0]] to <8 x i16> 27; CHECK-NEXT: [[ARRAYIDX7_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 9 28; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX7_US]], align 1 29; CHECK-NEXT: [[VMOVL_I56_US:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i16> 30; CHECK-NEXT: [[ARRAYIDX15_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 17 31; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX15_US]], align 1 32; CHECK-NEXT: [[VMOVL_I_US:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16> 33; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX_US]], align 1 34; CHECK-NEXT: [[CONV_US:%.*]] = sext i8 [[TMP3]] to i16 35; CHECK-NEXT: [[MUL_US:%.*]] = shl nsw i16 [[CONV_US]], 1 36; CHECK-NEXT: [[VECINIT_I79_US:%.*]] = insertelement <8 x i16> poison, i16 [[MUL_US]], i64 0 37; CHECK-NEXT: [[VECINIT7_I86_US:%.*]] = shufflevector <8 x i16> [[VECINIT_I79_US]], <8 x i16> poison, <8 x i32> zeroinitializer 38; CHECK-NEXT: [[MUL_I87_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I59_US]] 39; CHECK-NEXT: [[MUL_I74_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I56_US]] 40; CHECK-NEXT: [[MUL_I_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I_US]] 41; CHECK-NEXT: store <8 x i16> [[MUL_I87_US]], ptr [[DST_ADDR_052_US]], align 2 42; CHECK-NEXT: [[ADD_PTR47_US:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 16 43; CHECK-NEXT: store <8 x i16> [[MUL_I74_US]], ptr [[ADD_PTR47_US]], align 2 44; CHECK-NEXT: [[ADD_PTR50_US:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 32 45; CHECK-NEXT: store <8 x i16> [[MUL_I_US]], ptr [[ADD_PTR50_US]], align 2 46; CHECK-NEXT: [[DST_ADDR_1_US]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 48 47; CHECK-NEXT: [[INDVARS_IV_NEXT56]] = add nuw nsw i64 [[INDVARS_IV55]], 1 48; CHECK-NEXT: [[EXITCOND59_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT56]], [[WIDE_TRIP_COUNT58]] 49; CHECK-NEXT: br i1 [[EXITCOND59_NOT]], label %[[FOR_END]], label %[[FOR_BODY_US]], !llvm.loop [[LOOP4:![0-9]+]] 50; CHECK: [[FOR_BODY]]: 51; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_LR_PH]] ] 52; CHECK-NEXT: [[DST_ADDR_052:%.*]] = phi ptr [ [[DST_ADDR_1:%.*]], %[[FOR_BODY]] ], [ [[DST]], %[[FOR_BODY_LR_PH]] ] 53; CHECK-NEXT: [[AGG_TMP_COERCE_050:%.*]] = phi i64 [ [[AGG_TMP_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ] 54; CHECK-NEXT: [[AGG_TMP42_COERCE_049:%.*]] = phi i64 [ [[AGG_TMP42_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ] 55; CHECK-NEXT: [[AGG_TMP37_COERCE_048:%.*]] = phi i64 [ [[AGG_TMP37_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ] 56; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_COMPRESSED_DATA_8BIT]], ptr [[SRC]], i64 [[INDVARS_IV]] 57; CHECK-NEXT: [[MANTISSA:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 1 58; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, ptr [[MANTISSA]], align 1 59; CHECK-NEXT: [[VMOVL_I59:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i16> 60; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 9 61; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX7]], align 1 62; CHECK-NEXT: [[VMOVL_I56:%.*]] = sext <8 x i8> [[TMP5]] to <8 x i16> 63; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 17 64; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX15]], align 1 65; CHECK-NEXT: [[VMOVL_I:%.*]] = sext <8 x i8> [[TMP6]] to <8 x i16> 66; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 67; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP7]] to i16 68; CHECK-NEXT: [[MUL:%.*]] = shl nsw i16 [[CONV]], 1 69; CHECK-NEXT: [[VECINIT_I79:%.*]] = insertelement <8 x i16> poison, i16 [[MUL]], i64 0 70; CHECK-NEXT: [[VECINIT7_I86:%.*]] = shufflevector <8 x i16> [[VECINIT_I79]], <8 x i16> poison, <8 x i32> zeroinitializer 71; CHECK-NEXT: [[MUL_I87:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I59]] 72; CHECK-NEXT: [[MUL_I74:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I56]] 73; CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I]] 74; CHECK-NEXT: [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2 75; CHECK-NEXT: [[AGG_TMP_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP_SROA_0_0_COPYLOAD]] to i64 76; CHECK-NEXT: [[AGG_TMP_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP_COERCE_050]], -4294967296 77; CHECK-NEXT: [[AGG_TMP_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP_COERCE_0_INSERT_MASK]], [[AGG_TMP_COERCE_0_INSERT_EXT]] 78; CHECK-NEXT: [[CALL33:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I87]], i64 [[AGG_TMP_COERCE_0_INSERT_INSERT]]) 79; CHECK-NEXT: store <8 x i16> [[CALL33]], ptr [[DST_ADDR_052]], align 2 80; CHECK-NEXT: [[AGG_TMP37_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2 81; CHECK-NEXT: [[AGG_TMP37_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP37_SROA_0_0_COPYLOAD]] to i64 82; CHECK-NEXT: [[AGG_TMP37_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP37_COERCE_048]], -4294967296 83; CHECK-NEXT: [[AGG_TMP37_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP37_COERCE_0_INSERT_MASK]], [[AGG_TMP37_COERCE_0_INSERT_EXT]] 84; CHECK-NEXT: [[CALL38:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I74]], i64 [[AGG_TMP37_COERCE_0_INSERT_INSERT]]) 85; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 16 86; CHECK-NEXT: store <8 x i16> [[CALL38]], ptr [[ARRAYIDX39]], align 2 87; CHECK-NEXT: [[AGG_TMP42_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2 88; CHECK-NEXT: [[AGG_TMP42_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP42_SROA_0_0_COPYLOAD]] to i64 89; CHECK-NEXT: [[AGG_TMP42_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP42_COERCE_049]], -4294967296 90; CHECK-NEXT: [[AGG_TMP42_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP42_COERCE_0_INSERT_MASK]], [[AGG_TMP42_COERCE_0_INSERT_EXT]] 91; CHECK-NEXT: [[CALL43:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I]], i64 [[AGG_TMP42_COERCE_0_INSERT_INSERT]]) 92; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 32 93; CHECK-NEXT: store <8 x i16> [[CALL43]], ptr [[ARRAYIDX44]], align 2 94; CHECK-NEXT: [[DST_ADDR_1]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 48 95; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 96; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT58]] 97; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4]] 98; CHECK: [[FOR_END]]: 99; CHECK-NEXT: ret i32 0 100; 101entry: 102 %__p0.addr.i75 = alloca <8 x i16>, align 16 103 %__p1.addr.i76 = alloca i16, align 2 104 %__ret.i77 = alloca <8 x i16>, align 16 105 %.compoundliteral.i78 = alloca <8 x i16>, align 16 106 %__p0.addr.i62 = alloca <8 x i16>, align 16 107 %__p1.addr.i63 = alloca i16, align 2 108 %__ret.i64 = alloca <8 x i16>, align 16 109 %.compoundliteral.i65 = alloca <8 x i16>, align 16 110 %__p0.addr.i60 = alloca <8 x i16>, align 16 111 %__p1.addr.i = alloca i16, align 2 112 %__ret.i61 = alloca <8 x i16>, align 16 113 %.compoundliteral.i = alloca <8 x i16>, align 16 114 %__p0.addr.i57 = alloca <8 x i8>, align 8 115 %__ret.i58 = alloca <8 x i16>, align 16 116 %__p0.addr.i54 = alloca <8 x i8>, align 8 117 %__ret.i55 = alloca <8 x i16>, align 16 118 %__p0.addr.i = alloca <8 x i8>, align 8 119 %__ret.i = alloca <8 x i16>, align 16 120 %n_prb.addr = alloca i32, align 4 121 %src.addr = alloca ptr, align 8 122 %dst.addr = alloca ptr, align 8 123 %scale.addr = alloca ptr, align 8 124 %i = alloca i32, align 4 125 %prb_comp_in = alloca [3 x <8 x i16>], align 16 126 %__ret = alloca <8 x i8>, align 8 127 %tmp = alloca <8 x i8>, align 8 128 %__ret3 = alloca <8 x i8>, align 8 129 %tmp8 = alloca <8 x i8>, align 8 130 %__ret11 = alloca <8 x i8>, align 8 131 %tmp16 = alloca <8 x i8>, align 8 132 %prb_decomp = alloca [3 x <8 x i16>], align 16 133 %scaling_factor = alloca i16, align 2 134 %__s1 = alloca <8 x i16>, align 16 135 %agg.tmp = alloca %struct.cmplx_int16_t, align 2 136 %agg.tmp.coerce = alloca i64, align 8 137 %__s135 = alloca <8 x i16>, align 16 138 %agg.tmp37 = alloca %struct.cmplx_int16_t, align 2 139 %agg.tmp37.coerce = alloca i64, align 8 140 %__s140 = alloca <8 x i16>, align 16 141 %agg.tmp42 = alloca %struct.cmplx_int16_t, align 2 142 %agg.tmp42.coerce = alloca i64, align 8 143 %__s145 = alloca <8 x i16>, align 16 144 %__s148 = alloca <8 x i16>, align 16 145 %__s151 = alloca <8 x i16>, align 16 146 store i32 %n_prb, ptr %n_prb.addr, align 4 147 store ptr %src, ptr %src.addr, align 8 148 store ptr %dst, ptr %dst.addr, align 8 149 store ptr %scale, ptr %scale.addr, align 8 150 store i32 0, ptr %i, align 4 151 br label %for.cond 152 153for.cond: ; preds = %for.inc, %entry 154 %0 = load i32, ptr %i, align 4 155 %1 = load i32, ptr %n_prb.addr, align 4 156 %cmp = icmp ult i32 %0, %1 157 br i1 %cmp, label %for.body, label %for.end 158 159for.body: ; preds = %for.cond 160 %2 = load ptr, ptr %src.addr, align 8 161 %3 = load i32, ptr %i, align 4 162 %idxprom = zext i32 %3 to i64 163 %arrayidx = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %2, i64 %idxprom 164 %mantissa = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx, i32 0, i32 1 165 %arrayidx1 = getelementptr inbounds [24 x i8], ptr %mantissa, i64 0, i64 0 166 %4 = load <8 x i8>, ptr %arrayidx1, align 1 167 store <8 x i8> %4, ptr %__ret, align 8 168 %5 = load <8 x i8>, ptr %__ret, align 8 169 store <8 x i8> %5, ptr %tmp, align 8 170 %6 = load <8 x i8>, ptr %tmp, align 8 171 store <8 x i8> %6, ptr %__p0.addr.i57, align 8 172 %7 = load <8 x i8>, ptr %__p0.addr.i57, align 8 173 %vmovl.i59 = sext <8 x i8> %7 to <8 x i16> 174 store <8 x i16> %vmovl.i59, ptr %__ret.i58, align 16 175 %8 = load <8 x i16>, ptr %__ret.i58, align 16 176 %arrayidx2 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 0 177 store <8 x i16> %8, ptr %arrayidx2, align 16 178 %9 = load ptr, ptr %src.addr, align 8 179 %10 = load i32, ptr %i, align 4 180 %idxprom4 = zext i32 %10 to i64 181 %arrayidx5 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %9, i64 %idxprom4 182 %mantissa6 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx5, i32 0, i32 1 183 %arrayidx7 = getelementptr inbounds [24 x i8], ptr %mantissa6, i64 0, i64 8 184 %11 = load <8 x i8>, ptr %arrayidx7, align 1 185 store <8 x i8> %11, ptr %__ret3, align 8 186 %12 = load <8 x i8>, ptr %__ret3, align 8 187 store <8 x i8> %12, ptr %tmp8, align 8 188 %13 = load <8 x i8>, ptr %tmp8, align 8 189 store <8 x i8> %13, ptr %__p0.addr.i54, align 8 190 %14 = load <8 x i8>, ptr %__p0.addr.i54, align 8 191 %vmovl.i56 = sext <8 x i8> %14 to <8 x i16> 192 store <8 x i16> %vmovl.i56, ptr %__ret.i55, align 16 193 %15 = load <8 x i16>, ptr %__ret.i55, align 16 194 %arrayidx10 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 1 195 store <8 x i16> %15, ptr %arrayidx10, align 16 196 %16 = load ptr, ptr %src.addr, align 8 197 %17 = load i32, ptr %i, align 4 198 %idxprom12 = zext i32 %17 to i64 199 %arrayidx13 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %16, i64 %idxprom12 200 %mantissa14 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx13, i32 0, i32 1 201 %arrayidx15 = getelementptr inbounds [24 x i8], ptr %mantissa14, i64 0, i64 16 202 %18 = load <8 x i8>, ptr %arrayidx15, align 1 203 store <8 x i8> %18, ptr %__ret11, align 8 204 %19 = load <8 x i8>, ptr %__ret11, align 8 205 store <8 x i8> %19, ptr %tmp16, align 8 206 %20 = load <8 x i8>, ptr %tmp16, align 8 207 store <8 x i8> %20, ptr %__p0.addr.i, align 8 208 %21 = load <8 x i8>, ptr %__p0.addr.i, align 8 209 %vmovl.i = sext <8 x i8> %21 to <8 x i16> 210 store <8 x i16> %vmovl.i, ptr %__ret.i, align 16 211 %22 = load <8 x i16>, ptr %__ret.i, align 16 212 %arrayidx18 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 2 213 store <8 x i16> %22, ptr %arrayidx18, align 16 214 %23 = load ptr, ptr %src.addr, align 8 215 %24 = load i32, ptr %i, align 4 216 %idxprom19 = zext i32 %24 to i64 217 %arrayidx20 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %23, i64 %idxprom19 218 %exp = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx20, i32 0, i32 0 219 %25 = load i8, ptr %exp, align 1 220 %conv = sext i8 %25 to i32 221 %mul = mul nsw i32 %conv, 2 222 %conv21 = trunc i32 %mul to i16 223 store i16 %conv21, ptr %scaling_factor, align 2 224 %arrayidx22 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 0 225 %26 = load <8 x i16>, ptr %arrayidx22, align 16 226 %27 = load i16, ptr %scaling_factor, align 2 227 store <8 x i16> %26, ptr %__p0.addr.i75, align 16 228 store i16 %27, ptr %__p1.addr.i76, align 2 229 %28 = load <8 x i16>, ptr %__p0.addr.i75, align 16 230 %29 = load i16, ptr %__p1.addr.i76, align 2 231 %vecinit.i79 = insertelement <8 x i16> poison, i16 %29, i32 0 232 %30 = load i16, ptr %__p1.addr.i76, align 2 233 %vecinit1.i80 = insertelement <8 x i16> %vecinit.i79, i16 %30, i32 1 234 %31 = load i16, ptr %__p1.addr.i76, align 2 235 %vecinit2.i81 = insertelement <8 x i16> %vecinit1.i80, i16 %31, i32 2 236 %32 = load i16, ptr %__p1.addr.i76, align 2 237 %vecinit3.i82 = insertelement <8 x i16> %vecinit2.i81, i16 %32, i32 3 238 %33 = load i16, ptr %__p1.addr.i76, align 2 239 %vecinit4.i83 = insertelement <8 x i16> %vecinit3.i82, i16 %33, i32 4 240 %34 = load i16, ptr %__p1.addr.i76, align 2 241 %vecinit5.i84 = insertelement <8 x i16> %vecinit4.i83, i16 %34, i32 5 242 %35 = load i16, ptr %__p1.addr.i76, align 2 243 %vecinit6.i85 = insertelement <8 x i16> %vecinit5.i84, i16 %35, i32 6 244 %36 = load i16, ptr %__p1.addr.i76, align 2 245 %vecinit7.i86 = insertelement <8 x i16> %vecinit6.i85, i16 %36, i32 7 246 store <8 x i16> %vecinit7.i86, ptr %.compoundliteral.i78, align 16 247 %37 = load <8 x i16>, ptr %.compoundliteral.i78, align 16 248 %mul.i87 = mul <8 x i16> %28, %37 249 store <8 x i16> %mul.i87, ptr %__ret.i77, align 16 250 %38 = load <8 x i16>, ptr %__ret.i77, align 16 251 %arrayidx24 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0 252 store <8 x i16> %38, ptr %arrayidx24, align 16 253 %arrayidx25 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 1 254 %39 = load <8 x i16>, ptr %arrayidx25, align 16 255 %40 = load i16, ptr %scaling_factor, align 2 256 store <8 x i16> %39, ptr %__p0.addr.i62, align 16 257 store i16 %40, ptr %__p1.addr.i63, align 2 258 %41 = load <8 x i16>, ptr %__p0.addr.i62, align 16 259 %42 = load i16, ptr %__p1.addr.i63, align 2 260 %vecinit.i66 = insertelement <8 x i16> poison, i16 %42, i32 0 261 %43 = load i16, ptr %__p1.addr.i63, align 2 262 %vecinit1.i67 = insertelement <8 x i16> %vecinit.i66, i16 %43, i32 1 263 %44 = load i16, ptr %__p1.addr.i63, align 2 264 %vecinit2.i68 = insertelement <8 x i16> %vecinit1.i67, i16 %44, i32 2 265 %45 = load i16, ptr %__p1.addr.i63, align 2 266 %vecinit3.i69 = insertelement <8 x i16> %vecinit2.i68, i16 %45, i32 3 267 %46 = load i16, ptr %__p1.addr.i63, align 2 268 %vecinit4.i70 = insertelement <8 x i16> %vecinit3.i69, i16 %46, i32 4 269 %47 = load i16, ptr %__p1.addr.i63, align 2 270 %vecinit5.i71 = insertelement <8 x i16> %vecinit4.i70, i16 %47, i32 5 271 %48 = load i16, ptr %__p1.addr.i63, align 2 272 %vecinit6.i72 = insertelement <8 x i16> %vecinit5.i71, i16 %48, i32 6 273 %49 = load i16, ptr %__p1.addr.i63, align 2 274 %vecinit7.i73 = insertelement <8 x i16> %vecinit6.i72, i16 %49, i32 7 275 store <8 x i16> %vecinit7.i73, ptr %.compoundliteral.i65, align 16 276 %50 = load <8 x i16>, ptr %.compoundliteral.i65, align 16 277 %mul.i74 = mul <8 x i16> %41, %50 278 store <8 x i16> %mul.i74, ptr %__ret.i64, align 16 279 %51 = load <8 x i16>, ptr %__ret.i64, align 16 280 %arrayidx27 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1 281 store <8 x i16> %51, ptr %arrayidx27, align 16 282 %arrayidx28 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 2 283 %52 = load <8 x i16>, ptr %arrayidx28, align 16 284 %53 = load i16, ptr %scaling_factor, align 2 285 store <8 x i16> %52, ptr %__p0.addr.i60, align 16 286 store i16 %53, ptr %__p1.addr.i, align 2 287 %54 = load <8 x i16>, ptr %__p0.addr.i60, align 16 288 %55 = load i16, ptr %__p1.addr.i, align 2 289 %vecinit.i = insertelement <8 x i16> poison, i16 %55, i32 0 290 %56 = load i16, ptr %__p1.addr.i, align 2 291 %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %56, i32 1 292 %57 = load i16, ptr %__p1.addr.i, align 2 293 %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %57, i32 2 294 %58 = load i16, ptr %__p1.addr.i, align 2 295 %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %58, i32 3 296 %59 = load i16, ptr %__p1.addr.i, align 2 297 %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %59, i32 4 298 %60 = load i16, ptr %__p1.addr.i, align 2 299 %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %60, i32 5 300 %61 = load i16, ptr %__p1.addr.i, align 2 301 %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %61, i32 6 302 %62 = load i16, ptr %__p1.addr.i, align 2 303 %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %62, i32 7 304 store <8 x i16> %vecinit7.i, ptr %.compoundliteral.i, align 16 305 %63 = load <8 x i16>, ptr %.compoundliteral.i, align 16 306 %mul.i = mul <8 x i16> %54, %63 307 store <8 x i16> %mul.i, ptr %__ret.i61, align 16 308 %64 = load <8 x i16>, ptr %__ret.i61, align 16 309 %arrayidx30 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2 310 store <8 x i16> %64, ptr %arrayidx30, align 16 311 %65 = load ptr, ptr %scale.addr, align 8 312 %cmp31 = icmp ne ptr %65, null 313 br i1 %cmp31, label %if.then, label %if.else 314 315if.then: ; preds = %for.body 316 %arrayidx32 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0 317 %66 = load <8 x i16>, ptr %arrayidx32, align 16 318 %67 = load ptr, ptr %scale.addr, align 8 319 call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp, ptr align 2 %67, i64 4, i1 false) 320 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp.coerce, ptr align 2 %agg.tmp, i64 4, i1 false) 321 %68 = load i64, ptr %agg.tmp.coerce, align 8 322 %call33 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %66, i64 %68) 323 store <8 x i16> %call33, ptr %__s1, align 16 324 %69 = load ptr, ptr %dst.addr, align 8 325 %arrayidx34 = getelementptr inbounds %struct.cmplx_int16_t, ptr %69, i64 0 326 %70 = load <8 x i16>, ptr %__s1, align 16 327 %71 = bitcast <8 x i16> %70 to <16 x i8> 328 %72 = bitcast <16 x i8> %71 to <8 x i16> 329 store <8 x i16> %72, ptr %arrayidx34, align 2 330 %arrayidx36 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1 331 %73 = load <8 x i16>, ptr %arrayidx36, align 16 332 %74 = load ptr, ptr %scale.addr, align 8 333 call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp37, ptr align 2 %74, i64 4, i1 false) 334 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp37.coerce, ptr align 2 %agg.tmp37, i64 4, i1 false) 335 %75 = load i64, ptr %agg.tmp37.coerce, align 8 336 %call38 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %73, i64 %75) 337 store <8 x i16> %call38, ptr %__s135, align 16 338 %76 = load ptr, ptr %dst.addr, align 8 339 %arrayidx39 = getelementptr inbounds %struct.cmplx_int16_t, ptr %76, i64 4 340 %77 = load <8 x i16>, ptr %__s135, align 16 341 %78 = bitcast <8 x i16> %77 to <16 x i8> 342 %79 = bitcast <16 x i8> %78 to <8 x i16> 343 store <8 x i16> %79, ptr %arrayidx39, align 2 344 %arrayidx41 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2 345 %80 = load <8 x i16>, ptr %arrayidx41, align 16 346 %81 = load ptr, ptr %scale.addr, align 8 347 call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp42, ptr align 2 %81, i64 4, i1 false) 348 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp42.coerce, ptr align 2 %agg.tmp42, i64 4, i1 false) 349 %82 = load i64, ptr %agg.tmp42.coerce, align 8 350 %call43 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %80, i64 %82) 351 store <8 x i16> %call43, ptr %__s140, align 16 352 %83 = load ptr, ptr %dst.addr, align 8 353 %arrayidx44 = getelementptr inbounds %struct.cmplx_int16_t, ptr %83, i64 8 354 %84 = load <8 x i16>, ptr %__s140, align 16 355 %85 = bitcast <8 x i16> %84 to <16 x i8> 356 %86 = bitcast <16 x i8> %85 to <8 x i16> 357 store <8 x i16> %86, ptr %arrayidx44, align 2 358 %87 = load ptr, ptr %dst.addr, align 8 359 %add.ptr = getelementptr inbounds %struct.cmplx_int16_t, ptr %87, i64 12 360 store ptr %add.ptr, ptr %dst.addr, align 8 361 br label %if.end 362 363if.else: ; preds = %for.body 364 %arrayidx46 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0 365 %88 = load <8 x i16>, ptr %arrayidx46, align 16 366 store <8 x i16> %88, ptr %__s145, align 16 367 %89 = load ptr, ptr %dst.addr, align 8 368 %90 = load <8 x i16>, ptr %__s145, align 16 369 %91 = bitcast <8 x i16> %90 to <16 x i8> 370 %92 = bitcast <16 x i8> %91 to <8 x i16> 371 store <8 x i16> %92, ptr %89, align 2 372 %93 = load ptr, ptr %dst.addr, align 8 373 %add.ptr47 = getelementptr inbounds %struct.cmplx_int16_t, ptr %93, i64 4 374 store ptr %add.ptr47, ptr %dst.addr, align 8 375 %arrayidx49 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1 376 %94 = load <8 x i16>, ptr %arrayidx49, align 16 377 store <8 x i16> %94, ptr %__s148, align 16 378 %95 = load ptr, ptr %dst.addr, align 8 379 %96 = load <8 x i16>, ptr %__s148, align 16 380 %97 = bitcast <8 x i16> %96 to <16 x i8> 381 %98 = bitcast <16 x i8> %97 to <8 x i16> 382 store <8 x i16> %98, ptr %95, align 2 383 %99 = load ptr, ptr %dst.addr, align 8 384 %add.ptr50 = getelementptr inbounds %struct.cmplx_int16_t, ptr %99, i64 4 385 store ptr %add.ptr50, ptr %dst.addr, align 8 386 %arrayidx52 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2 387 %100 = load <8 x i16>, ptr %arrayidx52, align 16 388 store <8 x i16> %100, ptr %__s151, align 16 389 %101 = load ptr, ptr %dst.addr, align 8 390 %102 = load <8 x i16>, ptr %__s151, align 16 391 %103 = bitcast <8 x i16> %102 to <16 x i8> 392 %104 = bitcast <16 x i8> %103 to <8 x i16> 393 store <8 x i16> %104, ptr %101, align 2 394 %105 = load ptr, ptr %dst.addr, align 8 395 %add.ptr53 = getelementptr inbounds %struct.cmplx_int16_t, ptr %105, i64 4 396 store ptr %add.ptr53, ptr %dst.addr, align 8 397 br label %if.end 398 399if.end: ; preds = %if.else, %if.then 400 br label %for.inc 401 402for.inc: ; preds = %if.end 403 %106 = load i32, ptr %i, align 4 404 %inc = add i32 %106, 1 405 store i32 %inc, ptr %i, align 4 406 br label %for.cond, !llvm.loop !4 407 408for.end: ; preds = %for.cond 409 ret i32 0 410} 411 412define internal noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %a, i64 %scale.coerce) #0 { 413; CHECK-LABEL: define internal fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t( 414; CHECK-SAME: <8 x i16> noundef [[A:%.*]], i64 [[SCALE_COERCE:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] { 415; CHECK-NEXT: [[ENTRY:.*:]] 416; CHECK-NEXT: [[SCALE_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_COERCE]] to i16 417; CHECK-NEXT: [[SCALE_SROA_2_0_EXTRACT_SHIFT36:%.*]] = lshr i64 [[SCALE_COERCE]], 16 418; CHECK-NEXT: [[SCALE_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_SROA_2_0_EXTRACT_SHIFT36]] to i16 419; CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 420; CHECK-NEXT: [[VECINIT_I19:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_0_0_EXTRACT_TRUNC]], i64 0 421; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_2_0_EXTRACT_TRUNC]], i64 0 422; CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer 423; CHECK-NEXT: [[VQNEGQ_V1_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VECINIT7_I]]) 424; CHECK-NEXT: [[VBSL5_I:%.*]] = shufflevector <8 x i16> [[VQNEGQ_V1_I]], <8 x i16> [[VECINIT_I]], <8 x i32> <i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6, i32 8> 425; CHECK-NEXT: [[SHUFFLE_I85:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 426; CHECK-NEXT: [[SHUFFLE_I82:%.*]] = shufflevector <8 x i16> [[VECINIT_I19]], <8 x i16> poison, <4 x i32> zeroinitializer 427; CHECK-NEXT: [[VQDMULL_V2_I72:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I85]], <4 x i16> [[SHUFFLE_I82]]) 428; CHECK-NEXT: [[SHUFFLE_I97:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 429; CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I97]], <4 x i16> [[SHUFFLE_I82]]) 430; CHECK-NEXT: [[SHUFFLE_I79:%.*]] = shufflevector <8 x i16> [[SHUFFLE_I]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 431; CHECK-NEXT: [[SHUFFLE_I76:%.*]] = shufflevector <8 x i16> [[VBSL5_I]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 432; CHECK-NEXT: [[VQDMLAL2_I106:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I79]], <4 x i16> [[SHUFFLE_I76]]) 433; CHECK-NEXT: [[VQDMLAL_V3_I107:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I72]], <4 x i32> [[VQDMLAL2_I106]]) 434; CHECK-NEXT: [[SHUFFLE_I91:%.*]] = shufflevector <8 x i16> [[SHUFFLE_I]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 435; CHECK-NEXT: [[SHUFFLE_I88:%.*]] = shufflevector <8 x i16> [[VBSL5_I]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 436; CHECK-NEXT: [[VQDMLAL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I91]], <4 x i16> [[SHUFFLE_I88]]) 437; CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I]], <4 x i32> [[VQDMLAL2_I]]) 438; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VQDMLAL_V3_I107]] to <8 x i16> 439; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VQDMLAL_V3_I]] to <8 x i16> 440; CHECK-NEXT: [[SHUFFLE_I61:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 441; CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I61]] 442; 443entry: 444 %__p0.addr.i102 = alloca <4 x i32>, align 16 445 %__p1.addr.i103 = alloca <4 x i16>, align 8 446 %__p2.addr.i104 = alloca <4 x i16>, align 8 447 %__ret.i105 = alloca <4 x i32>, align 16 448 %__p0.addr.i98 = alloca <4 x i32>, align 16 449 %__p1.addr.i99 = alloca <4 x i16>, align 8 450 %__p2.addr.i100 = alloca <4 x i16>, align 8 451 %__ret.i101 = alloca <4 x i32>, align 16 452 %__p0.addr.i95 = alloca <8 x i16>, align 16 453 %__ret.i96 = alloca <4 x i16>, align 8 454 %__p0.addr.i92 = alloca <8 x i16>, align 16 455 %__ret.i93 = alloca <4 x i16>, align 8 456 %__p0.addr.i89 = alloca <8 x i16>, align 16 457 %__ret.i90 = alloca <4 x i16>, align 8 458 %__p0.addr.i86 = alloca <8 x i16>, align 16 459 %__ret.i87 = alloca <4 x i16>, align 8 460 %__p0.addr.i83 = alloca <8 x i16>, align 16 461 %__ret.i84 = alloca <4 x i16>, align 8 462 %__p0.addr.i80 = alloca <8 x i16>, align 16 463 %__ret.i81 = alloca <4 x i16>, align 8 464 %__p0.addr.i77 = alloca <8 x i16>, align 16 465 %__ret.i78 = alloca <4 x i16>, align 8 466 %__p0.addr.i74 = alloca <8 x i16>, align 16 467 %__ret.i75 = alloca <4 x i16>, align 8 468 %__p0.addr.i69 = alloca <4 x i16>, align 8 469 %__p1.addr.i70 = alloca <4 x i16>, align 8 470 %__ret.i71 = alloca <4 x i32>, align 16 471 %__p0.addr.i66 = alloca <4 x i16>, align 8 472 %__p1.addr.i67 = alloca <4 x i16>, align 8 473 %__ret.i68 = alloca <4 x i32>, align 16 474 %__p0.addr.i64 = alloca <4 x i32>, align 16 475 %__ret.i65 = alloca <8 x i16>, align 16 476 %__p0.addr.i62 = alloca <4 x i32>, align 16 477 %__ret.i63 = alloca <8 x i16>, align 16 478 %__p0.addr.i58 = alloca <8 x i16>, align 16 479 %__p1.addr.i59 = alloca <8 x i16>, align 16 480 %__ret.i60 = alloca <8 x i16>, align 16 481 %__p0.addr.i51 = alloca <4 x i32>, align 16 482 %__p1.addr.i52 = alloca <8 x i16>, align 16 483 %__p2.addr.i53 = alloca <8 x i16>, align 16 484 %__ret.i54 = alloca <4 x i32>, align 16 485 %a.addr.i46 = alloca <4 x i32>, align 16 486 %b.addr.i47 = alloca <8 x i16>, align 16 487 %c.addr.i = alloca <8 x i16>, align 16 488 %__p0.addr.i40 = alloca <8 x i16>, align 16 489 %__p1.addr.i41 = alloca <8 x i16>, align 16 490 %__ret.i42 = alloca <4 x i32>, align 16 491 %a.addr.i = alloca <8 x i16>, align 16 492 %b.addr.i = alloca <8 x i16>, align 16 493 %__p0.addr.i38 = alloca <8 x i16>, align 16 494 %__ret.i39 = alloca <8 x i16>, align 16 495 %__p0.addr.i36 = alloca <8 x i16>, align 16 496 %__p1.addr.i = alloca <8 x i16>, align 16 497 %__p2.addr.i = alloca <8 x i16>, align 16 498 %__ret.i37 = alloca <8 x i16>, align 16 499 %__p0.addr.i29 = alloca i32, align 4 500 %__ret.i30 = alloca <4 x i32>, align 16 501 %.compoundliteral.i31 = alloca <4 x i32>, align 16 502 %__p0.addr.i27 = alloca <4 x i32>, align 16 503 %__ret.i28 = alloca <8 x i16>, align 16 504 %__p0.addr.i16 = alloca i16, align 2 505 %__ret.i17 = alloca <8 x i16>, align 16 506 %.compoundliteral.i18 = alloca <8 x i16>, align 16 507 %__p0.addr.i14 = alloca i16, align 2 508 %__ret.i15 = alloca <8 x i16>, align 16 509 %.compoundliteral.i = alloca <8 x i16>, align 16 510 %__p0.addr.i = alloca <8 x i16>, align 16 511 %__ret.i = alloca <8 x i16>, align 16 512 %scale = alloca %struct.cmplx_int16_t, align 2 513 %a.addr = alloca <8 x i16>, align 16 514 %a_rev = alloca <8 x i16>, align 16 515 %cc = alloca <8 x i16>, align 16 516 %dd = alloca <8 x i16>, align 16 517 %mult_mask = alloca <8 x i16>, align 16 518 %lo32 = alloca <4 x i32>, align 16 519 %hi32 = alloca <4 x i32>, align 16 520 %coerce.val.ii = trunc i64 %scale.coerce to i32 521 store i32 %coerce.val.ii, ptr %scale, align 2 522 store <8 x i16> %a, ptr %a.addr, align 16 523 %0 = load <8 x i16>, ptr %a.addr, align 16 524 store <8 x i16> %0, ptr %__p0.addr.i, align 16 525 %1 = load <8 x i16>, ptr %__p0.addr.i, align 16 526 %2 = load <8 x i16>, ptr %__p0.addr.i, align 16 527 %shuffle.i = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 528 store <8 x i16> %shuffle.i, ptr %__ret.i, align 16 529 %3 = load <8 x i16>, ptr %__ret.i, align 16 530 store <8 x i16> %3, ptr %a_rev, align 16 531 %re = getelementptr inbounds nuw %struct.cmplx_int16_t, ptr %scale, i32 0, i32 0 532 %4 = load i16, ptr %re, align 2 533 store i16 %4, ptr %__p0.addr.i16, align 2 534 %5 = load i16, ptr %__p0.addr.i16, align 2 535 %vecinit.i19 = insertelement <8 x i16> poison, i16 %5, i32 0 536 %6 = load i16, ptr %__p0.addr.i16, align 2 537 %vecinit1.i20 = insertelement <8 x i16> %vecinit.i19, i16 %6, i32 1 538 %7 = load i16, ptr %__p0.addr.i16, align 2 539 %vecinit2.i21 = insertelement <8 x i16> %vecinit1.i20, i16 %7, i32 2 540 %8 = load i16, ptr %__p0.addr.i16, align 2 541 %vecinit3.i22 = insertelement <8 x i16> %vecinit2.i21, i16 %8, i32 3 542 %9 = load i16, ptr %__p0.addr.i16, align 2 543 %vecinit4.i23 = insertelement <8 x i16> %vecinit3.i22, i16 %9, i32 4 544 %10 = load i16, ptr %__p0.addr.i16, align 2 545 %vecinit5.i24 = insertelement <8 x i16> %vecinit4.i23, i16 %10, i32 5 546 %11 = load i16, ptr %__p0.addr.i16, align 2 547 %vecinit6.i25 = insertelement <8 x i16> %vecinit5.i24, i16 %11, i32 6 548 %12 = load i16, ptr %__p0.addr.i16, align 2 549 %vecinit7.i26 = insertelement <8 x i16> %vecinit6.i25, i16 %12, i32 7 550 store <8 x i16> %vecinit7.i26, ptr %.compoundliteral.i18, align 16 551 %13 = load <8 x i16>, ptr %.compoundliteral.i18, align 16 552 store <8 x i16> %13, ptr %__ret.i17, align 16 553 %14 = load <8 x i16>, ptr %__ret.i17, align 16 554 store <8 x i16> %14, ptr %cc, align 16 555 %im = getelementptr inbounds nuw %struct.cmplx_int16_t, ptr %scale, i32 0, i32 1 556 %15 = load i16, ptr %im, align 2 557 store i16 %15, ptr %__p0.addr.i14, align 2 558 %16 = load i16, ptr %__p0.addr.i14, align 2 559 %vecinit.i = insertelement <8 x i16> poison, i16 %16, i32 0 560 %17 = load i16, ptr %__p0.addr.i14, align 2 561 %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %17, i32 1 562 %18 = load i16, ptr %__p0.addr.i14, align 2 563 %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %18, i32 2 564 %19 = load i16, ptr %__p0.addr.i14, align 2 565 %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %19, i32 3 566 %20 = load i16, ptr %__p0.addr.i14, align 2 567 %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %20, i32 4 568 %21 = load i16, ptr %__p0.addr.i14, align 2 569 %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %21, i32 5 570 %22 = load i16, ptr %__p0.addr.i14, align 2 571 %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %22, i32 6 572 %23 = load i16, ptr %__p0.addr.i14, align 2 573 %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %23, i32 7 574 store <8 x i16> %vecinit7.i, ptr %.compoundliteral.i, align 16 575 %24 = load <8 x i16>, ptr %.compoundliteral.i, align 16 576 store <8 x i16> %24, ptr %__ret.i15, align 16 577 %25 = load <8 x i16>, ptr %__ret.i15, align 16 578 store <8 x i16> %25, ptr %dd, align 16 579 store i32 65535, ptr %__p0.addr.i29, align 4 580 %26 = load i32, ptr %__p0.addr.i29, align 4 581 %vecinit.i32 = insertelement <4 x i32> poison, i32 %26, i32 0 582 %27 = load i32, ptr %__p0.addr.i29, align 4 583 %vecinit1.i33 = insertelement <4 x i32> %vecinit.i32, i32 %27, i32 1 584 %28 = load i32, ptr %__p0.addr.i29, align 4 585 %vecinit2.i34 = insertelement <4 x i32> %vecinit1.i33, i32 %28, i32 2 586 %29 = load i32, ptr %__p0.addr.i29, align 4 587 %vecinit3.i35 = insertelement <4 x i32> %vecinit2.i34, i32 %29, i32 3 588 store <4 x i32> %vecinit3.i35, ptr %.compoundliteral.i31, align 16 589 %30 = load <4 x i32>, ptr %.compoundliteral.i31, align 16 590 store <4 x i32> %30, ptr %__ret.i30, align 16 591 %31 = load <4 x i32>, ptr %__ret.i30, align 16 592 store <4 x i32> %31, ptr %__p0.addr.i27, align 16 593 %32 = load <4 x i32>, ptr %__p0.addr.i27, align 16 594 %33 = bitcast <4 x i32> %32 to <8 x i16> 595 store <8 x i16> %33, ptr %__ret.i28, align 16 596 %34 = load <8 x i16>, ptr %__ret.i28, align 16 597 store <8 x i16> %34, ptr %mult_mask, align 16 598 %35 = load <8 x i16>, ptr %mult_mask, align 16 599 %36 = load <8 x i16>, ptr %dd, align 16 600 store <8 x i16> %36, ptr %__p0.addr.i38, align 16 601 %37 = load <8 x i16>, ptr %__p0.addr.i38, align 16 602 %38 = bitcast <8 x i16> %37 to <16 x i8> 603 %vqnegq_v1.i = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %37) 604 %vqnegq_v2.i = bitcast <8 x i16> %vqnegq_v1.i to <16 x i8> 605 store <8 x i16> %vqnegq_v1.i, ptr %__ret.i39, align 16 606 %39 = load <8 x i16>, ptr %__ret.i39, align 16 607 %40 = load <8 x i16>, ptr %dd, align 16 608 store <8 x i16> %35, ptr %__p0.addr.i36, align 16 609 store <8 x i16> %39, ptr %__p1.addr.i, align 16 610 store <8 x i16> %40, ptr %__p2.addr.i, align 16 611 %41 = load <8 x i16>, ptr %__p0.addr.i36, align 16 612 %42 = bitcast <8 x i16> %41 to <16 x i8> 613 %43 = load <8 x i16>, ptr %__p1.addr.i, align 16 614 %44 = bitcast <8 x i16> %43 to <16 x i8> 615 %45 = load <8 x i16>, ptr %__p2.addr.i, align 16 616 %46 = bitcast <8 x i16> %45 to <16 x i8> 617 %vbsl3.i = and <8 x i16> %41, %43 618 %47 = xor <8 x i16> %41, splat (i16 -1) 619 %vbsl4.i = and <8 x i16> %47, %45 620 %vbsl5.i = or <8 x i16> %vbsl3.i, %vbsl4.i 621 store <8 x i16> %vbsl5.i, ptr %__ret.i37, align 16 622 %48 = load <8 x i16>, ptr %__ret.i37, align 16 623 store <8 x i16> %48, ptr %dd, align 16 624 %49 = load <8 x i16>, ptr %a.addr, align 16 625 %50 = load <8 x i16>, ptr %cc, align 16 626 store <8 x i16> %49, ptr %a.addr.i, align 16 627 store <8 x i16> %50, ptr %b.addr.i, align 16 628 %51 = load <8 x i16>, ptr %a.addr.i, align 16 629 store <8 x i16> %51, ptr %__p0.addr.i83, align 16 630 %52 = load <8 x i16>, ptr %__p0.addr.i83, align 16 631 %53 = load <8 x i16>, ptr %__p0.addr.i83, align 16 632 %shuffle.i85 = shufflevector <8 x i16> %52, <8 x i16> %53, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 633 store <4 x i16> %shuffle.i85, ptr %__ret.i84, align 8 634 %54 = load <4 x i16>, ptr %__ret.i84, align 8 635 %55 = load <8 x i16>, ptr %b.addr.i, align 16 636 store <8 x i16> %55, ptr %__p0.addr.i80, align 16 637 %56 = load <8 x i16>, ptr %__p0.addr.i80, align 16 638 %57 = load <8 x i16>, ptr %__p0.addr.i80, align 16 639 %shuffle.i82 = shufflevector <8 x i16> %56, <8 x i16> %57, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 640 store <4 x i16> %shuffle.i82, ptr %__ret.i81, align 8 641 %58 = load <4 x i16>, ptr %__ret.i81, align 8 642 store <4 x i16> %54, ptr %__p0.addr.i69, align 8 643 store <4 x i16> %58, ptr %__p1.addr.i70, align 8 644 %59 = load <4 x i16>, ptr %__p0.addr.i69, align 8 645 %60 = bitcast <4 x i16> %59 to <8 x i8> 646 %61 = load <4 x i16>, ptr %__p1.addr.i70, align 8 647 %62 = bitcast <4 x i16> %61 to <8 x i8> 648 %vqdmull_v2.i72 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %59, <4 x i16> %61) 649 %vqdmull_v3.i73 = bitcast <4 x i32> %vqdmull_v2.i72 to <16 x i8> 650 store <4 x i32> %vqdmull_v2.i72, ptr %__ret.i71, align 16 651 %63 = load <4 x i32>, ptr %__ret.i71, align 16 652 store <4 x i32> %63, ptr %lo32, align 16 653 %64 = load <8 x i16>, ptr %a.addr, align 16 654 %65 = load <8 x i16>, ptr %cc, align 16 655 store <8 x i16> %64, ptr %__p0.addr.i40, align 16 656 store <8 x i16> %65, ptr %__p1.addr.i41, align 16 657 %66 = load <8 x i16>, ptr %__p0.addr.i40, align 16 658 store <8 x i16> %66, ptr %__p0.addr.i95, align 16 659 %67 = load <8 x i16>, ptr %__p0.addr.i95, align 16 660 %68 = load <8 x i16>, ptr %__p0.addr.i95, align 16 661 %shuffle.i97 = shufflevector <8 x i16> %67, <8 x i16> %68, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 662 store <4 x i16> %shuffle.i97, ptr %__ret.i96, align 8 663 %69 = load <4 x i16>, ptr %__ret.i96, align 8 664 %70 = load <8 x i16>, ptr %__p1.addr.i41, align 16 665 store <8 x i16> %70, ptr %__p0.addr.i92, align 16 666 %71 = load <8 x i16>, ptr %__p0.addr.i92, align 16 667 %72 = load <8 x i16>, ptr %__p0.addr.i92, align 16 668 %shuffle.i94 = shufflevector <8 x i16> %71, <8 x i16> %72, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 669 store <4 x i16> %shuffle.i94, ptr %__ret.i93, align 8 670 %73 = load <4 x i16>, ptr %__ret.i93, align 8 671 store <4 x i16> %69, ptr %__p0.addr.i66, align 8 672 store <4 x i16> %73, ptr %__p1.addr.i67, align 8 673 %74 = load <4 x i16>, ptr %__p0.addr.i66, align 8 674 %75 = bitcast <4 x i16> %74 to <8 x i8> 675 %76 = load <4 x i16>, ptr %__p1.addr.i67, align 8 676 %77 = bitcast <4 x i16> %76 to <8 x i8> 677 %vqdmull_v2.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %74, <4 x i16> %76) 678 %vqdmull_v3.i = bitcast <4 x i32> %vqdmull_v2.i to <16 x i8> 679 store <4 x i32> %vqdmull_v2.i, ptr %__ret.i68, align 16 680 %78 = load <4 x i32>, ptr %__ret.i68, align 16 681 store <4 x i32> %78, ptr %__ret.i42, align 16 682 %79 = load <4 x i32>, ptr %__ret.i42, align 16 683 store <4 x i32> %79, ptr %hi32, align 16 684 %80 = load <4 x i32>, ptr %lo32, align 16 685 %81 = load <8 x i16>, ptr %a_rev, align 16 686 %82 = load <8 x i16>, ptr %dd, align 16 687 store <4 x i32> %80, ptr %a.addr.i46, align 16 688 store <8 x i16> %81, ptr %b.addr.i47, align 16 689 store <8 x i16> %82, ptr %c.addr.i, align 16 690 %83 = load <4 x i32>, ptr %a.addr.i46, align 16 691 %84 = load <8 x i16>, ptr %b.addr.i47, align 16 692 store <8 x i16> %84, ptr %__p0.addr.i77, align 16 693 %85 = load <8 x i16>, ptr %__p0.addr.i77, align 16 694 %86 = load <8 x i16>, ptr %__p0.addr.i77, align 16 695 %shuffle.i79 = shufflevector <8 x i16> %85, <8 x i16> %86, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 696 store <4 x i16> %shuffle.i79, ptr %__ret.i78, align 8 697 %87 = load <4 x i16>, ptr %__ret.i78, align 8 698 %88 = load <8 x i16>, ptr %c.addr.i, align 16 699 store <8 x i16> %88, ptr %__p0.addr.i74, align 16 700 %89 = load <8 x i16>, ptr %__p0.addr.i74, align 16 701 %90 = load <8 x i16>, ptr %__p0.addr.i74, align 16 702 %shuffle.i76 = shufflevector <8 x i16> %89, <8 x i16> %90, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 703 store <4 x i16> %shuffle.i76, ptr %__ret.i75, align 8 704 %91 = load <4 x i16>, ptr %__ret.i75, align 8 705 store <4 x i32> %83, ptr %__p0.addr.i102, align 16 706 store <4 x i16> %87, ptr %__p1.addr.i103, align 8 707 store <4 x i16> %91, ptr %__p2.addr.i104, align 8 708 %92 = load <4 x i32>, ptr %__p0.addr.i102, align 16 709 %93 = bitcast <4 x i32> %92 to <16 x i8> 710 %94 = load <4 x i16>, ptr %__p1.addr.i103, align 8 711 %95 = bitcast <4 x i16> %94 to <8 x i8> 712 %96 = load <4 x i16>, ptr %__p2.addr.i104, align 8 713 %97 = bitcast <4 x i16> %96 to <8 x i8> 714 %vqdmlal2.i106 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %94, <4 x i16> %96) 715 %vqdmlal_v3.i107 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %92, <4 x i32> %vqdmlal2.i106) 716 store <4 x i32> %vqdmlal_v3.i107, ptr %__ret.i105, align 16 717 %98 = load <4 x i32>, ptr %__ret.i105, align 16 718 store <4 x i32> %98, ptr %lo32, align 16 719 %99 = load <4 x i32>, ptr %hi32, align 16 720 %100 = load <8 x i16>, ptr %a_rev, align 16 721 %101 = load <8 x i16>, ptr %dd, align 16 722 store <4 x i32> %99, ptr %__p0.addr.i51, align 16 723 store <8 x i16> %100, ptr %__p1.addr.i52, align 16 724 store <8 x i16> %101, ptr %__p2.addr.i53, align 16 725 %102 = load <4 x i32>, ptr %__p0.addr.i51, align 16 726 %103 = load <8 x i16>, ptr %__p1.addr.i52, align 16 727 store <8 x i16> %103, ptr %__p0.addr.i89, align 16 728 %104 = load <8 x i16>, ptr %__p0.addr.i89, align 16 729 %105 = load <8 x i16>, ptr %__p0.addr.i89, align 16 730 %shuffle.i91 = shufflevector <8 x i16> %104, <8 x i16> %105, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 731 store <4 x i16> %shuffle.i91, ptr %__ret.i90, align 8 732 %106 = load <4 x i16>, ptr %__ret.i90, align 8 733 %107 = load <8 x i16>, ptr %__p2.addr.i53, align 16 734 store <8 x i16> %107, ptr %__p0.addr.i86, align 16 735 %108 = load <8 x i16>, ptr %__p0.addr.i86, align 16 736 %109 = load <8 x i16>, ptr %__p0.addr.i86, align 16 737 %shuffle.i88 = shufflevector <8 x i16> %108, <8 x i16> %109, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 738 store <4 x i16> %shuffle.i88, ptr %__ret.i87, align 8 739 %110 = load <4 x i16>, ptr %__ret.i87, align 8 740 store <4 x i32> %102, ptr %__p0.addr.i98, align 16 741 store <4 x i16> %106, ptr %__p1.addr.i99, align 8 742 store <4 x i16> %110, ptr %__p2.addr.i100, align 8 743 %111 = load <4 x i32>, ptr %__p0.addr.i98, align 16 744 %112 = bitcast <4 x i32> %111 to <16 x i8> 745 %113 = load <4 x i16>, ptr %__p1.addr.i99, align 8 746 %114 = bitcast <4 x i16> %113 to <8 x i8> 747 %115 = load <4 x i16>, ptr %__p2.addr.i100, align 8 748 %116 = bitcast <4 x i16> %115 to <8 x i8> 749 %vqdmlal2.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %113, <4 x i16> %115) 750 %vqdmlal_v3.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %111, <4 x i32> %vqdmlal2.i) 751 store <4 x i32> %vqdmlal_v3.i, ptr %__ret.i101, align 16 752 %117 = load <4 x i32>, ptr %__ret.i101, align 16 753 store <4 x i32> %117, ptr %__ret.i54, align 16 754 %118 = load <4 x i32>, ptr %__ret.i54, align 16 755 store <4 x i32> %118, ptr %hi32, align 16 756 %119 = load <4 x i32>, ptr %lo32, align 16 757 store <4 x i32> %119, ptr %__p0.addr.i64, align 16 758 %120 = load <4 x i32>, ptr %__p0.addr.i64, align 16 759 %121 = bitcast <4 x i32> %120 to <8 x i16> 760 store <8 x i16> %121, ptr %__ret.i65, align 16 761 %122 = load <8 x i16>, ptr %__ret.i65, align 16 762 %123 = load <4 x i32>, ptr %hi32, align 16 763 store <4 x i32> %123, ptr %__p0.addr.i62, align 16 764 %124 = load <4 x i32>, ptr %__p0.addr.i62, align 16 765 %125 = bitcast <4 x i32> %124 to <8 x i16> 766 store <8 x i16> %125, ptr %__ret.i63, align 16 767 %126 = load <8 x i16>, ptr %__ret.i63, align 16 768 store <8 x i16> %122, ptr %__p0.addr.i58, align 16 769 store <8 x i16> %126, ptr %__p1.addr.i59, align 16 770 %127 = load <8 x i16>, ptr %__p0.addr.i58, align 16 771 %128 = load <8 x i16>, ptr %__p1.addr.i59, align 16 772 %shuffle.i61 = shufflevector <8 x i16> %127, <8 x i16> %128, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 773 store <8 x i16> %shuffle.i61, ptr %__ret.i60, align 16 774 %129 = load <8 x i16>, ptr %__ret.i60, align 16 775 ret <8 x i16> %129 776} 777 778; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) 779declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1 780 781; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) 782declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) #2 783 784; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) 785declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) #2 786 787; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) 788declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) #2 789 790attributes #0 = { mustprogress noinline uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fullfp16,+jsconv,+lse,+neon,+pauth,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" } 791attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } 792attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) } 793 794!llvm.module.flags = !{!0, !1, !2} 795!llvm.ident = !{!3} 796 797!0 = !{i32 1, !"wchar_size", i32 4} 798!1 = !{i32 7, !"uwtable", i32 2} 799!2 = !{i32 7, !"frame-pointer", i32 1} 800!3 = !{!"clang version 20.0.0git"} 801!4 = distinct !{!4, !5} 802!5 = !{!"llvm.loop.mustprogress"} 803;. 804; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]} 805; CHECK: [[META5]] = !{!"llvm.loop.mustprogress"} 806;. 807