1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 2 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +v8.2a -target-feature +neon -target-feature +fp16fml \ 3 // RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s 4 5 // REQUIRES: aarch64-registered-target 6 7 // Test AArch64 Armv8.2-A FP16 Fused Multiply-Add Long intrinsics 8 9 #include <arm_neon.h> 10 11 // Vector form 12 13 // CHECK-LABEL: @test_vfmlal_low_f16( 14 // CHECK-NEXT: entry: 15 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 16 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> 17 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> 18 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) 19 // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]] 20 // 21 float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { 22 return vfmlal_low_f16(a, b, c); 23 } 24 25 // CHECK-LABEL: @test_vfmlsl_low_f16( 26 // CHECK-NEXT: entry: 27 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 28 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> 29 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> 30 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) 31 // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]] 32 // 33 float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { 34 return vfmlsl_low_f16(a, b, c); 35 } 36 37 // CHECK-LABEL: @test_vfmlal_high_f16( 38 // CHECK-NEXT: entry: 39 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 40 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> 41 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> 42 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) 43 // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]] 44 // 45 float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { 46 return vfmlal_high_f16(a, b, c); 47 } 48 49 // CHECK-LABEL: @test_vfmlsl_high_f16( 50 // CHECK-NEXT: entry: 51 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 52 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> 53 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> 54 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) 55 // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]] 56 // 57 float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { 58 return vfmlsl_high_f16(a, b, c); 59 } 60 61 // CHECK-LABEL: @test_vfmlalq_low_f16( 62 // CHECK-NEXT: entry: 63 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 64 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> 65 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> 66 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) 67 // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]] 68 // 69 float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { 70 return vfmlalq_low_f16(a, b, c); 71 } 72 73 // CHECK-LABEL: @test_vfmlslq_low_f16( 74 // CHECK-NEXT: entry: 75 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 76 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> 77 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> 78 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) 79 // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]] 80 // 81 float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { 82 return vfmlslq_low_f16(a, b, c); 83 } 84 85 // CHECK-LABEL: @test_vfmlalq_high_f16( 86 // CHECK-NEXT: entry: 87 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 88 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> 89 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> 90 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) 91 // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]] 92 // 93 float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { 94 return vfmlalq_high_f16(a, b, c); 95 } 96 97 // CHECK-LABEL: @test_vfmlslq_high_f16( 98 // CHECK-NEXT: entry: 99 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 100 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> 101 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> 102 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) 103 // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]] 104 // 105 float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { 106 return vfmlslq_high_f16(a, b, c); 107 } 108 109 // Indexed form 110 111 // CHECK-LABEL: @test_vfmlal_lane_low_f16( 112 // CHECK-NEXT: entry: 113 // CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 114 // CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 115 // CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 116 // CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 117 // CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 118 // CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 119 // CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 120 // CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 121 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 122 // CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 123 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0 124 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 125 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 126 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 127 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 128 // CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 129 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 130 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 131 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 132 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 133 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 134 // CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 135 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0 136 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 137 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 138 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 139 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 140 // CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 141 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0 142 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 143 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 144 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 145 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 146 // CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> 147 // CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> 148 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) 149 // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]] 150 // 151 float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { 152 return vfmlal_lane_low_f16(a, b, c, 0); 153 } 154 155 // CHECK-LABEL: @test_vfmlal_lane_high_f16( 156 // CHECK-NEXT: entry: 157 // CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 158 // CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 159 // CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 160 // CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 161 // CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 162 // CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 163 // CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 164 // CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 165 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 166 // CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 167 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1 168 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 169 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 170 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 171 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 172 // CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 173 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1 174 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 175 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 176 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 177 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 178 // CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 179 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1 180 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 181 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 182 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 183 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 184 // CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 185 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1 186 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 187 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 188 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 189 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 190 // CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> 191 // CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> 192 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) 193 // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]] 194 // 195 float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { 196 return vfmlal_lane_high_f16(a, b, c, 1); 197 } 198 199 // CHECK-LABEL: @test_vfmlalq_lane_low_f16( 200 // CHECK-NEXT: entry: 201 // CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 202 // CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 203 // CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 204 // CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 205 // CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 206 // CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 207 // CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 208 // CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 209 // CHECK-NEXT: [[__REINT_84734:%.*]] = alloca <4 x half>, align 8 210 // CHECK-NEXT: [[__REINT1_84735:%.*]] = alloca i16, align 2 211 // CHECK-NEXT: [[__REINT_84744:%.*]] = alloca <4 x half>, align 8 212 // CHECK-NEXT: [[__REINT1_84745:%.*]] = alloca i16, align 2 213 // CHECK-NEXT: [[__REINT_84754:%.*]] = alloca <4 x half>, align 8 214 // CHECK-NEXT: [[__REINT1_84755:%.*]] = alloca i16, align 2 215 // CHECK-NEXT: [[__REINT_84764:%.*]] = alloca <4 x half>, align 8 216 // CHECK-NEXT: [[__REINT1_84765:%.*]] = alloca i16, align 2 217 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 218 // CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 219 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2 220 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 221 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 222 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 223 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 224 // CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 225 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2 226 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 227 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 228 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 229 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 230 // CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 231 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 232 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 233 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 234 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 235 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 236 // CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 237 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2 238 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 239 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 240 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 241 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84734]], align 8 242 // CHECK-NEXT: [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8 243 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2 244 // CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2 245 // CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2 246 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 247 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84744]], align 8 248 // CHECK-NEXT: [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8 249 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 2 250 // CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2 251 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2 252 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 253 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84754]], align 8 254 // CHECK-NEXT: [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8 255 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 2 256 // CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2 257 // CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2 258 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 259 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84764]], align 8 260 // CHECK-NEXT: [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8 261 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 2 262 // CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2 263 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2 264 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 265 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 266 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> 267 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> 268 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) 269 // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]] 270 // 271 float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) { 272 return vfmlalq_lane_low_f16(a, b, c, 2); 273 } 274 275 // CHECK-LABEL: @test_vfmlalq_lane_high_f16( 276 // CHECK-NEXT: entry: 277 // CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 278 // CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 279 // CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 280 // CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 281 // CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 282 // CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 283 // CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 284 // CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 285 // CHECK-NEXT: [[__REINT_84734:%.*]] = alloca <4 x half>, align 8 286 // CHECK-NEXT: [[__REINT1_84735:%.*]] = alloca i16, align 2 287 // CHECK-NEXT: [[__REINT_84744:%.*]] = alloca <4 x half>, align 8 288 // CHECK-NEXT: [[__REINT1_84745:%.*]] = alloca i16, align 2 289 // CHECK-NEXT: [[__REINT_84754:%.*]] = alloca <4 x half>, align 8 290 // CHECK-NEXT: [[__REINT1_84755:%.*]] = alloca i16, align 2 291 // CHECK-NEXT: [[__REINT_84764:%.*]] = alloca <4 x half>, align 8 292 // CHECK-NEXT: [[__REINT1_84765:%.*]] = alloca i16, align 2 293 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 294 // CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 295 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3 296 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 297 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 298 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 299 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 300 // CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 301 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 302 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 303 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 304 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 305 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 306 // CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 307 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 308 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 309 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 310 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 311 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 312 // CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 313 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3 314 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 315 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 316 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 317 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84734]], align 8 318 // CHECK-NEXT: [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8 319 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3 320 // CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2 321 // CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2 322 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 323 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84744]], align 8 324 // CHECK-NEXT: [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8 325 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3 326 // CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2 327 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2 328 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 329 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84754]], align 8 330 // CHECK-NEXT: [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8 331 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 3 332 // CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2 333 // CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2 334 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 335 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84764]], align 8 336 // CHECK-NEXT: [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8 337 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 3 338 // CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2 339 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2 340 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 341 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 342 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> 343 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> 344 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) 345 // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]] 346 // 347 float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) { 348 return vfmlalq_lane_high_f16(a, b, c, 3); 349 } 350 351 // CHECK-LABEL: @test_vfmlal_laneq_low_f16( 352 // CHECK-NEXT: entry: 353 // CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 354 // CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 355 // CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 356 // CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 357 // CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 358 // CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 359 // CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 360 // CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 361 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 362 // CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 363 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 4 364 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 365 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 366 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 367 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 368 // CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 369 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 370 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 371 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 372 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 373 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 374 // CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 375 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 376 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 377 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 378 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 379 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 380 // CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 381 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 382 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 383 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 384 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 385 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 386 // CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> 387 // CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> 388 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) 389 // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]] 390 // 391 float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) { 392 return vfmlal_laneq_low_f16(a, b, c, 4); 393 } 394 395 // CHECK-LABEL: @test_vfmlal_laneq_high_f16( 396 // CHECK-NEXT: entry: 397 // CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 398 // CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 399 // CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 400 // CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 401 // CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 402 // CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 403 // CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 404 // CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 405 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 406 // CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 407 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 5 408 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 409 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 410 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 411 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 412 // CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 413 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 414 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 415 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 416 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 417 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 418 // CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 419 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 420 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 421 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 422 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 423 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 424 // CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 425 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 426 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 427 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 428 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 429 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 430 // CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> 431 // CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> 432 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) 433 // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]] 434 // 435 float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) { 436 return vfmlal_laneq_high_f16(a, b, c, 5); 437 } 438 439 // CHECK-LABEL: @test_vfmlalq_laneq_low_f16( 440 // CHECK-NEXT: entry: 441 // CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 442 // CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 443 // CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 444 // CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 445 // CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 446 // CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 447 // CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 448 // CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 449 // CHECK-NEXT: [[__REINT_85034:%.*]] = alloca <8 x half>, align 16 450 // CHECK-NEXT: [[__REINT1_85035:%.*]] = alloca i16, align 2 451 // CHECK-NEXT: [[__REINT_85044:%.*]] = alloca <8 x half>, align 16 452 // CHECK-NEXT: [[__REINT1_85045:%.*]] = alloca i16, align 2 453 // CHECK-NEXT: [[__REINT_85054:%.*]] = alloca <8 x half>, align 16 454 // CHECK-NEXT: [[__REINT1_85055:%.*]] = alloca i16, align 2 455 // CHECK-NEXT: [[__REINT_85064:%.*]] = alloca <8 x half>, align 16 456 // CHECK-NEXT: [[__REINT1_85065:%.*]] = alloca i16, align 2 457 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 458 // CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 459 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 6 460 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 461 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 462 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 463 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 464 // CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 465 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 466 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 467 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 468 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 469 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 470 // CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 471 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 472 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 473 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 474 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 475 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 476 // CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 477 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 478 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 479 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 480 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 481 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85034]], align 16 482 // CHECK-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16 483 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6 484 // CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2 485 // CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2 486 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 487 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85044]], align 16 488 // CHECK-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16 489 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 6 490 // CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2 491 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2 492 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 493 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85054]], align 16 494 // CHECK-NEXT: [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16 495 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 6 496 // CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2 497 // CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2 498 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 499 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85064]], align 16 500 // CHECK-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16 501 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 6 502 // CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2 503 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2 504 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 505 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 506 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> 507 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> 508 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) 509 // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]] 510 // 511 float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { 512 return vfmlalq_laneq_low_f16(a, b, c, 6); 513 } 514 515 // CHECK-LABEL: @test_vfmlalq_laneq_high_f16( 516 // CHECK-NEXT: entry: 517 // CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 518 // CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 519 // CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 520 // CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 521 // CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 522 // CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 523 // CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 524 // CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 525 // CHECK-NEXT: [[__REINT_85034:%.*]] = alloca <8 x half>, align 16 526 // CHECK-NEXT: [[__REINT1_85035:%.*]] = alloca i16, align 2 527 // CHECK-NEXT: [[__REINT_85044:%.*]] = alloca <8 x half>, align 16 528 // CHECK-NEXT: [[__REINT1_85045:%.*]] = alloca i16, align 2 529 // CHECK-NEXT: [[__REINT_85054:%.*]] = alloca <8 x half>, align 16 530 // CHECK-NEXT: [[__REINT1_85055:%.*]] = alloca i16, align 2 531 // CHECK-NEXT: [[__REINT_85064:%.*]] = alloca <8 x half>, align 16 532 // CHECK-NEXT: [[__REINT1_85065:%.*]] = alloca i16, align 2 533 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 534 // CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 535 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7 536 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 537 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 538 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 539 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 540 // CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 541 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 542 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 543 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 544 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 545 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 546 // CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 547 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 548 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 549 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 550 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 551 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 552 // CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 553 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 554 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 555 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 556 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 557 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85034]], align 16 558 // CHECK-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16 559 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7 560 // CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2 561 // CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2 562 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 563 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85044]], align 16 564 // CHECK-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16 565 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 7 566 // CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2 567 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2 568 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 569 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85054]], align 16 570 // CHECK-NEXT: [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16 571 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 7 572 // CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2 573 // CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2 574 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 575 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85064]], align 16 576 // CHECK-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16 577 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 7 578 // CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2 579 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2 580 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 581 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 582 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> 583 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> 584 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) 585 // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]] 586 // 587 float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { 588 return vfmlalq_laneq_high_f16(a, b, c, 7); 589 } 590 591 // CHECK-LABEL: @test_vfmlsl_lane_low_f16( 592 // CHECK-NEXT: entry: 593 // CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 594 // CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 595 // CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 596 // CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 597 // CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 598 // CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 599 // CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 600 // CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 601 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 602 // CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 603 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0 604 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 605 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 606 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 607 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 608 // CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 609 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 610 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 611 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 612 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 613 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 614 // CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 615 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0 616 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 617 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 618 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 619 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 620 // CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 621 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0 622 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 623 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 624 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 625 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 626 // CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> 627 // CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> 628 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) 629 // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]] 630 // 631 float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { 632 return vfmlsl_lane_low_f16(a, b, c, 0); 633 } 634 635 // CHECK-LABEL: @test_vfmlsl_lane_high_f16( 636 // CHECK-NEXT: entry: 637 // CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 638 // CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 639 // CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 640 // CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 641 // CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 642 // CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 643 // CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 644 // CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 645 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 646 // CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 647 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1 648 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 649 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 650 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 651 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 652 // CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 653 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1 654 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 655 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 656 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 657 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 658 // CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 659 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1 660 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 661 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 662 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 663 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 664 // CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 665 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1 666 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 667 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 668 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 669 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 670 // CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> 671 // CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> 672 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) 673 // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]] 674 // 675 float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { 676 return vfmlsl_lane_high_f16(a, b, c, 1); 677 } 678 679 // CHECK-LABEL: @test_vfmlslq_lane_low_f16( 680 // CHECK-NEXT: entry: 681 // CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 682 // CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 683 // CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 684 // CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 685 // CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 686 // CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 687 // CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 688 // CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 689 // CHECK-NEXT: [[__REINT_84734:%.*]] = alloca <4 x half>, align 8 690 // CHECK-NEXT: [[__REINT1_84735:%.*]] = alloca i16, align 2 691 // CHECK-NEXT: [[__REINT_84744:%.*]] = alloca <4 x half>, align 8 692 // CHECK-NEXT: [[__REINT1_84745:%.*]] = alloca i16, align 2 693 // CHECK-NEXT: [[__REINT_84754:%.*]] = alloca <4 x half>, align 8 694 // CHECK-NEXT: [[__REINT1_84755:%.*]] = alloca i16, align 2 695 // CHECK-NEXT: [[__REINT_84764:%.*]] = alloca <4 x half>, align 8 696 // CHECK-NEXT: [[__REINT1_84765:%.*]] = alloca i16, align 2 697 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 698 // CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 699 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2 700 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 701 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 702 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 703 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 704 // CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 705 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2 706 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 707 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 708 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 709 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 710 // CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 711 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 712 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 713 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 714 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 715 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 716 // CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 717 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2 718 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 719 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 720 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 721 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84734]], align 8 722 // CHECK-NEXT: [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8 723 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2 724 // CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2 725 // CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2 726 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 727 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84744]], align 8 728 // CHECK-NEXT: [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8 729 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 2 730 // CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2 731 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2 732 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 733 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84754]], align 8 734 // CHECK-NEXT: [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8 735 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 2 736 // CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2 737 // CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2 738 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 739 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84764]], align 8 740 // CHECK-NEXT: [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8 741 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 2 742 // CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2 743 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2 744 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 745 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 746 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> 747 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> 748 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) 749 // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]] 750 // 751 float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) { 752 return vfmlslq_lane_low_f16(a, b, c, 2); 753 } 754 755 // CHECK-LABEL: @test_vfmlslq_lane_high_f16( 756 // CHECK-NEXT: entry: 757 // CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 758 // CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 759 // CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 760 // CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 761 // CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 762 // CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 763 // CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 764 // CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 765 // CHECK-NEXT: [[__REINT_84734:%.*]] = alloca <4 x half>, align 8 766 // CHECK-NEXT: [[__REINT1_84735:%.*]] = alloca i16, align 2 767 // CHECK-NEXT: [[__REINT_84744:%.*]] = alloca <4 x half>, align 8 768 // CHECK-NEXT: [[__REINT1_84745:%.*]] = alloca i16, align 2 769 // CHECK-NEXT: [[__REINT_84754:%.*]] = alloca <4 x half>, align 8 770 // CHECK-NEXT: [[__REINT1_84755:%.*]] = alloca i16, align 2 771 // CHECK-NEXT: [[__REINT_84764:%.*]] = alloca <4 x half>, align 8 772 // CHECK-NEXT: [[__REINT1_84765:%.*]] = alloca i16, align 2 773 // CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 774 // CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 775 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3 776 // CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 777 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 778 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 779 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 780 // CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 781 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 782 // CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 783 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 784 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 785 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 786 // CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 787 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 788 // CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 789 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 790 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 791 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 792 // CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 793 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3 794 // CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 795 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 796 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 797 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84734]], align 8 798 // CHECK-NEXT: [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8 799 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3 800 // CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2 801 // CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2 802 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 803 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84744]], align 8 804 // CHECK-NEXT: [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8 805 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3 806 // CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2 807 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2 808 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 809 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84754]], align 8 810 // CHECK-NEXT: [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8 811 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 3 812 // CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2 813 // CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2 814 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 815 // CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84764]], align 8 816 // CHECK-NEXT: [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8 817 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 3 818 // CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2 819 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2 820 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 821 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 822 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> 823 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> 824 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) 825 // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]] 826 // 827 float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) { 828 return vfmlslq_lane_high_f16(a, b, c, 3); 829 } 830 831 // CHECK-LABEL: @test_vfmlsl_laneq_low_f16( 832 // CHECK-NEXT: entry: 833 // CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 834 // CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 835 // CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 836 // CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 837 // CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 838 // CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 839 // CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 840 // CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 841 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 842 // CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 843 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 4 844 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 845 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 846 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 847 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 848 // CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 849 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 850 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 851 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 852 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 853 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 854 // CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 855 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 856 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 857 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 858 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 859 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 860 // CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 861 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 862 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 863 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 864 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 865 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 866 // CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> 867 // CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> 868 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) 869 // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]] 870 // 871 float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) { 872 return vfmlsl_laneq_low_f16(a, b, c, 4); 873 } 874 875 // CHECK-LABEL: @test_vfmlsl_laneq_high_f16( 876 // CHECK-NEXT: entry: 877 // CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 878 // CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 879 // CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 880 // CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 881 // CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 882 // CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 883 // CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 884 // CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 885 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 886 // CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 887 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 5 888 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 889 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 890 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 891 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 892 // CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 893 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 894 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 895 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 896 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 897 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 898 // CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 899 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 900 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 901 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 902 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 903 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 904 // CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 905 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 906 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 907 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 908 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 909 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 910 // CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> 911 // CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> 912 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) 913 // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]] 914 // 915 float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) { 916 return vfmlsl_laneq_high_f16(a, b, c, 5); 917 } 918 919 // CHECK-LABEL: @test_vfmlslq_laneq_low_f16( 920 // CHECK-NEXT: entry: 921 // CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 922 // CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 923 // CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 924 // CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 925 // CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 926 // CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 927 // CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 928 // CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 929 // CHECK-NEXT: [[__REINT_85034:%.*]] = alloca <8 x half>, align 16 930 // CHECK-NEXT: [[__REINT1_85035:%.*]] = alloca i16, align 2 931 // CHECK-NEXT: [[__REINT_85044:%.*]] = alloca <8 x half>, align 16 932 // CHECK-NEXT: [[__REINT1_85045:%.*]] = alloca i16, align 2 933 // CHECK-NEXT: [[__REINT_85054:%.*]] = alloca <8 x half>, align 16 934 // CHECK-NEXT: [[__REINT1_85055:%.*]] = alloca i16, align 2 935 // CHECK-NEXT: [[__REINT_85064:%.*]] = alloca <8 x half>, align 16 936 // CHECK-NEXT: [[__REINT1_85065:%.*]] = alloca i16, align 2 937 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 938 // CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 939 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 6 940 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 941 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 942 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 943 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 944 // CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 945 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 946 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 947 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 948 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 949 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 950 // CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 951 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 952 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 953 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 954 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 955 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 956 // CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 957 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 958 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 959 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 960 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 961 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85034]], align 16 962 // CHECK-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16 963 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6 964 // CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2 965 // CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2 966 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 967 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85044]], align 16 968 // CHECK-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16 969 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 6 970 // CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2 971 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2 972 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 973 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85054]], align 16 974 // CHECK-NEXT: [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16 975 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 6 976 // CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2 977 // CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2 978 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 979 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85064]], align 16 980 // CHECK-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16 981 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 6 982 // CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2 983 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2 984 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 985 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 986 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> 987 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> 988 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) 989 // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]] 990 // 991 float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { 992 return vfmlslq_laneq_low_f16(a, b, c, 6); 993 } 994 995 // CHECK-LABEL: @test_vfmlslq_laneq_high_f16( 996 // CHECK-NEXT: entry: 997 // CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 998 // CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 999 // CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 1000 // CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 1001 // CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 1002 // CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 1003 // CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 1004 // CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 1005 // CHECK-NEXT: [[__REINT_85034:%.*]] = alloca <8 x half>, align 16 1006 // CHECK-NEXT: [[__REINT1_85035:%.*]] = alloca i16, align 2 1007 // CHECK-NEXT: [[__REINT_85044:%.*]] = alloca <8 x half>, align 16 1008 // CHECK-NEXT: [[__REINT1_85045:%.*]] = alloca i16, align 2 1009 // CHECK-NEXT: [[__REINT_85054:%.*]] = alloca <8 x half>, align 16 1010 // CHECK-NEXT: [[__REINT1_85055:%.*]] = alloca i16, align 2 1011 // CHECK-NEXT: [[__REINT_85064:%.*]] = alloca <8 x half>, align 16 1012 // CHECK-NEXT: [[__REINT1_85065:%.*]] = alloca i16, align 2 1013 // CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 1014 // CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 1015 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7 1016 // CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 1017 // CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 1018 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 1019 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 1020 // CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 1021 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 1022 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 1023 // CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 1024 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 1025 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 1026 // CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 1027 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 1028 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 1029 // CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 1030 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 1031 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 1032 // CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 1033 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 1034 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 1035 // CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 1036 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 1037 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85034]], align 16 1038 // CHECK-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16 1039 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7 1040 // CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2 1041 // CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2 1042 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 1043 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85044]], align 16 1044 // CHECK-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16 1045 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 7 1046 // CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2 1047 // CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2 1048 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 1049 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85054]], align 16 1050 // CHECK-NEXT: [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16 1051 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 7 1052 // CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2 1053 // CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2 1054 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 1055 // CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85064]], align 16 1056 // CHECK-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16 1057 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 7 1058 // CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2 1059 // CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2 1060 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 1061 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 1062 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> 1063 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> 1064 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) 1065 // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]] 1066 // 1067 float32x4_t test_vfmlslq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { 1068 return vfmlslq_laneq_high_f16(a, b, c, 7); 1069 } 1070