1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 2 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s 3 4 // REQUIRES: aarch64-registered-target || arm-registered-target 5 6 #include <arm_neon.h> 7 8 // CHECK-LABEL: @test_vmla_lane_s16( 9 // CHECK-NEXT: entry: 10 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 11 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 12 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 13 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 14 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] 15 // CHECK-NEXT: ret <4 x i16> [[ADD]] 16 // 17 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { 18 return vmla_lane_s16(a, b, v, 3); 19 } 20 21 // CHECK-LABEL: @test_vmlaq_lane_s16( 22 // CHECK-NEXT: entry: 23 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 24 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 25 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 26 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 27 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] 28 // CHECK-NEXT: ret <8 x i16> [[ADD]] 29 // 30 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { 31 return vmlaq_lane_s16(a, b, v, 3); 32 } 33 34 // CHECK-LABEL: @test_vmla_lane_s32( 35 // CHECK-NEXT: entry: 36 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 37 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 38 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 39 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 40 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] 41 // CHECK-NEXT: ret <2 x i32> [[ADD]] 42 // 43 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { 44 return vmla_lane_s32(a, b, v, 1); 45 } 46 47 // CHECK-LABEL: @test_vmlaq_lane_s32( 48 // CHECK-NEXT: entry: 49 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 50 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 51 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1> 52 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 53 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] 54 // CHECK-NEXT: ret <4 x i32> [[ADD]] 55 // 56 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { 57 return vmlaq_lane_s32(a, b, v, 1); 58 } 59 60 // CHECK-LABEL: @test_vmla_laneq_s16( 61 // CHECK-NEXT: entry: 62 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 63 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 64 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 65 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 66 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] 67 // CHECK-NEXT: ret <4 x i16> [[ADD]] 68 // 69 int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { 70 return vmla_laneq_s16(a, b, v, 7); 71 } 72 73 // CHECK-LABEL: @test_vmlaq_laneq_s16( 74 // CHECK-NEXT: entry: 75 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 76 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 77 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 78 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 79 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] 80 // CHECK-NEXT: ret <8 x i16> [[ADD]] 81 // 82 int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { 83 return vmlaq_laneq_s16(a, b, v, 7); 84 } 85 86 // CHECK-LABEL: @test_vmla_laneq_s32( 87 // CHECK-NEXT: entry: 88 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 89 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 90 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 91 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 92 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] 93 // CHECK-NEXT: ret <2 x i32> [[ADD]] 94 // 95 int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { 96 return vmla_laneq_s32(a, b, v, 3); 97 } 98 99 // CHECK-LABEL: @test_vmlaq_laneq_s32( 100 // CHECK-NEXT: entry: 101 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 102 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 103 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 104 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 105 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] 106 // CHECK-NEXT: ret <4 x i32> [[ADD]] 107 // 108 int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { 109 return vmlaq_laneq_s32(a, b, v, 3); 110 } 111 112 // CHECK-LABEL: @test_vmls_lane_s16( 113 // CHECK-NEXT: entry: 114 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 115 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 116 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 117 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 118 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] 119 // CHECK-NEXT: ret <4 x i16> [[SUB]] 120 // 121 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { 122 return vmls_lane_s16(a, b, v, 3); 123 } 124 125 // CHECK-LABEL: @test_vmlsq_lane_s16( 126 // CHECK-NEXT: entry: 127 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 128 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 129 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 130 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 131 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] 132 // CHECK-NEXT: ret <8 x i16> [[SUB]] 133 // 134 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { 135 return vmlsq_lane_s16(a, b, v, 3); 136 } 137 138 // CHECK-LABEL: @test_vmls_lane_s32( 139 // CHECK-NEXT: entry: 140 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 141 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 142 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 143 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 144 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] 145 // CHECK-NEXT: ret <2 x i32> [[SUB]] 146 // 147 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { 148 return vmls_lane_s32(a, b, v, 1); 149 } 150 151 // CHECK-LABEL: @test_vmlsq_lane_s32( 152 // CHECK-NEXT: entry: 153 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 154 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 155 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1> 156 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 157 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] 158 // CHECK-NEXT: ret <4 x i32> [[SUB]] 159 // 160 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { 161 return vmlsq_lane_s32(a, b, v, 1); 162 } 163 164 // CHECK-LABEL: @test_vmls_laneq_s16( 165 // CHECK-NEXT: entry: 166 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 167 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 168 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 169 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 170 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] 171 // CHECK-NEXT: ret <4 x i16> [[SUB]] 172 // 173 int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { 174 return vmls_laneq_s16(a, b, v, 7); 175 } 176 177 // CHECK-LABEL: @test_vmlsq_laneq_s16( 178 // CHECK-NEXT: entry: 179 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 180 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 181 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 182 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 183 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] 184 // CHECK-NEXT: ret <8 x i16> [[SUB]] 185 // 186 int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { 187 return vmlsq_laneq_s16(a, b, v, 7); 188 } 189 190 // CHECK-LABEL: @test_vmls_laneq_s32( 191 // CHECK-NEXT: entry: 192 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 193 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 194 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 195 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 196 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] 197 // CHECK-NEXT: ret <2 x i32> [[SUB]] 198 // 199 int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { 200 return vmls_laneq_s32(a, b, v, 3); 201 } 202 203 // CHECK-LABEL: @test_vmlsq_laneq_s32( 204 // CHECK-NEXT: entry: 205 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 206 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 207 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 208 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 209 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] 210 // CHECK-NEXT: ret <4 x i32> [[SUB]] 211 // 212 int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { 213 return vmlsq_laneq_s32(a, b, v, 3); 214 } 215 216 // CHECK-LABEL: @test_vmul_lane_s16( 217 // CHECK-NEXT: entry: 218 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 219 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 220 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 221 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] 222 // CHECK-NEXT: ret <4 x i16> [[MUL]] 223 // 224 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) { 225 return vmul_lane_s16(a, v, 3); 226 } 227 228 // CHECK-LABEL: @test_vmulq_lane_s16( 229 // CHECK-NEXT: entry: 230 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 231 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 232 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 233 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] 234 // CHECK-NEXT: ret <8 x i16> [[MUL]] 235 // 236 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) { 237 return vmulq_lane_s16(a, v, 3); 238 } 239 240 // CHECK-LABEL: @test_vmul_lane_s32( 241 // CHECK-NEXT: entry: 242 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 243 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 244 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 245 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] 246 // CHECK-NEXT: ret <2 x i32> [[MUL]] 247 // 248 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) { 249 return vmul_lane_s32(a, v, 1); 250 } 251 252 // CHECK-LABEL: @test_vmulq_lane_s32( 253 // CHECK-NEXT: entry: 254 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 255 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 256 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1> 257 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] 258 // CHECK-NEXT: ret <4 x i32> [[MUL]] 259 // 260 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) { 261 return vmulq_lane_s32(a, v, 1); 262 } 263 264 // CHECK-LABEL: @test_vmul_lane_u16( 265 // CHECK-NEXT: entry: 266 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 267 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 268 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 269 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] 270 // CHECK-NEXT: ret <4 x i16> [[MUL]] 271 // 272 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) { 273 return vmul_lane_u16(a, v, 3); 274 } 275 276 // CHECK-LABEL: @test_vmulq_lane_u16( 277 // CHECK-NEXT: entry: 278 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 279 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 280 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 281 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] 282 // CHECK-NEXT: ret <8 x i16> [[MUL]] 283 // 284 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) { 285 return vmulq_lane_u16(a, v, 3); 286 } 287 288 // CHECK-LABEL: @test_vmul_lane_u32( 289 // CHECK-NEXT: entry: 290 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 291 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 292 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 293 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] 294 // CHECK-NEXT: ret <2 x i32> [[MUL]] 295 // 296 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) { 297 return vmul_lane_u32(a, v, 1); 298 } 299 300 // CHECK-LABEL: @test_vmulq_lane_u32( 301 // CHECK-NEXT: entry: 302 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 303 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 304 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1> 305 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] 306 // CHECK-NEXT: ret <4 x i32> [[MUL]] 307 // 308 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) { 309 return vmulq_lane_u32(a, v, 1); 310 } 311 312 // CHECK-LABEL: @test_vmul_laneq_s16( 313 // CHECK-NEXT: entry: 314 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 315 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 316 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 317 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] 318 // CHECK-NEXT: ret <4 x i16> [[MUL]] 319 // 320 int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) { 321 return vmul_laneq_s16(a, v, 7); 322 } 323 324 // CHECK-LABEL: @test_vmulq_laneq_s16( 325 // CHECK-NEXT: entry: 326 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 327 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 328 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 329 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] 330 // CHECK-NEXT: ret <8 x i16> [[MUL]] 331 // 332 int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) { 333 return vmulq_laneq_s16(a, v, 7); 334 } 335 336 // CHECK-LABEL: @test_vmul_laneq_s32( 337 // CHECK-NEXT: entry: 338 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 339 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 340 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 341 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] 342 // CHECK-NEXT: ret <2 x i32> [[MUL]] 343 // 344 int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) { 345 return vmul_laneq_s32(a, v, 3); 346 } 347 348 // CHECK-LABEL: @test_vmulq_laneq_s32( 349 // CHECK-NEXT: entry: 350 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 351 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 352 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 353 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] 354 // CHECK-NEXT: ret <4 x i32> [[MUL]] 355 // 356 int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) { 357 return vmulq_laneq_s32(a, v, 3); 358 } 359 360 // CHECK-LABEL: @test_vmul_laneq_u16( 361 // CHECK-NEXT: entry: 362 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 363 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 364 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 365 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] 366 // CHECK-NEXT: ret <4 x i16> [[MUL]] 367 // 368 uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) { 369 return vmul_laneq_u16(a, v, 7); 370 } 371 372 // CHECK-LABEL: @test_vmulq_laneq_u16( 373 // CHECK-NEXT: entry: 374 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 375 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 376 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 377 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] 378 // CHECK-NEXT: ret <8 x i16> [[MUL]] 379 // 380 uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) { 381 return vmulq_laneq_u16(a, v, 7); 382 } 383 384 // CHECK-LABEL: @test_vmul_laneq_u32( 385 // CHECK-NEXT: entry: 386 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 387 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 388 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 389 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] 390 // CHECK-NEXT: ret <2 x i32> [[MUL]] 391 // 392 uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) { 393 return vmul_laneq_u32(a, v, 3); 394 } 395 396 // CHECK-LABEL: @test_vmulq_laneq_u32( 397 // CHECK-NEXT: entry: 398 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 399 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 400 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 401 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] 402 // CHECK-NEXT: ret <4 x i32> [[MUL]] 403 // 404 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) { 405 return vmulq_laneq_u32(a, v, 3); 406 } 407 408 // CHECK-LABEL: @test_vfma_lane_f32( 409 // CHECK-NEXT: entry: 410 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 411 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> 412 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 413 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 414 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1> 415 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 416 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 417 // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) 418 // CHECK-NEXT: ret <2 x float> [[FMLA2]] 419 // 420 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { 421 return vfma_lane_f32(a, b, v, 1); 422 } 423 424 // CHECK-LABEL: @test_vfmaq_lane_f32( 425 // CHECK-NEXT: entry: 426 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 427 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> 428 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 429 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 430 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1> 431 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 432 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 433 // CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) 434 // CHECK-NEXT: ret <4 x float> [[FMLA2]] 435 // 436 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { 437 return vfmaq_lane_f32(a, b, v, 1); 438 } 439 440 // CHECK-LABEL: @test_vfma_laneq_f32( 441 // CHECK-NEXT: entry: 442 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 443 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> 444 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 445 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 446 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 447 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 448 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3> 449 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) 450 // CHECK-NEXT: ret <2 x float> [[TMP6]] 451 // 452 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { 453 return vfma_laneq_f32(a, b, v, 3); 454 } 455 456 // CHECK-LABEL: @test_vfmaq_laneq_f32( 457 // CHECK-NEXT: entry: 458 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 459 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> 460 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 461 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 462 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 463 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 464 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 465 // CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) 466 // CHECK-NEXT: ret <4 x float> [[TMP6]] 467 // 468 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { 469 return vfmaq_laneq_f32(a, b, v, 3); 470 } 471 472 // CHECK-LABEL: @test_vfms_lane_f32( 473 // CHECK-NEXT: entry: 474 // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]] 475 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 476 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8> 477 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 478 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 479 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1> 480 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 481 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 482 // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) 483 // CHECK-NEXT: ret <2 x float> [[FMLA2]] 484 // 485 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { 486 return vfms_lane_f32(a, b, v, 1); 487 } 488 489 // CHECK-LABEL: @test_vfmsq_lane_f32( 490 // CHECK-NEXT: entry: 491 // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]] 492 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 493 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8> 494 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 495 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 496 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1> 497 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 498 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 499 // CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) 500 // CHECK-NEXT: ret <4 x float> [[FMLA2]] 501 // 502 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { 503 return vfmsq_lane_f32(a, b, v, 1); 504 } 505 506 // CHECK-LABEL: @test_vfms_laneq_f32( 507 // CHECK-NEXT: entry: 508 // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]] 509 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 510 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8> 511 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 512 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 513 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 514 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 515 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3> 516 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) 517 // CHECK-NEXT: ret <2 x float> [[TMP6]] 518 // 519 float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { 520 return vfms_laneq_f32(a, b, v, 3); 521 } 522 523 // CHECK-LABEL: @test_vfmsq_laneq_f32( 524 // CHECK-NEXT: entry: 525 // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]] 526 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 527 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8> 528 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 529 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 530 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 531 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 532 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 533 // CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) 534 // CHECK-NEXT: ret <4 x float> [[TMP6]] 535 // 536 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { 537 return vfmsq_laneq_f32(a, b, v, 3); 538 } 539 540 // CHECK-LABEL: @test_vfmaq_lane_f64( 541 // CHECK-NEXT: entry: 542 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> 543 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8> 544 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> 545 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> 546 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer 547 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 548 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 549 // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]]) 550 // CHECK-NEXT: ret <2 x double> [[FMLA2]] 551 // 552 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { 553 return vfmaq_lane_f64(a, b, v, 0); 554 } 555 556 // CHECK-LABEL: @test_vfmaq_laneq_f64( 557 // CHECK-NEXT: entry: 558 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> 559 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8> 560 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> 561 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 562 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 563 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> 564 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1> 565 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) 566 // CHECK-NEXT: ret <2 x double> [[TMP6]] 567 // 568 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { 569 return vfmaq_laneq_f64(a, b, v, 1); 570 } 571 572 // CHECK-LABEL: @test_vfmsq_lane_f64( 573 // CHECK-NEXT: entry: 574 // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]] 575 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> 576 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8> 577 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> 578 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> 579 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer 580 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 581 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 582 // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]]) 583 // CHECK-NEXT: ret <2 x double> [[FMLA2]] 584 // 585 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { 586 return vfmsq_lane_f64(a, b, v, 0); 587 } 588 589 // CHECK-LABEL: @test_vfmsq_laneq_f64( 590 // CHECK-NEXT: entry: 591 // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]] 592 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> 593 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8> 594 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> 595 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 596 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 597 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> 598 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1> 599 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) 600 // CHECK-NEXT: ret <2 x double> [[TMP6]] 601 // 602 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { 603 return vfmsq_laneq_f64(a, b, v, 1); 604 } 605 606 // CHECK-LABEL: @test_vfmas_laneq_f32( 607 // CHECK-NEXT: entry: 608 // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3 609 // CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B:%.*]], float [[EXTRACT]], float [[A:%.*]]) 610 // CHECK-NEXT: ret float [[TMP0]] 611 // 612 float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) { 613 return vfmas_laneq_f32(a, b, v, 3); 614 } 615 616 // CHECK-LABEL: @test_vfmsd_lane_f64( 617 // CHECK-NEXT: entry: 618 // CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]] 619 // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[V:%.*]], i32 0 620 // CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]]) 621 // CHECK-NEXT: ret double [[TMP0]] 622 // 623 float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) { 624 return vfmsd_lane_f64(a, b, v, 0); 625 } 626 627 // CHECK-LABEL: @test_vfmss_laneq_f32( 628 // CHECK-NEXT: entry: 629 // CHECK-NEXT: [[FNEG:%.*]] = fneg float [[B:%.*]] 630 // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3 631 // CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A:%.*]]) 632 // CHECK-NEXT: ret float [[TMP0]] 633 // 634 float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) { 635 return vfmss_laneq_f32(a, b, v, 3); 636 } 637 638 // CHECK-LABEL: @test_vfmsd_laneq_f64( 639 // CHECK-NEXT: entry: 640 // CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]] 641 // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[V:%.*]], i32 1 642 // CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]]) 643 // CHECK-NEXT: ret double [[TMP0]] 644 // 645 float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) { 646 return vfmsd_laneq_f64(a, b, v, 1); 647 } 648 649 // CHECK-LABEL: @test_vmlal_lane_s16( 650 // CHECK-NEXT: entry: 651 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 652 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 653 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 654 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 655 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 656 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 657 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 658 // CHECK-NEXT: ret <4 x i32> [[ADD]] 659 // 660 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { 661 return vmlal_lane_s16(a, b, v, 3); 662 } 663 664 // CHECK-LABEL: @test_vmlal_lane_s32( 665 // CHECK-NEXT: entry: 666 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 667 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 668 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 669 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 670 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 671 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 672 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 673 // CHECK-NEXT: ret <2 x i64> [[ADD]] 674 // 675 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { 676 return vmlal_lane_s32(a, b, v, 1); 677 } 678 679 // CHECK-LABEL: @test_vmlal_laneq_s16( 680 // CHECK-NEXT: entry: 681 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 682 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 683 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 684 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 685 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 686 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 687 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 688 // CHECK-NEXT: ret <4 x i32> [[ADD]] 689 // 690 int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { 691 return vmlal_laneq_s16(a, b, v, 7); 692 } 693 694 // CHECK-LABEL: @test_vmlal_laneq_s32( 695 // CHECK-NEXT: entry: 696 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 697 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 698 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 699 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 700 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 701 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 702 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 703 // CHECK-NEXT: ret <2 x i64> [[ADD]] 704 // 705 int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { 706 return vmlal_laneq_s32(a, b, v, 3); 707 } 708 709 // CHECK-LABEL: @test_vmlal_high_lane_s16( 710 // CHECK-NEXT: entry: 711 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 712 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 713 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 714 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 715 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 716 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 717 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 718 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 719 // CHECK-NEXT: ret <4 x i32> [[ADD]] 720 // 721 int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { 722 return vmlal_high_lane_s16(a, b, v, 3); 723 } 724 725 // CHECK-LABEL: @test_vmlal_high_lane_s32( 726 // CHECK-NEXT: entry: 727 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 728 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 729 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 730 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 731 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 732 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 733 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 734 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 735 // CHECK-NEXT: ret <2 x i64> [[ADD]] 736 // 737 int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { 738 return vmlal_high_lane_s32(a, b, v, 1); 739 } 740 741 // CHECK-LABEL: @test_vmlal_high_laneq_s16( 742 // CHECK-NEXT: entry: 743 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 744 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 745 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 746 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 747 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 748 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 749 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 750 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 751 // CHECK-NEXT: ret <4 x i32> [[ADD]] 752 // 753 int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { 754 return vmlal_high_laneq_s16(a, b, v, 7); 755 } 756 757 // CHECK-LABEL: @test_vmlal_high_laneq_s32( 758 // CHECK-NEXT: entry: 759 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 760 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 761 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 762 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 763 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 764 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 765 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 766 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 767 // CHECK-NEXT: ret <2 x i64> [[ADD]] 768 // 769 int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { 770 return vmlal_high_laneq_s32(a, b, v, 3); 771 } 772 773 // CHECK-LABEL: @test_vmlsl_lane_s16( 774 // CHECK-NEXT: entry: 775 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 776 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 777 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 778 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 779 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 780 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 781 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 782 // CHECK-NEXT: ret <4 x i32> [[SUB]] 783 // 784 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { 785 return vmlsl_lane_s16(a, b, v, 3); 786 } 787 788 // CHECK-LABEL: @test_vmlsl_lane_s32( 789 // CHECK-NEXT: entry: 790 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 791 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 792 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 793 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 794 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 795 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 796 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 797 // CHECK-NEXT: ret <2 x i64> [[SUB]] 798 // 799 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { 800 return vmlsl_lane_s32(a, b, v, 1); 801 } 802 803 // CHECK-LABEL: @test_vmlsl_laneq_s16( 804 // CHECK-NEXT: entry: 805 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 806 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 807 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 808 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 809 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 810 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 811 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 812 // CHECK-NEXT: ret <4 x i32> [[SUB]] 813 // 814 int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { 815 return vmlsl_laneq_s16(a, b, v, 7); 816 } 817 818 // CHECK-LABEL: @test_vmlsl_laneq_s32( 819 // CHECK-NEXT: entry: 820 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 821 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 822 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 823 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 824 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 825 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 826 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 827 // CHECK-NEXT: ret <2 x i64> [[SUB]] 828 // 829 int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { 830 return vmlsl_laneq_s32(a, b, v, 3); 831 } 832 833 // CHECK-LABEL: @test_vmlsl_high_lane_s16( 834 // CHECK-NEXT: entry: 835 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 836 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 837 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 838 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 839 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 840 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 841 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 842 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 843 // CHECK-NEXT: ret <4 x i32> [[SUB]] 844 // 845 int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { 846 return vmlsl_high_lane_s16(a, b, v, 3); 847 } 848 849 // CHECK-LABEL: @test_vmlsl_high_lane_s32( 850 // CHECK-NEXT: entry: 851 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 852 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 853 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 854 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 855 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 856 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 857 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 858 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 859 // CHECK-NEXT: ret <2 x i64> [[SUB]] 860 // 861 int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { 862 return vmlsl_high_lane_s32(a, b, v, 1); 863 } 864 865 // CHECK-LABEL: @test_vmlsl_high_laneq_s16( 866 // CHECK-NEXT: entry: 867 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 868 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 869 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 870 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 871 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 872 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 873 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 874 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 875 // CHECK-NEXT: ret <4 x i32> [[SUB]] 876 // 877 int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { 878 return vmlsl_high_laneq_s16(a, b, v, 7); 879 } 880 881 // CHECK-LABEL: @test_vmlsl_high_laneq_s32( 882 // CHECK-NEXT: entry: 883 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 884 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 885 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 886 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 887 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 888 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 889 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 890 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 891 // CHECK-NEXT: ret <2 x i64> [[SUB]] 892 // 893 int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { 894 return vmlsl_high_laneq_s32(a, b, v, 3); 895 } 896 897 // CHECK-LABEL: @test_vmlal_lane_u16( 898 // CHECK-NEXT: entry: 899 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 900 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 901 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 902 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 903 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 904 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 905 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 906 // CHECK-NEXT: ret <4 x i32> [[ADD]] 907 // 908 int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { 909 return vmlal_lane_u16(a, b, v, 3); 910 } 911 912 // CHECK-LABEL: @test_vmlal_lane_u32( 913 // CHECK-NEXT: entry: 914 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 915 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 916 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 917 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 918 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 919 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 920 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 921 // CHECK-NEXT: ret <2 x i64> [[ADD]] 922 // 923 int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { 924 return vmlal_lane_u32(a, b, v, 1); 925 } 926 927 // CHECK-LABEL: @test_vmlal_laneq_u16( 928 // CHECK-NEXT: entry: 929 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 930 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 931 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 932 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 933 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 934 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 935 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 936 // CHECK-NEXT: ret <4 x i32> [[ADD]] 937 // 938 int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { 939 return vmlal_laneq_u16(a, b, v, 7); 940 } 941 942 // CHECK-LABEL: @test_vmlal_laneq_u32( 943 // CHECK-NEXT: entry: 944 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 945 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 946 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 947 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 948 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 949 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 950 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 951 // CHECK-NEXT: ret <2 x i64> [[ADD]] 952 // 953 int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { 954 return vmlal_laneq_u32(a, b, v, 3); 955 } 956 957 // CHECK-LABEL: @test_vmlal_high_lane_u16( 958 // CHECK-NEXT: entry: 959 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 960 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 961 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 962 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 963 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 964 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 965 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 966 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 967 // CHECK-NEXT: ret <4 x i32> [[ADD]] 968 // 969 int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { 970 return vmlal_high_lane_u16(a, b, v, 3); 971 } 972 973 // CHECK-LABEL: @test_vmlal_high_lane_u32( 974 // CHECK-NEXT: entry: 975 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 976 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 977 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 978 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 979 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 980 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 981 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 982 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 983 // CHECK-NEXT: ret <2 x i64> [[ADD]] 984 // 985 int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { 986 return vmlal_high_lane_u32(a, b, v, 1); 987 } 988 989 // CHECK-LABEL: @test_vmlal_high_laneq_u16( 990 // CHECK-NEXT: entry: 991 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 992 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 993 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 994 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 995 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 996 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 997 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 998 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 999 // CHECK-NEXT: ret <4 x i32> [[ADD]] 1000 // 1001 int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { 1002 return vmlal_high_laneq_u16(a, b, v, 7); 1003 } 1004 1005 // CHECK-LABEL: @test_vmlal_high_laneq_u32( 1006 // CHECK-NEXT: entry: 1007 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 1008 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 1009 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1010 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 1011 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1012 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1013 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 1014 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 1015 // CHECK-NEXT: ret <2 x i64> [[ADD]] 1016 // 1017 int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { 1018 return vmlal_high_laneq_u32(a, b, v, 3); 1019 } 1020 1021 // CHECK-LABEL: @test_vmlsl_lane_u16( 1022 // CHECK-NEXT: entry: 1023 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1024 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1025 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1026 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 1027 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1028 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 1029 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 1030 // CHECK-NEXT: ret <4 x i32> [[SUB]] 1031 // 1032 int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { 1033 return vmlsl_lane_u16(a, b, v, 3); 1034 } 1035 1036 // CHECK-LABEL: @test_vmlsl_lane_u32( 1037 // CHECK-NEXT: entry: 1038 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1039 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1040 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 1041 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 1042 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1043 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 1044 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 1045 // CHECK-NEXT: ret <2 x i64> [[SUB]] 1046 // 1047 int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { 1048 return vmlsl_lane_u32(a, b, v, 1); 1049 } 1050 1051 // CHECK-LABEL: @test_vmlsl_laneq_u16( 1052 // CHECK-NEXT: entry: 1053 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 1054 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 1055 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 1056 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 1057 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1058 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 1059 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 1060 // CHECK-NEXT: ret <4 x i32> [[SUB]] 1061 // 1062 int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { 1063 return vmlsl_laneq_u16(a, b, v, 7); 1064 } 1065 1066 // CHECK-LABEL: @test_vmlsl_laneq_u32( 1067 // CHECK-NEXT: entry: 1068 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 1069 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1070 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 1071 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 1072 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1073 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 1074 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 1075 // CHECK-NEXT: ret <2 x i64> [[SUB]] 1076 // 1077 int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { 1078 return vmlsl_laneq_u32(a, b, v, 3); 1079 } 1080 1081 // CHECK-LABEL: @test_vmlsl_high_lane_u16( 1082 // CHECK-NEXT: entry: 1083 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1084 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1085 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1086 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1087 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1088 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1089 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 1090 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 1091 // CHECK-NEXT: ret <4 x i32> [[SUB]] 1092 // 1093 int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { 1094 return vmlsl_high_lane_u16(a, b, v, 3); 1095 } 1096 1097 // CHECK-LABEL: @test_vmlsl_high_lane_u32( 1098 // CHECK-NEXT: entry: 1099 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 1100 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1101 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1102 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 1103 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1104 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1105 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 1106 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 1107 // CHECK-NEXT: ret <2 x i64> [[SUB]] 1108 // 1109 int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { 1110 return vmlsl_high_lane_u32(a, b, v, 1); 1111 } 1112 1113 // CHECK-LABEL: @test_vmlsl_high_laneq_u16( 1114 // CHECK-NEXT: entry: 1115 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1116 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 1117 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 1118 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 1119 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1120 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1121 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 1122 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 1123 // CHECK-NEXT: ret <4 x i32> [[SUB]] 1124 // 1125 int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { 1126 return vmlsl_high_laneq_u16(a, b, v, 7); 1127 } 1128 1129 // CHECK-LABEL: @test_vmlsl_high_laneq_u32( 1130 // CHECK-NEXT: entry: 1131 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 1132 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 1133 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1134 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 1135 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1136 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1137 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 1138 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 1139 // CHECK-NEXT: ret <2 x i64> [[SUB]] 1140 // 1141 int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { 1142 return vmlsl_high_laneq_u32(a, b, v, 3); 1143 } 1144 1145 // CHECK-LABEL: @test_vmull_lane_s16( 1146 // CHECK-NEXT: entry: 1147 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1148 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1149 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1150 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 1151 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1152 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) 1153 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 1154 // 1155 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { 1156 return vmull_lane_s16(a, v, 3); 1157 } 1158 1159 // CHECK-LABEL: @test_vmull_lane_s32( 1160 // CHECK-NEXT: entry: 1161 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1162 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1163 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 1164 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 1165 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1166 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) 1167 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 1168 // 1169 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { 1170 return vmull_lane_s32(a, v, 1); 1171 } 1172 1173 // CHECK-LABEL: @test_vmull_lane_u16( 1174 // CHECK-NEXT: entry: 1175 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1176 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1177 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1178 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 1179 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1180 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) 1181 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 1182 // 1183 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { 1184 return vmull_lane_u16(a, v, 3); 1185 } 1186 1187 // CHECK-LABEL: @test_vmull_lane_u32( 1188 // CHECK-NEXT: entry: 1189 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1190 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1191 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 1192 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 1193 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1194 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) 1195 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 1196 // 1197 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { 1198 return vmull_lane_u32(a, v, 1); 1199 } 1200 1201 // CHECK-LABEL: @test_vmull_high_lane_s16( 1202 // CHECK-NEXT: entry: 1203 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1204 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1205 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1206 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1207 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1208 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1209 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 1210 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 1211 // 1212 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) { 1213 return vmull_high_lane_s16(a, v, 3); 1214 } 1215 1216 // CHECK-LABEL: @test_vmull_high_lane_s32( 1217 // CHECK-NEXT: entry: 1218 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3> 1219 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1220 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1221 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 1222 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1223 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1224 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 1225 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 1226 // 1227 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) { 1228 return vmull_high_lane_s32(a, v, 1); 1229 } 1230 1231 // CHECK-LABEL: @test_vmull_high_lane_u16( 1232 // CHECK-NEXT: entry: 1233 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1234 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1235 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1236 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1237 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1238 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1239 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 1240 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 1241 // 1242 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) { 1243 return vmull_high_lane_u16(a, v, 3); 1244 } 1245 1246 // CHECK-LABEL: @test_vmull_high_lane_u32( 1247 // CHECK-NEXT: entry: 1248 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3> 1249 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1250 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1251 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 1252 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1253 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1254 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 1255 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 1256 // 1257 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) { 1258 return vmull_high_lane_u32(a, v, 1); 1259 } 1260 1261 // CHECK-LABEL: @test_vmull_laneq_s16( 1262 // CHECK-NEXT: entry: 1263 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 1264 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 1265 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 1266 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 1267 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1268 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) 1269 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 1270 // 1271 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { 1272 return vmull_laneq_s16(a, v, 7); 1273 } 1274 1275 // CHECK-LABEL: @test_vmull_laneq_s32( 1276 // CHECK-NEXT: entry: 1277 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 1278 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1279 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 1280 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 1281 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1282 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) 1283 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 1284 // 1285 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { 1286 return vmull_laneq_s32(a, v, 3); 1287 } 1288 1289 // CHECK-LABEL: @test_vmull_laneq_u16( 1290 // CHECK-NEXT: entry: 1291 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 1292 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 1293 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 1294 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 1295 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1296 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) 1297 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 1298 // 1299 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { 1300 return vmull_laneq_u16(a, v, 7); 1301 } 1302 1303 // CHECK-LABEL: @test_vmull_laneq_u32( 1304 // CHECK-NEXT: entry: 1305 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 1306 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1307 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 1308 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 1309 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1310 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) 1311 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 1312 // 1313 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { 1314 return vmull_laneq_u32(a, v, 3); 1315 } 1316 1317 // CHECK-LABEL: @test_vmull_high_laneq_s16( 1318 // CHECK-NEXT: entry: 1319 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1320 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 1321 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 1322 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 1323 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1324 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1325 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 1326 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 1327 // 1328 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) { 1329 return vmull_high_laneq_s16(a, v, 7); 1330 } 1331 1332 // CHECK-LABEL: @test_vmull_high_laneq_s32( 1333 // CHECK-NEXT: entry: 1334 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3> 1335 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 1336 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1337 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 1338 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1339 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1340 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 1341 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 1342 // 1343 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) { 1344 return vmull_high_laneq_s32(a, v, 3); 1345 } 1346 1347 // CHECK-LABEL: @test_vmull_high_laneq_u16( 1348 // CHECK-NEXT: entry: 1349 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1350 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 1351 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 1352 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 1353 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1354 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1355 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 1356 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 1357 // 1358 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) { 1359 return vmull_high_laneq_u16(a, v, 7); 1360 } 1361 1362 // CHECK-LABEL: @test_vmull_high_laneq_u32( 1363 // CHECK-NEXT: entry: 1364 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3> 1365 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 1366 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1367 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 1368 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1369 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1370 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 1371 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 1372 // 1373 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) { 1374 return vmull_high_laneq_u32(a, v, 3); 1375 } 1376 1377 // CHECK-LABEL: @test_vqdmlal_lane_s16( 1378 // CHECK-NEXT: entry: 1379 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1380 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1381 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1382 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 1383 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 1384 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1385 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 1386 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 1387 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] 1388 // 1389 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { 1390 return vqdmlal_lane_s16(a, b, v, 3); 1391 } 1392 1393 // CHECK-LABEL: @test_vqdmlal_lane_s32( 1394 // CHECK-NEXT: entry: 1395 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1396 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1397 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 1398 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 1399 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 1400 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1401 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 1402 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 1403 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] 1404 // 1405 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { 1406 return vqdmlal_lane_s32(a, b, v, 1); 1407 } 1408 1409 // CHECK-LABEL: @test_vqdmlal_high_lane_s16( 1410 // CHECK-NEXT: entry: 1411 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1412 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1413 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1414 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1415 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 1416 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1417 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1418 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 1419 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 1420 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] 1421 // 1422 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { 1423 return vqdmlal_high_lane_s16(a, b, v, 3); 1424 } 1425 1426 // CHECK-LABEL: @test_vqdmlal_high_lane_s32( 1427 // CHECK-NEXT: entry: 1428 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 1429 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1430 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1431 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 1432 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 1433 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1434 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1435 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 1436 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 1437 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] 1438 // 1439 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { 1440 return vqdmlal_high_lane_s32(a, b, v, 1); 1441 } 1442 1443 // CHECK-LABEL: @test_vqdmlsl_lane_s16( 1444 // CHECK-NEXT: entry: 1445 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1446 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1447 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1448 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 1449 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 1450 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1451 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 1452 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 1453 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] 1454 // 1455 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { 1456 return vqdmlsl_lane_s16(a, b, v, 3); 1457 } 1458 1459 // CHECK-LABEL: @test_vqdmlsl_lane_s32( 1460 // CHECK-NEXT: entry: 1461 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1462 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1463 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 1464 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 1465 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 1466 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1467 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 1468 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 1469 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] 1470 // 1471 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { 1472 return vqdmlsl_lane_s32(a, b, v, 1); 1473 } 1474 1475 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16( 1476 // CHECK-NEXT: entry: 1477 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1478 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1479 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1480 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1481 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 1482 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1483 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1484 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 1485 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 1486 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] 1487 // 1488 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { 1489 return vqdmlsl_high_lane_s16(a, b, v, 3); 1490 } 1491 1492 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32( 1493 // CHECK-NEXT: entry: 1494 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 1495 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1496 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1497 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 1498 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 1499 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1500 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1501 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 1502 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 1503 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] 1504 // 1505 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { 1506 return vqdmlsl_high_lane_s32(a, b, v, 1); 1507 } 1508 1509 // CHECK-LABEL: @test_vqdmull_lane_s16( 1510 // CHECK-NEXT: entry: 1511 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1512 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1513 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1514 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 1515 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1516 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) 1517 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 1518 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] 1519 // 1520 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) { 1521 return vqdmull_lane_s16(a, v, 3); 1522 } 1523 1524 // CHECK-LABEL: @test_vqdmull_lane_s32( 1525 // CHECK-NEXT: entry: 1526 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1527 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1528 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 1529 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 1530 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1531 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) 1532 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 1533 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] 1534 // 1535 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) { 1536 return vqdmull_lane_s32(a, v, 1); 1537 } 1538 1539 // CHECK-LABEL: @test_vqdmull_laneq_s16( 1540 // CHECK-NEXT: entry: 1541 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 1542 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 1543 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1544 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 1545 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1546 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) 1547 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 1548 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] 1549 // 1550 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) { 1551 return vqdmull_laneq_s16(a, v, 3); 1552 } 1553 1554 // CHECK-LABEL: @test_vqdmull_laneq_s32( 1555 // CHECK-NEXT: entry: 1556 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 1557 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1558 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 1559 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 1560 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1561 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) 1562 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 1563 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] 1564 // 1565 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) { 1566 return vqdmull_laneq_s32(a, v, 3); 1567 } 1568 1569 // CHECK-LABEL: @test_vqdmull_high_lane_s16( 1570 // CHECK-NEXT: entry: 1571 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1572 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1573 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1574 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1575 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1576 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1577 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 1578 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 1579 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] 1580 // 1581 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) { 1582 return vqdmull_high_lane_s16(a, v, 3); 1583 } 1584 1585 // CHECK-LABEL: @test_vqdmull_high_lane_s32( 1586 // CHECK-NEXT: entry: 1587 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3> 1588 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1589 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1590 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 1591 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1592 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1593 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 1594 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 1595 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] 1596 // 1597 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) { 1598 return vqdmull_high_lane_s32(a, v, 1); 1599 } 1600 1601 // CHECK-LABEL: @test_vqdmull_high_laneq_s16( 1602 // CHECK-NEXT: entry: 1603 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1604 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 1605 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 1606 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 1607 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 1608 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 1609 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 1610 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 1611 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] 1612 // 1613 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) { 1614 return vqdmull_high_laneq_s16(a, v, 7); 1615 } 1616 1617 // CHECK-LABEL: @test_vqdmull_high_laneq_s32( 1618 // CHECK-NEXT: entry: 1619 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3> 1620 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 1621 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1622 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 1623 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 1624 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 1625 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 1626 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 1627 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] 1628 // 1629 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) { 1630 return vqdmull_high_laneq_s32(a, v, 3); 1631 } 1632 1633 // CHECK-LABEL: @test_vqdmulh_lane_s16( 1634 // CHECK-NEXT: entry: 1635 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 1636 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1637 // CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1638 // CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1639 // CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 3) 1640 // CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANE_V2]] 1641 // 1642 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) { 1643 return vqdmulh_lane_s16(a, v, 3); 1644 } 1645 1646 // CHECK-LABEL: @test_vqdmulhq_lane_s16( 1647 // CHECK-NEXT: entry: 1648 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> 1649 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1650 // CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 1651 // CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1652 // CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 3) 1653 // CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANE_V2]] 1654 // 1655 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) { 1656 return vqdmulhq_lane_s16(a, v, 3); 1657 } 1658 1659 // CHECK-LABEL: @test_vqdmulh_lane_s32( 1660 // CHECK-NEXT: entry: 1661 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 1662 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1663 // CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1664 // CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1665 // CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 1) 1666 // CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANE_V2]] 1667 // 1668 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) { 1669 return vqdmulh_lane_s32(a, v, 1); 1670 } 1671 1672 // CHECK-LABEL: @test_vqdmulhq_lane_s32( 1673 // CHECK-NEXT: entry: 1674 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 1675 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1676 // CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1677 // CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1678 // CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 1) 1679 // CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANE_V2]] 1680 // 1681 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) { 1682 return vqdmulhq_lane_s32(a, v, 1); 1683 } 1684 1685 // CHECK-LABEL: @test_vqrdmulh_lane_s16( 1686 // CHECK-NEXT: entry: 1687 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 1688 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1689 // CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1690 // CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1691 // CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 3) 1692 // CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANE_V2]] 1693 // 1694 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) { 1695 return vqrdmulh_lane_s16(a, v, 3); 1696 } 1697 1698 // CHECK-LABEL: @test_vqrdmulhq_lane_s16( 1699 // CHECK-NEXT: entry: 1700 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> 1701 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1702 // CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 1703 // CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 1704 // CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 3) 1705 // CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANE_V2]] 1706 // 1707 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) { 1708 return vqrdmulhq_lane_s16(a, v, 3); 1709 } 1710 1711 // CHECK-LABEL: @test_vqrdmulh_lane_s32( 1712 // CHECK-NEXT: entry: 1713 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 1714 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1715 // CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1716 // CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1717 // CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 1) 1718 // CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANE_V2]] 1719 // 1720 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) { 1721 return vqrdmulh_lane_s32(a, v, 1); 1722 } 1723 1724 // CHECK-LABEL: @test_vqrdmulhq_lane_s32( 1725 // CHECK-NEXT: entry: 1726 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 1727 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1728 // CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 1729 // CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 1730 // CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 1) 1731 // CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANE_V2]] 1732 // 1733 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) { 1734 return vqrdmulhq_lane_s32(a, v, 1); 1735 } 1736 1737 // CHECK-LABEL: @test_vmul_lane_f32( 1738 // CHECK-NEXT: entry: 1739 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 1740 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 1741 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1> 1742 // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] 1743 // CHECK-NEXT: ret <2 x float> [[MUL]] 1744 // 1745 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) { 1746 return vmul_lane_f32(a, v, 1); 1747 } 1748 1749 1750 // CHECK-LABEL: @test_vmul_lane_f64( 1751 // CHECK-NEXT: entry: 1752 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> 1753 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> 1754 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double 1755 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> 1756 // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0 1757 // CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]] 1758 // CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double> 1759 // CHECK-NEXT: ret <1 x double> [[TMP5]] 1760 // 1761 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) { 1762 return vmul_lane_f64(a, v, 0); 1763 } 1764 1765 // CHECK-LABEL: @test_vmulq_lane_f32( 1766 // CHECK-NEXT: entry: 1767 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 1768 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 1769 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1770 // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] 1771 // CHECK-NEXT: ret <4 x float> [[MUL]] 1772 // 1773 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) { 1774 return vmulq_lane_f32(a, v, 1); 1775 } 1776 1777 // CHECK-LABEL: @test_vmulq_lane_f64( 1778 // CHECK-NEXT: entry: 1779 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> 1780 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> 1781 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer 1782 // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] 1783 // CHECK-NEXT: ret <2 x double> [[MUL]] 1784 // 1785 float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) { 1786 return vmulq_lane_f64(a, v, 0); 1787 } 1788 1789 // CHECK-LABEL: @test_vmul_laneq_f32( 1790 // CHECK-NEXT: entry: 1791 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 1792 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 1793 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3> 1794 // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] 1795 // CHECK-NEXT: ret <2 x float> [[MUL]] 1796 // 1797 float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) { 1798 return vmul_laneq_f32(a, v, 3); 1799 } 1800 1801 // CHECK-LABEL: @test_vmul_laneq_f64( 1802 // CHECK-NEXT: entry: 1803 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> 1804 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> 1805 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double 1806 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 1807 // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 1808 // CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]] 1809 // CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double> 1810 // CHECK-NEXT: ret <1 x double> [[TMP5]] 1811 // 1812 float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) { 1813 return vmul_laneq_f64(a, v, 1); 1814 } 1815 1816 // CHECK-LABEL: @test_vmulq_laneq_f32( 1817 // CHECK-NEXT: entry: 1818 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 1819 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 1820 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1821 // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] 1822 // CHECK-NEXT: ret <4 x float> [[MUL]] 1823 // 1824 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) { 1825 return vmulq_laneq_f32(a, v, 3); 1826 } 1827 1828 // CHECK-LABEL: @test_vmulq_laneq_f64( 1829 // CHECK-NEXT: entry: 1830 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> 1831 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 1832 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1> 1833 // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] 1834 // CHECK-NEXT: ret <2 x double> [[MUL]] 1835 // 1836 float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) { 1837 return vmulq_laneq_f64(a, v, 1); 1838 } 1839 1840 // CHECK-LABEL: @test_vmulx_lane_f32( 1841 // CHECK-NEXT: entry: 1842 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 1843 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 1844 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1> 1845 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 1846 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> 1847 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) 1848 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] 1849 // 1850 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { 1851 return vmulx_lane_f32(a, v, 1); 1852 } 1853 1854 // CHECK-LABEL: @test_vmulxq_lane_f32( 1855 // CHECK-NEXT: entry: 1856 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 1857 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 1858 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1859 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 1860 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> 1861 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) 1862 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] 1863 // 1864 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { 1865 return vmulxq_lane_f32(a, v, 1); 1866 } 1867 1868 // CHECK-LABEL: @test_vmulxq_lane_f64( 1869 // CHECK-NEXT: entry: 1870 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> 1871 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> 1872 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer 1873 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> 1874 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> 1875 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) 1876 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] 1877 // 1878 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { 1879 return vmulxq_lane_f64(a, v, 0); 1880 } 1881 1882 // CHECK-LABEL: @test_vmulx_laneq_f32( 1883 // CHECK-NEXT: entry: 1884 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 1885 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 1886 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3> 1887 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 1888 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> 1889 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) 1890 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] 1891 // 1892 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { 1893 return vmulx_laneq_f32(a, v, 3); 1894 } 1895 1896 // CHECK-LABEL: @test_vmulxq_laneq_f32( 1897 // CHECK-NEXT: entry: 1898 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 1899 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 1900 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1901 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 1902 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> 1903 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) 1904 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] 1905 // 1906 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { 1907 return vmulxq_laneq_f32(a, v, 3); 1908 } 1909 1910 // CHECK-LABEL: @test_vmulxq_laneq_f64( 1911 // CHECK-NEXT: entry: 1912 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> 1913 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 1914 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1> 1915 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> 1916 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> 1917 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) 1918 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] 1919 // 1920 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) { 1921 return vmulxq_laneq_f64(a, v, 1); 1922 } 1923 1924 // CHECK-LABEL: @test_vmla_lane_s16_0( 1925 // CHECK-NEXT: entry: 1926 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1927 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1928 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 1929 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 1930 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] 1931 // CHECK-NEXT: ret <4 x i16> [[ADD]] 1932 // 1933 int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) { 1934 return vmla_lane_s16(a, b, v, 0); 1935 } 1936 1937 // CHECK-LABEL: @test_vmlaq_lane_s16_0( 1938 // CHECK-NEXT: entry: 1939 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 1940 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 1941 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer 1942 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 1943 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] 1944 // CHECK-NEXT: ret <8 x i16> [[ADD]] 1945 // 1946 int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) { 1947 return vmlaq_lane_s16(a, b, v, 0); 1948 } 1949 1950 // CHECK-LABEL: @test_vmla_lane_s32_0( 1951 // CHECK-NEXT: entry: 1952 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1953 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1954 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 1955 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 1956 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] 1957 // CHECK-NEXT: ret <2 x i32> [[ADD]] 1958 // 1959 int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) { 1960 return vmla_lane_s32(a, b, v, 0); 1961 } 1962 1963 // CHECK-LABEL: @test_vmlaq_lane_s32_0( 1964 // CHECK-NEXT: entry: 1965 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 1966 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 1967 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer 1968 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 1969 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] 1970 // CHECK-NEXT: ret <4 x i32> [[ADD]] 1971 // 1972 int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) { 1973 return vmlaq_lane_s32(a, b, v, 0); 1974 } 1975 1976 // CHECK-LABEL: @test_vmla_laneq_s16_0( 1977 // CHECK-NEXT: entry: 1978 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 1979 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 1980 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 1981 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 1982 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] 1983 // CHECK-NEXT: ret <4 x i16> [[ADD]] 1984 // 1985 int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) { 1986 return vmla_laneq_s16(a, b, v, 0); 1987 } 1988 1989 // CHECK-LABEL: @test_vmlaq_laneq_s16_0( 1990 // CHECK-NEXT: entry: 1991 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 1992 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 1993 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer 1994 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 1995 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] 1996 // CHECK-NEXT: ret <8 x i16> [[ADD]] 1997 // 1998 int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) { 1999 return vmlaq_laneq_s16(a, b, v, 0); 2000 } 2001 2002 // CHECK-LABEL: @test_vmla_laneq_s32_0( 2003 // CHECK-NEXT: entry: 2004 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2005 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2006 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 2007 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 2008 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] 2009 // CHECK-NEXT: ret <2 x i32> [[ADD]] 2010 // 2011 int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) { 2012 return vmla_laneq_s32(a, b, v, 0); 2013 } 2014 2015 // CHECK-LABEL: @test_vmlaq_laneq_s32_0( 2016 // CHECK-NEXT: entry: 2017 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2018 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2019 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer 2020 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 2021 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] 2022 // CHECK-NEXT: ret <4 x i32> [[ADD]] 2023 // 2024 int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) { 2025 return vmlaq_laneq_s32(a, b, v, 0); 2026 } 2027 2028 // CHECK-LABEL: @test_vmls_lane_s16_0( 2029 // CHECK-NEXT: entry: 2030 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 2031 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2032 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 2033 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 2034 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] 2035 // CHECK-NEXT: ret <4 x i16> [[SUB]] 2036 // 2037 int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) { 2038 return vmls_lane_s16(a, b, v, 0); 2039 } 2040 2041 // CHECK-LABEL: @test_vmlsq_lane_s16_0( 2042 // CHECK-NEXT: entry: 2043 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 2044 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2045 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer 2046 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 2047 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] 2048 // CHECK-NEXT: ret <8 x i16> [[SUB]] 2049 // 2050 int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) { 2051 return vmlsq_lane_s16(a, b, v, 0); 2052 } 2053 2054 // CHECK-LABEL: @test_vmls_lane_s32_0( 2055 // CHECK-NEXT: entry: 2056 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 2057 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2058 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 2059 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 2060 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] 2061 // CHECK-NEXT: ret <2 x i32> [[SUB]] 2062 // 2063 int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) { 2064 return vmls_lane_s32(a, b, v, 0); 2065 } 2066 2067 // CHECK-LABEL: @test_vmlsq_lane_s32_0( 2068 // CHECK-NEXT: entry: 2069 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 2070 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2071 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer 2072 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 2073 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] 2074 // CHECK-NEXT: ret <4 x i32> [[SUB]] 2075 // 2076 int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) { 2077 return vmlsq_lane_s32(a, b, v, 0); 2078 } 2079 2080 // CHECK-LABEL: @test_vmls_laneq_s16_0( 2081 // CHECK-NEXT: entry: 2082 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 2083 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2084 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 2085 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 2086 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] 2087 // CHECK-NEXT: ret <4 x i16> [[SUB]] 2088 // 2089 int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) { 2090 return vmls_laneq_s16(a, b, v, 0); 2091 } 2092 2093 // CHECK-LABEL: @test_vmlsq_laneq_s16_0( 2094 // CHECK-NEXT: entry: 2095 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 2096 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2097 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer 2098 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 2099 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] 2100 // CHECK-NEXT: ret <8 x i16> [[SUB]] 2101 // 2102 int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) { 2103 return vmlsq_laneq_s16(a, b, v, 0); 2104 } 2105 2106 // CHECK-LABEL: @test_vmls_laneq_s32_0( 2107 // CHECK-NEXT: entry: 2108 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2109 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2110 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 2111 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 2112 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] 2113 // CHECK-NEXT: ret <2 x i32> [[SUB]] 2114 // 2115 int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) { 2116 return vmls_laneq_s32(a, b, v, 0); 2117 } 2118 2119 // CHECK-LABEL: @test_vmlsq_laneq_s32_0( 2120 // CHECK-NEXT: entry: 2121 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2122 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2123 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer 2124 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 2125 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] 2126 // CHECK-NEXT: ret <4 x i32> [[SUB]] 2127 // 2128 int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) { 2129 return vmlsq_laneq_s32(a, b, v, 0); 2130 } 2131 2132 // CHECK-LABEL: @test_vmul_lane_s16_0( 2133 // CHECK-NEXT: entry: 2134 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 2135 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2136 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 2137 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] 2138 // CHECK-NEXT: ret <4 x i16> [[MUL]] 2139 // 2140 int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) { 2141 return vmul_lane_s16(a, v, 0); 2142 } 2143 2144 // CHECK-LABEL: @test_vmulq_lane_s16_0( 2145 // CHECK-NEXT: entry: 2146 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 2147 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2148 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer 2149 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] 2150 // CHECK-NEXT: ret <8 x i16> [[MUL]] 2151 // 2152 int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) { 2153 return vmulq_lane_s16(a, v, 0); 2154 } 2155 2156 // CHECK-LABEL: @test_vmul_lane_s32_0( 2157 // CHECK-NEXT: entry: 2158 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 2159 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2160 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 2161 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] 2162 // CHECK-NEXT: ret <2 x i32> [[MUL]] 2163 // 2164 int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) { 2165 return vmul_lane_s32(a, v, 0); 2166 } 2167 2168 // CHECK-LABEL: @test_vmulq_lane_s32_0( 2169 // CHECK-NEXT: entry: 2170 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 2171 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2172 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer 2173 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] 2174 // CHECK-NEXT: ret <4 x i32> [[MUL]] 2175 // 2176 int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) { 2177 return vmulq_lane_s32(a, v, 0); 2178 } 2179 2180 // CHECK-LABEL: @test_vmul_lane_u16_0( 2181 // CHECK-NEXT: entry: 2182 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 2183 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2184 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 2185 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] 2186 // CHECK-NEXT: ret <4 x i16> [[MUL]] 2187 // 2188 uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) { 2189 return vmul_lane_u16(a, v, 0); 2190 } 2191 2192 // CHECK-LABEL: @test_vmulq_lane_u16_0( 2193 // CHECK-NEXT: entry: 2194 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 2195 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2196 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer 2197 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] 2198 // CHECK-NEXT: ret <8 x i16> [[MUL]] 2199 // 2200 uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) { 2201 return vmulq_lane_u16(a, v, 0); 2202 } 2203 2204 // CHECK-LABEL: @test_vmul_lane_u32_0( 2205 // CHECK-NEXT: entry: 2206 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 2207 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2208 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 2209 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] 2210 // CHECK-NEXT: ret <2 x i32> [[MUL]] 2211 // 2212 uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) { 2213 return vmul_lane_u32(a, v, 0); 2214 } 2215 2216 // CHECK-LABEL: @test_vmulq_lane_u32_0( 2217 // CHECK-NEXT: entry: 2218 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 2219 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2220 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer 2221 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] 2222 // CHECK-NEXT: ret <4 x i32> [[MUL]] 2223 // 2224 uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) { 2225 return vmulq_lane_u32(a, v, 0); 2226 } 2227 2228 // CHECK-LABEL: @test_vmul_laneq_s16_0( 2229 // CHECK-NEXT: entry: 2230 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 2231 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2232 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 2233 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] 2234 // CHECK-NEXT: ret <4 x i16> [[MUL]] 2235 // 2236 int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) { 2237 return vmul_laneq_s16(a, v, 0); 2238 } 2239 2240 // CHECK-LABEL: @test_vmulq_laneq_s16_0( 2241 // CHECK-NEXT: entry: 2242 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 2243 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2244 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer 2245 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] 2246 // CHECK-NEXT: ret <8 x i16> [[MUL]] 2247 // 2248 int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) { 2249 return vmulq_laneq_s16(a, v, 0); 2250 } 2251 2252 // CHECK-LABEL: @test_vmul_laneq_s32_0( 2253 // CHECK-NEXT: entry: 2254 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2255 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2256 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 2257 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] 2258 // CHECK-NEXT: ret <2 x i32> [[MUL]] 2259 // 2260 int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) { 2261 return vmul_laneq_s32(a, v, 0); 2262 } 2263 2264 // CHECK-LABEL: @test_vmulq_laneq_s32_0( 2265 // CHECK-NEXT: entry: 2266 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2267 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2268 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer 2269 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] 2270 // CHECK-NEXT: ret <4 x i32> [[MUL]] 2271 // 2272 int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) { 2273 return vmulq_laneq_s32(a, v, 0); 2274 } 2275 2276 // CHECK-LABEL: @test_vmul_laneq_u16_0( 2277 // CHECK-NEXT: entry: 2278 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 2279 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2280 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 2281 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] 2282 // CHECK-NEXT: ret <4 x i16> [[MUL]] 2283 // 2284 uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) { 2285 return vmul_laneq_u16(a, v, 0); 2286 } 2287 2288 // CHECK-LABEL: @test_vmulq_laneq_u16_0( 2289 // CHECK-NEXT: entry: 2290 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 2291 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2292 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer 2293 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] 2294 // CHECK-NEXT: ret <8 x i16> [[MUL]] 2295 // 2296 uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) { 2297 return vmulq_laneq_u16(a, v, 0); 2298 } 2299 2300 // CHECK-LABEL: @test_vmul_laneq_u32_0( 2301 // CHECK-NEXT: entry: 2302 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2303 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2304 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 2305 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] 2306 // CHECK-NEXT: ret <2 x i32> [[MUL]] 2307 // 2308 uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) { 2309 return vmul_laneq_u32(a, v, 0); 2310 } 2311 2312 // CHECK-LABEL: @test_vmulq_laneq_u32_0( 2313 // CHECK-NEXT: entry: 2314 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2315 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2316 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer 2317 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] 2318 // CHECK-NEXT: ret <4 x i32> [[MUL]] 2319 // 2320 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) { 2321 return vmulq_laneq_u32(a, v, 0); 2322 } 2323 2324 // CHECK-LABEL: @test_vfma_lane_f32_0( 2325 // CHECK-NEXT: entry: 2326 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 2327 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> 2328 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 2329 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 2330 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer 2331 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 2332 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 2333 // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) 2334 // CHECK-NEXT: ret <2 x float> [[FMLA2]] 2335 // 2336 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { 2337 return vfma_lane_f32(a, b, v, 0); 2338 } 2339 2340 // CHECK-LABEL: @test_vfmaq_lane_f32_0( 2341 // CHECK-NEXT: entry: 2342 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 2343 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> 2344 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 2345 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 2346 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer 2347 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 2348 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 2349 // CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) 2350 // CHECK-NEXT: ret <4 x float> [[FMLA2]] 2351 // 2352 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { 2353 return vfmaq_lane_f32(a, b, v, 0); 2354 } 2355 2356 // CHECK-LABEL: @test_vfma_laneq_f32_0( 2357 // CHECK-NEXT: entry: 2358 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 2359 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> 2360 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 2361 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 2362 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 2363 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 2364 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer 2365 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) 2366 // CHECK-NEXT: ret <2 x float> [[TMP6]] 2367 // 2368 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { 2369 return vfma_laneq_f32(a, b, v, 0); 2370 } 2371 2372 // CHECK-LABEL: @test_vfmaq_laneq_f32_0( 2373 // CHECK-NEXT: entry: 2374 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 2375 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> 2376 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 2377 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 2378 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 2379 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 2380 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer 2381 // CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) 2382 // CHECK-NEXT: ret <4 x float> [[TMP6]] 2383 // 2384 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { 2385 return vfmaq_laneq_f32(a, b, v, 0); 2386 } 2387 2388 // CHECK-LABEL: @test_vfms_lane_f32_0( 2389 // CHECK-NEXT: entry: 2390 // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]] 2391 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 2392 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8> 2393 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 2394 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 2395 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer 2396 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 2397 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 2398 // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) 2399 // CHECK-NEXT: ret <2 x float> [[FMLA2]] 2400 // 2401 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { 2402 return vfms_lane_f32(a, b, v, 0); 2403 } 2404 2405 // CHECK-LABEL: @test_vfmsq_lane_f32_0( 2406 // CHECK-NEXT: entry: 2407 // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]] 2408 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 2409 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8> 2410 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 2411 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> 2412 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer 2413 // CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 2414 // CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 2415 // CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) 2416 // CHECK-NEXT: ret <4 x float> [[FMLA2]] 2417 // 2418 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { 2419 return vfmsq_lane_f32(a, b, v, 0); 2420 } 2421 2422 // CHECK-LABEL: @test_vfms_laneq_f32_0( 2423 // CHECK-NEXT: entry: 2424 // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]] 2425 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 2426 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8> 2427 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 2428 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 2429 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> 2430 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 2431 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer 2432 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) 2433 // CHECK-NEXT: ret <2 x float> [[TMP6]] 2434 // 2435 float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { 2436 return vfms_laneq_f32(a, b, v, 0); 2437 } 2438 2439 // CHECK-LABEL: @test_vfmsq_laneq_f32_0( 2440 // CHECK-NEXT: entry: 2441 // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]] 2442 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 2443 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8> 2444 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 2445 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 2446 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> 2447 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> 2448 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer 2449 // CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) 2450 // CHECK-NEXT: ret <4 x float> [[TMP6]] 2451 // 2452 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { 2453 return vfmsq_laneq_f32(a, b, v, 0); 2454 } 2455 2456 // CHECK-LABEL: @test_vfmaq_laneq_f64_0( 2457 // CHECK-NEXT: entry: 2458 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> 2459 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8> 2460 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> 2461 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 2462 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 2463 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> 2464 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer 2465 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) 2466 // CHECK-NEXT: ret <2 x double> [[TMP6]] 2467 // 2468 float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { 2469 return vfmaq_laneq_f64(a, b, v, 0); 2470 } 2471 2472 // CHECK-LABEL: @test_vfmsq_laneq_f64_0( 2473 // CHECK-NEXT: entry: 2474 // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]] 2475 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> 2476 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8> 2477 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> 2478 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 2479 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 2480 // CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> 2481 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer 2482 // CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) 2483 // CHECK-NEXT: ret <2 x double> [[TMP6]] 2484 // 2485 float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { 2486 return vfmsq_laneq_f64(a, b, v, 0); 2487 } 2488 2489 // CHECK-LABEL: @test_vmlal_lane_s16_0( 2490 // CHECK-NEXT: entry: 2491 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 2492 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2493 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 2494 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 2495 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2496 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 2497 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 2498 // CHECK-NEXT: ret <4 x i32> [[ADD]] 2499 // 2500 int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { 2501 return vmlal_lane_s16(a, b, v, 0); 2502 } 2503 2504 // CHECK-LABEL: @test_vmlal_lane_s32_0( 2505 // CHECK-NEXT: entry: 2506 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 2507 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2508 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 2509 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 2510 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2511 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 2512 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 2513 // CHECK-NEXT: ret <2 x i64> [[ADD]] 2514 // 2515 int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { 2516 return vmlal_lane_s32(a, b, v, 0); 2517 } 2518 2519 // CHECK-LABEL: @test_vmlal_laneq_s16_0( 2520 // CHECK-NEXT: entry: 2521 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 2522 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2523 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 2524 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 2525 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2526 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 2527 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 2528 // CHECK-NEXT: ret <4 x i32> [[ADD]] 2529 // 2530 int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { 2531 return vmlal_laneq_s16(a, b, v, 0); 2532 } 2533 2534 // CHECK-LABEL: @test_vmlal_laneq_s32_0( 2535 // CHECK-NEXT: entry: 2536 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2537 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2538 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 2539 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 2540 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2541 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 2542 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 2543 // CHECK-NEXT: ret <2 x i64> [[ADD]] 2544 // 2545 int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { 2546 return vmlal_laneq_s32(a, b, v, 0); 2547 } 2548 2549 // CHECK-LABEL: @test_vmlal_high_lane_s16_0( 2550 // CHECK-NEXT: entry: 2551 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2552 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 2553 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2554 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 2555 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2556 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2557 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 2558 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 2559 // CHECK-NEXT: ret <4 x i32> [[ADD]] 2560 // 2561 int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { 2562 return vmlal_high_lane_s16(a, b, v, 0); 2563 } 2564 2565 // CHECK-LABEL: @test_vmlal_high_lane_s32_0( 2566 // CHECK-NEXT: entry: 2567 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 2568 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 2569 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2570 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 2571 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2572 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2573 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 2574 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 2575 // CHECK-NEXT: ret <2 x i64> [[ADD]] 2576 // 2577 int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { 2578 return vmlal_high_lane_s32(a, b, v, 0); 2579 } 2580 2581 // CHECK-LABEL: @test_vmlal_high_laneq_s16_0( 2582 // CHECK-NEXT: entry: 2583 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2584 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 2585 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2586 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 2587 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2588 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2589 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 2590 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 2591 // CHECK-NEXT: ret <4 x i32> [[ADD]] 2592 // 2593 int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { 2594 return vmlal_high_laneq_s16(a, b, v, 0); 2595 } 2596 2597 // CHECK-LABEL: @test_vmlal_high_laneq_s32_0( 2598 // CHECK-NEXT: entry: 2599 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 2600 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2601 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2602 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 2603 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2604 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2605 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 2606 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 2607 // CHECK-NEXT: ret <2 x i64> [[ADD]] 2608 // 2609 int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { 2610 return vmlal_high_laneq_s32(a, b, v, 0); 2611 } 2612 2613 // CHECK-LABEL: @test_vmlsl_lane_s16_0( 2614 // CHECK-NEXT: entry: 2615 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 2616 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2617 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 2618 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 2619 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2620 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 2621 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 2622 // CHECK-NEXT: ret <4 x i32> [[SUB]] 2623 // 2624 int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { 2625 return vmlsl_lane_s16(a, b, v, 0); 2626 } 2627 2628 // CHECK-LABEL: @test_vmlsl_lane_s32_0( 2629 // CHECK-NEXT: entry: 2630 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 2631 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2632 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 2633 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 2634 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2635 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 2636 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 2637 // CHECK-NEXT: ret <2 x i64> [[SUB]] 2638 // 2639 int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { 2640 return vmlsl_lane_s32(a, b, v, 0); 2641 } 2642 2643 // CHECK-LABEL: @test_vmlsl_laneq_s16_0( 2644 // CHECK-NEXT: entry: 2645 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 2646 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2647 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 2648 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 2649 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2650 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 2651 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 2652 // CHECK-NEXT: ret <4 x i32> [[SUB]] 2653 // 2654 int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { 2655 return vmlsl_laneq_s16(a, b, v, 0); 2656 } 2657 2658 // CHECK-LABEL: @test_vmlsl_laneq_s32_0( 2659 // CHECK-NEXT: entry: 2660 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2661 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2662 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 2663 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 2664 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2665 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 2666 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 2667 // CHECK-NEXT: ret <2 x i64> [[SUB]] 2668 // 2669 int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { 2670 return vmlsl_laneq_s32(a, b, v, 0); 2671 } 2672 2673 // CHECK-LABEL: @test_vmlsl_high_lane_s16_0( 2674 // CHECK-NEXT: entry: 2675 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2676 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 2677 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2678 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 2679 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2680 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2681 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 2682 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 2683 // CHECK-NEXT: ret <4 x i32> [[SUB]] 2684 // 2685 int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { 2686 return vmlsl_high_lane_s16(a, b, v, 0); 2687 } 2688 2689 // CHECK-LABEL: @test_vmlsl_high_lane_s32_0( 2690 // CHECK-NEXT: entry: 2691 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 2692 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 2693 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2694 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 2695 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2696 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2697 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 2698 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 2699 // CHECK-NEXT: ret <2 x i64> [[SUB]] 2700 // 2701 int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { 2702 return vmlsl_high_lane_s32(a, b, v, 0); 2703 } 2704 2705 // CHECK-LABEL: @test_vmlsl_high_laneq_s16_0( 2706 // CHECK-NEXT: entry: 2707 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2708 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 2709 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2710 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 2711 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2712 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2713 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 2714 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 2715 // CHECK-NEXT: ret <4 x i32> [[SUB]] 2716 // 2717 int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { 2718 return vmlsl_high_laneq_s16(a, b, v, 0); 2719 } 2720 2721 // CHECK-LABEL: @test_vmlsl_high_laneq_s32_0( 2722 // CHECK-NEXT: entry: 2723 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 2724 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2725 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2726 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 2727 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2728 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2729 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 2730 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 2731 // CHECK-NEXT: ret <2 x i64> [[SUB]] 2732 // 2733 int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { 2734 return vmlsl_high_laneq_s32(a, b, v, 0); 2735 } 2736 2737 // CHECK-LABEL: @test_vmlal_lane_u16_0( 2738 // CHECK-NEXT: entry: 2739 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 2740 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2741 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 2742 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 2743 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2744 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 2745 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 2746 // CHECK-NEXT: ret <4 x i32> [[ADD]] 2747 // 2748 int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) { 2749 return vmlal_lane_u16(a, b, v, 0); 2750 } 2751 2752 // CHECK-LABEL: @test_vmlal_lane_u32_0( 2753 // CHECK-NEXT: entry: 2754 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 2755 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2756 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 2757 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 2758 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2759 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 2760 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 2761 // CHECK-NEXT: ret <2 x i64> [[ADD]] 2762 // 2763 int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) { 2764 return vmlal_lane_u32(a, b, v, 0); 2765 } 2766 2767 // CHECK-LABEL: @test_vmlal_laneq_u16_0( 2768 // CHECK-NEXT: entry: 2769 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 2770 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2771 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 2772 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 2773 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2774 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 2775 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 2776 // CHECK-NEXT: ret <4 x i32> [[ADD]] 2777 // 2778 int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) { 2779 return vmlal_laneq_u16(a, b, v, 0); 2780 } 2781 2782 // CHECK-LABEL: @test_vmlal_laneq_u32_0( 2783 // CHECK-NEXT: entry: 2784 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2785 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2786 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 2787 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 2788 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2789 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 2790 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 2791 // CHECK-NEXT: ret <2 x i64> [[ADD]] 2792 // 2793 int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) { 2794 return vmlal_laneq_u32(a, b, v, 0); 2795 } 2796 2797 // CHECK-LABEL: @test_vmlal_high_lane_u16_0( 2798 // CHECK-NEXT: entry: 2799 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2800 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 2801 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2802 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 2803 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2804 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2805 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 2806 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 2807 // CHECK-NEXT: ret <4 x i32> [[ADD]] 2808 // 2809 int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) { 2810 return vmlal_high_lane_u16(a, b, v, 0); 2811 } 2812 2813 // CHECK-LABEL: @test_vmlal_high_lane_u32_0( 2814 // CHECK-NEXT: entry: 2815 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 2816 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 2817 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2818 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 2819 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2820 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2821 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 2822 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 2823 // CHECK-NEXT: ret <2 x i64> [[ADD]] 2824 // 2825 int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) { 2826 return vmlal_high_lane_u32(a, b, v, 0); 2827 } 2828 2829 // CHECK-LABEL: @test_vmlal_high_laneq_u16_0( 2830 // CHECK-NEXT: entry: 2831 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2832 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 2833 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2834 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 2835 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2836 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2837 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 2838 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] 2839 // CHECK-NEXT: ret <4 x i32> [[ADD]] 2840 // 2841 int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) { 2842 return vmlal_high_laneq_u16(a, b, v, 0); 2843 } 2844 2845 // CHECK-LABEL: @test_vmlal_high_laneq_u32_0( 2846 // CHECK-NEXT: entry: 2847 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 2848 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2849 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2850 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 2851 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2852 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2853 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 2854 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] 2855 // CHECK-NEXT: ret <2 x i64> [[ADD]] 2856 // 2857 int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) { 2858 return vmlal_high_laneq_u32(a, b, v, 0); 2859 } 2860 2861 // CHECK-LABEL: @test_vmlsl_lane_u16_0( 2862 // CHECK-NEXT: entry: 2863 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 2864 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2865 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 2866 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 2867 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2868 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 2869 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 2870 // CHECK-NEXT: ret <4 x i32> [[SUB]] 2871 // 2872 int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) { 2873 return vmlsl_lane_u16(a, b, v, 0); 2874 } 2875 2876 // CHECK-LABEL: @test_vmlsl_lane_u32_0( 2877 // CHECK-NEXT: entry: 2878 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 2879 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2880 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 2881 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 2882 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2883 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 2884 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 2885 // CHECK-NEXT: ret <2 x i64> [[SUB]] 2886 // 2887 int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) { 2888 return vmlsl_lane_u32(a, b, v, 0); 2889 } 2890 2891 // CHECK-LABEL: @test_vmlsl_laneq_u16_0( 2892 // CHECK-NEXT: entry: 2893 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 2894 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2895 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 2896 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 2897 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2898 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 2899 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 2900 // CHECK-NEXT: ret <4 x i32> [[SUB]] 2901 // 2902 int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) { 2903 return vmlsl_laneq_u16(a, b, v, 0); 2904 } 2905 2906 // CHECK-LABEL: @test_vmlsl_laneq_u32_0( 2907 // CHECK-NEXT: entry: 2908 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2909 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2910 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 2911 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 2912 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2913 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 2914 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 2915 // CHECK-NEXT: ret <2 x i64> [[SUB]] 2916 // 2917 int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) { 2918 return vmlsl_laneq_u32(a, b, v, 0); 2919 } 2920 2921 // CHECK-LABEL: @test_vmlsl_high_lane_u16_0( 2922 // CHECK-NEXT: entry: 2923 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2924 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 2925 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2926 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 2927 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2928 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2929 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 2930 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 2931 // CHECK-NEXT: ret <4 x i32> [[SUB]] 2932 // 2933 int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) { 2934 return vmlsl_high_lane_u16(a, b, v, 0); 2935 } 2936 2937 // CHECK-LABEL: @test_vmlsl_high_lane_u32_0( 2938 // CHECK-NEXT: entry: 2939 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 2940 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 2941 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 2942 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 2943 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2944 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2945 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 2946 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 2947 // CHECK-NEXT: ret <2 x i64> [[SUB]] 2948 // 2949 int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) { 2950 return vmlsl_high_lane_u32(a, b, v, 0); 2951 } 2952 2953 // CHECK-LABEL: @test_vmlsl_high_laneq_u16_0( 2954 // CHECK-NEXT: entry: 2955 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2956 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 2957 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 2958 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 2959 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 2960 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2961 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 2962 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] 2963 // CHECK-NEXT: ret <4 x i32> [[SUB]] 2964 // 2965 int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) { 2966 return vmlsl_high_laneq_u16(a, b, v, 0); 2967 } 2968 2969 // CHECK-LABEL: @test_vmlsl_high_laneq_u32_0( 2970 // CHECK-NEXT: entry: 2971 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 2972 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 2973 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 2974 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 2975 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 2976 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 2977 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 2978 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] 2979 // CHECK-NEXT: ret <2 x i64> [[SUB]] 2980 // 2981 int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) { 2982 return vmlsl_high_laneq_u32(a, b, v, 0); 2983 } 2984 2985 // CHECK-LABEL: @test_vmull_lane_s16_0( 2986 // CHECK-NEXT: entry: 2987 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 2988 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 2989 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 2990 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 2991 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 2992 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) 2993 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 2994 // 2995 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) { 2996 return vmull_lane_s16(a, v, 0); 2997 } 2998 2999 // CHECK-LABEL: @test_vmull_lane_s32_0( 3000 // CHECK-NEXT: entry: 3001 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 3002 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3003 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 3004 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 3005 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3006 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) 3007 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 3008 // 3009 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) { 3010 return vmull_lane_s32(a, v, 0); 3011 } 3012 3013 // CHECK-LABEL: @test_vmull_lane_u16_0( 3014 // CHECK-NEXT: entry: 3015 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 3016 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3017 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 3018 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 3019 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 3020 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) 3021 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 3022 // 3023 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) { 3024 return vmull_lane_u16(a, v, 0); 3025 } 3026 3027 // CHECK-LABEL: @test_vmull_lane_u32_0( 3028 // CHECK-NEXT: entry: 3029 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 3030 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3031 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 3032 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 3033 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3034 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) 3035 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 3036 // 3037 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) { 3038 return vmull_lane_u32(a, v, 0); 3039 } 3040 3041 // CHECK-LABEL: @test_vmull_high_lane_s16_0( 3042 // CHECK-NEXT: entry: 3043 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3044 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 3045 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3046 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 3047 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 3048 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 3049 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 3050 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 3051 // 3052 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { 3053 return vmull_high_lane_s16(a, v, 0); 3054 } 3055 3056 // CHECK-LABEL: @test_vmull_high_lane_s32_0( 3057 // CHECK-NEXT: entry: 3058 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3> 3059 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 3060 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3061 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 3062 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 3063 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3064 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 3065 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 3066 // 3067 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { 3068 return vmull_high_lane_s32(a, v, 0); 3069 } 3070 3071 // CHECK-LABEL: @test_vmull_high_lane_u16_0( 3072 // CHECK-NEXT: entry: 3073 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3074 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 3075 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3076 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 3077 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 3078 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 3079 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 3080 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 3081 // 3082 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) { 3083 return vmull_high_lane_u16(a, v, 0); 3084 } 3085 3086 // CHECK-LABEL: @test_vmull_high_lane_u32_0( 3087 // CHECK-NEXT: entry: 3088 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3> 3089 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 3090 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3091 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 3092 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 3093 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3094 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 3095 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 3096 // 3097 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) { 3098 return vmull_high_lane_u32(a, v, 0); 3099 } 3100 3101 // CHECK-LABEL: @test_vmull_laneq_s16_0( 3102 // CHECK-NEXT: entry: 3103 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 3104 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 3105 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 3106 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 3107 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 3108 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) 3109 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 3110 // 3111 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) { 3112 return vmull_laneq_s16(a, v, 0); 3113 } 3114 3115 // CHECK-LABEL: @test_vmull_laneq_s32_0( 3116 // CHECK-NEXT: entry: 3117 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 3118 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 3119 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 3120 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 3121 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3122 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) 3123 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 3124 // 3125 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) { 3126 return vmull_laneq_s32(a, v, 0); 3127 } 3128 3129 // CHECK-LABEL: @test_vmull_laneq_u16_0( 3130 // CHECK-NEXT: entry: 3131 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 3132 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 3133 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 3134 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 3135 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 3136 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) 3137 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 3138 // 3139 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) { 3140 return vmull_laneq_u16(a, v, 0); 3141 } 3142 3143 // CHECK-LABEL: @test_vmull_laneq_u32_0( 3144 // CHECK-NEXT: entry: 3145 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 3146 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 3147 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 3148 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 3149 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3150 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) 3151 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 3152 // 3153 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) { 3154 return vmull_laneq_u32(a, v, 0); 3155 } 3156 3157 // CHECK-LABEL: @test_vmull_high_laneq_s16_0( 3158 // CHECK-NEXT: entry: 3159 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3160 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 3161 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 3162 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 3163 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 3164 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 3165 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 3166 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 3167 // 3168 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { 3169 return vmull_high_laneq_s16(a, v, 0); 3170 } 3171 3172 // CHECK-LABEL: @test_vmull_high_laneq_s32_0( 3173 // CHECK-NEXT: entry: 3174 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3> 3175 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 3176 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 3177 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 3178 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 3179 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3180 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 3181 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 3182 // 3183 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { 3184 return vmull_high_laneq_s32(a, v, 0); 3185 } 3186 3187 // CHECK-LABEL: @test_vmull_high_laneq_u16_0( 3188 // CHECK-NEXT: entry: 3189 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3190 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 3191 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 3192 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 3193 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 3194 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 3195 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 3196 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] 3197 // 3198 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) { 3199 return vmull_high_laneq_u16(a, v, 0); 3200 } 3201 3202 // CHECK-LABEL: @test_vmull_high_laneq_u32_0( 3203 // CHECK-NEXT: entry: 3204 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3> 3205 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 3206 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 3207 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 3208 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 3209 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3210 // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 3211 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] 3212 // 3213 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) { 3214 return vmull_high_laneq_u32(a, v, 0); 3215 } 3216 3217 // CHECK-LABEL: @test_vqdmlal_lane_s16_0( 3218 // CHECK-NEXT: entry: 3219 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 3220 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3221 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 3222 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 3223 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 3224 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 3225 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 3226 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 3227 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] 3228 // 3229 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { 3230 return vqdmlal_lane_s16(a, b, v, 0); 3231 } 3232 3233 // CHECK-LABEL: @test_vqdmlal_lane_s32_0( 3234 // CHECK-NEXT: entry: 3235 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 3236 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3237 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 3238 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 3239 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 3240 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3241 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 3242 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 3243 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] 3244 // 3245 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { 3246 return vqdmlal_lane_s32(a, b, v, 0); 3247 } 3248 3249 // CHECK-LABEL: @test_vqdmlal_high_lane_s16_0( 3250 // CHECK-NEXT: entry: 3251 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3252 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 3253 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3254 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 3255 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 3256 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 3257 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 3258 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 3259 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 3260 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] 3261 // 3262 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { 3263 return vqdmlal_high_lane_s16(a, b, v, 0); 3264 } 3265 3266 // CHECK-LABEL: @test_vqdmlal_high_lane_s32_0( 3267 // CHECK-NEXT: entry: 3268 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 3269 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 3270 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3271 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 3272 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 3273 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 3274 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3275 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 3276 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 3277 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] 3278 // 3279 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { 3280 return vqdmlal_high_lane_s32(a, b, v, 0); 3281 } 3282 3283 // CHECK-LABEL: @test_vqdmlsl_lane_s16_0( 3284 // CHECK-NEXT: entry: 3285 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 3286 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3287 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 3288 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 3289 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 3290 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 3291 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 3292 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 3293 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] 3294 // 3295 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { 3296 return vqdmlsl_lane_s16(a, b, v, 0); 3297 } 3298 3299 // CHECK-LABEL: @test_vqdmlsl_lane_s32_0( 3300 // CHECK-NEXT: entry: 3301 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 3302 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3303 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 3304 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 3305 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 3306 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3307 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 3308 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 3309 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] 3310 // 3311 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { 3312 return vqdmlsl_lane_s32(a, b, v, 0); 3313 } 3314 3315 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0( 3316 // CHECK-NEXT: entry: 3317 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3318 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 3319 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3320 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 3321 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 3322 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 3323 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 3324 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 3325 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 3326 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] 3327 // 3328 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { 3329 return vqdmlsl_high_lane_s16(a, b, v, 0); 3330 } 3331 3332 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0( 3333 // CHECK-NEXT: entry: 3334 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 3335 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 3336 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3337 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 3338 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 3339 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 3340 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3341 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 3342 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 3343 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] 3344 // 3345 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { 3346 return vqdmlsl_high_lane_s32(a, b, v, 0); 3347 } 3348 3349 // CHECK-LABEL: @test_vqdmull_lane_s16_0( 3350 // CHECK-NEXT: entry: 3351 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 3352 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3353 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 3354 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 3355 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 3356 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) 3357 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 3358 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] 3359 // 3360 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) { 3361 return vqdmull_lane_s16(a, v, 0); 3362 } 3363 3364 // CHECK-LABEL: @test_vqdmull_lane_s32_0( 3365 // CHECK-NEXT: entry: 3366 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 3367 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3368 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 3369 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 3370 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3371 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) 3372 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 3373 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] 3374 // 3375 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) { 3376 return vqdmull_lane_s32(a, v, 0); 3377 } 3378 3379 // CHECK-LABEL: @test_vqdmull_laneq_s16_0( 3380 // CHECK-NEXT: entry: 3381 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 3382 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 3383 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 3384 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 3385 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 3386 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) 3387 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 3388 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] 3389 // 3390 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) { 3391 return vqdmull_laneq_s16(a, v, 0); 3392 } 3393 3394 // CHECK-LABEL: @test_vqdmull_laneq_s32_0( 3395 // CHECK-NEXT: entry: 3396 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 3397 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 3398 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 3399 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 3400 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3401 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) 3402 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 3403 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] 3404 // 3405 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) { 3406 return vqdmull_laneq_s32(a, v, 0); 3407 } 3408 3409 // CHECK-LABEL: @test_vqdmull_high_lane_s16_0( 3410 // CHECK-NEXT: entry: 3411 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3412 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 3413 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3414 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 3415 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 3416 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 3417 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 3418 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 3419 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] 3420 // 3421 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { 3422 return vqdmull_high_lane_s16(a, v, 0); 3423 } 3424 3425 // CHECK-LABEL: @test_vqdmull_high_lane_s32_0( 3426 // CHECK-NEXT: entry: 3427 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3> 3428 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 3429 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3430 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 3431 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 3432 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3433 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 3434 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 3435 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] 3436 // 3437 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { 3438 return vqdmull_high_lane_s32(a, v, 0); 3439 } 3440 3441 // CHECK-LABEL: @test_vqdmull_high_laneq_s16_0( 3442 // CHECK-NEXT: entry: 3443 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3444 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 3445 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 3446 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 3447 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 3448 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 3449 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 3450 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> 3451 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] 3452 // 3453 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { 3454 return vqdmull_high_laneq_s16(a, v, 0); 3455 } 3456 3457 // CHECK-LABEL: @test_vqdmull_high_laneq_s32_0( 3458 // CHECK-NEXT: entry: 3459 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3> 3460 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 3461 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 3462 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 3463 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 3464 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 3465 // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 3466 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> 3467 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] 3468 // 3469 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { 3470 return vqdmull_high_laneq_s32(a, v, 0); 3471 } 3472 3473 // CHECK-LABEL: @test_vqdmulh_lane_s16_0( 3474 // CHECK-NEXT: entry: 3475 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 3476 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 3477 // CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3478 // CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3479 // CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 0) 3480 // CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANE_V2]] 3481 // 3482 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) { 3483 return vqdmulh_lane_s16(a, v, 0); 3484 } 3485 3486 // CHECK-LABEL: @test_vqdmulhq_lane_s16_0( 3487 // CHECK-NEXT: entry: 3488 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> 3489 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 3490 // CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 3491 // CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3492 // CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 0) 3493 // CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANE_V2]] 3494 // 3495 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) { 3496 return vqdmulhq_lane_s16(a, v, 0); 3497 } 3498 3499 // CHECK-LABEL: @test_vqdmulh_lane_s32_0( 3500 // CHECK-NEXT: entry: 3501 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 3502 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 3503 // CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3504 // CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3505 // CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 0) 3506 // CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANE_V2]] 3507 // 3508 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) { 3509 return vqdmulh_lane_s32(a, v, 0); 3510 } 3511 3512 // CHECK-LABEL: @test_vqdmulhq_lane_s32_0( 3513 // CHECK-NEXT: entry: 3514 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 3515 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 3516 // CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 3517 // CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3518 // CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 0) 3519 // CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANE_V2]] 3520 // 3521 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) { 3522 return vqdmulhq_lane_s32(a, v, 0); 3523 } 3524 3525 // CHECK-LABEL: @test_vqrdmulh_lane_s16_0( 3526 // CHECK-NEXT: entry: 3527 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 3528 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 3529 // CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 3530 // CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3531 // CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 0) 3532 // CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANE_V2]] 3533 // 3534 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) { 3535 return vqrdmulh_lane_s16(a, v, 0); 3536 } 3537 3538 // CHECK-LABEL: @test_vqrdmulhq_lane_s16_0( 3539 // CHECK-NEXT: entry: 3540 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> 3541 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 3542 // CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 3543 // CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> 3544 // CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 0) 3545 // CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANE_V2]] 3546 // 3547 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) { 3548 return vqrdmulhq_lane_s16(a, v, 0); 3549 } 3550 3551 // CHECK-LABEL: @test_vqrdmulh_lane_s32_0( 3552 // CHECK-NEXT: entry: 3553 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 3554 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 3555 // CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 3556 // CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3557 // CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 0) 3558 // CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANE_V2]] 3559 // 3560 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) { 3561 return vqrdmulh_lane_s32(a, v, 0); 3562 } 3563 3564 // CHECK-LABEL: @test_vqrdmulhq_lane_s32_0( 3565 // CHECK-NEXT: entry: 3566 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 3567 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 3568 // CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 3569 // CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 3570 // CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 0) 3571 // CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANE_V2]] 3572 // 3573 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) { 3574 return vqrdmulhq_lane_s32(a, v, 0); 3575 } 3576 3577 // CHECK-LABEL: @test_vmul_lane_f32_0( 3578 // CHECK-NEXT: entry: 3579 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 3580 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 3581 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer 3582 // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] 3583 // CHECK-NEXT: ret <2 x float> [[MUL]] 3584 // 3585 float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) { 3586 return vmul_lane_f32(a, v, 0); 3587 } 3588 3589 // CHECK-LABEL: @test_vmulq_lane_f32_0( 3590 // CHECK-NEXT: entry: 3591 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 3592 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 3593 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer 3594 // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] 3595 // CHECK-NEXT: ret <4 x float> [[MUL]] 3596 // 3597 float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) { 3598 return vmulq_lane_f32(a, v, 0); 3599 } 3600 3601 // CHECK-LABEL: @test_vmul_laneq_f32_0( 3602 // CHECK-NEXT: entry: 3603 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 3604 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 3605 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer 3606 // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] 3607 // CHECK-NEXT: ret <2 x float> [[MUL]] 3608 // 3609 float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) { 3610 return vmul_laneq_f32(a, v, 0); 3611 } 3612 3613 // CHECK-LABEL: @test_vmul_laneq_f64_0( 3614 // CHECK-NEXT: entry: 3615 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> 3616 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> 3617 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double 3618 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> 3619 // CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 3620 // CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]] 3621 // CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double> 3622 // CHECK-NEXT: ret <1 x double> [[TMP5]] 3623 // 3624 float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) { 3625 return vmul_laneq_f64(a, v, 0); 3626 } 3627 3628 // CHECK-LABEL: @test_vmulq_laneq_f32_0( 3629 // CHECK-NEXT: entry: 3630 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 3631 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 3632 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer 3633 // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] 3634 // CHECK-NEXT: ret <4 x float> [[MUL]] 3635 // 3636 float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) { 3637 return vmulq_laneq_f32(a, v, 0); 3638 } 3639 3640 // CHECK-LABEL: @test_vmulq_laneq_f64_0( 3641 // CHECK-NEXT: entry: 3642 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> 3643 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 3644 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer 3645 // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] 3646 // CHECK-NEXT: ret <2 x double> [[MUL]] 3647 // 3648 float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) { 3649 return vmulq_laneq_f64(a, v, 0); 3650 } 3651 3652 // CHECK-LABEL: @test_vmulx_lane_f32_0( 3653 // CHECK-NEXT: entry: 3654 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 3655 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 3656 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer 3657 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 3658 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> 3659 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) 3660 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] 3661 // 3662 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) { 3663 return vmulx_lane_f32(a, v, 0); 3664 } 3665 3666 // CHECK-LABEL: @test_vmulxq_lane_f32_0( 3667 // CHECK-NEXT: entry: 3668 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> 3669 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> 3670 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer 3671 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 3672 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> 3673 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) 3674 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] 3675 // 3676 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) { 3677 return vmulxq_lane_f32(a, v, 0); 3678 } 3679 3680 // CHECK-LABEL: @test_vmulxq_lane_f64_0( 3681 // CHECK-NEXT: entry: 3682 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> 3683 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> 3684 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer 3685 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> 3686 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> 3687 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) 3688 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] 3689 // 3690 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) { 3691 return vmulxq_lane_f64(a, v, 0); 3692 } 3693 3694 // CHECK-LABEL: @test_vmulx_laneq_f32_0( 3695 // CHECK-NEXT: entry: 3696 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 3697 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 3698 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer 3699 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 3700 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> 3701 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) 3702 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] 3703 // 3704 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) { 3705 return vmulx_laneq_f32(a, v, 0); 3706 } 3707 3708 // CHECK-LABEL: @test_vmulxq_laneq_f32_0( 3709 // CHECK-NEXT: entry: 3710 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> 3711 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> 3712 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer 3713 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 3714 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> 3715 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) 3716 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] 3717 // 3718 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) { 3719 return vmulxq_laneq_f32(a, v, 0); 3720 } 3721 3722 // CHECK-LABEL: @test_vmulxq_laneq_f64_0( 3723 // CHECK-NEXT: entry: 3724 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> 3725 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> 3726 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer 3727 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> 3728 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> 3729 // CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) 3730 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] 3731 // 3732 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) { 3733 return vmulxq_laneq_f64(a, v, 0); 3734 } 3735 3736 // CHECK-LABEL: @test_vmull_high_n_s16( 3737 // CHECK-NEXT: entry: 3738 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3739 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 3740 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 3741 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 3742 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 3743 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3744 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 3745 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) 3746 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] 3747 // 3748 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) { 3749 return vmull_high_n_s16(a, b); 3750 } 3751 3752 // CHECK-LABEL: @test_vmull_high_n_s32( 3753 // CHECK-NEXT: entry: 3754 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3> 3755 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 3756 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 3757 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3758 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 3759 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) 3760 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] 3761 // 3762 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) { 3763 return vmull_high_n_s32(a, b); 3764 } 3765 3766 // CHECK-LABEL: @test_vmull_high_n_u16( 3767 // CHECK-NEXT: entry: 3768 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3769 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 3770 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 3771 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 3772 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 3773 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3774 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 3775 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) 3776 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] 3777 // 3778 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) { 3779 return vmull_high_n_u16(a, b); 3780 } 3781 3782 // CHECK-LABEL: @test_vmull_high_n_u32( 3783 // CHECK-NEXT: entry: 3784 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3> 3785 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 3786 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 3787 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3788 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 3789 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) 3790 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] 3791 // 3792 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) { 3793 return vmull_high_n_u32(a, b); 3794 } 3795 3796 // CHECK-LABEL: @test_vqdmull_high_n_s16( 3797 // CHECK-NEXT: entry: 3798 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3799 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 3800 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 3801 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 3802 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 3803 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3804 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 3805 // CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) 3806 // CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8> 3807 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I]] 3808 // 3809 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) { 3810 return vqdmull_high_n_s16(a, b); 3811 } 3812 3813 // CHECK-LABEL: @test_vqdmull_high_n_s32( 3814 // CHECK-NEXT: entry: 3815 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> <i32 2, i32 3> 3816 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 3817 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 3818 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3819 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 3820 // CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) 3821 // CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8> 3822 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I]] 3823 // 3824 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) { 3825 return vqdmull_high_n_s32(a, b); 3826 } 3827 3828 // CHECK-LABEL: @test_vmlal_high_n_s16( 3829 // CHECK-NEXT: entry: 3830 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3831 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 3832 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 3833 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 3834 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 3835 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3836 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 3837 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) 3838 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] 3839 // CHECK-NEXT: ret <4 x i32> [[ADD_I]] 3840 // 3841 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { 3842 return vmlal_high_n_s16(a, b, c); 3843 } 3844 3845 // CHECK-LABEL: @test_vmlal_high_n_s32( 3846 // CHECK-NEXT: entry: 3847 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 3848 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 3849 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 3850 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3851 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 3852 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) 3853 // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] 3854 // CHECK-NEXT: ret <2 x i64> [[ADD_I]] 3855 // 3856 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { 3857 return vmlal_high_n_s32(a, b, c); 3858 } 3859 3860 // CHECK-LABEL: @test_vmlal_high_n_u16( 3861 // CHECK-NEXT: entry: 3862 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3863 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 3864 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 3865 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 3866 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 3867 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3868 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 3869 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) 3870 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] 3871 // CHECK-NEXT: ret <4 x i32> [[ADD_I]] 3872 // 3873 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { 3874 return vmlal_high_n_u16(a, b, c); 3875 } 3876 3877 // CHECK-LABEL: @test_vmlal_high_n_u32( 3878 // CHECK-NEXT: entry: 3879 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 3880 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 3881 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 3882 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3883 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 3884 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) 3885 // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] 3886 // CHECK-NEXT: ret <2 x i64> [[ADD_I]] 3887 // 3888 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { 3889 return vmlal_high_n_u32(a, b, c); 3890 } 3891 3892 // CHECK-LABEL: @test_vqdmlal_high_n_s16( 3893 // CHECK-NEXT: entry: 3894 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3895 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 3896 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 3897 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 3898 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 3899 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 3900 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3901 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 3902 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) 3903 // CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) 3904 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]] 3905 // 3906 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { 3907 return vqdmlal_high_n_s16(a, b, c); 3908 } 3909 3910 // CHECK-LABEL: @test_vqdmlal_high_n_s32( 3911 // CHECK-NEXT: entry: 3912 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 3913 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 3914 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 3915 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 3916 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3917 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 3918 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) 3919 // CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) 3920 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]] 3921 // 3922 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { 3923 return vqdmlal_high_n_s32(a, b, c); 3924 } 3925 3926 // CHECK-LABEL: @test_vmlsl_high_n_s16( 3927 // CHECK-NEXT: entry: 3928 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3929 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 3930 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 3931 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 3932 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 3933 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3934 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 3935 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) 3936 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] 3937 // CHECK-NEXT: ret <4 x i32> [[SUB_I]] 3938 // 3939 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { 3940 return vmlsl_high_n_s16(a, b, c); 3941 } 3942 3943 // CHECK-LABEL: @test_vmlsl_high_n_s32( 3944 // CHECK-NEXT: entry: 3945 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 3946 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 3947 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 3948 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3949 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 3950 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) 3951 // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] 3952 // CHECK-NEXT: ret <2 x i64> [[SUB_I]] 3953 // 3954 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { 3955 return vmlsl_high_n_s32(a, b, c); 3956 } 3957 3958 // CHECK-LABEL: @test_vmlsl_high_n_u16( 3959 // CHECK-NEXT: entry: 3960 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3961 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 3962 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 3963 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 3964 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 3965 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3966 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 3967 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) 3968 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] 3969 // CHECK-NEXT: ret <4 x i32> [[SUB_I]] 3970 // 3971 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { 3972 return vmlsl_high_n_u16(a, b, c); 3973 } 3974 3975 // CHECK-LABEL: @test_vmlsl_high_n_u32( 3976 // CHECK-NEXT: entry: 3977 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 3978 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 3979 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 3980 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 3981 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 3982 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) 3983 // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] 3984 // CHECK-NEXT: ret <2 x i64> [[SUB_I]] 3985 // 3986 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { 3987 return vmlsl_high_n_u32(a, b, c); 3988 } 3989 3990 // CHECK-LABEL: @test_vqdmlsl_high_n_s16( 3991 // CHECK-NEXT: entry: 3992 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 3993 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 3994 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 3995 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 3996 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 3997 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 3998 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> 3999 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4000 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) 4001 // CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) 4002 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]] 4003 // 4004 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { 4005 return vqdmlsl_high_n_s16(a, b, c); 4006 } 4007 4008 // CHECK-LABEL: @test_vqdmlsl_high_n_s32( 4009 // CHECK-NEXT: entry: 4010 // CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 4011 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 4012 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 4013 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 4014 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> 4015 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4016 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) 4017 // CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) 4018 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]] 4019 // 4020 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { 4021 return vqdmlsl_high_n_s32(a, b, c); 4022 } 4023 4024 // CHECK-LABEL: @test_vmul_n_f32( 4025 // CHECK-NEXT: entry: 4026 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[B:%.*]], i32 0 4027 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[B]], i32 1 4028 // CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[A:%.*]], [[VECINIT1_I]] 4029 // CHECK-NEXT: ret <2 x float> [[MUL_I]] 4030 // 4031 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) { 4032 return vmul_n_f32(a, b); 4033 } 4034 4035 // CHECK-LABEL: @test_vmulq_n_f32( 4036 // CHECK-NEXT: entry: 4037 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i32 0 4038 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[B]], i32 1 4039 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[B]], i32 2 4040 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[B]], i32 3 4041 // CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[A:%.*]], [[VECINIT3_I]] 4042 // CHECK-NEXT: ret <4 x float> [[MUL_I]] 4043 // 4044 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) { 4045 return vmulq_n_f32(a, b); 4046 } 4047 4048 // CHECK-LABEL: @test_vmulq_n_f64( 4049 // CHECK-NEXT: entry: 4050 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i32 0 4051 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[B]], i32 1 4052 // CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[A:%.*]], [[VECINIT1_I]] 4053 // CHECK-NEXT: ret <2 x double> [[MUL_I]] 4054 // 4055 float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) { 4056 return vmulq_n_f64(a, b); 4057 } 4058 4059 // CHECK-LABEL: @test_vfma_n_f32( 4060 // CHECK-NEXT: entry: 4061 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i32 0 4062 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1 4063 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 4064 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> 4065 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8> 4066 // CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) 4067 // CHECK-NEXT: ret <2 x float> [[TMP3]] 4068 // 4069 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) { 4070 return vfma_n_f32(a, b, n); 4071 } 4072 4073 // CHECK-LABEL: @test_vfma_n_f64( 4074 // CHECK-NEXT: entry: 4075 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i32 0 4076 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> 4077 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8> 4078 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8> 4079 // CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) 4080 // CHECK-NEXT: ret <1 x double> [[TMP3]] 4081 // 4082 float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) { 4083 return vfma_n_f64(a, b, n); 4084 } 4085 4086 // CHECK-LABEL: @test_vfmaq_n_f32( 4087 // CHECK-NEXT: entry: 4088 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N:%.*]], i32 0 4089 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1 4090 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2 4091 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3 4092 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 4093 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> 4094 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8> 4095 // CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) 4096 // CHECK-NEXT: ret <4 x float> [[TMP3]] 4097 // 4098 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { 4099 return vfmaq_n_f32(a, b, n); 4100 } 4101 4102 // CHECK-LABEL: @test_vfms_n_f32( 4103 // CHECK-NEXT: entry: 4104 // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[B:%.*]] 4105 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i32 0 4106 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1 4107 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> 4108 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <8 x i8> 4109 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8> 4110 // CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) 4111 // CHECK-NEXT: ret <2 x float> [[TMP3]] 4112 // 4113 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) { 4114 return vfms_n_f32(a, b, n); 4115 } 4116 4117 // CHECK-LABEL: @test_vfms_n_f64( 4118 // CHECK-NEXT: entry: 4119 // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <1 x double> [[B:%.*]] 4120 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i32 0 4121 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> 4122 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to <8 x i8> 4123 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8> 4124 // CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) 4125 // CHECK-NEXT: ret <1 x double> [[TMP3]] 4126 // 4127 float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) { 4128 return vfms_n_f64(a, b, n); 4129 } 4130 4131 // CHECK-LABEL: @test_vfmsq_n_f32( 4132 // CHECK-NEXT: entry: 4133 // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[B:%.*]] 4134 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N:%.*]], i32 0 4135 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1 4136 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2 4137 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3 4138 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> 4139 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <16 x i8> 4140 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8> 4141 // CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) 4142 // CHECK-NEXT: ret <4 x float> [[TMP3]] 4143 // 4144 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { 4145 return vfmsq_n_f32(a, b, n); 4146 } 4147 4148 // CHECK-LABEL: @test_vmul_n_s16( 4149 // CHECK-NEXT: entry: 4150 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 4151 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 4152 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 4153 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 4154 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]] 4155 // CHECK-NEXT: ret <4 x i16> [[MUL_I]] 4156 // 4157 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) { 4158 return vmul_n_s16(a, b); 4159 } 4160 4161 // CHECK-LABEL: @test_vmulq_n_s16( 4162 // CHECK-NEXT: entry: 4163 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0 4164 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1 4165 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 4166 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 4167 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4 4168 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5 4169 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6 4170 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 4171 // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]] 4172 // CHECK-NEXT: ret <8 x i16> [[MUL_I]] 4173 // 4174 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) { 4175 return vmulq_n_s16(a, b); 4176 } 4177 4178 // CHECK-LABEL: @test_vmul_n_s32( 4179 // CHECK-NEXT: entry: 4180 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 4181 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 4182 // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]] 4183 // CHECK-NEXT: ret <2 x i32> [[MUL_I]] 4184 // 4185 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) { 4186 return vmul_n_s32(a, b); 4187 } 4188 4189 // CHECK-LABEL: @test_vmulq_n_s32( 4190 // CHECK-NEXT: entry: 4191 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0 4192 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1 4193 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2 4194 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 4195 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]] 4196 // CHECK-NEXT: ret <4 x i32> [[MUL_I]] 4197 // 4198 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) { 4199 return vmulq_n_s32(a, b); 4200 } 4201 4202 // CHECK-LABEL: @test_vmul_n_u16( 4203 // CHECK-NEXT: entry: 4204 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 4205 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 4206 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 4207 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 4208 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]] 4209 // CHECK-NEXT: ret <4 x i16> [[MUL_I]] 4210 // 4211 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) { 4212 return vmul_n_u16(a, b); 4213 } 4214 4215 // CHECK-LABEL: @test_vmulq_n_u16( 4216 // CHECK-NEXT: entry: 4217 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0 4218 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1 4219 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 4220 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 4221 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4 4222 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5 4223 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6 4224 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 4225 // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]] 4226 // CHECK-NEXT: ret <8 x i16> [[MUL_I]] 4227 // 4228 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) { 4229 return vmulq_n_u16(a, b); 4230 } 4231 4232 // CHECK-LABEL: @test_vmul_n_u32( 4233 // CHECK-NEXT: entry: 4234 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 4235 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 4236 // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]] 4237 // CHECK-NEXT: ret <2 x i32> [[MUL_I]] 4238 // 4239 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) { 4240 return vmul_n_u32(a, b); 4241 } 4242 4243 // CHECK-LABEL: @test_vmulq_n_u32( 4244 // CHECK-NEXT: entry: 4245 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0 4246 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1 4247 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2 4248 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 4249 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]] 4250 // CHECK-NEXT: ret <4 x i32> [[MUL_I]] 4251 // 4252 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) { 4253 return vmulq_n_u32(a, b); 4254 } 4255 4256 // CHECK-LABEL: @test_vmull_n_s16( 4257 // CHECK-NEXT: entry: 4258 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 4259 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 4260 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 4261 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 4262 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 4263 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4264 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) 4265 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] 4266 // 4267 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) { 4268 return vmull_n_s16(a, b); 4269 } 4270 4271 // CHECK-LABEL: @test_vmull_n_s32( 4272 // CHECK-NEXT: entry: 4273 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 4274 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 4275 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 4276 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4277 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) 4278 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] 4279 // 4280 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) { 4281 return vmull_n_s32(a, b); 4282 } 4283 4284 // CHECK-LABEL: @test_vmull_n_u16( 4285 // CHECK-NEXT: entry: 4286 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 4287 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 4288 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 4289 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 4290 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 4291 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4292 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) 4293 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] 4294 // 4295 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) { 4296 return vmull_n_u16(a, b); 4297 } 4298 4299 // CHECK-LABEL: @test_vmull_n_u32( 4300 // CHECK-NEXT: entry: 4301 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 4302 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 4303 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 4304 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4305 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) 4306 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] 4307 // 4308 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) { 4309 return vmull_n_u32(a, b); 4310 } 4311 4312 // CHECK-LABEL: @test_vqdmull_n_s16( 4313 // CHECK-NEXT: entry: 4314 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 4315 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 4316 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 4317 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 4318 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 4319 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4320 // CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) 4321 // CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8> 4322 // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I]] 4323 // 4324 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) { 4325 return vqdmull_n_s16(a, b); 4326 } 4327 4328 // CHECK-LABEL: @test_vqdmull_n_s32( 4329 // CHECK-NEXT: entry: 4330 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 4331 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 4332 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 4333 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4334 // CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) 4335 // CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8> 4336 // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I]] 4337 // 4338 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) { 4339 return vqdmull_n_s32(a, b); 4340 } 4341 4342 // CHECK-LABEL: @test_vqdmulh_n_s16( 4343 // CHECK-NEXT: entry: 4344 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 4345 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 4346 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 4347 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 4348 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 4349 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4350 // CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) 4351 // CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> 4352 // CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]] 4353 // 4354 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) { 4355 return vqdmulh_n_s16(a, b); 4356 } 4357 4358 // CHECK-LABEL: @test_vqdmulhq_n_s16( 4359 // CHECK-NEXT: entry: 4360 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0 4361 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1 4362 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 4363 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 4364 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4 4365 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5 4366 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6 4367 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 4368 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> 4369 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> 4370 // CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) 4371 // CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> 4372 // CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]] 4373 // 4374 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) { 4375 return vqdmulhq_n_s16(a, b); 4376 } 4377 4378 // CHECK-LABEL: @test_vqdmulh_n_s32( 4379 // CHECK-NEXT: entry: 4380 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 4381 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 4382 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 4383 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4384 // CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) 4385 // CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> 4386 // CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]] 4387 // 4388 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) { 4389 return vqdmulh_n_s32(a, b); 4390 } 4391 4392 // CHECK-LABEL: @test_vqdmulhq_n_s32( 4393 // CHECK-NEXT: entry: 4394 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0 4395 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1 4396 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2 4397 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 4398 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 4399 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> 4400 // CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) 4401 // CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> 4402 // CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]] 4403 // 4404 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) { 4405 return vqdmulhq_n_s32(a, b); 4406 } 4407 4408 // CHECK-LABEL: @test_vqrdmulh_n_s16( 4409 // CHECK-NEXT: entry: 4410 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 4411 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 4412 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 4413 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 4414 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 4415 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4416 // CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) 4417 // CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> 4418 // CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]] 4419 // 4420 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) { 4421 return vqrdmulh_n_s16(a, b); 4422 } 4423 4424 // CHECK-LABEL: @test_vqrdmulhq_n_s16( 4425 // CHECK-NEXT: entry: 4426 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0 4427 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1 4428 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 4429 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 4430 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4 4431 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5 4432 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6 4433 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 4434 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> 4435 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> 4436 // CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) 4437 // CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> 4438 // CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]] 4439 // 4440 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) { 4441 return vqrdmulhq_n_s16(a, b); 4442 } 4443 4444 // CHECK-LABEL: @test_vqrdmulh_n_s32( 4445 // CHECK-NEXT: entry: 4446 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 4447 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 4448 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 4449 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4450 // CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) 4451 // CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> 4452 // CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]] 4453 // 4454 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) { 4455 return vqrdmulh_n_s32(a, b); 4456 } 4457 4458 // CHECK-LABEL: @test_vqrdmulhq_n_s32( 4459 // CHECK-NEXT: entry: 4460 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0 4461 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1 4462 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2 4463 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 4464 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 4465 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> 4466 // CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) 4467 // CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> 4468 // CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]] 4469 // 4470 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) { 4471 return vqrdmulhq_n_s32(a, b); 4472 } 4473 4474 // CHECK-LABEL: @test_vmla_n_s16( 4475 // CHECK-NEXT: entry: 4476 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 4477 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 4478 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 4479 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 4480 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]] 4481 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]] 4482 // CHECK-NEXT: ret <4 x i16> [[ADD_I]] 4483 // 4484 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) { 4485 return vmla_n_s16(a, b, c); 4486 } 4487 4488 // CHECK-LABEL: @test_vmlaq_n_s16( 4489 // CHECK-NEXT: entry: 4490 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0 4491 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1 4492 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 4493 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 4494 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4 4495 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5 4496 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6 4497 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7 4498 // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]] 4499 // CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]] 4500 // CHECK-NEXT: ret <8 x i16> [[ADD_I]] 4501 // 4502 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { 4503 return vmlaq_n_s16(a, b, c); 4504 } 4505 4506 // CHECK-LABEL: @test_vmla_n_s32( 4507 // CHECK-NEXT: entry: 4508 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 4509 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 4510 // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]] 4511 // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]] 4512 // CHECK-NEXT: ret <2 x i32> [[ADD_I]] 4513 // 4514 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) { 4515 return vmla_n_s32(a, b, c); 4516 } 4517 4518 // CHECK-LABEL: @test_vmlaq_n_s32( 4519 // CHECK-NEXT: entry: 4520 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0 4521 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1 4522 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2 4523 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3 4524 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]] 4525 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]] 4526 // CHECK-NEXT: ret <4 x i32> [[ADD_I]] 4527 // 4528 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { 4529 return vmlaq_n_s32(a, b, c); 4530 } 4531 4532 // CHECK-LABEL: @test_vmla_n_u16( 4533 // CHECK-NEXT: entry: 4534 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 4535 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 4536 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 4537 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 4538 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]] 4539 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]] 4540 // CHECK-NEXT: ret <4 x i16> [[ADD_I]] 4541 // 4542 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { 4543 return vmla_n_u16(a, b, c); 4544 } 4545 4546 // CHECK-LABEL: @test_vmlaq_n_u16( 4547 // CHECK-NEXT: entry: 4548 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0 4549 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1 4550 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 4551 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 4552 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4 4553 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5 4554 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6 4555 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7 4556 // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]] 4557 // CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]] 4558 // CHECK-NEXT: ret <8 x i16> [[ADD_I]] 4559 // 4560 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { 4561 return vmlaq_n_u16(a, b, c); 4562 } 4563 4564 // CHECK-LABEL: @test_vmla_n_u32( 4565 // CHECK-NEXT: entry: 4566 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 4567 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 4568 // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]] 4569 // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]] 4570 // CHECK-NEXT: ret <2 x i32> [[ADD_I]] 4571 // 4572 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { 4573 return vmla_n_u32(a, b, c); 4574 } 4575 4576 // CHECK-LABEL: @test_vmlaq_n_u32( 4577 // CHECK-NEXT: entry: 4578 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0 4579 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1 4580 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2 4581 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3 4582 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]] 4583 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]] 4584 // CHECK-NEXT: ret <4 x i32> [[ADD_I]] 4585 // 4586 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { 4587 return vmlaq_n_u32(a, b, c); 4588 } 4589 4590 // CHECK-LABEL: @test_vmlal_n_s16( 4591 // CHECK-NEXT: entry: 4592 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 4593 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 4594 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 4595 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 4596 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 4597 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4598 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) 4599 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] 4600 // CHECK-NEXT: ret <4 x i32> [[ADD_I]] 4601 // 4602 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { 4603 return vmlal_n_s16(a, b, c); 4604 } 4605 4606 // CHECK-LABEL: @test_vmlal_n_s32( 4607 // CHECK-NEXT: entry: 4608 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 4609 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 4610 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 4611 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4612 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) 4613 // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] 4614 // CHECK-NEXT: ret <2 x i64> [[ADD_I]] 4615 // 4616 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { 4617 return vmlal_n_s32(a, b, c); 4618 } 4619 4620 // CHECK-LABEL: @test_vmlal_n_u16( 4621 // CHECK-NEXT: entry: 4622 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 4623 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 4624 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 4625 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 4626 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 4627 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4628 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) 4629 // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] 4630 // CHECK-NEXT: ret <4 x i32> [[ADD_I]] 4631 // 4632 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { 4633 return vmlal_n_u16(a, b, c); 4634 } 4635 4636 // CHECK-LABEL: @test_vmlal_n_u32( 4637 // CHECK-NEXT: entry: 4638 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 4639 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 4640 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 4641 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4642 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) 4643 // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] 4644 // CHECK-NEXT: ret <2 x i64> [[ADD_I]] 4645 // 4646 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { 4647 return vmlal_n_u32(a, b, c); 4648 } 4649 4650 // CHECK-LABEL: @test_vqdmlal_n_s16( 4651 // CHECK-NEXT: entry: 4652 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 4653 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 4654 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 4655 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 4656 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 4657 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 4658 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4659 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) 4660 // CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) 4661 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]] 4662 // 4663 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { 4664 return vqdmlal_n_s16(a, b, c); 4665 } 4666 4667 // CHECK-LABEL: @test_vqdmlal_n_s32( 4668 // CHECK-NEXT: entry: 4669 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 4670 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 4671 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 4672 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 4673 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4674 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) 4675 // CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) 4676 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]] 4677 // 4678 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { 4679 return vqdmlal_n_s32(a, b, c); 4680 } 4681 4682 // CHECK-LABEL: @test_vmls_n_s16( 4683 // CHECK-NEXT: entry: 4684 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 4685 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 4686 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 4687 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 4688 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]] 4689 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]] 4690 // CHECK-NEXT: ret <4 x i16> [[SUB_I]] 4691 // 4692 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) { 4693 return vmls_n_s16(a, b, c); 4694 } 4695 4696 // CHECK-LABEL: @test_vmlsq_n_s16( 4697 // CHECK-NEXT: entry: 4698 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0 4699 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1 4700 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 4701 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 4702 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4 4703 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5 4704 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6 4705 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7 4706 // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]] 4707 // CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]] 4708 // CHECK-NEXT: ret <8 x i16> [[SUB_I]] 4709 // 4710 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { 4711 return vmlsq_n_s16(a, b, c); 4712 } 4713 4714 // CHECK-LABEL: @test_vmls_n_s32( 4715 // CHECK-NEXT: entry: 4716 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 4717 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 4718 // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]] 4719 // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]] 4720 // CHECK-NEXT: ret <2 x i32> [[SUB_I]] 4721 // 4722 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) { 4723 return vmls_n_s32(a, b, c); 4724 } 4725 4726 // CHECK-LABEL: @test_vmlsq_n_s32( 4727 // CHECK-NEXT: entry: 4728 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0 4729 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1 4730 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2 4731 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3 4732 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]] 4733 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]] 4734 // CHECK-NEXT: ret <4 x i32> [[SUB_I]] 4735 // 4736 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { 4737 return vmlsq_n_s32(a, b, c); 4738 } 4739 4740 // CHECK-LABEL: @test_vmls_n_u16( 4741 // CHECK-NEXT: entry: 4742 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 4743 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 4744 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 4745 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 4746 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]] 4747 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]] 4748 // CHECK-NEXT: ret <4 x i16> [[SUB_I]] 4749 // 4750 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { 4751 return vmls_n_u16(a, b, c); 4752 } 4753 4754 // CHECK-LABEL: @test_vmlsq_n_u16( 4755 // CHECK-NEXT: entry: 4756 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0 4757 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1 4758 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 4759 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 4760 // CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4 4761 // CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5 4762 // CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6 4763 // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7 4764 // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]] 4765 // CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]] 4766 // CHECK-NEXT: ret <8 x i16> [[SUB_I]] 4767 // 4768 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { 4769 return vmlsq_n_u16(a, b, c); 4770 } 4771 4772 // CHECK-LABEL: @test_vmls_n_u32( 4773 // CHECK-NEXT: entry: 4774 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 4775 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 4776 // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]] 4777 // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]] 4778 // CHECK-NEXT: ret <2 x i32> [[SUB_I]] 4779 // 4780 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { 4781 return vmls_n_u32(a, b, c); 4782 } 4783 4784 // CHECK-LABEL: @test_vmlsq_n_u32( 4785 // CHECK-NEXT: entry: 4786 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0 4787 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1 4788 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2 4789 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3 4790 // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]] 4791 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]] 4792 // CHECK-NEXT: ret <4 x i32> [[SUB_I]] 4793 // 4794 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { 4795 return vmlsq_n_u32(a, b, c); 4796 } 4797 4798 // CHECK-LABEL: @test_vmlsl_n_s16( 4799 // CHECK-NEXT: entry: 4800 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 4801 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 4802 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 4803 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 4804 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 4805 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4806 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) 4807 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] 4808 // CHECK-NEXT: ret <4 x i32> [[SUB_I]] 4809 // 4810 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { 4811 return vmlsl_n_s16(a, b, c); 4812 } 4813 4814 // CHECK-LABEL: @test_vmlsl_n_s32( 4815 // CHECK-NEXT: entry: 4816 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 4817 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 4818 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 4819 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4820 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) 4821 // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] 4822 // CHECK-NEXT: ret <2 x i64> [[SUB_I]] 4823 // 4824 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { 4825 return vmlsl_n_s32(a, b, c); 4826 } 4827 4828 // CHECK-LABEL: @test_vmlsl_n_u16( 4829 // CHECK-NEXT: entry: 4830 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 4831 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 4832 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 4833 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 4834 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 4835 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4836 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) 4837 // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] 4838 // CHECK-NEXT: ret <4 x i32> [[SUB_I]] 4839 // 4840 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { 4841 return vmlsl_n_u16(a, b, c); 4842 } 4843 4844 // CHECK-LABEL: @test_vmlsl_n_u32( 4845 // CHECK-NEXT: entry: 4846 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 4847 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 4848 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 4849 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4850 // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) 4851 // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] 4852 // CHECK-NEXT: ret <2 x i64> [[SUB_I]] 4853 // 4854 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { 4855 return vmlsl_n_u32(a, b, c); 4856 } 4857 4858 // CHECK-LABEL: @test_vqdmlsl_n_s16( 4859 // CHECK-NEXT: entry: 4860 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 4861 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 4862 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 4863 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 4864 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 4865 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 4866 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> 4867 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) 4868 // CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) 4869 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]] 4870 // 4871 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { 4872 return vqdmlsl_n_s16(a, b, c); 4873 } 4874 4875 // CHECK-LABEL: @test_vqdmlsl_n_s32( 4876 // CHECK-NEXT: entry: 4877 // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 4878 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 4879 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 4880 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 4881 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> 4882 // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) 4883 // CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) 4884 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]] 4885 // 4886 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { 4887 return vqdmlsl_n_s32(a, b, c); 4888 } 4889 4890 // CHECK-LABEL: @test_vmla_lane_u16_0( 4891 // CHECK-NEXT: entry: 4892 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 4893 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 4894 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 4895 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 4896 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] 4897 // CHECK-NEXT: ret <4 x i16> [[ADD]] 4898 // 4899 uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) { 4900 return vmla_lane_u16(a, b, v, 0); 4901 } 4902 4903 // CHECK-LABEL: @test_vmlaq_lane_u16_0( 4904 // CHECK-NEXT: entry: 4905 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 4906 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 4907 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer 4908 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 4909 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] 4910 // CHECK-NEXT: ret <8 x i16> [[ADD]] 4911 // 4912 uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) { 4913 return vmlaq_lane_u16(a, b, v, 0); 4914 } 4915 4916 // CHECK-LABEL: @test_vmla_lane_u32_0( 4917 // CHECK-NEXT: entry: 4918 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 4919 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 4920 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 4921 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 4922 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] 4923 // CHECK-NEXT: ret <2 x i32> [[ADD]] 4924 // 4925 uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) { 4926 return vmla_lane_u32(a, b, v, 0); 4927 } 4928 4929 // CHECK-LABEL: @test_vmlaq_lane_u32_0( 4930 // CHECK-NEXT: entry: 4931 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 4932 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 4933 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer 4934 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 4935 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] 4936 // CHECK-NEXT: ret <4 x i32> [[ADD]] 4937 // 4938 uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) { 4939 return vmlaq_lane_u32(a, b, v, 0); 4940 } 4941 4942 // CHECK-LABEL: @test_vmla_laneq_u16_0( 4943 // CHECK-NEXT: entry: 4944 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 4945 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 4946 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 4947 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 4948 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] 4949 // CHECK-NEXT: ret <4 x i16> [[ADD]] 4950 // 4951 uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) { 4952 return vmla_laneq_u16(a, b, v, 0); 4953 } 4954 4955 // CHECK-LABEL: @test_vmlaq_laneq_u16_0( 4956 // CHECK-NEXT: entry: 4957 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 4958 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 4959 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer 4960 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 4961 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] 4962 // CHECK-NEXT: ret <8 x i16> [[ADD]] 4963 // 4964 uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) { 4965 return vmlaq_laneq_u16(a, b, v, 0); 4966 } 4967 4968 // CHECK-LABEL: @test_vmla_laneq_u32_0( 4969 // CHECK-NEXT: entry: 4970 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 4971 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 4972 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 4973 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 4974 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] 4975 // CHECK-NEXT: ret <2 x i32> [[ADD]] 4976 // 4977 uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) { 4978 return vmla_laneq_u32(a, b, v, 0); 4979 } 4980 4981 // CHECK-LABEL: @test_vmlaq_laneq_u32_0( 4982 // CHECK-NEXT: entry: 4983 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 4984 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 4985 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer 4986 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 4987 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] 4988 // CHECK-NEXT: ret <4 x i32> [[ADD]] 4989 // 4990 uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) { 4991 return vmlaq_laneq_u32(a, b, v, 0); 4992 } 4993 4994 // CHECK-LABEL: @test_vqdmlal_laneq_s16_0( 4995 // CHECK-NEXT: entry: 4996 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 4997 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 4998 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 4999 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 5000 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 5001 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 5002 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 5003 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 5004 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] 5005 // 5006 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { 5007 return vqdmlal_laneq_s16(a, b, v, 0); 5008 } 5009 5010 // CHECK-LABEL: @test_vqdmlal_laneq_s32_0( 5011 // CHECK-NEXT: entry: 5012 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5013 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5014 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 5015 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 5016 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 5017 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 5018 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 5019 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 5020 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] 5021 // 5022 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { 5023 return vqdmlal_laneq_s32(a, b, v, 0); 5024 } 5025 5026 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0( 5027 // CHECK-NEXT: entry: 5028 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 5029 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5030 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5031 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 5032 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 5033 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 5034 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 5035 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 5036 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 5037 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] 5038 // 5039 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { 5040 return vqdmlal_high_laneq_s16(a, b, v, 0); 5041 } 5042 5043 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0( 5044 // CHECK-NEXT: entry: 5045 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 5046 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5047 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5048 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 5049 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 5050 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 5051 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 5052 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 5053 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 5054 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] 5055 // 5056 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { 5057 return vqdmlal_high_laneq_s32(a, b, v, 0); 5058 } 5059 5060 // CHECK-LABEL: @test_vmls_lane_u16_0( 5061 // CHECK-NEXT: entry: 5062 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 5063 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 5064 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer 5065 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 5066 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] 5067 // CHECK-NEXT: ret <4 x i16> [[SUB]] 5068 // 5069 uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) { 5070 return vmls_lane_u16(a, b, v, 0); 5071 } 5072 5073 // CHECK-LABEL: @test_vmlsq_lane_u16_0( 5074 // CHECK-NEXT: entry: 5075 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 5076 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 5077 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer 5078 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 5079 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] 5080 // CHECK-NEXT: ret <8 x i16> [[SUB]] 5081 // 5082 uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) { 5083 return vmlsq_lane_u16(a, b, v, 0); 5084 } 5085 5086 // CHECK-LABEL: @test_vmls_lane_u32_0( 5087 // CHECK-NEXT: entry: 5088 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 5089 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 5090 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer 5091 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 5092 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] 5093 // CHECK-NEXT: ret <2 x i32> [[SUB]] 5094 // 5095 uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) { 5096 return vmls_lane_u32(a, b, v, 0); 5097 } 5098 5099 // CHECK-LABEL: @test_vmlsq_lane_u32_0( 5100 // CHECK-NEXT: entry: 5101 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 5102 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 5103 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer 5104 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 5105 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] 5106 // CHECK-NEXT: ret <4 x i32> [[SUB]] 5107 // 5108 uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) { 5109 return vmlsq_lane_u32(a, b, v, 0); 5110 } 5111 5112 // CHECK-LABEL: @test_vmls_laneq_u16_0( 5113 // CHECK-NEXT: entry: 5114 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5115 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5116 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 5117 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 5118 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] 5119 // CHECK-NEXT: ret <4 x i16> [[SUB]] 5120 // 5121 uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) { 5122 return vmls_laneq_u16(a, b, v, 0); 5123 } 5124 5125 // CHECK-LABEL: @test_vmlsq_laneq_u16_0( 5126 // CHECK-NEXT: entry: 5127 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5128 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5129 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer 5130 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 5131 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] 5132 // CHECK-NEXT: ret <8 x i16> [[SUB]] 5133 // 5134 uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) { 5135 return vmlsq_laneq_u16(a, b, v, 0); 5136 } 5137 5138 // CHECK-LABEL: @test_vmls_laneq_u32_0( 5139 // CHECK-NEXT: entry: 5140 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5141 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5142 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 5143 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 5144 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] 5145 // CHECK-NEXT: ret <2 x i32> [[SUB]] 5146 // 5147 uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) { 5148 return vmls_laneq_u32(a, b, v, 0); 5149 } 5150 5151 // CHECK-LABEL: @test_vmlsq_laneq_u32_0( 5152 // CHECK-NEXT: entry: 5153 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5154 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5155 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer 5156 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 5157 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] 5158 // CHECK-NEXT: ret <4 x i32> [[SUB]] 5159 // 5160 uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) { 5161 return vmlsq_laneq_u32(a, b, v, 0); 5162 } 5163 5164 // CHECK-LABEL: @test_vqdmlsl_laneq_s16_0( 5165 // CHECK-NEXT: entry: 5166 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5167 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5168 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 5169 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 5170 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 5171 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 5172 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 5173 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 5174 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] 5175 // 5176 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { 5177 return vqdmlsl_laneq_s16(a, b, v, 0); 5178 } 5179 5180 // CHECK-LABEL: @test_vqdmlsl_laneq_s32_0( 5181 // CHECK-NEXT: entry: 5182 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5183 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5184 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 5185 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 5186 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 5187 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 5188 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 5189 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 5190 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] 5191 // 5192 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { 5193 return vqdmlsl_laneq_s32(a, b, v, 0); 5194 } 5195 5196 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0( 5197 // CHECK-NEXT: entry: 5198 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 5199 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5200 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5201 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer 5202 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 5203 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 5204 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 5205 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 5206 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 5207 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] 5208 // 5209 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { 5210 return vqdmlsl_high_laneq_s16(a, b, v, 0); 5211 } 5212 5213 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0( 5214 // CHECK-NEXT: entry: 5215 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 5216 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5217 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5218 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer 5219 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 5220 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 5221 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 5222 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 5223 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 5224 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] 5225 // 5226 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { 5227 return vqdmlsl_high_laneq_s32(a, b, v, 0); 5228 } 5229 5230 // CHECK-LABEL: @test_vqdmulh_laneq_s16_0( 5231 // CHECK-NEXT: entry: 5232 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 5233 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5234 // CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 5235 // CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 5236 // CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 0) 5237 // CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANEQ_V2]] 5238 // 5239 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) { 5240 return vqdmulh_laneq_s16(a, v, 0); 5241 } 5242 5243 // CHECK-LABEL: @test_vqdmulhq_laneq_s16_0( 5244 // CHECK-NEXT: entry: 5245 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> 5246 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5247 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5248 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 5249 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 0) 5250 // CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANEQ_V2]] 5251 // 5252 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) { 5253 return vqdmulhq_laneq_s16(a, v, 0); 5254 } 5255 5256 // CHECK-LABEL: @test_vqdmulh_laneq_s32_0( 5257 // CHECK-NEXT: entry: 5258 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 5259 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5260 // CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 5261 // CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 5262 // CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 0) 5263 // CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANEQ_V2]] 5264 // 5265 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) { 5266 return vqdmulh_laneq_s32(a, v, 0); 5267 } 5268 5269 // CHECK-LABEL: @test_vqdmulhq_laneq_s32_0( 5270 // CHECK-NEXT: entry: 5271 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 5272 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5273 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5274 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 5275 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 0) 5276 // CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANEQ_V2]] 5277 // 5278 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) { 5279 return vqdmulhq_laneq_s32(a, v, 0); 5280 } 5281 5282 // CHECK-LABEL: @test_vqrdmulh_laneq_s16_0( 5283 // CHECK-NEXT: entry: 5284 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 5285 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5286 // CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 5287 // CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 5288 // CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 0) 5289 // CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANEQ_V2]] 5290 // 5291 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) { 5292 return vqrdmulh_laneq_s16(a, v, 0); 5293 } 5294 5295 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0( 5296 // CHECK-NEXT: entry: 5297 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> 5298 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5299 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5300 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 5301 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 0) 5302 // CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]] 5303 // 5304 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) { 5305 return vqrdmulhq_laneq_s16(a, v, 0); 5306 } 5307 5308 // CHECK-LABEL: @test_vqrdmulh_laneq_s32_0( 5309 // CHECK-NEXT: entry: 5310 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 5311 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5312 // CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 5313 // CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 5314 // CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 0) 5315 // CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANEQ_V2]] 5316 // 5317 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) { 5318 return vqrdmulh_laneq_s32(a, v, 0); 5319 } 5320 5321 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0( 5322 // CHECK-NEXT: entry: 5323 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 5324 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5325 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5326 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 5327 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 0) 5328 // CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]] 5329 // 5330 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) { 5331 return vqrdmulhq_laneq_s32(a, v, 0); 5332 } 5333 5334 // CHECK-LABEL: @test_vmla_lane_u16( 5335 // CHECK-NEXT: entry: 5336 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 5337 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 5338 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 5339 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 5340 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] 5341 // CHECK-NEXT: ret <4 x i16> [[ADD]] 5342 // 5343 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) { 5344 return vmla_lane_u16(a, b, v, 3); 5345 } 5346 5347 // CHECK-LABEL: @test_vmlaq_lane_u16( 5348 // CHECK-NEXT: entry: 5349 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 5350 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 5351 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 5352 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 5353 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] 5354 // CHECK-NEXT: ret <8 x i16> [[ADD]] 5355 // 5356 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) { 5357 return vmlaq_lane_u16(a, b, v, 3); 5358 } 5359 5360 // CHECK-LABEL: @test_vmla_lane_u32( 5361 // CHECK-NEXT: entry: 5362 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 5363 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 5364 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 5365 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 5366 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] 5367 // CHECK-NEXT: ret <2 x i32> [[ADD]] 5368 // 5369 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) { 5370 return vmla_lane_u32(a, b, v, 1); 5371 } 5372 5373 // CHECK-LABEL: @test_vmlaq_lane_u32( 5374 // CHECK-NEXT: entry: 5375 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 5376 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 5377 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1> 5378 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 5379 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] 5380 // CHECK-NEXT: ret <4 x i32> [[ADD]] 5381 // 5382 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) { 5383 return vmlaq_lane_u32(a, b, v, 1); 5384 } 5385 5386 // CHECK-LABEL: @test_vmla_laneq_u16( 5387 // CHECK-NEXT: entry: 5388 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5389 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5390 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 5391 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 5392 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] 5393 // CHECK-NEXT: ret <4 x i16> [[ADD]] 5394 // 5395 uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) { 5396 return vmla_laneq_u16(a, b, v, 7); 5397 } 5398 5399 // CHECK-LABEL: @test_vmlaq_laneq_u16( 5400 // CHECK-NEXT: entry: 5401 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5402 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5403 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 5404 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 5405 // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] 5406 // CHECK-NEXT: ret <8 x i16> [[ADD]] 5407 // 5408 uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) { 5409 return vmlaq_laneq_u16(a, b, v, 7); 5410 } 5411 5412 // CHECK-LABEL: @test_vmla_laneq_u32( 5413 // CHECK-NEXT: entry: 5414 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5415 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5416 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 5417 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 5418 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] 5419 // CHECK-NEXT: ret <2 x i32> [[ADD]] 5420 // 5421 uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) { 5422 return vmla_laneq_u32(a, b, v, 3); 5423 } 5424 5425 // CHECK-LABEL: @test_vmlaq_laneq_u32( 5426 // CHECK-NEXT: entry: 5427 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5428 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5429 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 5430 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 5431 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] 5432 // CHECK-NEXT: ret <4 x i32> [[ADD]] 5433 // 5434 uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) { 5435 return vmlaq_laneq_u32(a, b, v, 3); 5436 } 5437 5438 // CHECK-LABEL: @test_vqdmlal_laneq_s16( 5439 // CHECK-NEXT: entry: 5440 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5441 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5442 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 5443 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 5444 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 5445 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 5446 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 5447 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 5448 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] 5449 // 5450 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { 5451 return vqdmlal_laneq_s16(a, b, v, 7); 5452 } 5453 5454 // CHECK-LABEL: @test_vqdmlal_laneq_s32( 5455 // CHECK-NEXT: entry: 5456 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5457 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5458 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 5459 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 5460 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 5461 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 5462 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 5463 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 5464 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] 5465 // 5466 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { 5467 return vqdmlal_laneq_s32(a, b, v, 3); 5468 } 5469 5470 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16( 5471 // CHECK-NEXT: entry: 5472 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 5473 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5474 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5475 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 5476 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 5477 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 5478 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 5479 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 5480 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 5481 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] 5482 // 5483 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { 5484 return vqdmlal_high_laneq_s16(a, b, v, 7); 5485 } 5486 5487 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32( 5488 // CHECK-NEXT: entry: 5489 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 5490 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5491 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5492 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 5493 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 5494 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 5495 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 5496 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 5497 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 5498 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] 5499 // 5500 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { 5501 return vqdmlal_high_laneq_s32(a, b, v, 3); 5502 } 5503 5504 // CHECK-LABEL: @test_vmls_lane_u16( 5505 // CHECK-NEXT: entry: 5506 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 5507 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 5508 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 5509 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 5510 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] 5511 // CHECK-NEXT: ret <4 x i16> [[SUB]] 5512 // 5513 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) { 5514 return vmls_lane_u16(a, b, v, 3); 5515 } 5516 5517 // CHECK-LABEL: @test_vmlsq_lane_u16( 5518 // CHECK-NEXT: entry: 5519 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> 5520 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 5521 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 5522 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 5523 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] 5524 // CHECK-NEXT: ret <8 x i16> [[SUB]] 5525 // 5526 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) { 5527 return vmlsq_lane_u16(a, b, v, 3); 5528 } 5529 5530 // CHECK-LABEL: @test_vmls_lane_u32( 5531 // CHECK-NEXT: entry: 5532 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 5533 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 5534 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1> 5535 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 5536 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] 5537 // CHECK-NEXT: ret <2 x i32> [[SUB]] 5538 // 5539 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) { 5540 return vmls_lane_u32(a, b, v, 1); 5541 } 5542 5543 // CHECK-LABEL: @test_vmlsq_lane_u32( 5544 // CHECK-NEXT: entry: 5545 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> 5546 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 5547 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1> 5548 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 5549 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] 5550 // CHECK-NEXT: ret <4 x i32> [[SUB]] 5551 // 5552 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) { 5553 return vmlsq_lane_u32(a, b, v, 1); 5554 } 5555 5556 // CHECK-LABEL: @test_vmls_laneq_u16( 5557 // CHECK-NEXT: entry: 5558 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5559 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5560 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 5561 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] 5562 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] 5563 // CHECK-NEXT: ret <4 x i16> [[SUB]] 5564 // 5565 uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) { 5566 return vmls_laneq_u16(a, b, v, 7); 5567 } 5568 5569 // CHECK-LABEL: @test_vmlsq_laneq_u16( 5570 // CHECK-NEXT: entry: 5571 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5572 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5573 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 5574 // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] 5575 // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] 5576 // CHECK-NEXT: ret <8 x i16> [[SUB]] 5577 // 5578 uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) { 5579 return vmlsq_laneq_u16(a, b, v, 7); 5580 } 5581 5582 // CHECK-LABEL: @test_vmls_laneq_u32( 5583 // CHECK-NEXT: entry: 5584 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5585 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5586 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 5587 // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] 5588 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] 5589 // CHECK-NEXT: ret <2 x i32> [[SUB]] 5590 // 5591 uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) { 5592 return vmls_laneq_u32(a, b, v, 3); 5593 } 5594 5595 // CHECK-LABEL: @test_vmlsq_laneq_u32( 5596 // CHECK-NEXT: entry: 5597 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5598 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5599 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3> 5600 // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] 5601 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] 5602 // CHECK-NEXT: ret <4 x i32> [[SUB]] 5603 // 5604 uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) { 5605 return vmlsq_laneq_u32(a, b, v, 3); 5606 } 5607 5608 // CHECK-LABEL: @test_vqdmlsl_laneq_s16( 5609 // CHECK-NEXT: entry: 5610 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5611 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5612 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 5613 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 5614 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> 5615 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 5616 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) 5617 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 5618 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] 5619 // 5620 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { 5621 return vqdmlsl_laneq_s16(a, b, v, 7); 5622 } 5623 5624 // CHECK-LABEL: @test_vqdmlsl_laneq_s32( 5625 // CHECK-NEXT: entry: 5626 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5627 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5628 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 5629 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 5630 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> 5631 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 5632 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) 5633 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 5634 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] 5635 // 5636 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { 5637 return vqdmlsl_laneq_s32(a, b, v, 3); 5638 } 5639 5640 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16( 5641 // CHECK-NEXT: entry: 5642 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> <i32 4, i32 5, i32 6, i32 7> 5643 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5644 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5645 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7> 5646 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 5647 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> 5648 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> 5649 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) 5650 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) 5651 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] 5652 // 5653 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { 5654 return vqdmlsl_high_laneq_s16(a, b, v, 7); 5655 } 5656 5657 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32( 5658 // CHECK-NEXT: entry: 5659 // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> <i32 2, i32 3> 5660 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5661 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5662 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3> 5663 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> 5664 // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> 5665 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> 5666 // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) 5667 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) 5668 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] 5669 // 5670 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { 5671 return vqdmlsl_high_laneq_s32(a, b, v, 3); 5672 } 5673 5674 // CHECK-LABEL: @test_vqdmulh_laneq_s16( 5675 // CHECK-NEXT: entry: 5676 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 5677 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5678 // CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 5679 // CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 5680 // CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 7) 5681 // CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANEQ_V2]] 5682 // 5683 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) { 5684 return vqdmulh_laneq_s16(a, v, 7); 5685 } 5686 5687 // CHECK-LABEL: @test_vqdmulhq_laneq_s16( 5688 // CHECK-NEXT: entry: 5689 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> 5690 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5691 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5692 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 5693 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 7) 5694 // CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANEQ_V2]] 5695 // 5696 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) { 5697 return vqdmulhq_laneq_s16(a, v, 7); 5698 } 5699 5700 // CHECK-LABEL: @test_vqdmulh_laneq_s32( 5701 // CHECK-NEXT: entry: 5702 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 5703 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5704 // CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 5705 // CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 5706 // CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 3) 5707 // CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANEQ_V2]] 5708 // 5709 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) { 5710 return vqdmulh_laneq_s32(a, v, 3); 5711 } 5712 5713 // CHECK-LABEL: @test_vqdmulhq_laneq_s32( 5714 // CHECK-NEXT: entry: 5715 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 5716 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5717 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5718 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 5719 // CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 3) 5720 // CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANEQ_V2]] 5721 // 5722 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) { 5723 return vqdmulhq_laneq_s32(a, v, 3); 5724 } 5725 5726 // CHECK-LABEL: @test_vqrdmulh_laneq_s16( 5727 // CHECK-NEXT: entry: 5728 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> 5729 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5730 // CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> 5731 // CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 5732 // CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 7) 5733 // CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANEQ_V2]] 5734 // 5735 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) { 5736 return vqrdmulh_laneq_s16(a, v, 7); 5737 } 5738 5739 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16( 5740 // CHECK-NEXT: entry: 5741 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> 5742 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> 5743 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> 5744 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> 5745 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 7) 5746 // CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]] 5747 // 5748 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) { 5749 return vqrdmulhq_laneq_s16(a, v, 7); 5750 } 5751 5752 // CHECK-LABEL: @test_vqrdmulh_laneq_s32( 5753 // CHECK-NEXT: entry: 5754 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> 5755 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5756 // CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 5757 // CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 5758 // CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 3) 5759 // CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANEQ_V2]] 5760 // 5761 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) { 5762 return vqrdmulh_laneq_s32(a, v, 3); 5763 } 5764 5765 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32( 5766 // CHECK-NEXT: entry: 5767 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> 5768 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> 5769 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> 5770 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> 5771 // CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 3) 5772 // CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]] 5773 // 5774 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) { 5775 return vqrdmulhq_laneq_s32(a, v, 3); 5776 } 5777