1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -mattr=+aes | FileCheck %s 3 4define <8 x i16> @smull8h(ptr %A, ptr %B) nounwind { 5; CHECK-LABEL: smull8h: 6; CHECK: // %bb.0: 7; CHECK-NEXT: ldr d0, [x0] 8; CHECK-NEXT: ldr d1, [x1] 9; CHECK-NEXT: smull.8h v0, v0, v1 10; CHECK-NEXT: ret 11 %tmp1 = load <8 x i8>, ptr %A 12 %tmp2 = load <8 x i8>, ptr %B 13 %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) 14 ret <8 x i16> %tmp3 15} 16 17define <4 x i32> @smull4s(ptr %A, ptr %B) nounwind { 18; CHECK-LABEL: smull4s: 19; CHECK: // %bb.0: 20; CHECK-NEXT: ldr d0, [x0] 21; CHECK-NEXT: ldr d1, [x1] 22; CHECK-NEXT: smull.4s v0, v0, v1 23; CHECK-NEXT: ret 24 %tmp1 = load <4 x i16>, ptr %A 25 %tmp2 = load <4 x i16>, ptr %B 26 %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 27 ret <4 x i32> %tmp3 28} 29 30define <2 x i64> @smull2d(ptr %A, ptr %B) nounwind { 31; CHECK-LABEL: smull2d: 32; CHECK: // %bb.0: 33; CHECK-NEXT: ldr d0, [x0] 34; CHECK-NEXT: ldr d1, [x1] 35; CHECK-NEXT: smull.2d v0, v0, v1 36; CHECK-NEXT: ret 37 %tmp1 = load <2 x i32>, ptr %A 38 %tmp2 = load <2 x i32>, ptr %B 39 %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 40 ret <2 x i64> %tmp3 41} 42 43declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone 44declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone 45declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone 46 47define <8 x i16> @umull8h(ptr %A, ptr %B) nounwind { 48; CHECK-LABEL: umull8h: 49; CHECK: // %bb.0: 50; CHECK-NEXT: ldr d0, [x0] 51; CHECK-NEXT: ldr d1, [x1] 52; CHECK-NEXT: umull.8h v0, v0, v1 53; CHECK-NEXT: ret 54 %tmp1 = load <8 x i8>, ptr %A 55 %tmp2 = load <8 x i8>, ptr %B 56 %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) 57 ret <8 x i16> %tmp3 58} 59 60define <4 x i32> @umull4s(ptr %A, ptr %B) nounwind { 61; CHECK-LABEL: umull4s: 62; CHECK: // %bb.0: 63; CHECK-NEXT: ldr d0, [x0] 64; CHECK-NEXT: ldr d1, [x1] 65; CHECK-NEXT: umull.4s v0, v0, v1 66; CHECK-NEXT: ret 67 %tmp1 = load <4 x i16>, ptr %A 68 %tmp2 = load <4 x i16>, ptr %B 69 %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 70 ret <4 x i32> %tmp3 71} 72 73define <2 x i64> @umull2d(ptr %A, ptr %B) nounwind { 74; CHECK-LABEL: umull2d: 75; CHECK: // %bb.0: 76; CHECK-NEXT: ldr d0, [x0] 77; CHECK-NEXT: ldr d1, [x1] 78; CHECK-NEXT: umull.2d v0, v0, v1 79; CHECK-NEXT: ret 80 %tmp1 = load <2 x i32>, ptr %A 81 %tmp2 = load <2 x i32>, ptr %B 82 %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 83 ret <2 x i64> %tmp3 84} 85 86declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone 87declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone 88declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone 89 90define <4 x i32> @sqdmull4s(ptr %A, ptr %B) nounwind { 91; CHECK-LABEL: sqdmull4s: 92; CHECK: // %bb.0: 93; CHECK-NEXT: ldr d0, [x0] 94; CHECK-NEXT: ldr d1, [x1] 95; CHECK-NEXT: sqdmull.4s v0, v0, v1 96; CHECK-NEXT: ret 97 %tmp1 = load <4 x i16>, ptr %A 98 %tmp2 = load <4 x i16>, ptr %B 99 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 100 ret <4 x i32> %tmp3 101} 102 103define <2 x i64> @sqdmull2d(ptr %A, ptr %B) nounwind { 104; CHECK-LABEL: sqdmull2d: 105; CHECK: // %bb.0: 106; CHECK-NEXT: ldr d0, [x0] 107; CHECK-NEXT: ldr d1, [x1] 108; CHECK-NEXT: sqdmull.2d v0, v0, v1 109; CHECK-NEXT: ret 110 %tmp1 = load <2 x i32>, ptr %A 111 %tmp2 = load <2 x i32>, ptr %B 112 %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 113 ret <2 x i64> %tmp3 114} 115 116define <4 x i32> @sqdmull2_4s(ptr %A, ptr %B) nounwind { 117; CHECK-LABEL: sqdmull2_4s: 118; CHECK: // %bb.0: 119; CHECK-NEXT: ldr d0, [x0, #8] 120; CHECK-NEXT: ldr d1, [x1, #8] 121; CHECK-NEXT: sqdmull.4s v0, v0, v1 122; CHECK-NEXT: ret 123 %load1 = load <8 x i16>, ptr %A 124 %load2 = load <8 x i16>, ptr %B 125 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 126 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 127 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 128 ret <4 x i32> %tmp3 129} 130 131define <2 x i64> @sqdmull2_2d(ptr %A, ptr %B) nounwind { 132; CHECK-LABEL: sqdmull2_2d: 133; CHECK: // %bb.0: 134; CHECK-NEXT: ldr d0, [x0, #8] 135; CHECK-NEXT: ldr d1, [x1, #8] 136; CHECK-NEXT: sqdmull.2d v0, v0, v1 137; CHECK-NEXT: ret 138 %load1 = load <4 x i32>, ptr %A 139 %load2 = load <4 x i32>, ptr %B 140 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 141 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 142 %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 143 ret <2 x i64> %tmp3 144} 145 146 147declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone 148declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone 149 150define <8 x i16> @pmull8h(ptr %A, ptr %B) nounwind { 151; CHECK-LABEL: pmull8h: 152; CHECK: // %bb.0: 153; CHECK-NEXT: ldr d0, [x0] 154; CHECK-NEXT: ldr d1, [x1] 155; CHECK-NEXT: pmull.8h v0, v0, v1 156; CHECK-NEXT: ret 157 %tmp1 = load <8 x i8>, ptr %A 158 %tmp2 = load <8 x i8>, ptr %B 159 %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) 160 ret <8 x i16> %tmp3 161} 162 163declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone 164 165define <4 x i16> @sqdmulh_4h(ptr %A, ptr %B) nounwind { 166; CHECK-LABEL: sqdmulh_4h: 167; CHECK: // %bb.0: 168; CHECK-NEXT: ldr d0, [x0] 169; CHECK-NEXT: ldr d1, [x1] 170; CHECK-NEXT: sqdmulh.4h v0, v0, v1 171; CHECK-NEXT: ret 172 %tmp1 = load <4 x i16>, ptr %A 173 %tmp2 = load <4 x i16>, ptr %B 174 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 175 ret <4 x i16> %tmp3 176} 177 178define <8 x i16> @sqdmulh_8h(ptr %A, ptr %B) nounwind { 179; CHECK-LABEL: sqdmulh_8h: 180; CHECK: // %bb.0: 181; CHECK-NEXT: ldr q0, [x0] 182; CHECK-NEXT: ldr q1, [x1] 183; CHECK-NEXT: sqdmulh.8h v0, v0, v1 184; CHECK-NEXT: ret 185 %tmp1 = load <8 x i16>, ptr %A 186 %tmp2 = load <8 x i16>, ptr %B 187 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 188 ret <8 x i16> %tmp3 189} 190 191define <2 x i32> @sqdmulh_2s(ptr %A, ptr %B) nounwind { 192; CHECK-LABEL: sqdmulh_2s: 193; CHECK: // %bb.0: 194; CHECK-NEXT: ldr d0, [x0] 195; CHECK-NEXT: ldr d1, [x1] 196; CHECK-NEXT: sqdmulh.2s v0, v0, v1 197; CHECK-NEXT: ret 198 %tmp1 = load <2 x i32>, ptr %A 199 %tmp2 = load <2 x i32>, ptr %B 200 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 201 ret <2 x i32> %tmp3 202} 203 204define <4 x i32> @sqdmulh_4s(ptr %A, ptr %B) nounwind { 205; CHECK-LABEL: sqdmulh_4s: 206; CHECK: // %bb.0: 207; CHECK-NEXT: ldr q0, [x0] 208; CHECK-NEXT: ldr q1, [x1] 209; CHECK-NEXT: sqdmulh.4s v0, v0, v1 210; CHECK-NEXT: ret 211 %tmp1 = load <4 x i32>, ptr %A 212 %tmp2 = load <4 x i32>, ptr %B 213 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 214 ret <4 x i32> %tmp3 215} 216 217define i32 @sqdmulh_1s(ptr %A, ptr %B) nounwind { 218; CHECK-LABEL: sqdmulh_1s: 219; CHECK: // %bb.0: 220; CHECK-NEXT: ldr w8, [x0] 221; CHECK-NEXT: ldr w9, [x1] 222; CHECK-NEXT: fmov s0, w8 223; CHECK-NEXT: fmov s1, w9 224; CHECK-NEXT: sqdmulh s0, s0, s1 225; CHECK-NEXT: fmov w0, s0 226; CHECK-NEXT: ret 227 %tmp1 = load i32, ptr %A 228 %tmp2 = load i32, ptr %B 229 %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2) 230 ret i32 %tmp3 231} 232 233declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 234declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 235declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 236declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 237declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone 238 239define <4 x i16> @sqrdmulh_4h(ptr %A, ptr %B) nounwind { 240; CHECK-LABEL: sqrdmulh_4h: 241; CHECK: // %bb.0: 242; CHECK-NEXT: ldr d0, [x0] 243; CHECK-NEXT: ldr d1, [x1] 244; CHECK-NEXT: sqrdmulh.4h v0, v0, v1 245; CHECK-NEXT: ret 246 %tmp1 = load <4 x i16>, ptr %A 247 %tmp2 = load <4 x i16>, ptr %B 248 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 249 ret <4 x i16> %tmp3 250} 251 252define <8 x i16> @sqrdmulh_8h(ptr %A, ptr %B) nounwind { 253; CHECK-LABEL: sqrdmulh_8h: 254; CHECK: // %bb.0: 255; CHECK-NEXT: ldr q0, [x0] 256; CHECK-NEXT: ldr q1, [x1] 257; CHECK-NEXT: sqrdmulh.8h v0, v0, v1 258; CHECK-NEXT: ret 259 %tmp1 = load <8 x i16>, ptr %A 260 %tmp2 = load <8 x i16>, ptr %B 261 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 262 ret <8 x i16> %tmp3 263} 264 265define <2 x i32> @sqrdmulh_2s(ptr %A, ptr %B) nounwind { 266; CHECK-LABEL: sqrdmulh_2s: 267; CHECK: // %bb.0: 268; CHECK-NEXT: ldr d0, [x0] 269; CHECK-NEXT: ldr d1, [x1] 270; CHECK-NEXT: sqrdmulh.2s v0, v0, v1 271; CHECK-NEXT: ret 272 %tmp1 = load <2 x i32>, ptr %A 273 %tmp2 = load <2 x i32>, ptr %B 274 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 275 ret <2 x i32> %tmp3 276} 277 278define <4 x i32> @sqrdmulh_4s(ptr %A, ptr %B) nounwind { 279; CHECK-LABEL: sqrdmulh_4s: 280; CHECK: // %bb.0: 281; CHECK-NEXT: ldr q0, [x0] 282; CHECK-NEXT: ldr q1, [x1] 283; CHECK-NEXT: sqrdmulh.4s v0, v0, v1 284; CHECK-NEXT: ret 285 %tmp1 = load <4 x i32>, ptr %A 286 %tmp2 = load <4 x i32>, ptr %B 287 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 288 ret <4 x i32> %tmp3 289} 290 291define i32 @sqrdmulh_1s(ptr %A, ptr %B) nounwind { 292; CHECK-LABEL: sqrdmulh_1s: 293; CHECK: // %bb.0: 294; CHECK-NEXT: ldr w8, [x0] 295; CHECK-NEXT: ldr w9, [x1] 296; CHECK-NEXT: fmov s0, w8 297; CHECK-NEXT: fmov s1, w9 298; CHECK-NEXT: sqrdmulh s0, s0, s1 299; CHECK-NEXT: fmov w0, s0 300; CHECK-NEXT: ret 301 %tmp1 = load i32, ptr %A 302 %tmp2 = load i32, ptr %B 303 %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2) 304 ret i32 %tmp3 305} 306 307declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 308declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 309declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 310declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 311declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone 312 313define <2 x float> @fmulx_2s(ptr %A, ptr %B) nounwind { 314; CHECK-LABEL: fmulx_2s: 315; CHECK: // %bb.0: 316; CHECK-NEXT: ldr d0, [x0] 317; CHECK-NEXT: ldr d1, [x1] 318; CHECK-NEXT: fmulx.2s v0, v0, v1 319; CHECK-NEXT: ret 320 %tmp1 = load <2 x float>, ptr %A 321 %tmp2 = load <2 x float>, ptr %B 322 %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) 323 ret <2 x float> %tmp3 324} 325 326define <4 x float> @fmulx_4s(ptr %A, ptr %B) nounwind { 327; CHECK-LABEL: fmulx_4s: 328; CHECK: // %bb.0: 329; CHECK-NEXT: ldr q0, [x0] 330; CHECK-NEXT: ldr q1, [x1] 331; CHECK-NEXT: fmulx.4s v0, v0, v1 332; CHECK-NEXT: ret 333 %tmp1 = load <4 x float>, ptr %A 334 %tmp2 = load <4 x float>, ptr %B 335 %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) 336 ret <4 x float> %tmp3 337} 338 339define <2 x double> @fmulx_2d(ptr %A, ptr %B) nounwind { 340; CHECK-LABEL: fmulx_2d: 341; CHECK: // %bb.0: 342; CHECK-NEXT: ldr q0, [x0] 343; CHECK-NEXT: ldr q1, [x1] 344; CHECK-NEXT: fmulx.2d v0, v0, v1 345; CHECK-NEXT: ret 346 %tmp1 = load <2 x double>, ptr %A 347 %tmp2 = load <2 x double>, ptr %B 348 %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) 349 ret <2 x double> %tmp3 350} 351 352declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone 353declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone 354declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone 355 356define <4 x i32> @smlal4s(ptr %A, ptr %B, ptr %C) nounwind { 357; CHECK-LABEL: smlal4s: 358; CHECK: // %bb.0: 359; CHECK-NEXT: ldr d1, [x0] 360; CHECK-NEXT: ldr d2, [x1] 361; CHECK-NEXT: ldr q0, [x2] 362; CHECK-NEXT: smlal.4s v0, v1, v2 363; CHECK-NEXT: ret 364 %tmp1 = load <4 x i16>, ptr %A 365 %tmp2 = load <4 x i16>, ptr %B 366 %tmp3 = load <4 x i32>, ptr %C 367 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 368 %tmp5 = add <4 x i32> %tmp3, %tmp4 369 ret <4 x i32> %tmp5 370} 371 372define <2 x i64> @smlal2d(ptr %A, ptr %B, ptr %C) nounwind { 373; CHECK-LABEL: smlal2d: 374; CHECK: // %bb.0: 375; CHECK-NEXT: ldr d1, [x0] 376; CHECK-NEXT: ldr d2, [x1] 377; CHECK-NEXT: ldr q0, [x2] 378; CHECK-NEXT: smlal.2d v0, v1, v2 379; CHECK-NEXT: ret 380 %tmp1 = load <2 x i32>, ptr %A 381 %tmp2 = load <2 x i32>, ptr %B 382 %tmp3 = load <2 x i64>, ptr %C 383 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 384 %tmp5 = add <2 x i64> %tmp3, %tmp4 385 ret <2 x i64> %tmp5 386} 387 388define void @smlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { 389; CHECK-LABEL: smlal8h_chain_with_constant: 390; CHECK: // %bb.0: 391; CHECK-NEXT: movi.16b v3, #1 392; CHECK-NEXT: smlal.8h v3, v0, v2 393; CHECK-NEXT: mvn.8b v0, v2 394; CHECK-NEXT: smlal.8h v3, v1, v0 395; CHECK-NEXT: str q3, [x0] 396; CHECK-NEXT: ret 397 %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 398 %smull.1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v1, <8 x i8> %v3) 399 %add.1 = add <8 x i16> %smull.1, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257> 400 %smull.2 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v2, <8 x i8> %xor) 401 %add.2 = add <8 x i16> %add.1, %smull.2 402 store <8 x i16> %add.2, ptr %dst 403 ret void 404} 405 406define void @smlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { 407; CHECK-LABEL: smlal2d_chain_with_constant: 408; CHECK: // %bb.0: 409; CHECK-NEXT: mov w8, #257 // =0x101 410; CHECK-NEXT: dup.2d v3, x8 411; CHECK-NEXT: smlal.2d v3, v0, v2 412; CHECK-NEXT: mvn.8b v0, v2 413; CHECK-NEXT: smlal.2d v3, v1, v0 414; CHECK-NEXT: str q3, [x0] 415; CHECK-NEXT: ret 416 %xor = xor <2 x i32> %v3, <i32 -1, i32 -1> 417 %smull.1 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v1, <2 x i32> %v3) 418 %add.1 = add <2 x i64> %smull.1, <i64 257, i64 257> 419 %smull.2 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v2, <2 x i32> %xor) 420 %add.2 = add <2 x i64> %add.1, %smull.2 421 store <2 x i64> %add.2, ptr %dst 422 ret void 423} 424 425define <4 x i32> @smlsl4s(ptr %A, ptr %B, ptr %C) nounwind { 426; CHECK-LABEL: smlsl4s: 427; CHECK: // %bb.0: 428; CHECK-NEXT: ldr d1, [x0] 429; CHECK-NEXT: ldr d2, [x1] 430; CHECK-NEXT: ldr q0, [x2] 431; CHECK-NEXT: smlsl.4s v0, v1, v2 432; CHECK-NEXT: ret 433 %tmp1 = load <4 x i16>, ptr %A 434 %tmp2 = load <4 x i16>, ptr %B 435 %tmp3 = load <4 x i32>, ptr %C 436 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 437 %tmp5 = sub <4 x i32> %tmp3, %tmp4 438 ret <4 x i32> %tmp5 439} 440 441define <2 x i64> @smlsl2d(ptr %A, ptr %B, ptr %C) nounwind { 442; CHECK-LABEL: smlsl2d: 443; CHECK: // %bb.0: 444; CHECK-NEXT: ldr d1, [x0] 445; CHECK-NEXT: ldr d2, [x1] 446; CHECK-NEXT: ldr q0, [x2] 447; CHECK-NEXT: smlsl.2d v0, v1, v2 448; CHECK-NEXT: ret 449 %tmp1 = load <2 x i32>, ptr %A 450 %tmp2 = load <2 x i32>, ptr %B 451 %tmp3 = load <2 x i64>, ptr %C 452 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 453 %tmp5 = sub <2 x i64> %tmp3, %tmp4 454 ret <2 x i64> %tmp5 455} 456 457define void @smlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { 458; CHECK-LABEL: smlsl8h_chain_with_constant: 459; CHECK: // %bb.0: 460; CHECK-NEXT: movi.16b v3, #1 461; CHECK-NEXT: smlsl.8h v3, v0, v2 462; CHECK-NEXT: mvn.8b v0, v2 463; CHECK-NEXT: smlsl.8h v3, v1, v0 464; CHECK-NEXT: str q3, [x0] 465; CHECK-NEXT: ret 466 %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 467 %smull.1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v1, <8 x i8> %v3) 468 %sub.1 = sub <8 x i16> <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>, %smull.1 469 %smull.2 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v2, <8 x i8> %xor) 470 %sub.2 = sub <8 x i16> %sub.1, %smull.2 471 store <8 x i16> %sub.2, ptr %dst 472 ret void 473} 474 475define void @smlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { 476; CHECK-LABEL: smlsl2d_chain_with_constant: 477; CHECK: // %bb.0: 478; CHECK-NEXT: mov w8, #257 // =0x101 479; CHECK-NEXT: dup.2d v3, x8 480; CHECK-NEXT: smlsl.2d v3, v0, v2 481; CHECK-NEXT: mvn.8b v0, v2 482; CHECK-NEXT: smlsl.2d v3, v1, v0 483; CHECK-NEXT: str q3, [x0] 484; CHECK-NEXT: ret 485 %xor = xor <2 x i32> %v3, <i32 -1, i32 -1> 486 %smull.1 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v1, <2 x i32> %v3) 487 %sub.1 = sub <2 x i64> <i64 257, i64 257>, %smull.1 488 %smull.2 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v2, <2 x i32> %xor) 489 %sub.2 = sub <2 x i64> %sub.1, %smull.2 490 store <2 x i64> %sub.2, ptr %dst 491 ret void 492} 493 494declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) 495declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) 496declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) 497declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) 498 499define <4 x i32> @sqdmlal4s(ptr %A, ptr %B, ptr %C) nounwind { 500; CHECK-LABEL: sqdmlal4s: 501; CHECK: // %bb.0: 502; CHECK-NEXT: ldr d1, [x0] 503; CHECK-NEXT: ldr d2, [x1] 504; CHECK-NEXT: ldr q0, [x2] 505; CHECK-NEXT: sqdmlal.4s v0, v1, v2 506; CHECK-NEXT: ret 507 %tmp1 = load <4 x i16>, ptr %A 508 %tmp2 = load <4 x i16>, ptr %B 509 %tmp3 = load <4 x i32>, ptr %C 510 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 511 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4) 512 ret <4 x i32> %tmp5 513} 514 515define <2 x i64> @sqdmlal2d(ptr %A, ptr %B, ptr %C) nounwind { 516; CHECK-LABEL: sqdmlal2d: 517; CHECK: // %bb.0: 518; CHECK-NEXT: ldr d1, [x0] 519; CHECK-NEXT: ldr d2, [x1] 520; CHECK-NEXT: ldr q0, [x2] 521; CHECK-NEXT: sqdmlal.2d v0, v1, v2 522; CHECK-NEXT: ret 523 %tmp1 = load <2 x i32>, ptr %A 524 %tmp2 = load <2 x i32>, ptr %B 525 %tmp3 = load <2 x i64>, ptr %C 526 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 527 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4) 528 ret <2 x i64> %tmp5 529} 530 531define <4 x i32> @sqdmlal2_4s(ptr %A, ptr %B, ptr %C) nounwind { 532; CHECK-LABEL: sqdmlal2_4s: 533; CHECK: // %bb.0: 534; CHECK-NEXT: ldr q0, [x2] 535; CHECK-NEXT: ldr d1, [x0, #8] 536; CHECK-NEXT: ldr d2, [x1, #8] 537; CHECK-NEXT: sqdmlal.4s v0, v1, v2 538; CHECK-NEXT: ret 539 %load1 = load <8 x i16>, ptr %A 540 %load2 = load <8 x i16>, ptr %B 541 %tmp3 = load <4 x i32>, ptr %C 542 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 543 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 544 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 545 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4) 546 ret <4 x i32> %tmp5 547} 548 549define <2 x i64> @sqdmlal2_2d(ptr %A, ptr %B, ptr %C) nounwind { 550; CHECK-LABEL: sqdmlal2_2d: 551; CHECK: // %bb.0: 552; CHECK-NEXT: ldr q0, [x2] 553; CHECK-NEXT: ldr d1, [x0, #8] 554; CHECK-NEXT: ldr d2, [x1, #8] 555; CHECK-NEXT: sqdmlal.2d v0, v1, v2 556; CHECK-NEXT: ret 557 %load1 = load <4 x i32>, ptr %A 558 %load2 = load <4 x i32>, ptr %B 559 %tmp3 = load <2 x i64>, ptr %C 560 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 561 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 562 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 563 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4) 564 ret <2 x i64> %tmp5 565} 566 567define <4 x i32> @sqdmlsl4s(ptr %A, ptr %B, ptr %C) nounwind { 568; CHECK-LABEL: sqdmlsl4s: 569; CHECK: // %bb.0: 570; CHECK-NEXT: ldr d1, [x0] 571; CHECK-NEXT: ldr d2, [x1] 572; CHECK-NEXT: ldr q0, [x2] 573; CHECK-NEXT: sqdmlsl.4s v0, v1, v2 574; CHECK-NEXT: ret 575 %tmp1 = load <4 x i16>, ptr %A 576 %tmp2 = load <4 x i16>, ptr %B 577 %tmp3 = load <4 x i32>, ptr %C 578 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 579 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4) 580 ret <4 x i32> %tmp5 581} 582 583define <2 x i64> @sqdmlsl2d(ptr %A, ptr %B, ptr %C) nounwind { 584; CHECK-LABEL: sqdmlsl2d: 585; CHECK: // %bb.0: 586; CHECK-NEXT: ldr d1, [x0] 587; CHECK-NEXT: ldr d2, [x1] 588; CHECK-NEXT: ldr q0, [x2] 589; CHECK-NEXT: sqdmlsl.2d v0, v1, v2 590; CHECK-NEXT: ret 591 %tmp1 = load <2 x i32>, ptr %A 592 %tmp2 = load <2 x i32>, ptr %B 593 %tmp3 = load <2 x i64>, ptr %C 594 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 595 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4) 596 ret <2 x i64> %tmp5 597} 598 599define <4 x i32> @sqdmlsl2_4s(ptr %A, ptr %B, ptr %C) nounwind { 600; CHECK-LABEL: sqdmlsl2_4s: 601; CHECK: // %bb.0: 602; CHECK-NEXT: ldr q0, [x2] 603; CHECK-NEXT: ldr d1, [x0, #8] 604; CHECK-NEXT: ldr d2, [x1, #8] 605; CHECK-NEXT: sqdmlsl.4s v0, v1, v2 606; CHECK-NEXT: ret 607 %load1 = load <8 x i16>, ptr %A 608 %load2 = load <8 x i16>, ptr %B 609 %tmp3 = load <4 x i32>, ptr %C 610 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 611 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 612 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 613 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4) 614 ret <4 x i32> %tmp5 615} 616 617define <2 x i64> @sqdmlsl2_2d(ptr %A, ptr %B, ptr %C) nounwind { 618; CHECK-LABEL: sqdmlsl2_2d: 619; CHECK: // %bb.0: 620; CHECK-NEXT: ldr q0, [x2] 621; CHECK-NEXT: ldr d1, [x0, #8] 622; CHECK-NEXT: ldr d2, [x1, #8] 623; CHECK-NEXT: sqdmlsl.2d v0, v1, v2 624; CHECK-NEXT: ret 625 %load1 = load <4 x i32>, ptr %A 626 %load2 = load <4 x i32>, ptr %B 627 %tmp3 = load <2 x i64>, ptr %C 628 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 629 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 630 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 631 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4) 632 ret <2 x i64> %tmp5 633} 634 635define <4 x i32> @umlal4s(ptr %A, ptr %B, ptr %C) nounwind { 636; CHECK-LABEL: umlal4s: 637; CHECK: // %bb.0: 638; CHECK-NEXT: ldr d1, [x0] 639; CHECK-NEXT: ldr d2, [x1] 640; CHECK-NEXT: ldr q0, [x2] 641; CHECK-NEXT: umlal.4s v0, v1, v2 642; CHECK-NEXT: ret 643 %tmp1 = load <4 x i16>, ptr %A 644 %tmp2 = load <4 x i16>, ptr %B 645 %tmp3 = load <4 x i32>, ptr %C 646 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 647 %tmp5 = add <4 x i32> %tmp3, %tmp4 648 ret <4 x i32> %tmp5 649} 650 651define <2 x i64> @umlal2d(ptr %A, ptr %B, ptr %C) nounwind { 652; CHECK-LABEL: umlal2d: 653; CHECK: // %bb.0: 654; CHECK-NEXT: ldr d1, [x0] 655; CHECK-NEXT: ldr d2, [x1] 656; CHECK-NEXT: ldr q0, [x2] 657; CHECK-NEXT: umlal.2d v0, v1, v2 658; CHECK-NEXT: ret 659 %tmp1 = load <2 x i32>, ptr %A 660 %tmp2 = load <2 x i32>, ptr %B 661 %tmp3 = load <2 x i64>, ptr %C 662 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 663 %tmp5 = add <2 x i64> %tmp3, %tmp4 664 ret <2 x i64> %tmp5 665} 666 667define void @umlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { 668; CHECK-LABEL: umlal8h_chain_with_constant: 669; CHECK: // %bb.0: 670; CHECK-NEXT: movi.16b v3, #1 671; CHECK-NEXT: umlal.8h v3, v0, v2 672; CHECK-NEXT: mvn.8b v0, v2 673; CHECK-NEXT: umlal.8h v3, v1, v0 674; CHECK-NEXT: str q3, [x0] 675; CHECK-NEXT: ret 676 %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 677 %umull.1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v1, <8 x i8> %v3) 678 %add.1 = add <8 x i16> %umull.1, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257> 679 %umull.2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v2, <8 x i8> %xor) 680 %add.2 = add <8 x i16> %add.1, %umull.2 681 store <8 x i16> %add.2, ptr %dst 682 ret void 683} 684 685define void @umlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { 686; CHECK-LABEL: umlal2d_chain_with_constant: 687; CHECK: // %bb.0: 688; CHECK-NEXT: mov w8, #257 // =0x101 689; CHECK-NEXT: dup.2d v3, x8 690; CHECK-NEXT: umlal.2d v3, v0, v2 691; CHECK-NEXT: mvn.8b v0, v2 692; CHECK-NEXT: umlal.2d v3, v1, v0 693; CHECK-NEXT: str q3, [x0] 694; CHECK-NEXT: ret 695 %xor = xor <2 x i32> %v3, <i32 -1, i32 -1> 696 %umull.1 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v1, <2 x i32> %v3) 697 %add.1 = add <2 x i64> %umull.1, <i64 257, i64 257> 698 %umull.2 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v2, <2 x i32> %xor) 699 %add.2 = add <2 x i64> %add.1, %umull.2 700 store <2 x i64> %add.2, ptr %dst 701 ret void 702} 703 704define <4 x i32> @umlsl4s(ptr %A, ptr %B, ptr %C) nounwind { 705; CHECK-LABEL: umlsl4s: 706; CHECK: // %bb.0: 707; CHECK-NEXT: ldr d1, [x0] 708; CHECK-NEXT: ldr d2, [x1] 709; CHECK-NEXT: ldr q0, [x2] 710; CHECK-NEXT: umlsl.4s v0, v1, v2 711; CHECK-NEXT: ret 712 %tmp1 = load <4 x i16>, ptr %A 713 %tmp2 = load <4 x i16>, ptr %B 714 %tmp3 = load <4 x i32>, ptr %C 715 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 716 %tmp5 = sub <4 x i32> %tmp3, %tmp4 717 ret <4 x i32> %tmp5 718} 719 720define <2 x i64> @umlsl2d(ptr %A, ptr %B, ptr %C) nounwind { 721; CHECK-LABEL: umlsl2d: 722; CHECK: // %bb.0: 723; CHECK-NEXT: ldr d1, [x0] 724; CHECK-NEXT: ldr d2, [x1] 725; CHECK-NEXT: ldr q0, [x2] 726; CHECK-NEXT: umlsl.2d v0, v1, v2 727; CHECK-NEXT: ret 728 %tmp1 = load <2 x i32>, ptr %A 729 %tmp2 = load <2 x i32>, ptr %B 730 %tmp3 = load <2 x i64>, ptr %C 731 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 732 %tmp5 = sub <2 x i64> %tmp3, %tmp4 733 ret <2 x i64> %tmp5 734} 735 736define void @umlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { 737; CHECK-LABEL: umlsl8h_chain_with_constant: 738; CHECK: // %bb.0: 739; CHECK-NEXT: movi.16b v3, #1 740; CHECK-NEXT: umlsl.8h v3, v0, v2 741; CHECK-NEXT: mvn.8b v0, v2 742; CHECK-NEXT: umlsl.8h v3, v1, v0 743; CHECK-NEXT: str q3, [x0] 744; CHECK-NEXT: ret 745 %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 746 %umull.1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v1, <8 x i8> %v3) 747 %add.1 = sub <8 x i16> <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>, %umull.1 748 %umull.2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v2, <8 x i8> %xor) 749 %add.2 = sub <8 x i16> %add.1, %umull.2 750 store <8 x i16> %add.2, ptr %dst 751 ret void 752} 753 754define void @umlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { 755; CHECK-LABEL: umlsl2d_chain_with_constant: 756; CHECK: // %bb.0: 757; CHECK-NEXT: mov w8, #257 // =0x101 758; CHECK-NEXT: dup.2d v3, x8 759; CHECK-NEXT: umlsl.2d v3, v0, v2 760; CHECK-NEXT: mvn.8b v0, v2 761; CHECK-NEXT: umlsl.2d v3, v1, v0 762; CHECK-NEXT: str q3, [x0] 763; CHECK-NEXT: ret 764 %xor = xor <2 x i32> %v3, <i32 -1, i32 -1> 765 %umull.1 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v1, <2 x i32> %v3) 766 %add.1 = sub <2 x i64> <i64 257, i64 257>, %umull.1 767 %umull.2 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v2, <2 x i32> %xor) 768 %add.2 = sub <2 x i64> %add.1, %umull.2 769 store <2 x i64> %add.2, ptr %dst 770 ret void 771} 772 773define <2 x float> @fmla_2s(ptr %A, ptr %B, ptr %C) nounwind { 774; CHECK-LABEL: fmla_2s: 775; CHECK: // %bb.0: 776; CHECK-NEXT: ldr d1, [x0] 777; CHECK-NEXT: ldr d2, [x1] 778; CHECK-NEXT: ldr d0, [x2] 779; CHECK-NEXT: fmla.2s v0, v2, v1 780; CHECK-NEXT: ret 781 %tmp1 = load <2 x float>, ptr %A 782 %tmp2 = load <2 x float>, ptr %B 783 %tmp3 = load <2 x float>, ptr %C 784 %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3) 785 ret <2 x float> %tmp4 786} 787 788define <4 x float> @fmla_4s(ptr %A, ptr %B, ptr %C) nounwind { 789; CHECK-LABEL: fmla_4s: 790; CHECK: // %bb.0: 791; CHECK-NEXT: ldr q1, [x0] 792; CHECK-NEXT: ldr q2, [x1] 793; CHECK-NEXT: ldr q0, [x2] 794; CHECK-NEXT: fmla.4s v0, v2, v1 795; CHECK-NEXT: ret 796 %tmp1 = load <4 x float>, ptr %A 797 %tmp2 = load <4 x float>, ptr %B 798 %tmp3 = load <4 x float>, ptr %C 799 %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3) 800 ret <4 x float> %tmp4 801} 802 803define <2 x double> @fmla_2d(ptr %A, ptr %B, ptr %C) nounwind { 804; CHECK-LABEL: fmla_2d: 805; CHECK: // %bb.0: 806; CHECK-NEXT: ldr q1, [x0] 807; CHECK-NEXT: ldr q2, [x1] 808; CHECK-NEXT: ldr q0, [x2] 809; CHECK-NEXT: fmla.2d v0, v2, v1 810; CHECK-NEXT: ret 811 %tmp1 = load <2 x double>, ptr %A 812 %tmp2 = load <2 x double>, ptr %B 813 %tmp3 = load <2 x double>, ptr %C 814 %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3) 815 ret <2 x double> %tmp4 816} 817 818declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone 819declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone 820declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone 821 822define <2 x float> @fmls_2s(ptr %A, ptr %B, ptr %C) nounwind { 823; CHECK-LABEL: fmls_2s: 824; CHECK: // %bb.0: 825; CHECK-NEXT: ldr d1, [x0] 826; CHECK-NEXT: ldr d2, [x1] 827; CHECK-NEXT: ldr d0, [x2] 828; CHECK-NEXT: fmls.2s v0, v1, v2 829; CHECK-NEXT: ret 830 %tmp1 = load <2 x float>, ptr %A 831 %tmp2 = load <2 x float>, ptr %B 832 %tmp3 = load <2 x float>, ptr %C 833 %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2 834 %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3) 835 ret <2 x float> %tmp5 836} 837 838define <4 x float> @fmls_4s(ptr %A, ptr %B, ptr %C) nounwind { 839; CHECK-LABEL: fmls_4s: 840; CHECK: // %bb.0: 841; CHECK-NEXT: ldr q1, [x0] 842; CHECK-NEXT: ldr q2, [x1] 843; CHECK-NEXT: ldr q0, [x2] 844; CHECK-NEXT: fmls.4s v0, v1, v2 845; CHECK-NEXT: ret 846 %tmp1 = load <4 x float>, ptr %A 847 %tmp2 = load <4 x float>, ptr %B 848 %tmp3 = load <4 x float>, ptr %C 849 %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2 850 %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3) 851 ret <4 x float> %tmp5 852} 853 854define <2 x double> @fmls_2d(ptr %A, ptr %B, ptr %C) nounwind { 855; CHECK-LABEL: fmls_2d: 856; CHECK: // %bb.0: 857; CHECK-NEXT: ldr q1, [x0] 858; CHECK-NEXT: ldr q2, [x1] 859; CHECK-NEXT: ldr q0, [x2] 860; CHECK-NEXT: fmls.2d v0, v1, v2 861; CHECK-NEXT: ret 862 %tmp1 = load <2 x double>, ptr %A 863 %tmp2 = load <2 x double>, ptr %B 864 %tmp3 = load <2 x double>, ptr %C 865 %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2 866 %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3) 867 ret <2 x double> %tmp5 868} 869 870define <2 x float> @fmls_commuted_neg_2s(ptr %A, ptr %B, ptr %C) nounwind { 871; CHECK-LABEL: fmls_commuted_neg_2s: 872; CHECK: // %bb.0: 873; CHECK-NEXT: ldr d1, [x0] 874; CHECK-NEXT: ldr d2, [x1] 875; CHECK-NEXT: ldr d0, [x2] 876; CHECK-NEXT: fmls.2s v0, v1, v2 877; CHECK-NEXT: ret 878 %tmp1 = load <2 x float>, ptr %A 879 %tmp2 = load <2 x float>, ptr %B 880 %tmp3 = load <2 x float>, ptr %C 881 %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2 882 %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3) 883 ret <2 x float> %tmp5 884} 885 886define <4 x float> @fmls_commuted_neg_4s(ptr %A, ptr %B, ptr %C) nounwind { 887; CHECK-LABEL: fmls_commuted_neg_4s: 888; CHECK: // %bb.0: 889; CHECK-NEXT: ldr q1, [x0] 890; CHECK-NEXT: ldr q2, [x1] 891; CHECK-NEXT: ldr q0, [x2] 892; CHECK-NEXT: fmls.4s v0, v1, v2 893; CHECK-NEXT: ret 894 %tmp1 = load <4 x float>, ptr %A 895 %tmp2 = load <4 x float>, ptr %B 896 %tmp3 = load <4 x float>, ptr %C 897 %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2 898 %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3) 899 ret <4 x float> %tmp5 900} 901 902define <2 x double> @fmls_commuted_neg_2d(ptr %A, ptr %B, ptr %C) nounwind { 903; CHECK-LABEL: fmls_commuted_neg_2d: 904; CHECK: // %bb.0: 905; CHECK-NEXT: ldr q1, [x0] 906; CHECK-NEXT: ldr q2, [x1] 907; CHECK-NEXT: ldr q0, [x2] 908; CHECK-NEXT: fmls.2d v0, v1, v2 909; CHECK-NEXT: ret 910 %tmp1 = load <2 x double>, ptr %A 911 %tmp2 = load <2 x double>, ptr %B 912 %tmp3 = load <2 x double>, ptr %C 913 %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2 914 %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3) 915 ret <2 x double> %tmp5 916} 917 918define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp { 919; CHECK-LABEL: fmls_indexed_2s: 920; CHECK: // %bb.0: // %entry 921; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 922; CHECK-NEXT: fmls.2s v0, v2, v1[0] 923; CHECK-NEXT: ret 924entry: 925 %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c 926 %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer 927 %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a) 928 ret <2 x float> %fmls1 929} 930 931define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp { 932; CHECK-LABEL: fmls_indexed_4s: 933; CHECK: // %bb.0: // %entry 934; CHECK-NEXT: fmls.4s v0, v2, v1[0] 935; CHECK-NEXT: ret 936entry: 937 %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c 938 %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer 939 %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a) 940 ret <4 x float> %fmls1 941} 942 943define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp { 944; CHECK-LABEL: fmls_indexed_2d: 945; CHECK: // %bb.0: // %entry 946; CHECK-NEXT: fmls.2d v0, v2, v1[0] 947; CHECK-NEXT: ret 948entry: 949 %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c 950 %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer 951 %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a) 952 ret <2 x double> %fmls1 953} 954 955define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp { 956; CHECK-LABEL: fmla_indexed_scalar_2s: 957; CHECK: // %bb.0: // %entry 958; CHECK-NEXT: // kill: def $s2 killed $s2 def $d2 959; CHECK-NEXT: fmla.2s v0, v1, v2 960; CHECK-NEXT: ret 961entry: 962 %v1 = insertelement <2 x float> undef, float %c, i32 0 963 %v2 = insertelement <2 x float> %v1, float %c, i32 1 964 %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind 965 ret <2 x float> %fmla1 966} 967 968define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp { 969; CHECK-LABEL: fmla_indexed_scalar_4s: 970; CHECK: // %bb.0: // %entry 971; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 972; CHECK-NEXT: fmla.4s v0, v1, v2[0] 973; CHECK-NEXT: ret 974entry: 975 %v1 = insertelement <4 x float> undef, float %c, i32 0 976 %v2 = insertelement <4 x float> %v1, float %c, i32 1 977 %v3 = insertelement <4 x float> %v2, float %c, i32 2 978 %v4 = insertelement <4 x float> %v3, float %c, i32 3 979 %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind 980 ret <4 x float> %fmla1 981} 982 983define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp { 984; CHECK-LABEL: fmla_indexed_scalar_2d: 985; CHECK: // %bb.0: // %entry 986; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 987; CHECK-NEXT: fmla.2d v0, v1, v2[0] 988; CHECK-NEXT: ret 989entry: 990 %v1 = insertelement <2 x double> undef, double %c, i32 0 991 %v2 = insertelement <2 x double> %v1, double %c, i32 1 992 %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind 993 ret <2 x double> %fmla1 994} 995 996define <2 x float> @fmls_indexed_2s_strict(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp strictfp { 997; CHECK-LABEL: fmls_indexed_2s_strict: 998; CHECK: // %bb.0: // %entry 999; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1000; CHECK-NEXT: fmls.2s v0, v2, v1[0] 1001; CHECK-NEXT: ret 1002entry: 1003 %0 = fneg <2 x float> %c 1004 %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer 1005 %fmls1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 1006 ret <2 x float> %fmls1 1007} 1008 1009define <4 x float> @fmls_indexed_4s_strict(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp strictfp { 1010; CHECK-LABEL: fmls_indexed_4s_strict: 1011; CHECK: // %bb.0: // %entry 1012; CHECK-NEXT: fmls.4s v0, v2, v1[0] 1013; CHECK-NEXT: ret 1014entry: 1015 %0 = fneg <4 x float> %c 1016 %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer 1017 %fmls1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 1018 ret <4 x float> %fmls1 1019} 1020 1021define <2 x double> @fmls_indexed_2d_strict(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp strictfp { 1022; CHECK-LABEL: fmls_indexed_2d_strict: 1023; CHECK: // %bb.0: // %entry 1024; CHECK-NEXT: fmls.2d v0, v2, v1[0] 1025; CHECK-NEXT: ret 1026entry: 1027 %0 = fneg <2 x double> %c 1028 %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer 1029 %fmls1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 1030 ret <2 x double> %fmls1 1031} 1032 1033define <2 x float> @fmla_indexed_scalar_2s_strict(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp strictfp { 1034; CHECK-LABEL: fmla_indexed_scalar_2s_strict: 1035; CHECK: // %bb.0: // %entry 1036; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 1037; CHECK-NEXT: fmla.2s v0, v1, v2[0] 1038; CHECK-NEXT: ret 1039entry: 1040 %v1 = insertelement <2 x float> undef, float %c, i32 0 1041 %v2 = insertelement <2 x float> %v1, float %c, i32 1 1042 %fmla1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %v2, <2 x float> %b, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 1043 ret <2 x float> %fmla1 1044} 1045 1046define <4 x float> @fmla_indexed_scalar_4s_strict(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp strictfp { 1047; CHECK-LABEL: fmla_indexed_scalar_4s_strict: 1048; CHECK: // %bb.0: // %entry 1049; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 1050; CHECK-NEXT: fmla.4s v0, v1, v2[0] 1051; CHECK-NEXT: ret 1052entry: 1053 %v1 = insertelement <4 x float> undef, float %c, i32 0 1054 %v2 = insertelement <4 x float> %v1, float %c, i32 1 1055 %v3 = insertelement <4 x float> %v2, float %c, i32 2 1056 %v4 = insertelement <4 x float> %v3, float %c, i32 3 1057 %fmla1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 1058 ret <4 x float> %fmla1 1059} 1060 1061define <2 x double> @fmla_indexed_scalar_2d_strict(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp strictfp { 1062; CHECK-LABEL: fmla_indexed_scalar_2d_strict: 1063; CHECK: // %bb.0: // %entry 1064; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 1065; CHECK-NEXT: fmla.2d v0, v1, v2[0] 1066; CHECK-NEXT: ret 1067entry: 1068 %v1 = insertelement <2 x double> undef, double %c, i32 0 1069 %v2 = insertelement <2 x double> %v1, double %c, i32 1 1070 %fmla1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 1071 ret <2 x double> %fmla1 1072} 1073 1074attributes #0 = { strictfp } 1075 1076declare <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float>, <2 x float>, <2 x float>, metadata, metadata) 1077declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata) 1078declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata) 1079 1080define <4 x i16> @mul_4h(<4 x i16> %A, <4 x i16> %B) nounwind { 1081; CHECK-LABEL: mul_4h: 1082; CHECK: // %bb.0: 1083; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1084; CHECK-NEXT: mul.4h v0, v0, v1[1] 1085; CHECK-NEXT: ret 1086 %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1087 %tmp4 = mul <4 x i16> %A, %tmp3 1088 ret <4 x i16> %tmp4 1089} 1090 1091define <8 x i16> @mul_8h(<8 x i16> %A, <8 x i16> %B) nounwind { 1092; CHECK-LABEL: mul_8h: 1093; CHECK: // %bb.0: 1094; CHECK-NEXT: mul.8h v0, v0, v1[1] 1095; CHECK-NEXT: ret 1096 %tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1097 %tmp4 = mul <8 x i16> %A, %tmp3 1098 ret <8 x i16> %tmp4 1099} 1100 1101define <2 x i32> @mul_2s(<2 x i32> %A, <2 x i32> %B) nounwind { 1102; CHECK-LABEL: mul_2s: 1103; CHECK: // %bb.0: 1104; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1105; CHECK-NEXT: mul.2s v0, v0, v1[1] 1106; CHECK-NEXT: ret 1107 %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> 1108 %tmp4 = mul <2 x i32> %A, %tmp3 1109 ret <2 x i32> %tmp4 1110} 1111 1112define <4 x i32> @mul_4s(<4 x i32> %A, <4 x i32> %B) nounwind { 1113; CHECK-LABEL: mul_4s: 1114; CHECK: // %bb.0: 1115; CHECK-NEXT: mul.4s v0, v0, v1[1] 1116; CHECK-NEXT: ret 1117 %tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1118 %tmp4 = mul <4 x i32> %A, %tmp3 1119 ret <4 x i32> %tmp4 1120} 1121 1122define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind { 1123; CHECK-LABEL: mul_2d: 1124; CHECK: // %bb.0: 1125; CHECK-NEXT: fmov x10, d1 1126; CHECK-NEXT: fmov x11, d0 1127; CHECK-NEXT: mov.d x8, v1[1] 1128; CHECK-NEXT: mov.d x9, v0[1] 1129; CHECK-NEXT: mul x10, x11, x10 1130; CHECK-NEXT: mul x8, x9, x8 1131; CHECK-NEXT: fmov d0, x10 1132; CHECK-NEXT: mov.d v0[1], x8 1133; CHECK-NEXT: ret 1134 %tmp1 = mul <2 x i64> %A, %B 1135 ret <2 x i64> %tmp1 1136} 1137 1138define <2 x float> @fmul_lane_2s(<2 x float> %A, <2 x float> %B) nounwind { 1139; CHECK-LABEL: fmul_lane_2s: 1140; CHECK: // %bb.0: 1141; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1142; CHECK-NEXT: fmul.2s v0, v0, v1[1] 1143; CHECK-NEXT: ret 1144 %tmp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1> 1145 %tmp4 = fmul <2 x float> %A, %tmp3 1146 ret <2 x float> %tmp4 1147} 1148 1149define <4 x float> @fmul_lane_4s(<4 x float> %A, <4 x float> %B) nounwind { 1150; CHECK-LABEL: fmul_lane_4s: 1151; CHECK: // %bb.0: 1152; CHECK-NEXT: fmul.4s v0, v0, v1[1] 1153; CHECK-NEXT: ret 1154 %tmp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1155 %tmp4 = fmul <4 x float> %A, %tmp3 1156 ret <4 x float> %tmp4 1157} 1158 1159define <2 x double> @fmul_lane_2d(<2 x double> %A, <2 x double> %B) nounwind { 1160; CHECK-LABEL: fmul_lane_2d: 1161; CHECK: // %bb.0: 1162; CHECK-NEXT: fmul.2d v0, v0, v1[1] 1163; CHECK-NEXT: ret 1164 %tmp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1> 1165 %tmp4 = fmul <2 x double> %A, %tmp3 1166 ret <2 x double> %tmp4 1167} 1168 1169define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind { 1170; CHECK-LABEL: fmul_lane_s: 1171; CHECK: // %bb.0: 1172; CHECK-NEXT: fmul.s s0, s0, v1[3] 1173; CHECK-NEXT: ret 1174 %B = extractelement <4 x float> %vec, i32 3 1175 %res = fmul float %A, %B 1176 ret float %res 1177} 1178 1179define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind { 1180; CHECK-LABEL: fmul_lane_d: 1181; CHECK: // %bb.0: 1182; CHECK-NEXT: fmul.d d0, d0, v1[1] 1183; CHECK-NEXT: ret 1184 %B = extractelement <2 x double> %vec, i32 1 1185 %res = fmul double %A, %B 1186 ret double %res 1187} 1188 1189 1190 1191define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind { 1192; CHECK-LABEL: fmulx_lane_2s: 1193; CHECK: // %bb.0: 1194; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1195; CHECK-NEXT: fmulx.2s v0, v0, v1[1] 1196; CHECK-NEXT: ret 1197 %tmp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1> 1198 %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %A, <2 x float> %tmp3) 1199 ret <2 x float> %tmp4 1200} 1201 1202define <4 x float> @fmulx_lane_4s(<4 x float> %A, <4 x float> %B) nounwind { 1203; CHECK-LABEL: fmulx_lane_4s: 1204; CHECK: // %bb.0: 1205; CHECK-NEXT: fmulx.4s v0, v0, v1[1] 1206; CHECK-NEXT: ret 1207 %tmp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1208 %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %A, <4 x float> %tmp3) 1209 ret <4 x float> %tmp4 1210} 1211 1212define <2 x double> @fmulx_lane_2d(<2 x double> %A, <2 x double> %B) nounwind { 1213; CHECK-LABEL: fmulx_lane_2d: 1214; CHECK: // %bb.0: 1215; CHECK-NEXT: fmulx.2d v0, v0, v1[1] 1216; CHECK-NEXT: ret 1217 %tmp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1> 1218 %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %A, <2 x double> %tmp3) 1219 ret <2 x double> %tmp4 1220} 1221 1222define <4 x i16> @sqdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind { 1223; CHECK-LABEL: sqdmulh_lane_4h: 1224; CHECK: // %bb.0: 1225; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1226; CHECK-NEXT: sqdmulh.4h v0, v0, v1[1] 1227; CHECK-NEXT: ret 1228 %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1229 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %A, <4 x i16> %tmp3) 1230 ret <4 x i16> %tmp4 1231} 1232 1233define <8 x i16> @sqdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind { 1234; CHECK-LABEL: sqdmulh_lane_8h: 1235; CHECK: // %bb.0: 1236; CHECK-NEXT: sqdmulh.8h v0, v0, v1[1] 1237; CHECK-NEXT: ret 1238 %tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1239 %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %A, <8 x i16> %tmp3) 1240 ret <8 x i16> %tmp4 1241} 1242 1243define <2 x i32> @sqdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind { 1244; CHECK-LABEL: sqdmulh_lane_2s: 1245; CHECK: // %bb.0: 1246; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1247; CHECK-NEXT: sqdmulh.2s v0, v0, v1[1] 1248; CHECK-NEXT: ret 1249 %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> 1250 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %A, <2 x i32> %tmp3) 1251 ret <2 x i32> %tmp4 1252} 1253 1254define <4 x i32> @sqdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind { 1255; CHECK-LABEL: sqdmulh_lane_4s: 1256; CHECK: // %bb.0: 1257; CHECK-NEXT: sqdmulh.4s v0, v0, v1[1] 1258; CHECK-NEXT: ret 1259 %tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1260 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %A, <4 x i32> %tmp3) 1261 ret <4 x i32> %tmp4 1262} 1263 1264define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind { 1265; CHECK-LABEL: sqdmulh_lane_1s: 1266; CHECK: // %bb.0: 1267; CHECK-NEXT: fmov s1, w0 1268; CHECK-NEXT: sqdmulh.s s0, s1, v0[1] 1269; CHECK-NEXT: fmov w0, s0 1270; CHECK-NEXT: ret 1271 %tmp1 = extractelement <4 x i32> %B, i32 1 1272 %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1) 1273 ret i32 %tmp2 1274} 1275 1276define <4 x i16> @sqrdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind { 1277; CHECK-LABEL: sqrdmulh_lane_4h: 1278; CHECK: // %bb.0: 1279; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1280; CHECK-NEXT: sqrdmulh.4h v0, v0, v1[1] 1281; CHECK-NEXT: ret 1282 %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1283 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %A, <4 x i16> %tmp3) 1284 ret <4 x i16> %tmp4 1285} 1286 1287define <8 x i16> @sqrdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind { 1288; CHECK-LABEL: sqrdmulh_lane_8h: 1289; CHECK: // %bb.0: 1290; CHECK-NEXT: sqrdmulh.8h v0, v0, v1[1] 1291; CHECK-NEXT: ret 1292 %tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1293 %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %A, <8 x i16> %tmp3) 1294 ret <8 x i16> %tmp4 1295} 1296 1297define <2 x i32> @sqrdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind { 1298; CHECK-LABEL: sqrdmulh_lane_2s: 1299; CHECK: // %bb.0: 1300; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1301; CHECK-NEXT: sqrdmulh.2s v0, v0, v1[1] 1302; CHECK-NEXT: ret 1303 %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> 1304 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %A, <2 x i32> %tmp3) 1305 ret <2 x i32> %tmp4 1306} 1307 1308define <4 x i32> @sqrdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind { 1309; CHECK-LABEL: sqrdmulh_lane_4s: 1310; CHECK: // %bb.0: 1311; CHECK-NEXT: sqrdmulh.4s v0, v0, v1[1] 1312; CHECK-NEXT: ret 1313 %tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1314 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %A, <4 x i32> %tmp3) 1315 ret <4 x i32> %tmp4 1316} 1317 1318define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind { 1319; CHECK-LABEL: sqrdmulh_lane_1s: 1320; CHECK: // %bb.0: 1321; CHECK-NEXT: fmov s1, w0 1322; CHECK-NEXT: sqrdmulh.s s0, s1, v0[1] 1323; CHECK-NEXT: fmov w0, s0 1324; CHECK-NEXT: ret 1325 %tmp1 = extractelement <4 x i32> %B, i32 1 1326 %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1) 1327 ret i32 %tmp2 1328} 1329 1330define <4 x i32> @sqdmull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind { 1331; CHECK-LABEL: sqdmull_lane_4s: 1332; CHECK: // %bb.0: 1333; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1334; CHECK-NEXT: sqdmull.4s v0, v0, v1[1] 1335; CHECK-NEXT: ret 1336 %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1337 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp3) 1338 ret <4 x i32> %tmp4 1339} 1340 1341define <2 x i64> @sqdmull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind { 1342; CHECK-LABEL: sqdmull_lane_2d: 1343; CHECK: // %bb.0: 1344; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1345; CHECK-NEXT: sqdmull.2d v0, v0, v1[1] 1346; CHECK-NEXT: ret 1347 %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> 1348 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp3) 1349 ret <2 x i64> %tmp4 1350} 1351 1352define <4 x i32> @sqdmull2_lane_4s(<8 x i16> %A, <8 x i16> %B) nounwind { 1353; CHECK-LABEL: sqdmull2_lane_4s: 1354; CHECK: // %bb.0: 1355; CHECK-NEXT: sqdmull2.4s v0, v0, v1[1] 1356; CHECK-NEXT: ret 1357 %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1358 %tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1359 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 1360 ret <4 x i32> %tmp4 1361} 1362 1363define <2 x i64> @sqdmull2_lane_2d(<4 x i32> %A, <4 x i32> %B) nounwind { 1364; CHECK-LABEL: sqdmull2_lane_2d: 1365; CHECK: // %bb.0: 1366; CHECK-NEXT: sqdmull2.2d v0, v0, v1[1] 1367; CHECK-NEXT: ret 1368 %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1369 %tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 1370 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 1371 ret <2 x i64> %tmp4 1372} 1373 1374define <4 x i32> @umull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind { 1375; CHECK-LABEL: umull_lane_4s: 1376; CHECK: // %bb.0: 1377; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1378; CHECK-NEXT: umull.4s v0, v0, v1[1] 1379; CHECK-NEXT: ret 1380 %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1381 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp3) 1382 ret <4 x i32> %tmp4 1383} 1384 1385define <2 x i64> @umull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind { 1386; CHECK-LABEL: umull_lane_2d: 1387; CHECK: // %bb.0: 1388; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1389; CHECK-NEXT: umull.2d v0, v0, v1[1] 1390; CHECK-NEXT: ret 1391 %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> 1392 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp3) 1393 ret <2 x i64> %tmp4 1394} 1395 1396define <4 x i32> @smull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind { 1397; CHECK-LABEL: smull_lane_4s: 1398; CHECK: // %bb.0: 1399; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1400; CHECK-NEXT: smull.4s v0, v0, v1[1] 1401; CHECK-NEXT: ret 1402 %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1403 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp3) 1404 ret <4 x i32> %tmp4 1405} 1406 1407define <2 x i64> @smull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind { 1408; CHECK-LABEL: smull_lane_2d: 1409; CHECK: // %bb.0: 1410; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1411; CHECK-NEXT: smull.2d v0, v0, v1[1] 1412; CHECK-NEXT: ret 1413 %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> 1414 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp3) 1415 ret <2 x i64> %tmp4 1416} 1417 1418define <4 x i32> @smlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind { 1419; CHECK-LABEL: smlal_lane_4s: 1420; CHECK: // %bb.0: 1421; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1422; CHECK-NEXT: smlal.4s v2, v0, v1[1] 1423; CHECK-NEXT: mov.16b v0, v2 1424; CHECK-NEXT: ret 1425 %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1426 %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) 1427 %tmp6 = add <4 x i32> %C, %tmp5 1428 ret <4 x i32> %tmp6 1429} 1430 1431define <2 x i64> @smlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { 1432; CHECK-LABEL: smlal_lane_2d: 1433; CHECK: // %bb.0: 1434; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1435; CHECK-NEXT: smlal.2d v2, v0, v1[1] 1436; CHECK-NEXT: mov.16b v0, v2 1437; CHECK-NEXT: ret 1438 %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> 1439 %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) 1440 %tmp6 = add <2 x i64> %C, %tmp5 1441 ret <2 x i64> %tmp6 1442} 1443 1444define <4 x i32> @sqdmlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind { 1445; CHECK-LABEL: sqdmlal_lane_4s: 1446; CHECK: // %bb.0: 1447; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1448; CHECK-NEXT: sqdmlal.4s v2, v0, v1[1] 1449; CHECK-NEXT: mov.16b v0, v2 1450; CHECK-NEXT: ret 1451 %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1452 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) 1453 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %tmp5) 1454 ret <4 x i32> %tmp6 1455} 1456 1457define <2 x i64> @sqdmlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { 1458; CHECK-LABEL: sqdmlal_lane_2d: 1459; CHECK: // %bb.0: 1460; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1461; CHECK-NEXT: sqdmlal.2d v2, v0, v1[1] 1462; CHECK-NEXT: mov.16b v0, v2 1463; CHECK-NEXT: ret 1464 %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> 1465 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) 1466 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %tmp5) 1467 ret <2 x i64> %tmp6 1468} 1469 1470define <4 x i32> @sqdmlal2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind { 1471; CHECK-LABEL: sqdmlal2_lane_4s: 1472; CHECK: // %bb.0: 1473; CHECK-NEXT: sqdmlal2.4s v2, v0, v1[1] 1474; CHECK-NEXT: mov.16b v0, v2 1475; CHECK-NEXT: ret 1476 %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1477 %tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1478 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 1479 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %tmp5) 1480 ret <4 x i32> %tmp6 1481} 1482 1483define <2 x i64> @sqdmlal2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind { 1484; CHECK-LABEL: sqdmlal2_lane_2d: 1485; CHECK: // %bb.0: 1486; CHECK-NEXT: sqdmlal2.2d v2, v0, v1[1] 1487; CHECK-NEXT: mov.16b v0, v2 1488; CHECK-NEXT: ret 1489 %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1490 %tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 1491 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 1492 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %tmp5) 1493 ret <2 x i64> %tmp6 1494} 1495 1496define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind { 1497; CHECK-LABEL: sqdmlal_lane_1s: 1498; CHECK: // %bb.0: 1499; CHECK-NEXT: fmov s1, w1 1500; CHECK-NEXT: fmov s2, w0 1501; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 1502; CHECK-NEXT: sqdmlal.h s2, h1, v0[1] 1503; CHECK-NEXT: fmov w0, s2 1504; CHECK-NEXT: ret 1505 %lhs = insertelement <4 x i16> undef, i16 %B, i32 0 1506 %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1507 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs) 1508 %prod = extractelement <4 x i32> %prod.vec, i32 0 1509 %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod) 1510 ret i32 %res 1511} 1512declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) 1513 1514define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind { 1515; CHECK-LABEL: sqdmlsl_lane_1s: 1516; CHECK: // %bb.0: 1517; CHECK-NEXT: fmov s1, w1 1518; CHECK-NEXT: fmov s2, w0 1519; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 1520; CHECK-NEXT: sqdmlsl.h s2, h1, v0[1] 1521; CHECK-NEXT: fmov w0, s2 1522; CHECK-NEXT: ret 1523 %lhs = insertelement <4 x i16> undef, i16 %B, i32 0 1524 %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1525 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs) 1526 %prod = extractelement <4 x i32> %prod.vec, i32 0 1527 %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod) 1528 ret i32 %res 1529} 1530declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) 1531 1532define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind { 1533; CHECK-LABEL: sqadd_lane1_sqdmull4s: 1534; CHECK: // %bb.0: 1535; CHECK-NEXT: sqdmull.4s v0, v0, v1 1536; CHECK-NEXT: mov.s w8, v0[1] 1537; CHECK-NEXT: fmov s0, w0 1538; CHECK-NEXT: fmov s1, w8 1539; CHECK-NEXT: sqadd s0, s0, s1 1540; CHECK-NEXT: fmov w0, s0 1541; CHECK-NEXT: ret 1542 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C) 1543 %prod = extractelement <4 x i32> %prod.vec, i32 1 1544 %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod) 1545 ret i32 %res 1546} 1547 1548define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind { 1549; CHECK-LABEL: sqsub_lane1_sqdmull4s: 1550; CHECK: // %bb.0: 1551; CHECK-NEXT: sqdmull.4s v0, v0, v1 1552; CHECK-NEXT: mov.s w8, v0[1] 1553; CHECK-NEXT: fmov s0, w0 1554; CHECK-NEXT: fmov s1, w8 1555; CHECK-NEXT: sqsub s0, s0, s1 1556; CHECK-NEXT: fmov w0, s0 1557; CHECK-NEXT: ret 1558 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C) 1559 %prod = extractelement <4 x i32> %prod.vec, i32 1 1560 %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod) 1561 ret i32 %res 1562} 1563 1564define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind { 1565; CHECK-LABEL: sqdmlal_lane_1d: 1566; CHECK: // %bb.0: 1567; CHECK-NEXT: fmov d1, x0 1568; CHECK-NEXT: fmov s2, w1 1569; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 1570; CHECK-NEXT: sqdmlal.s d1, s2, v0[1] 1571; CHECK-NEXT: fmov x0, d1 1572; CHECK-NEXT: ret 1573 %rhs = extractelement <2 x i32> %C, i32 1 1574 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs) 1575 %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod) 1576 ret i64 %res 1577} 1578declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32) 1579declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64) 1580 1581define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind { 1582; CHECK-LABEL: sqdmlsl_lane_1d: 1583; CHECK: // %bb.0: 1584; CHECK-NEXT: fmov d1, x0 1585; CHECK-NEXT: fmov s2, w1 1586; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 1587; CHECK-NEXT: sqdmlsl.s d1, s2, v0[1] 1588; CHECK-NEXT: fmov x0, d1 1589; CHECK-NEXT: ret 1590 %rhs = extractelement <2 x i32> %C, i32 1 1591 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs) 1592 %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod) 1593 ret i64 %res 1594} 1595declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64) 1596 1597 1598define <4 x i32> @umlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind { 1599; CHECK-LABEL: umlal_lane_4s: 1600; CHECK: // %bb.0: 1601; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1602; CHECK-NEXT: umlal.4s v2, v0, v1[1] 1603; CHECK-NEXT: mov.16b v0, v2 1604; CHECK-NEXT: ret 1605 %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1606 %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) 1607 %tmp6 = add <4 x i32> %C, %tmp5 1608 ret <4 x i32> %tmp6 1609} 1610 1611define <2 x i64> @umlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { 1612; CHECK-LABEL: umlal_lane_2d: 1613; CHECK: // %bb.0: 1614; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1615; CHECK-NEXT: umlal.2d v2, v0, v1[1] 1616; CHECK-NEXT: mov.16b v0, v2 1617; CHECK-NEXT: ret 1618 %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> 1619 %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) 1620 %tmp6 = add <2 x i64> %C, %tmp5 1621 ret <2 x i64> %tmp6 1622} 1623 1624 1625define <4 x i32> @smlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind { 1626; CHECK-LABEL: smlsl_lane_4s: 1627; CHECK: // %bb.0: 1628; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1629; CHECK-NEXT: smlsl.4s v2, v0, v1[1] 1630; CHECK-NEXT: mov.16b v0, v2 1631; CHECK-NEXT: ret 1632 %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1633 %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) 1634 %tmp6 = sub <4 x i32> %C, %tmp5 1635 ret <4 x i32> %tmp6 1636} 1637 1638define <2 x i64> @smlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { 1639; CHECK-LABEL: smlsl_lane_2d: 1640; CHECK: // %bb.0: 1641; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1642; CHECK-NEXT: smlsl.2d v2, v0, v1[1] 1643; CHECK-NEXT: mov.16b v0, v2 1644; CHECK-NEXT: ret 1645 %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> 1646 %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) 1647 %tmp6 = sub <2 x i64> %C, %tmp5 1648 ret <2 x i64> %tmp6 1649} 1650 1651define <4 x i32> @sqdmlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind { 1652; CHECK-LABEL: sqdmlsl_lane_4s: 1653; CHECK: // %bb.0: 1654; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1655; CHECK-NEXT: sqdmlsl.4s v2, v0, v1[1] 1656; CHECK-NEXT: mov.16b v0, v2 1657; CHECK-NEXT: ret 1658 %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1659 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) 1660 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %tmp5) 1661 ret <4 x i32> %tmp6 1662} 1663 1664define <2 x i64> @sqdmlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { 1665; CHECK-LABEL: sqdmlsl_lane_2d: 1666; CHECK: // %bb.0: 1667; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1668; CHECK-NEXT: sqdmlsl.2d v2, v0, v1[1] 1669; CHECK-NEXT: mov.16b v0, v2 1670; CHECK-NEXT: ret 1671 %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> 1672 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) 1673 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %tmp5) 1674 ret <2 x i64> %tmp6 1675} 1676 1677define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind { 1678; CHECK-LABEL: sqdmlsl2_lane_4s: 1679; CHECK: // %bb.0: 1680; CHECK-NEXT: sqdmlsl2.4s v2, v0, v1[1] 1681; CHECK-NEXT: mov.16b v0, v2 1682; CHECK-NEXT: ret 1683 %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1684 %tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1685 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 1686 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %tmp5) 1687 ret <4 x i32> %tmp6 1688} 1689 1690define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind { 1691; CHECK-LABEL: sqdmlsl2_lane_2d: 1692; CHECK: // %bb.0: 1693; CHECK-NEXT: sqdmlsl2.2d v2, v0, v1[1] 1694; CHECK-NEXT: mov.16b v0, v2 1695; CHECK-NEXT: ret 1696 %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1697 %tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 1698 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 1699 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %tmp5) 1700 ret <2 x i64> %tmp6 1701} 1702 1703define <4 x i32> @umlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind { 1704; CHECK-LABEL: umlsl_lane_4s: 1705; CHECK: // %bb.0: 1706; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1707; CHECK-NEXT: umlsl.4s v2, v0, v1[1] 1708; CHECK-NEXT: mov.16b v0, v2 1709; CHECK-NEXT: ret 1710 %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1711 %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) 1712 %tmp6 = sub <4 x i32> %C, %tmp5 1713 ret <4 x i32> %tmp6 1714} 1715 1716define <2 x i64> @umlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { 1717; CHECK-LABEL: umlsl_lane_2d: 1718; CHECK: // %bb.0: 1719; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 1720; CHECK-NEXT: umlsl.2d v2, v0, v1[1] 1721; CHECK-NEXT: mov.16b v0, v2 1722; CHECK-NEXT: ret 1723 %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1> 1724 %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) 1725 %tmp6 = sub <2 x i64> %C, %tmp5 1726 ret <2 x i64> %tmp6 1727} 1728 1729; Scalar FMULX 1730define float @fmulxs(float %a, float %b) nounwind { 1731; CHECK-LABEL: fmulxs: 1732; CHECK: // %bb.0: 1733; CHECK-NEXT: fmulx s0, s0, s1 1734; CHECK-NEXT: ret 1735 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind 1736 ret float %fmulx.i 1737} 1738 1739define double @fmulxd(double %a, double %b) nounwind { 1740; CHECK-LABEL: fmulxd: 1741; CHECK: // %bb.0: 1742; CHECK-NEXT: fmulx d0, d0, d1 1743; CHECK-NEXT: ret 1744 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind 1745 ret double %fmulx.i 1746} 1747 1748define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind { 1749; CHECK-LABEL: fmulxs_lane: 1750; CHECK: // %bb.0: 1751; CHECK-NEXT: fmulx.s s0, s0, v1[3] 1752; CHECK-NEXT: ret 1753 %b = extractelement <4 x float> %vec, i32 3 1754 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind 1755 ret float %fmulx.i 1756} 1757 1758define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind { 1759; CHECK-LABEL: fmulxd_lane: 1760; CHECK: // %bb.0: 1761; CHECK-NEXT: fmulx.d d0, d0, v1[1] 1762; CHECK-NEXT: ret 1763 %b = extractelement <2 x double> %vec, i32 1 1764 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind 1765 ret double %fmulx.i 1766} 1767 1768declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone 1769declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone 1770 1771 1772define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind { 1773; CHECK-LABEL: smull2_8h_simple: 1774; CHECK: // %bb.0: 1775; CHECK-NEXT: smull2.8h v0, v0, v1 1776; CHECK-NEXT: ret 1777 %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1778 %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1779 %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2 1780 ret <8 x i16> %3 1781} 1782 1783define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind { 1784; CHECK-LABEL: foo0: 1785; CHECK: // %bb.0: 1786; CHECK-NEXT: smull2.8h v0, v0, v1 1787; CHECK-NEXT: ret 1788 %tmp = bitcast <16 x i8> %a to <2 x i64> 1789 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1790 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8> 1791 %tmp2 = bitcast <16 x i8> %b to <2 x i64> 1792 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1793 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8> 1794 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind 1795 ret <8 x i16> %vmull.i.i 1796} 1797 1798define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind { 1799; CHECK-LABEL: foo1: 1800; CHECK: // %bb.0: 1801; CHECK-NEXT: smull2.4s v0, v0, v1 1802; CHECK-NEXT: ret 1803 %tmp = bitcast <8 x i16> %a to <2 x i64> 1804 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1805 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 1806 %tmp2 = bitcast <8 x i16> %b to <2 x i64> 1807 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1808 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16> 1809 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 1810 ret <4 x i32> %vmull2.i.i 1811} 1812 1813define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind { 1814; CHECK-LABEL: foo2: 1815; CHECK: // %bb.0: 1816; CHECK-NEXT: smull2.2d v0, v0, v1 1817; CHECK-NEXT: ret 1818 %tmp = bitcast <4 x i32> %a to <2 x i64> 1819 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1820 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 1821 %tmp2 = bitcast <4 x i32> %b to <2 x i64> 1822 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1823 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32> 1824 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 1825 ret <2 x i64> %vmull2.i.i 1826} 1827 1828define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind { 1829; CHECK-LABEL: foo3: 1830; CHECK: // %bb.0: 1831; CHECK-NEXT: umull2.8h v0, v0, v1 1832; CHECK-NEXT: ret 1833 %tmp = bitcast <16 x i8> %a to <2 x i64> 1834 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1835 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8> 1836 %tmp2 = bitcast <16 x i8> %b to <2 x i64> 1837 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1838 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8> 1839 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind 1840 ret <8 x i16> %vmull.i.i 1841} 1842 1843define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind { 1844; CHECK-LABEL: foo4: 1845; CHECK: // %bb.0: 1846; CHECK-NEXT: umull2.4s v0, v0, v1 1847; CHECK-NEXT: ret 1848 %tmp = bitcast <8 x i16> %a to <2 x i64> 1849 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1850 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 1851 %tmp2 = bitcast <8 x i16> %b to <2 x i64> 1852 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1853 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16> 1854 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 1855 ret <4 x i32> %vmull2.i.i 1856} 1857 1858define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind { 1859; CHECK-LABEL: foo5: 1860; CHECK: // %bb.0: 1861; CHECK-NEXT: umull2.2d v0, v0, v1 1862; CHECK-NEXT: ret 1863 %tmp = bitcast <4 x i32> %a to <2 x i64> 1864 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1865 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 1866 %tmp2 = bitcast <4 x i32> %b to <2 x i64> 1867 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1868 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32> 1869 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 1870 ret <2 x i64> %vmull2.i.i 1871} 1872 1873define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { 1874; CHECK-LABEL: foo6: 1875; CHECK: // %bb.0: // %entry 1876; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 1877; CHECK-NEXT: smull2.4s v0, v1, v2[1] 1878; CHECK-NEXT: ret 1879entry: 1880 %0 = bitcast <8 x i16> %b to <2 x i64> 1881 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1882 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16> 1883 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1884 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind 1885 ret <4 x i32> %vmull2.i 1886} 1887 1888define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { 1889; CHECK-LABEL: foo6a: 1890; CHECK: // %bb.0: // %entry 1891; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 1892; CHECK-NEXT: smull.4s v0, v1, v2[1] 1893; CHECK-NEXT: ret 1894entry: 1895 %0 = bitcast <8 x i16> %b to <2 x i64> 1896 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0> 1897 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16> 1898 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1899 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind 1900 ret <4 x i32> %vmull2.i 1901} 1902 1903define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { 1904; CHECK-LABEL: foo7: 1905; CHECK: // %bb.0: // %entry 1906; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 1907; CHECK-NEXT: smull2.2d v0, v1, v2[1] 1908; CHECK-NEXT: ret 1909entry: 1910 %0 = bitcast <4 x i32> %b to <2 x i64> 1911 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1912 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32> 1913 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 1914 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind 1915 ret <2 x i64> %vmull2.i 1916} 1917 1918define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { 1919; CHECK-LABEL: foo7a: 1920; CHECK: // %bb.0: // %entry 1921; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 1922; CHECK-NEXT: smull.2d v0, v1, v2[1] 1923; CHECK-NEXT: ret 1924entry: 1925 %0 = bitcast <4 x i32> %b to <2 x i64> 1926 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0> 1927 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32> 1928 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 1929 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind 1930 ret <2 x i64> %vmull2.i 1931} 1932 1933 1934define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { 1935; CHECK-LABEL: foo8: 1936; CHECK: // %bb.0: // %entry 1937; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 1938; CHECK-NEXT: umull2.4s v0, v1, v2[1] 1939; CHECK-NEXT: ret 1940entry: 1941 %0 = bitcast <8 x i16> %b to <2 x i64> 1942 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1943 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16> 1944 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1945 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind 1946 ret <4 x i32> %vmull2.i 1947} 1948 1949define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { 1950; CHECK-LABEL: foo8a: 1951; CHECK: // %bb.0: // %entry 1952; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 1953; CHECK-NEXT: umull.4s v0, v1, v2[1] 1954; CHECK-NEXT: ret 1955entry: 1956 %0 = bitcast <8 x i16> %b to <2 x i64> 1957 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0> 1958 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16> 1959 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1960 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind 1961 ret <4 x i32> %vmull2.i 1962} 1963 1964define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { 1965; CHECK-LABEL: foo9: 1966; CHECK: // %bb.0: // %entry 1967; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 1968; CHECK-NEXT: umull2.2d v0, v1, v2[1] 1969; CHECK-NEXT: ret 1970entry: 1971 %0 = bitcast <4 x i32> %b to <2 x i64> 1972 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1973 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32> 1974 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 1975 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind 1976 ret <2 x i64> %vmull2.i 1977} 1978 1979define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { 1980; CHECK-LABEL: foo9a: 1981; CHECK: // %bb.0: // %entry 1982; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 1983; CHECK-NEXT: umull.2d v0, v1, v2[1] 1984; CHECK-NEXT: ret 1985entry: 1986 %0 = bitcast <4 x i32> %b to <2 x i64> 1987 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0> 1988 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32> 1989 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 1990 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind 1991 ret <2 x i64> %vmull2.i 1992} 1993 1994define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind { 1995; CHECK-LABEL: bar0: 1996; CHECK: // %bb.0: 1997; CHECK-NEXT: smlal2.8h v0, v1, v2 1998; CHECK-NEXT: ret 1999 %tmp = bitcast <16 x i8> %b to <2 x i64> 2000 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 2001 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> 2002 %tmp2 = bitcast <16 x i8> %c to <2 x i64> 2003 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 2004 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8> 2005 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind 2006 %add.i = add <8 x i16> %vmull.i.i.i, %a 2007 ret <8 x i16> %add.i 2008} 2009 2010define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind { 2011; CHECK-LABEL: bar1: 2012; CHECK: // %bb.0: 2013; CHECK-NEXT: smlal2.4s v0, v1, v2 2014; CHECK-NEXT: ret 2015 %tmp = bitcast <8 x i16> %b to <2 x i64> 2016 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 2017 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> 2018 %tmp2 = bitcast <8 x i16> %c to <2 x i64> 2019 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 2020 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16> 2021 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 2022 %add.i = add <4 x i32> %vmull2.i.i.i, %a 2023 ret <4 x i32> %add.i 2024} 2025 2026define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind { 2027; CHECK-LABEL: bar2: 2028; CHECK: // %bb.0: 2029; CHECK-NEXT: smlal2.2d v0, v1, v2 2030; CHECK-NEXT: ret 2031 %tmp = bitcast <4 x i32> %b to <2 x i64> 2032 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 2033 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> 2034 %tmp2 = bitcast <4 x i32> %c to <2 x i64> 2035 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 2036 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32> 2037 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 2038 %add.i = add <2 x i64> %vmull2.i.i.i, %a 2039 ret <2 x i64> %add.i 2040} 2041 2042define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind { 2043; CHECK-LABEL: bar3: 2044; CHECK: // %bb.0: 2045; CHECK-NEXT: umlal2.8h v0, v1, v2 2046; CHECK-NEXT: ret 2047 %tmp = bitcast <16 x i8> %b to <2 x i64> 2048 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 2049 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> 2050 %tmp2 = bitcast <16 x i8> %c to <2 x i64> 2051 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 2052 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8> 2053 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind 2054 %add.i = add <8 x i16> %vmull.i.i.i, %a 2055 ret <8 x i16> %add.i 2056} 2057 2058define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind { 2059; CHECK-LABEL: bar4: 2060; CHECK: // %bb.0: 2061; CHECK-NEXT: umlal2.4s v0, v1, v2 2062; CHECK-NEXT: ret 2063 %tmp = bitcast <8 x i16> %b to <2 x i64> 2064 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 2065 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> 2066 %tmp2 = bitcast <8 x i16> %c to <2 x i64> 2067 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 2068 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16> 2069 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 2070 %add.i = add <4 x i32> %vmull2.i.i.i, %a 2071 ret <4 x i32> %add.i 2072} 2073 2074define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind { 2075; CHECK-LABEL: bar5: 2076; CHECK: // %bb.0: 2077; CHECK-NEXT: umlal2.2d v0, v1, v2 2078; CHECK-NEXT: ret 2079 %tmp = bitcast <4 x i32> %b to <2 x i64> 2080 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 2081 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> 2082 %tmp2 = bitcast <4 x i32> %c to <2 x i64> 2083 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 2084 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32> 2085 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 2086 %add.i = add <2 x i64> %vmull2.i.i.i, %a 2087 ret <2 x i64> %add.i 2088} 2089 2090define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind { 2091; CHECK-LABEL: mlal2_1: 2092; CHECK: // %bb.0: 2093; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 2094; CHECK-NEXT: smlal2.4s v0, v1, v2[3] 2095; CHECK-NEXT: ret 2096 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 2097 %tmp = bitcast <8 x i16> %b to <2 x i64> 2098 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 2099 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 2100 %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64> 2101 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 2102 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16> 2103 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 2104 %add = add <4 x i32> %vmull2.i.i, %a 2105 ret <4 x i32> %add 2106} 2107 2108define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind { 2109; CHECK-LABEL: mlal2_2: 2110; CHECK: // %bb.0: 2111; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 2112; CHECK-NEXT: smlal2.2d v0, v1, v2[1] 2113; CHECK-NEXT: ret 2114 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 2115 %tmp = bitcast <4 x i32> %b to <2 x i64> 2116 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 2117 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 2118 %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64> 2119 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 2120 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32> 2121 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 2122 %add = add <2 x i64> %vmull2.i.i, %a 2123 ret <2 x i64> %add 2124} 2125 2126define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind { 2127; CHECK-LABEL: mlal2_4: 2128; CHECK: // %bb.0: 2129; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 2130; CHECK-NEXT: umlal2.4s v0, v1, v2[2] 2131; CHECK-NEXT: ret 2132 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 2133 %tmp = bitcast <8 x i16> %b to <2 x i64> 2134 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 2135 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 2136 %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64> 2137 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 2138 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16> 2139 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 2140 %add = add <4 x i32> %vmull2.i.i, %a 2141 ret <4 x i32> %add 2142} 2143 2144define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind { 2145; CHECK-LABEL: mlal2_5: 2146; CHECK: // %bb.0: 2147; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 2148; CHECK-NEXT: umlal2.2d v0, v1, v2[0] 2149; CHECK-NEXT: ret 2150 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer 2151 %tmp = bitcast <4 x i32> %b to <2 x i64> 2152 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 2153 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 2154 %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64> 2155 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 2156 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32> 2157 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 2158 %add = add <2 x i64> %vmull2.i.i, %a 2159 ret <2 x i64> %add 2160} 2161 2162; rdar://12328502 2163define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp { 2164; CHECK-LABEL: vmulq_n_f64: 2165; CHECK: // %bb.0: // %entry 2166; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 2167; CHECK-NEXT: fmul.2d v0, v0, v1[0] 2168; CHECK-NEXT: ret 2169entry: 2170 %vecinit.i = insertelement <2 x double> undef, double %y, i32 0 2171 %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1 2172 %mul.i = fmul <2 x double> %vecinit1.i, %x 2173 ret <2 x double> %mul.i 2174} 2175 2176define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp { 2177; CHECK-LABEL: vmulq_n_f32: 2178; CHECK: // %bb.0: // %entry 2179; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 2180; CHECK-NEXT: fmul.4s v0, v0, v1[0] 2181; CHECK-NEXT: ret 2182entry: 2183 %vecinit.i = insertelement <4 x float> undef, float %y, i32 0 2184 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1 2185 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2 2186 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3 2187 %mul.i = fmul <4 x float> %vecinit3.i, %x 2188 ret <4 x float> %mul.i 2189} 2190 2191define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp { 2192; CHECK-LABEL: vmul_n_f32: 2193; CHECK: // %bb.0: // %entry 2194; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 2195; CHECK-NEXT: fmul.2s v0, v0, v1[0] 2196; CHECK-NEXT: ret 2197entry: 2198 %vecinit.i = insertelement <2 x float> undef, float %y, i32 0 2199 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1 2200 %mul.i = fmul <2 x float> %vecinit1.i, %x 2201 ret <2 x float> %mul.i 2202} 2203 2204define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp { 2205; CHECK-LABEL: vmla_laneq_s16_test: 2206; CHECK: // %bb.0: // %entry 2207; CHECK-NEXT: mla.4h v0, v1, v2[6] 2208; CHECK-NEXT: ret 2209entry: 2210 %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6> 2211 %mul = mul <4 x i16> %shuffle, %b 2212 %add = add <4 x i16> %mul, %a 2213 ret <4 x i16> %add 2214} 2215 2216define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp { 2217; CHECK-LABEL: vmla_laneq_s32_test: 2218; CHECK: // %bb.0: // %entry 2219; CHECK-NEXT: mla.2s v0, v1, v2[3] 2220; CHECK-NEXT: ret 2221entry: 2222 %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3> 2223 %mul = mul <2 x i32> %shuffle, %b 2224 %add = add <2 x i32> %mul, %a 2225 ret <2 x i32> %add 2226} 2227 2228define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp { 2229; CHECK-LABEL: not_really_vmlaq_laneq_s16_test: 2230; CHECK: // %bb.0: // %entry 2231; CHECK-NEXT: mla.8h v0, v1, v2[5] 2232; CHECK-NEXT: ret 2233entry: 2234 %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2235 %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2236 %mul = mul <8 x i16> %shuffle2, %b 2237 %add = add <8 x i16> %mul, %a 2238 ret <8 x i16> %add 2239} 2240 2241define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp { 2242; CHECK-LABEL: not_really_vmlaq_laneq_s32_test: 2243; CHECK: // %bb.0: // %entry 2244; CHECK-NEXT: mla.4s v0, v1, v2[3] 2245; CHECK-NEXT: ret 2246entry: 2247 %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 2248 %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 2249 %mul = mul <4 x i32> %shuffle2, %b 2250 %add = add <4 x i32> %mul, %a 2251 ret <4 x i32> %add 2252} 2253 2254define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp { 2255; CHECK-LABEL: vmull_laneq_s16_test: 2256; CHECK: // %bb.0: // %entry 2257; CHECK-NEXT: smull.4s v0, v0, v1[6] 2258; CHECK-NEXT: ret 2259entry: 2260 %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6> 2261 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2 2262 ret <4 x i32> %vmull2.i 2263} 2264 2265define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp { 2266; CHECK-LABEL: vmull_laneq_s32_test: 2267; CHECK: // %bb.0: // %entry 2268; CHECK-NEXT: smull.2d v0, v0, v1[2] 2269; CHECK-NEXT: ret 2270entry: 2271 %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2> 2272 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2 2273 ret <2 x i64> %vmull2.i 2274} 2275define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp { 2276; CHECK-LABEL: vmull_laneq_u16_test: 2277; CHECK: // %bb.0: // %entry 2278; CHECK-NEXT: umull.4s v0, v0, v1[6] 2279; CHECK-NEXT: ret 2280entry: 2281 %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6> 2282 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2 2283 ret <4 x i32> %vmull2.i 2284} 2285 2286define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp { 2287; CHECK-LABEL: vmull_laneq_u32_test: 2288; CHECK: // %bb.0: // %entry 2289; CHECK-NEXT: umull.2d v0, v0, v1[2] 2290; CHECK-NEXT: ret 2291entry: 2292 %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2> 2293 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2 2294 ret <2 x i64> %vmull2.i 2295} 2296 2297define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp { 2298; CHECK-LABEL: vmull_low_n_s16_test: 2299; CHECK: // %bb.0: // %entry 2300; CHECK-NEXT: dup.4h v0, w0 2301; CHECK-NEXT: smull.4s v0, v1, v0 2302; CHECK-NEXT: ret 2303entry: 2304 %conv = trunc i32 %d to i16 2305 %0 = bitcast <8 x i16> %b to <2 x i64> 2306 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0> 2307 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 2308 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0 2309 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1 2310 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2 2311 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3 2312 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind 2313 ret <4 x i32> %vmull2.i.i 2314} 2315 2316define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp { 2317; CHECK-LABEL: vmull_high_n_s16_test: 2318; CHECK: // %bb.0: // %entry 2319; CHECK-NEXT: dup.8h v0, w0 2320; CHECK-NEXT: smull2.4s v0, v1, v0 2321; CHECK-NEXT: ret 2322entry: 2323 %conv = trunc i32 %d to i16 2324 %0 = bitcast <8 x i16> %b to <2 x i64> 2325 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 2326 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 2327 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0 2328 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1 2329 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2 2330 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3 2331 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind 2332 ret <4 x i32> %vmull2.i.i 2333} 2334 2335define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp { 2336; CHECK-LABEL: vmull_high_n_s32_test: 2337; CHECK: // %bb.0: // %entry 2338; CHECK-NEXT: dup.4s v0, w0 2339; CHECK-NEXT: smull2.2d v0, v1, v0 2340; CHECK-NEXT: ret 2341entry: 2342 %0 = bitcast <4 x i32> %b to <2 x i64> 2343 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 2344 %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 2345 %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0 2346 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1 2347 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind 2348 ret <2 x i64> %vmull2.i.i 2349} 2350 2351define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp { 2352; CHECK-LABEL: vmull_high_n_u16_test: 2353; CHECK: // %bb.0: // %entry 2354; CHECK-NEXT: dup.8h v0, w0 2355; CHECK-NEXT: umull2.4s v0, v1, v0 2356; CHECK-NEXT: ret 2357entry: 2358 %conv = trunc i32 %d to i16 2359 %0 = bitcast <8 x i16> %b to <2 x i64> 2360 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 2361 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 2362 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0 2363 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1 2364 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2 2365 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3 2366 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind 2367 ret <4 x i32> %vmull2.i.i 2368} 2369 2370define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp { 2371; CHECK-LABEL: vmull_high_n_u32_test: 2372; CHECK: // %bb.0: // %entry 2373; CHECK-NEXT: dup.4s v0, w0 2374; CHECK-NEXT: umull2.2d v0, v1, v0 2375; CHECK-NEXT: ret 2376entry: 2377 %0 = bitcast <4 x i32> %b to <2 x i64> 2378 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 2379 %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 2380 %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0 2381 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1 2382 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind 2383 ret <2 x i64> %vmull2.i.i 2384} 2385 2386define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) { 2387; CHECK-LABEL: vmul_built_dup_test: 2388; CHECK: // %bb.0: 2389; CHECK-NEXT: mul.4s v0, v0, v1[1] 2390; CHECK-NEXT: ret 2391 %vget_lane = extractelement <4 x i32> %b, i32 1 2392 %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0 2393 %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1 2394 %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2 2395 %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3 2396 %prod = mul <4 x i32> %a, %vecinit3.i 2397 ret <4 x i32> %prod 2398} 2399 2400define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) { 2401; CHECK-LABEL: vmul_built_dup_fromsmall_test: 2402; CHECK: // %bb.0: 2403; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 2404; CHECK-NEXT: mul.4h v0, v0, v1[3] 2405; CHECK-NEXT: ret 2406 %vget_lane = extractelement <4 x i16> %b, i32 3 2407 %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0 2408 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1 2409 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2 2410 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3 2411 %prod = mul <4 x i16> %a, %vecinit3.i 2412 ret <4 x i16> %prod 2413} 2414 2415define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) { 2416; CHECK-LABEL: vmulq_built_dup_fromsmall_test: 2417; CHECK: // %bb.0: 2418; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 2419; CHECK-NEXT: mul.8h v0, v0, v1[0] 2420; CHECK-NEXT: ret 2421 %vget_lane = extractelement <4 x i16> %b, i32 0 2422 %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0 2423 %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1 2424 %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2 2425 %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3 2426 %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4 2427 %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5 2428 %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6 2429 %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7 2430 %prod = mul <8 x i16> %a, %vecinit7.i 2431 ret <8 x i16> %prod 2432} 2433 2434define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) { 2435; CHECK-LABEL: mull_from_two_extracts: 2436; CHECK: // %bb.0: 2437; CHECK-NEXT: sqdmull2.2d v0, v0, v1 2438; CHECK-NEXT: ret 2439 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 2440 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 2441 2442 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 2443 ret <2 x i64> %res 2444} 2445 2446define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { 2447; CHECK-LABEL: mlal_from_two_extracts: 2448; CHECK: // %bb.0: 2449; CHECK-NEXT: sqdmlal2.2d v0, v1, v2 2450; CHECK-NEXT: ret 2451 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 2452 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 2453 2454 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 2455 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res) 2456 ret <2 x i64> %sum 2457} 2458 2459define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) { 2460; CHECK-LABEL: mull_from_extract_dup_low: 2461; CHECK: // %bb.0: 2462; CHECK-NEXT: dup.2s v1, w0 2463; CHECK-NEXT: sqdmull.2d v0, v0, v1 2464; CHECK-NEXT: ret 2465 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 2466 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 2467 2468 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 2469 2470 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 2471 ret <2 x i64> %res 2472} 2473 2474define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) { 2475; CHECK-LABEL: mull_from_extract_dup_high: 2476; CHECK: // %bb.0: 2477; CHECK-NEXT: dup.4s v1, w0 2478; CHECK-NEXT: sqdmull2.2d v0, v0, v1 2479; CHECK-NEXT: ret 2480 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 2481 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 2482 2483 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 2484 2485 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 2486 ret <2 x i64> %res 2487} 2488 2489define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) { 2490; CHECK-LABEL: pmull_from_extract_dup_low: 2491; CHECK: // %bb.0: 2492; CHECK-NEXT: dup.8b v1, w0 2493; CHECK-NEXT: pmull.8h v0, v0, v1 2494; CHECK-NEXT: ret 2495 %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0 2496 %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 2497 2498 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2499 2500 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind 2501 ret <8 x i16> %res 2502} 2503 2504define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) { 2505; CHECK-LABEL: pmull_from_extract_dup_high: 2506; CHECK: // %bb.0: 2507; CHECK-NEXT: dup.16b v1, w0 2508; CHECK-NEXT: pmull2.8h v0, v0, v1 2509; CHECK-NEXT: ret 2510 %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0 2511 %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 2512 2513 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2514 2515 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind 2516 ret <8 x i16> %res 2517} 2518 2519define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) { 2520; CHECK-LABEL: pmull_from_extract_duplane_low: 2521; CHECK: // %bb.0: 2522; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 2523; CHECK-NEXT: dup.8b v1, v1[0] 2524; CHECK-NEXT: pmull.8h v0, v0, v1 2525; CHECK-NEXT: ret 2526 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2527 %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 2528 2529 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind 2530 ret <8 x i16> %res 2531} 2532 2533define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) { 2534; CHECK-LABEL: pmull_from_extract_duplane_high: 2535; CHECK: // %bb.0: 2536; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 2537; CHECK-NEXT: dup.16b v1, v1[0] 2538; CHECK-NEXT: pmull2.8h v0, v0, v1 2539; CHECK-NEXT: ret 2540 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2541 %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 2542 2543 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind 2544 ret <8 x i16> %res 2545} 2546 2547define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) { 2548; CHECK-LABEL: sqdmull_from_extract_duplane_low: 2549; CHECK: // %bb.0: 2550; CHECK-NEXT: sqdmull.2d v0, v0, v1[0] 2551; CHECK-NEXT: ret 2552 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 2553 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> 2554 2555 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 2556 ret <2 x i64> %res 2557} 2558 2559define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) { 2560; CHECK-LABEL: sqdmull_from_extract_duplane_high: 2561; CHECK: // %bb.0: 2562; CHECK-NEXT: sqdmull2.2d v0, v0, v1[0] 2563; CHECK-NEXT: ret 2564 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 2565 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> 2566 2567 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 2568 ret <2 x i64> %res 2569} 2570 2571define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { 2572; CHECK-LABEL: sqdmlal_from_extract_duplane_low: 2573; CHECK: // %bb.0: 2574; CHECK-NEXT: sqdmlal.2d v0, v1, v2[0] 2575; CHECK-NEXT: ret 2576 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 2577 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> 2578 2579 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 2580 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res) 2581 ret <2 x i64> %sum 2582} 2583 2584define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { 2585; CHECK-LABEL: sqdmlal_from_extract_duplane_high: 2586; CHECK: // %bb.0: 2587; CHECK-NEXT: sqdmlal2.2d v0, v1, v2[0] 2588; CHECK-NEXT: ret 2589 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 2590 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> 2591 2592 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 2593 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res) 2594 ret <2 x i64> %sum 2595} 2596 2597define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { 2598; CHECK-LABEL: umlal_from_extract_duplane_low: 2599; CHECK: // %bb.0: 2600; CHECK-NEXT: umlal.2d v0, v1, v2[0] 2601; CHECK-NEXT: ret 2602 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 2603 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> 2604 2605 %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 2606 %sum = add <2 x i64> %accum, %res 2607 ret <2 x i64> %sum 2608} 2609 2610define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { 2611; CHECK-LABEL: umlal_from_extract_duplane_high: 2612; CHECK: // %bb.0: 2613; CHECK-NEXT: umlal2.2d v0, v1, v2[0] 2614; CHECK-NEXT: ret 2615 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 2616 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> 2617 2618 %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 2619 %sum = add <2 x i64> %accum, %res 2620 ret <2 x i64> %sum 2621} 2622 2623define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) { 2624; CHECK-LABEL: scalar_fmla_from_extract_v4f32: 2625; CHECK: // %bb.0: 2626; CHECK-NEXT: fmla.s s0, s1, v2[3] 2627; CHECK-NEXT: ret 2628 %rhs = extractelement <4 x float> %rvec, i32 3 2629 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) 2630 ret float %res 2631} 2632 2633define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) { 2634; CHECK-LABEL: scalar_fmla_from_extract_v2f32: 2635; CHECK: // %bb.0: 2636; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 2637; CHECK-NEXT: fmla.s s0, s1, v2[1] 2638; CHECK-NEXT: ret 2639 %rhs = extractelement <2 x float> %rvec, i32 1 2640 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) 2641 ret float %res 2642} 2643 2644define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) { 2645; CHECK-LABEL: scalar_fmls_from_extract_v4f32: 2646; CHECK: // %bb.0: 2647; CHECK-NEXT: fmls.s s0, s1, v2[3] 2648; CHECK-NEXT: ret 2649 %rhs.scal = extractelement <4 x float> %rvec, i32 3 2650 %rhs = fsub float -0.0, %rhs.scal 2651 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) 2652 ret float %res 2653} 2654 2655define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) { 2656; CHECK-LABEL: scalar_fmls_from_extract_v2f32: 2657; CHECK: // %bb.0: 2658; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 2659; CHECK-NEXT: fmls.s s0, s1, v2[1] 2660; CHECK-NEXT: ret 2661 %rhs.scal = extractelement <2 x float> %rvec, i32 1 2662 %rhs = fsub float -0.0, %rhs.scal 2663 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) 2664 ret float %res 2665} 2666 2667declare float @llvm.fma.f32(float, float, float) 2668 2669define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) { 2670; CHECK-LABEL: scalar_fmla_from_extract_v2f64: 2671; CHECK: // %bb.0: 2672; CHECK-NEXT: fmla.d d0, d1, v2[1] 2673; CHECK-NEXT: ret 2674 %rhs = extractelement <2 x double> %rvec, i32 1 2675 %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum) 2676 ret double %res 2677} 2678 2679define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) { 2680; CHECK-LABEL: scalar_fmls_from_extract_v2f64: 2681; CHECK: // %bb.0: 2682; CHECK-NEXT: fmls.d d0, d1, v2[1] 2683; CHECK-NEXT: ret 2684 %rhs.scal = extractelement <2 x double> %rvec, i32 1 2685 %rhs = fsub double -0.0, %rhs.scal 2686 %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum) 2687 ret double %res 2688} 2689 2690declare double @llvm.fma.f64(double, double, double) 2691 2692define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) { 2693; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32: 2694; CHECK: // %bb.0: 2695; CHECK-NEXT: fmls.2s v0, v1, v2[3] 2696; CHECK-NEXT: ret 2697 %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs 2698 %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3> 2699 %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum) 2700 ret <2 x float> %res 2701} 2702 2703define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) { 2704; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1: 2705; CHECK: // %bb.0: 2706; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 2707; CHECK-NEXT: fmls.2s v0, v1, v2[1] 2708; CHECK-NEXT: ret 2709 %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs 2710 %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1> 2711 %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum) 2712 ret <2 x float> %res 2713} 2714 2715define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) { 2716; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32: 2717; CHECK: // %bb.0: 2718; CHECK-NEXT: fmls.4s v0, v1, v2[3] 2719; CHECK-NEXT: ret 2720 %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs 2721 %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 2722 %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum) 2723 ret <4 x float> %res 2724} 2725 2726define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) { 2727; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1: 2728; CHECK: // %bb.0: 2729; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 2730; CHECK-NEXT: fmls.4s v0, v1, v2[1] 2731; CHECK-NEXT: ret 2732 %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs 2733 %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 2734 %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum) 2735 ret <4 x float> %res 2736} 2737 2738define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) { 2739; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64: 2740; CHECK: // %bb.0: 2741; CHECK-NEXT: fmls.2d v0, v1, v2[1] 2742; CHECK-NEXT: ret 2743 %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs 2744 %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1> 2745 %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum) 2746 ret <2 x double> %res 2747} 2748 2749define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind { 2750; CHECK-LABEL: test_fmul_v1f64: 2751; CHECK: // %bb.0: 2752; CHECK-NEXT: fmul d0, d0, d1 2753; CHECK-NEXT: ret 2754 %prod = fmul <1 x double> %L, %R 2755 ret <1 x double> %prod 2756} 2757 2758define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind { 2759; CHECK-LABEL: test_fdiv_v1f64: 2760; CHECK: // %bb.0: 2761; CHECK-NEXT: fdiv d0, d0, d1 2762; CHECK-NEXT: ret 2763 %prod = fdiv <1 x double> %L, %R 2764 ret <1 x double> %prod 2765} 2766 2767define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind { 2768; CHECK-LABEL: sqdmlal_s: 2769; CHECK: // %bb.0: 2770; CHECK-NEXT: fmov s0, w0 2771; CHECK-NEXT: fmov s1, w1 2772; CHECK-NEXT: fmov s2, w2 2773; CHECK-NEXT: sqdmlal.h s2, h0, v1[0] 2774; CHECK-NEXT: fmov w0, s2 2775; CHECK-NEXT: ret 2776 %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0 2777 %tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0 2778 %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 2779 %tmp4 = extractelement <4 x i32> %tmp3, i64 0 2780 %tmp5 = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %C, i32 %tmp4) 2781 ret i32 %tmp5 2782} 2783 2784define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind { 2785; CHECK-LABEL: sqdmlal_d: 2786; CHECK: // %bb.0: 2787; CHECK-NEXT: fmov d0, x2 2788; CHECK-NEXT: fmov s1, w0 2789; CHECK-NEXT: fmov s2, w1 2790; CHECK-NEXT: sqdmlal d0, s1, s2 2791; CHECK-NEXT: fmov x0, d0 2792; CHECK-NEXT: ret 2793 %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B) 2794 %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4) 2795 ret i64 %tmp5 2796} 2797 2798define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind { 2799; CHECK-LABEL: sqdmlsl_s: 2800; CHECK: // %bb.0: 2801; CHECK-NEXT: fmov s0, w0 2802; CHECK-NEXT: fmov s1, w1 2803; CHECK-NEXT: fmov s2, w2 2804; CHECK-NEXT: sqdmlsl.h s2, h0, v1[0] 2805; CHECK-NEXT: fmov w0, s2 2806; CHECK-NEXT: ret 2807 %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0 2808 %tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0 2809 %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 2810 %tmp4 = extractelement <4 x i32> %tmp3, i64 0 2811 %tmp5 = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %C, i32 %tmp4) 2812 ret i32 %tmp5 2813} 2814 2815define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind { 2816; CHECK-LABEL: sqdmlsl_d: 2817; CHECK: // %bb.0: 2818; CHECK-NEXT: fmov d0, x2 2819; CHECK-NEXT: fmov s1, w0 2820; CHECK-NEXT: fmov s2, w1 2821; CHECK-NEXT: sqdmlsl d0, s1, s2 2822; CHECK-NEXT: fmov x0, d0 2823; CHECK-NEXT: ret 2824 %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B) 2825 %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4) 2826 ret i64 %tmp5 2827} 2828 2829define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind { 2830; CHECK-LABEL: test_pmull_64: 2831; CHECK: // %bb.0: 2832; CHECK-NEXT: fmov d0, x1 2833; CHECK-NEXT: fmov d1, x0 2834; CHECK-NEXT: pmull.1q v0, v1, v0 2835; CHECK-NEXT: ret 2836 %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r) 2837 ret <16 x i8> %val 2838} 2839 2840define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind { 2841; CHECK-LABEL: test_pmull_high_64: 2842; CHECK: // %bb.0: 2843; CHECK-NEXT: pmull2.1q v0, v0, v1 2844; CHECK-NEXT: ret 2845 %l_hi = extractelement <2 x i64> %l, i32 1 2846 %r_hi = extractelement <2 x i64> %r, i32 1 2847 %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi) 2848 ret <16 x i8> %val 2849} 2850 2851declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) 2852 2853define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind { 2854; CHECK-LABEL: test_mul_v1i64: 2855; CHECK: // %bb.0: 2856; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 2857; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 2858; CHECK-NEXT: fmov x8, d1 2859; CHECK-NEXT: fmov x9, d0 2860; CHECK-NEXT: mul x8, x9, x8 2861; CHECK-NEXT: fmov d0, x8 2862; CHECK-NEXT: ret 2863 %prod = mul <1 x i64> %lhs, %rhs 2864 ret <1 x i64> %prod 2865} 2866