1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s 3 4define <4 x i16> @test_mla0(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { 5; CHECK-LABEL: test_mla0: 6; CHECK: // %bb.0: // %entry 7; CHECK-NEXT: umull v2.8h, v2.8b, v3.8b 8; CHECK-NEXT: umlal v2.8h, v0.8b, v1.8b 9; CHECK-NEXT: fmov d0, d2 10; CHECK-NEXT: ret 11entry: 12 %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b) 13 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %c, <8 x i8> %d) 14 %add.i = add <8 x i16> %vmull.i.i, %vmull.i 15 %shuffle.i = shufflevector <8 x i16> %add.i, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 16 ret <4 x i16> %shuffle.i 17} 18 19 20define <4 x i16> @test_mla1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { 21; CHECK-LABEL: test_mla1: 22; CHECK: // %bb.0: // %entry 23; CHECK-NEXT: smull v2.8h, v2.8b, v3.8b 24; CHECK-NEXT: smlal v2.8h, v0.8b, v1.8b 25; CHECK-NEXT: fmov d0, d2 26; CHECK-NEXT: ret 27entry: 28 %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b) 29 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %c, <8 x i8> %d) 30 %add.i = add <8 x i16> %vmull.i.i, %vmull.i 31 %shuffle.i = shufflevector <8 x i16> %add.i, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 32 ret <4 x i16> %shuffle.i 33} 34 35 36define <2 x i32> @test_mla2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { 37; CHECK-LABEL: test_mla2: 38; CHECK: // %bb.0: // %entry 39; CHECK-NEXT: umull v2.4s, v2.4h, v3.4h 40; CHECK-NEXT: umlal v2.4s, v0.4h, v1.4h 41; CHECK-NEXT: fmov d0, d2 42; CHECK-NEXT: ret 43entry: 44 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b) 45 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %c, <4 x i16> %d) 46 %add.i = add <4 x i32> %vmull2.i.i, %vmull2.i 47 %shuffle.i = shufflevector <4 x i32> %add.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 48 ret <2 x i32> %shuffle.i 49} 50 51 52define <2 x i32> @test_mla3(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { 53; CHECK-LABEL: test_mla3: 54; CHECK: // %bb.0: // %entry 55; CHECK-NEXT: smull v2.4s, v2.4h, v3.4h 56; CHECK-NEXT: smlal v2.4s, v0.4h, v1.4h 57; CHECK-NEXT: fmov d0, d2 58; CHECK-NEXT: ret 59entry: 60 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b) 61 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %c, <4 x i16> %d) 62 %add.i = add <4 x i32> %vmull2.i.i, %vmull2.i 63 %shuffle.i = shufflevector <4 x i32> %add.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 64 ret <2 x i32> %shuffle.i 65} 66 67 68define <1 x i64> @test_mla4(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { 69; CHECK-LABEL: test_mla4: 70; CHECK: // %bb.0: // %entry 71; CHECK-NEXT: umull v2.2d, v2.2s, v3.2s 72; CHECK-NEXT: umlal v2.2d, v0.2s, v1.2s 73; CHECK-NEXT: fmov d0, d2 74; CHECK-NEXT: ret 75entry: 76 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b) 77 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %c, <2 x i32> %d) 78 %add.i = add <2 x i64> %vmull2.i.i, %vmull2.i 79 %shuffle.i = shufflevector <2 x i64> %add.i, <2 x i64> undef, <1 x i32> zeroinitializer 80 ret <1 x i64> %shuffle.i 81} 82 83 84define <1 x i64> @test_mla5(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { 85; CHECK-LABEL: test_mla5: 86; CHECK: // %bb.0: // %entry 87; CHECK-NEXT: smull v2.2d, v2.2s, v3.2s 88; CHECK-NEXT: smlal v2.2d, v0.2s, v1.2s 89; CHECK-NEXT: fmov d0, d2 90; CHECK-NEXT: ret 91entry: 92 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b) 93 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %c, <2 x i32> %d) 94 %add.i = add <2 x i64> %vmull2.i.i, %vmull2.i 95 %shuffle.i = shufflevector <2 x i64> %add.i, <2 x i64> undef, <1 x i32> zeroinitializer 96 ret <1 x i64> %shuffle.i 97} 98 99 100define <4 x i16> @test_mls0(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { 101; CHECK-LABEL: test_mls0: 102; CHECK: // %bb.0: // %entry 103; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b 104; CHECK-NEXT: umlsl v0.8h, v2.8b, v3.8b 105; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 106; CHECK-NEXT: ret 107entry: 108 %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b) 109 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %c, <8 x i8> %d) 110 %sub.i = sub <8 x i16> %vmull.i, %vmull.i.i 111 %shuffle.i = shufflevector <8 x i16> %sub.i, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 112 ret <4 x i16> %shuffle.i 113} 114 115 116define <4 x i16> @test_mls1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { 117; CHECK-LABEL: test_mls1: 118; CHECK: // %bb.0: // %entry 119; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b 120; CHECK-NEXT: smlsl v0.8h, v2.8b, v3.8b 121; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 122; CHECK-NEXT: ret 123entry: 124 %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b) 125 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %c, <8 x i8> %d) 126 %sub.i = sub <8 x i16> %vmull.i, %vmull.i.i 127 %shuffle.i = shufflevector <8 x i16> %sub.i, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 128 ret <4 x i16> %shuffle.i 129} 130 131 132define <2 x i32> @test_mls2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { 133; CHECK-LABEL: test_mls2: 134; CHECK: // %bb.0: // %entry 135; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h 136; CHECK-NEXT: umlsl v0.4s, v2.4h, v3.4h 137; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 138; CHECK-NEXT: ret 139entry: 140 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b) 141 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %c, <4 x i16> %d) 142 %sub.i = sub <4 x i32> %vmull2.i, %vmull2.i.i 143 %shuffle.i = shufflevector <4 x i32> %sub.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 144 ret <2 x i32> %shuffle.i 145} 146 147 148define <2 x i32> @test_mls3(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { 149; CHECK-LABEL: test_mls3: 150; CHECK: // %bb.0: // %entry 151; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h 152; CHECK-NEXT: smlsl v0.4s, v2.4h, v3.4h 153; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 154; CHECK-NEXT: ret 155entry: 156 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b) 157 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %c, <4 x i16> %d) 158 %sub.i = sub <4 x i32> %vmull2.i, %vmull2.i.i 159 %shuffle.i = shufflevector <4 x i32> %sub.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 160 ret <2 x i32> %shuffle.i 161} 162 163 164define <1 x i64> @test_mls4(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { 165; CHECK-LABEL: test_mls4: 166; CHECK: // %bb.0: // %entry 167; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s 168; CHECK-NEXT: umlsl v0.2d, v2.2s, v3.2s 169; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 170; CHECK-NEXT: ret 171entry: 172 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b) 173 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %c, <2 x i32> %d) 174 %sub.i = sub <2 x i64> %vmull2.i, %vmull2.i.i 175 %shuffle.i = shufflevector <2 x i64> %sub.i, <2 x i64> undef, <1 x i32> zeroinitializer 176 ret <1 x i64> %shuffle.i 177} 178 179 180define <1 x i64> @test_mls5(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { 181; CHECK-LABEL: test_mls5: 182; CHECK: // %bb.0: // %entry 183; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s 184; CHECK-NEXT: smlsl v0.2d, v2.2s, v3.2s 185; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 186; CHECK-NEXT: ret 187entry: 188 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b) 189 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %c, <2 x i32> %d) 190 %sub.i = sub <2 x i64> %vmull2.i, %vmull2.i.i 191 %shuffle.i = shufflevector <2 x i64> %sub.i, <2 x i64> undef, <1 x i32> zeroinitializer 192 ret <1 x i64> %shuffle.i 193} 194 195declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) 196 197declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) 198 199declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) 200 201declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) 202 203declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) 204 205declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) 206