1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-LE 3; RUN: llc -mtriple=aarch64_be-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-BE 4 5declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) 6declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) 7declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) 8declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) 9declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) 10declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) 11declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) 12declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %s1, <2 x i32> %s2) 13 14define <4 x i32> @test_smull_high_s16_base(<8 x i16> %a, <8 x i16> %b) #0 { 15; CHECK-LE-LABEL: test_smull_high_s16_base: 16; CHECK-LE: // %bb.0: // %entry 17; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h 18; CHECK-LE-NEXT: ret 19; 20; CHECK-BE-LABEL: test_smull_high_s16_base: 21; CHECK-BE: // %bb.0: // %entry 22; CHECK-BE-NEXT: rev64 v1.8h, v1.8h 23; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 24; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 25; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 26; CHECK-BE-NEXT: smull2 v0.4s, v0.8h, v1.8h 27; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 28; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 29; CHECK-BE-NEXT: ret 30entry: 31 %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 32 %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 33 %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) 34 ret <4 x i32> %r 35} 36 37define <4 x i32> @test_smull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 { 38; CHECK-LE-LABEL: test_smull_high_s16_bitcasta1: 39; CHECK-LE: // %bb.0: // %entry 40; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h 41; CHECK-LE-NEXT: ret 42; 43; CHECK-BE-LABEL: test_smull_high_s16_bitcasta1: 44; CHECK-BE: // %bb.0: // %entry 45; CHECK-BE-NEXT: rev64 v1.8h, v1.8h 46; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 47; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 48; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 49; CHECK-BE-NEXT: smull2 v0.4s, v0.8h, v1.8h 50; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 51; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 52; CHECK-BE-NEXT: ret 53entry: 54 %a = bitcast <2 x i64> %aa to <8 x i16> 55 %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 56 %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 57 %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) 58 ret <4 x i32> %r 59} 60 61define <4 x i32> @test_smull_high_s16_bitcastb1(<8 x i16> %a, <16 x i8> %bb) #0 { 62; CHECK-LE-LABEL: test_smull_high_s16_bitcastb1: 63; CHECK-LE: // %bb.0: // %entry 64; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h 65; CHECK-LE-NEXT: ret 66; 67; CHECK-BE-LABEL: test_smull_high_s16_bitcastb1: 68; CHECK-BE: // %bb.0: // %entry 69; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 70; CHECK-BE-NEXT: rev64 v1.8h, v1.8h 71; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 72; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 73; CHECK-BE-NEXT: smull2 v0.4s, v0.8h, v1.8h 74; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 75; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 76; CHECK-BE-NEXT: ret 77entry: 78 %b = bitcast <16 x i8> %bb to <8 x i16> 79 %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 80 %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 81 %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) 82 ret <4 x i32> %r 83} 84 85define <4 x i32> @test_smull_high_s16_bitcasta2(<2 x i64> %a, <8 x i16> %b) #0 { 86; CHECK-LE-LABEL: test_smull_high_s16_bitcasta2: 87; CHECK-LE: // %bb.0: // %entry 88; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h 89; CHECK-LE-NEXT: ret 90; 91; CHECK-BE-LABEL: test_smull_high_s16_bitcasta2: 92; CHECK-BE: // %bb.0: // %entry 93; CHECK-BE-NEXT: rev64 v1.8h, v1.8h 94; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 95; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 96; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 97; CHECK-BE-NEXT: rev64 v0.4h, v0.4h 98; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 99; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h 100; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 101; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 102; CHECK-BE-NEXT: ret 103entry: 104 %s1a = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1> 105 %s1 = bitcast <1 x i64> %s1a to <4 x i16> 106 %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 107 %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) 108 ret <4 x i32> %r 109} 110 111define <4 x i32> @test_smull_high_s16_bitcastb2(<8 x i16> %a, <16 x i8> %b) #0 { 112; CHECK-LE-LABEL: test_smull_high_s16_bitcastb2: 113; CHECK-LE: // %bb.0: // %entry 114; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h 115; CHECK-LE-NEXT: ret 116; 117; CHECK-BE-LABEL: test_smull_high_s16_bitcastb2: 118; CHECK-BE: // %bb.0: // %entry 119; CHECK-BE-NEXT: rev64 v1.16b, v1.16b 120; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 121; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 122; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 123; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 124; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 125; CHECK-BE-NEXT: rev16 v1.8b, v1.8b 126; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h 127; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 128; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 129; CHECK-BE-NEXT: ret 130entry: 131 %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 132 %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 133 %s2 = bitcast <8 x i8> %s2a to <4 x i16> 134 %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) 135 ret <4 x i32> %r 136} 137 138 139define <4 x i32> @test_smull_high_s16_bitcasta1_wrongindex(<2 x i64> %aa, <8 x i16> %b) #0 { 140; CHECK-LE-LABEL: test_smull_high_s16_bitcasta1_wrongindex: 141; CHECK-LE: // %bb.0: // %entry 142; CHECK-LE-NEXT: ext v2.16b, v0.16b, v0.16b, #8 143; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 144; CHECK-LE-NEXT: ext v0.8b, v0.8b, v2.8b, #4 145; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h 146; CHECK-LE-NEXT: ret 147; 148; CHECK-BE-LABEL: test_smull_high_s16_bitcasta1_wrongindex: 149; CHECK-BE: // %bb.0: // %entry 150; CHECK-BE-NEXT: rev64 v1.8h, v1.8h 151; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 152; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 153; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 154; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #4 155; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 156; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h 157; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 158; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 159; CHECK-BE-NEXT: ret 160entry: 161 %a = bitcast <2 x i64> %aa to <8 x i16> 162 %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 163 %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 164 %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) 165 ret <4 x i32> %r 166} 167 168define <4 x i32> @test_smull_high_s16_bitcastb1_wrongindex(<8 x i16> %a, <16 x i8> %bb) #0 { 169; CHECK-LE-LABEL: test_smull_high_s16_bitcastb1_wrongindex: 170; CHECK-LE: // %bb.0: // %entry 171; CHECK-LE-NEXT: ext v2.16b, v1.16b, v1.16b, #8 172; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 173; CHECK-LE-NEXT: ext v1.8b, v1.8b, v2.8b, #6 174; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h 175; CHECK-LE-NEXT: ret 176; 177; CHECK-BE-LABEL: test_smull_high_s16_bitcastb1_wrongindex: 178; CHECK-BE: // %bb.0: // %entry 179; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 180; CHECK-BE-NEXT: rev64 v1.8h, v1.8h 181; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 182; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 183; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 184; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #6 185; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h 186; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 187; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 188; CHECK-BE-NEXT: ret 189entry: 190 %b = bitcast <16 x i8> %bb to <8 x i16> 191 %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 192 %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 193 %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) 194 ret <4 x i32> %r 195} 196 197define <4 x i32> @test_smull_high_s16_bitcasta2_wrongindex(<4 x i32> %a, <8 x i16> %b) #0 { 198; CHECK-LE-LABEL: test_smull_high_s16_bitcasta2_wrongindex: 199; CHECK-LE: // %bb.0: // %entry 200; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #4 201; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 202; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h 203; CHECK-LE-NEXT: ret 204; 205; CHECK-BE-LABEL: test_smull_high_s16_bitcasta2_wrongindex: 206; CHECK-BE: // %bb.0: // %entry 207; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 208; CHECK-BE-NEXT: rev64 v1.8h, v1.8h 209; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 210; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 211; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #4 212; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 213; CHECK-BE-NEXT: rev32 v0.4h, v0.4h 214; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h 215; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 216; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 217; CHECK-BE-NEXT: ret 218entry: 219 %s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 2> 220 %s1 = bitcast <2 x i32> %s1a to <4 x i16> 221 %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 222 %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) 223 ret <4 x i32> %r 224} 225 226define <4 x i32> @test_smull_high_s16_bitcastb2_wrongindex(<8 x i16> %a, <16 x i8> %b) #0 { 227; CHECK-LE-LABEL: test_smull_high_s16_bitcastb2_wrongindex: 228; CHECK-LE: // %bb.0: // %entry 229; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 230; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #4 231; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h 232; CHECK-LE-NEXT: ret 233; 234; CHECK-BE-LABEL: test_smull_high_s16_bitcastb2_wrongindex: 235; CHECK-BE: // %bb.0: // %entry 236; CHECK-BE-NEXT: rev64 v1.16b, v1.16b 237; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 238; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 239; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 240; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #4 241; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 242; CHECK-BE-NEXT: rev16 v1.8b, v1.8b 243; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h 244; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 245; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 246; CHECK-BE-NEXT: ret 247entry: 248 %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 249 %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 250 %s2 = bitcast <8 x i8> %s2a to <4 x i16> 251 %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) 252 ret <4 x i32> %r 253} 254 255 256define <4 x i32> @test_smull_high_s16_splata1(<2 x i64> %aa, <8 x i16> %b) #0 { 257; CHECK-LE-LABEL: test_smull_high_s16_splata1: 258; CHECK-LE: // %bb.0: // %entry 259; CHECK-LE-NEXT: smull2 v0.4s, v1.8h, v0.h[3] 260; CHECK-LE-NEXT: ret 261; 262; CHECK-BE-LABEL: test_smull_high_s16_splata1: 263; CHECK-BE: // %bb.0: // %entry 264; CHECK-BE-NEXT: rev64 v1.8h, v1.8h 265; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 266; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 267; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 268; CHECK-BE-NEXT: smull2 v0.4s, v1.8h, v0.h[3] 269; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 270; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 271; CHECK-BE-NEXT: ret 272entry: 273 %a = bitcast <2 x i64> %aa to <8 x i16> 274 %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 275 %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 276 %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) 277 ret <4 x i32> %r 278} 279 280define <4 x i32> @test_smull_high_s16_splatb1(<8 x i16> %a, <16 x i8> %bb) #0 { 281; CHECK-LE-LABEL: test_smull_high_s16_splatb1: 282; CHECK-LE: // %bb.0: // %entry 283; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.h[3] 284; CHECK-LE-NEXT: ret 285; 286; CHECK-BE-LABEL: test_smull_high_s16_splatb1: 287; CHECK-BE: // %bb.0: // %entry 288; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 289; CHECK-BE-NEXT: rev64 v1.8h, v1.8h 290; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 291; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 292; CHECK-BE-NEXT: smull2 v0.4s, v0.8h, v1.h[3] 293; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 294; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 295; CHECK-BE-NEXT: ret 296entry: 297 %b = bitcast <16 x i8> %bb to <8 x i16> 298 %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 299 %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 300 %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) 301 ret <4 x i32> %r 302} 303 304define <4 x i32> @test_smull_high_s16_splata2(<4 x i32> %a, <8 x i16> %b) #0 { 305; CHECK-LE-LABEL: test_smull_high_s16_splata2: 306; CHECK-LE: // %bb.0: // %entry 307; CHECK-LE-NEXT: dup v0.2s, v0.s[3] 308; CHECK-LE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 309; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h 310; CHECK-LE-NEXT: ret 311; 312; CHECK-BE-LABEL: test_smull_high_s16_splata2: 313; CHECK-BE: // %bb.0: // %entry 314; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 315; CHECK-BE-NEXT: rev64 v1.8h, v1.8h 316; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 317; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 318; CHECK-BE-NEXT: dup v0.2s, v0.s[3] 319; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 320; CHECK-BE-NEXT: rev32 v0.4h, v0.4h 321; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h 322; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 323; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 324; CHECK-BE-NEXT: ret 325entry: 326 %s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 3, i32 3> 327 %s1 = bitcast <2 x i32> %s1a to <4 x i16> 328 %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 329 %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) 330 ret <4 x i32> %r 331} 332 333define <4 x i32> @test_smull_high_s16_splatb2(<8 x i16> %a, <16 x i8> %b) #0 { 334; CHECK-LE-LABEL: test_smull_high_s16_splatb2: 335; CHECK-LE: // %bb.0: // %entry 336; CHECK-LE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 337; CHECK-LE-NEXT: dup v1.8b, v1.b[3] 338; CHECK-LE-NEXT: smull v0.4s, v0.4h, v1.4h 339; CHECK-LE-NEXT: ret 340; 341; CHECK-BE-LABEL: test_smull_high_s16_splatb2: 342; CHECK-BE: // %bb.0: // %entry 343; CHECK-BE-NEXT: rev64 v1.16b, v1.16b 344; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 345; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 346; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 347; CHECK-BE-NEXT: dup v1.8b, v1.b[3] 348; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 349; CHECK-BE-NEXT: rev16 v1.8b, v1.8b 350; CHECK-BE-NEXT: smull v0.4s, v0.4h, v1.4h 351; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 352; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 353; CHECK-BE-NEXT: ret 354entry: 355 %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 356 %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 357 %s2 = bitcast <8 x i8> %s2a to <4 x i16> 358 %r = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %s1, <4 x i16> %s2) 359 ret <4 x i32> %r 360} 361 362 363 364define <4 x i32> @test_umull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 { 365; CHECK-LE-LABEL: test_umull_high_s16_bitcasta1: 366; CHECK-LE: // %bb.0: // %entry 367; CHECK-LE-NEXT: umull2 v0.4s, v0.8h, v1.8h 368; CHECK-LE-NEXT: ret 369; 370; CHECK-BE-LABEL: test_umull_high_s16_bitcasta1: 371; CHECK-BE: // %bb.0: // %entry 372; CHECK-BE-NEXT: rev64 v1.8h, v1.8h 373; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 374; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 375; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 376; CHECK-BE-NEXT: umull2 v0.4s, v0.8h, v1.8h 377; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 378; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 379; CHECK-BE-NEXT: ret 380entry: 381 %a = bitcast <2 x i64> %aa to <8 x i16> 382 %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 383 %s2 = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 384 %r = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %s1, <4 x i16> %s2) 385 ret <4 x i32> %r 386} 387 388define <8 x i16> @test_vabdl_high_u82(<16 x i8> %a, <8 x i16> %bb) { 389; CHECK-LE-LABEL: test_vabdl_high_u82: 390; CHECK-LE: // %bb.0: // %entry 391; CHECK-LE-NEXT: uabdl2 v0.8h, v0.16b, v1.16b 392; CHECK-LE-NEXT: ret 393; 394; CHECK-BE-LABEL: test_vabdl_high_u82: 395; CHECK-BE: // %bb.0: // %entry 396; CHECK-BE-NEXT: rev64 v0.16b, v0.16b 397; CHECK-BE-NEXT: rev64 v1.16b, v1.16b 398; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 399; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 400; CHECK-BE-NEXT: uabdl2 v0.8h, v0.16b, v1.16b 401; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 402; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 403; CHECK-BE-NEXT: ret 404entry: 405 %b = bitcast <8 x i16> %bb to <16 x i8> 406 %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 407 %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 408 %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) 409 %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16> 410 ret <8 x i16> %vmovl.i.i.i 411} 412 413define <8 x i16> @test_vabdl_high_s82(<16 x i8> %a, <8 x i16> %bb) { 414; CHECK-LE-LABEL: test_vabdl_high_s82: 415; CHECK-LE: // %bb.0: // %entry 416; CHECK-LE-NEXT: sabdl2 v0.8h, v0.16b, v1.16b 417; CHECK-LE-NEXT: ret 418; 419; CHECK-BE-LABEL: test_vabdl_high_s82: 420; CHECK-BE: // %bb.0: // %entry 421; CHECK-BE-NEXT: rev64 v0.16b, v0.16b 422; CHECK-BE-NEXT: rev64 v1.16b, v1.16b 423; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 424; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 425; CHECK-BE-NEXT: sabdl2 v0.8h, v0.16b, v1.16b 426; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 427; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 428; CHECK-BE-NEXT: ret 429entry: 430 %b = bitcast <8 x i16> %bb to <16 x i8> 431 %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 432 %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 433 %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) 434 %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16> 435 ret <8 x i16> %vmovl.i.i.i 436} 437 438define <4 x i32> @test_vqdmlal_high_s16_bitcast(<4 x i32> %a, <8 x i16> %b, <16 x i8> %cc) { 439; CHECK-LE-LABEL: test_vqdmlal_high_s16_bitcast: 440; CHECK-LE: // %bb.0: // %entry 441; CHECK-LE-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h 442; CHECK-LE-NEXT: ret 443; 444; CHECK-BE-LABEL: test_vqdmlal_high_s16_bitcast: 445; CHECK-BE: // %bb.0: // %entry 446; CHECK-BE-NEXT: rev64 v1.8h, v1.8h 447; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 448; CHECK-BE-NEXT: rev64 v2.8h, v2.8h 449; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 450; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 451; CHECK-BE-NEXT: ext v2.16b, v2.16b, v2.16b, #8 452; CHECK-BE-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h 453; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 454; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 455; CHECK-BE-NEXT: ret 456entry: 457 %c = bitcast <16 x i8> %cc to <8 x i16> 458 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 459 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 460 %vqdmlal2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i) 461 %vqdmlal4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i) 462 ret <4 x i32> %vqdmlal4.i.i 463} 464 465define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) { 466; CHECK-LE-LABEL: test_pmull_high_p8_128: 467; CHECK-LE: // %bb.0: // %entry 468; CHECK-LE-NEXT: fmov d0, x3 469; CHECK-LE-NEXT: fmov d1, x1 470; CHECK-LE-NEXT: pmull v0.8h, v1.8b, v0.8b 471; CHECK-LE-NEXT: ret 472; 473; CHECK-BE-LABEL: test_pmull_high_p8_128: 474; CHECK-BE: // %bb.0: // %entry 475; CHECK-BE-NEXT: fmov d0, x3 476; CHECK-BE-NEXT: fmov d1, x1 477; CHECK-BE-NEXT: rev64 v0.8b, v0.8b 478; CHECK-BE-NEXT: rev64 v1.8b, v1.8b 479; CHECK-BE-NEXT: pmull v0.8h, v1.8b, v0.8b 480; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 481; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 482; CHECK-BE-NEXT: ret 483entry: 484 %a = bitcast i128 %aa to <16 x i8> 485 %b = bitcast i128 %bb to <16 x i8> 486 %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 487 %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 488 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) 489 ret <8 x i16> %vmull.i.i 490} 491 492define <8 x i16> @test_pmull_high_p8_64(<2 x i64> %aa, <2 x i64> %bb) { 493; CHECK-LE-LABEL: test_pmull_high_p8_64: 494; CHECK-LE: // %bb.0: // %entry 495; CHECK-LE-NEXT: pmull2 v0.8h, v0.16b, v1.16b 496; CHECK-LE-NEXT: ret 497; 498; CHECK-BE-LABEL: test_pmull_high_p8_64: 499; CHECK-BE: // %bb.0: // %entry 500; CHECK-BE-NEXT: rev64 v0.16b, v0.16b 501; CHECK-BE-NEXT: rev64 v1.16b, v1.16b 502; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 503; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 504; CHECK-BE-NEXT: pmull2 v0.8h, v0.16b, v1.16b 505; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 506; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 507; CHECK-BE-NEXT: ret 508entry: 509 %a = bitcast <2 x i64> %aa to <16 x i8> 510 %b = bitcast <2 x i64> %bb to <16 x i8> 511 %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 512 %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 513 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i) 514 ret <8 x i16> %vmull.i.i 515} 516 517define <8 x i16> @foov8i16(<16 x i8> %a1, <2 x i64> %b1) { 518; CHECK-LE-LABEL: foov8i16: 519; CHECK-LE: // %bb.0: 520; CHECK-LE-NEXT: shrn v0.4h, v0.4s, #5 521; CHECK-LE-NEXT: shrn2 v0.8h, v1.4s, #5 522; CHECK-LE-NEXT: ret 523; 524; CHECK-BE-LABEL: foov8i16: 525; CHECK-BE: // %bb.0: 526; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 527; CHECK-BE-NEXT: rev64 v1.4s, v1.4s 528; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 529; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 530; CHECK-BE-NEXT: shrn v0.4h, v0.4s, #5 531; CHECK-BE-NEXT: shrn2 v0.8h, v1.4s, #5 532; CHECK-BE-NEXT: rev64 v0.8h, v0.8h 533; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 534; CHECK-BE-NEXT: ret 535 %a0 = bitcast <16 x i8> %a1 to <4 x i32> 536 %b0 = bitcast <2 x i64> %b1 to <4 x i32> 537 %vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5> 538 %vshrn_low = trunc <4 x i32> %vshrn_low_shift to <4 x i16> 539 %vshrn_high_shift = lshr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5> 540 %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16> 541 %1 = bitcast <4 x i16> %vshrn_low to <1 x i64> 542 %2 = bitcast <4 x i16> %vshrn_high to <1 x i64> 543 %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1> 544 %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> 545 ret <8 x i16> %3 546} 547 548define <2 x i64> @hadd32_zext_asr(<16 x i8> %src1a) { 549; CHECK-LE-LABEL: hadd32_zext_asr: 550; CHECK-LE: // %bb.0: 551; CHECK-LE-NEXT: ushll2 v0.2d, v0.4s, #1 552; CHECK-LE-NEXT: ret 553; 554; CHECK-BE-LABEL: hadd32_zext_asr: 555; CHECK-BE: // %bb.0: 556; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 557; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 558; CHECK-BE-NEXT: ushll2 v0.2d, v0.4s, #1 559; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 560; CHECK-BE-NEXT: ret 561 %src1 = bitcast <16 x i8> %src1a to <4 x i32> 562 %s1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 563 %zextsrc1 = zext <2 x i32> %s1 to <2 x i64> 564 %resulti32 = shl <2 x i64> %zextsrc1, <i64 1, i64 1> 565 ret <2 x i64> %resulti32 566} 567 568define <2 x i64> @test_umull_high_s16_splata1(<2 x i64> %aa, <4 x i32> %b) #0 { 569; CHECK-LE-LABEL: test_umull_high_s16_splata1: 570; CHECK-LE: // %bb.0: // %entry 571; CHECK-LE-NEXT: umull2 v0.2d, v1.4s, v0.s[1] 572; CHECK-LE-NEXT: ret 573; 574; CHECK-BE-LABEL: test_umull_high_s16_splata1: 575; CHECK-BE: // %bb.0: // %entry 576; CHECK-BE-NEXT: rev64 v1.4s, v1.4s 577; CHECK-BE-NEXT: rev64 v0.4s, v0.4s 578; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 579; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 580; CHECK-BE-NEXT: umull2 v0.2d, v1.4s, v0.s[1] 581; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 582; CHECK-BE-NEXT: ret 583entry: 584 %a = bitcast <2 x i64> %aa to <4 x i32> 585 %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 586 %s2 = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 587 %r = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %s1, <2 x i32> %s2) 588 ret <2 x i64> %r 589} 590