1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON 3; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE 4; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI 5 6; CHECK-GI: warning: Instruction selection used fallback path for pmlsl2_v8i16_uzp1 7; CHECK-GI-NEXT: warning: Instruction selection used fallback path for pmlsl_pmlsl2_v8i16_uzp1 8 9define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind { 10; CHECK-LABEL: smull_v8i8_v8i16: 11; CHECK: // %bb.0: 12; CHECK-NEXT: ldr d0, [x0] 13; CHECK-NEXT: ldr d1, [x1] 14; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b 15; CHECK-NEXT: ret 16 %tmp1 = load <8 x i8>, ptr %A 17 %tmp2 = load <8 x i8>, ptr %B 18 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16> 19 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> 20 %tmp5 = mul <8 x i16> %tmp3, %tmp4 21 ret <8 x i16> %tmp5 22} 23 24define <4 x i32> @smull_v4i16_v4i32(ptr %A, ptr %B) nounwind { 25; CHECK-LABEL: smull_v4i16_v4i32: 26; CHECK: // %bb.0: 27; CHECK-NEXT: ldr d0, [x0] 28; CHECK-NEXT: ldr d1, [x1] 29; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h 30; CHECK-NEXT: ret 31 %tmp1 = load <4 x i16>, ptr %A 32 %tmp2 = load <4 x i16>, ptr %B 33 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32> 34 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> 35 %tmp5 = mul <4 x i32> %tmp3, %tmp4 36 ret <4 x i32> %tmp5 37} 38 39define <2 x i64> @smull_v2i32_v2i64(ptr %A, ptr %B) nounwind { 40; CHECK-LABEL: smull_v2i32_v2i64: 41; CHECK: // %bb.0: 42; CHECK-NEXT: ldr d0, [x0] 43; CHECK-NEXT: ldr d1, [x1] 44; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s 45; CHECK-NEXT: ret 46 %tmp1 = load <2 x i32>, ptr %A 47 %tmp2 = load <2 x i32>, ptr %B 48 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64> 49 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> 50 %tmp5 = mul <2 x i64> %tmp3, %tmp4 51 ret <2 x i64> %tmp5 52} 53 54define <8 x i32> @smull_zext_v8i8_v8i32(ptr %A, ptr %B) nounwind { 55; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32: 56; CHECK-NEON: // %bb.0: 57; CHECK-NEON-NEXT: ldr d0, [x0] 58; CHECK-NEON-NEXT: ldr q2, [x1] 59; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0 60; CHECK-NEON-NEXT: smull2 v1.4s, v0.8h, v2.8h 61; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v2.4h 62; CHECK-NEON-NEXT: ret 63; 64; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32: 65; CHECK-SVE: // %bb.0: 66; CHECK-SVE-NEXT: ldr d0, [x0] 67; CHECK-SVE-NEXT: ldr q2, [x1] 68; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0 69; CHECK-SVE-NEXT: smull2 v1.4s, v0.8h, v2.8h 70; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v2.4h 71; CHECK-SVE-NEXT: ret 72; 73; CHECK-GI-LABEL: smull_zext_v8i8_v8i32: 74; CHECK-GI: // %bb.0: 75; CHECK-GI-NEXT: ldr d0, [x0] 76; CHECK-GI-NEXT: ldr q1, [x1] 77; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 78; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 79; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0 80; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0 81; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 82; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s 83; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s 84; CHECK-GI-NEXT: ret 85 %load.A = load <8 x i8>, ptr %A 86 %load.B = load <8 x i16>, ptr %B 87 %zext.A = zext <8 x i8> %load.A to <8 x i32> 88 %sext.B = sext <8 x i16> %load.B to <8 x i32> 89 %res = mul <8 x i32> %zext.A, %sext.B 90 ret <8 x i32> %res 91} 92 93define <8 x i32> @smull_zext_v8i8_v8i32_sext_first_operand(ptr %A, ptr %B) nounwind { 94; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_sext_first_operand: 95; CHECK-NEON: // %bb.0: 96; CHECK-NEON-NEXT: ldr d0, [x1] 97; CHECK-NEON-NEXT: ldr q2, [x0] 98; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0 99; CHECK-NEON-NEXT: smull2 v1.4s, v2.8h, v0.8h 100; CHECK-NEON-NEXT: smull v0.4s, v2.4h, v0.4h 101; CHECK-NEON-NEXT: ret 102; 103; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_sext_first_operand: 104; CHECK-SVE: // %bb.0: 105; CHECK-SVE-NEXT: ldr d0, [x1] 106; CHECK-SVE-NEXT: ldr q2, [x0] 107; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0 108; CHECK-SVE-NEXT: smull2 v1.4s, v2.8h, v0.8h 109; CHECK-SVE-NEXT: smull v0.4s, v2.4h, v0.4h 110; CHECK-SVE-NEXT: ret 111; 112; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_sext_first_operand: 113; CHECK-GI: // %bb.0: 114; CHECK-GI-NEXT: ldr d0, [x1] 115; CHECK-GI-NEXT: ldr q1, [x0] 116; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 117; CHECK-GI-NEXT: sshll v2.4s, v1.4h, #0 118; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 119; CHECK-GI-NEXT: ushll v3.4s, v0.4h, #0 120; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 121; CHECK-GI-NEXT: mul v0.4s, v2.4s, v3.4s 122; CHECK-GI-NEXT: mul v1.4s, v1.4s, v4.4s 123; CHECK-GI-NEXT: ret 124 %load.A = load <8 x i16>, ptr %A 125 %load.B = load <8 x i8>, ptr %B 126 %sext.A = sext <8 x i16> %load.A to <8 x i32> 127 %zext.B = zext <8 x i8> %load.B to <8 x i32> 128 %res = mul <8 x i32> %sext.A, %zext.B 129 ret <8 x i32> %res 130} 131 132define <8 x i32> @smull_zext_v8i8_v8i32_top_bit_is_1(ptr %A, ptr %B) nounwind { 133; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1: 134; CHECK-NEON: // %bb.0: 135; CHECK-NEON-NEXT: ldr q0, [x0] 136; CHECK-NEON-NEXT: ldr q1, [x1] 137; CHECK-NEON-NEXT: orr v0.8h, #128, lsl #8 138; CHECK-NEON-NEXT: sshll v3.4s, v1.4h, #0 139; CHECK-NEON-NEXT: sshll2 v1.4s, v1.8h, #0 140; CHECK-NEON-NEXT: ushll v2.4s, v0.4h, #0 141; CHECK-NEON-NEXT: ushll2 v0.4s, v0.8h, #0 142; CHECK-NEON-NEXT: mul v1.4s, v0.4s, v1.4s 143; CHECK-NEON-NEXT: mul v0.4s, v2.4s, v3.4s 144; CHECK-NEON-NEXT: ret 145; 146; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1: 147; CHECK-SVE: // %bb.0: 148; CHECK-SVE-NEXT: ldr q0, [x0] 149; CHECK-SVE-NEXT: ldr q1, [x1] 150; CHECK-SVE-NEXT: orr v0.8h, #128, lsl #8 151; CHECK-SVE-NEXT: sshll v3.4s, v1.4h, #0 152; CHECK-SVE-NEXT: sshll2 v1.4s, v1.8h, #0 153; CHECK-SVE-NEXT: ushll v2.4s, v0.4h, #0 154; CHECK-SVE-NEXT: ushll2 v0.4s, v0.8h, #0 155; CHECK-SVE-NEXT: mul v1.4s, v0.4s, v1.4s 156; CHECK-SVE-NEXT: mul v0.4s, v2.4s, v3.4s 157; CHECK-SVE-NEXT: ret 158; 159; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1: 160; CHECK-GI: // %bb.0: 161; CHECK-GI-NEXT: movi v0.8h, #128, lsl #8 162; CHECK-GI-NEXT: ldr q1, [x0] 163; CHECK-GI-NEXT: orr v0.16b, v1.16b, v0.16b 164; CHECK-GI-NEXT: ldr q1, [x1] 165; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 166; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0 167; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0 168; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 169; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s 170; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s 171; CHECK-GI-NEXT: ret 172 %load.A = load <8 x i16>, ptr %A 173 %or.A = or <8 x i16> %load.A, <i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000> 174 %load.B = load <8 x i16>, ptr %B 175 %zext.A = zext <8 x i16> %or.A to <8 x i32> 176 %sext.B = sext <8 x i16> %load.B to <8 x i32> 177 %res = mul <8 x i32> %zext.A, %sext.B 178 ret <8 x i32> %res 179} 180 181define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind { 182; CHECK-NEON-LABEL: smull_zext_v4i16_v4i32: 183; CHECK-NEON: // %bb.0: 184; CHECK-NEON-NEXT: ldr s0, [x0] 185; CHECK-NEON-NEXT: ldr d1, [x1] 186; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0 187; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h 188; CHECK-NEON-NEXT: ret 189; 190; CHECK-SVE-LABEL: smull_zext_v4i16_v4i32: 191; CHECK-SVE: // %bb.0: 192; CHECK-SVE-NEXT: ldr s0, [x0] 193; CHECK-SVE-NEXT: ldr d1, [x1] 194; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0 195; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h 196; CHECK-SVE-NEXT: ret 197; 198; CHECK-GI-LABEL: smull_zext_v4i16_v4i32: 199; CHECK-GI: // %bb.0: 200; CHECK-GI-NEXT: ldr w8, [x0] 201; CHECK-GI-NEXT: fmov s0, w8 202; CHECK-GI-NEXT: uxtb w8, w8 203; CHECK-GI-NEXT: mov b1, v0.b[2] 204; CHECK-GI-NEXT: mov b2, v0.b[1] 205; CHECK-GI-NEXT: mov b3, v0.b[3] 206; CHECK-GI-NEXT: fmov s0, w8 207; CHECK-GI-NEXT: fmov w9, s1 208; CHECK-GI-NEXT: fmov w10, s2 209; CHECK-GI-NEXT: fmov w11, s3 210; CHECK-GI-NEXT: ldr d2, [x1] 211; CHECK-GI-NEXT: uxtb w9, w9 212; CHECK-GI-NEXT: uxtb w10, w10 213; CHECK-GI-NEXT: uxtb w11, w11 214; CHECK-GI-NEXT: fmov s1, w9 215; CHECK-GI-NEXT: mov v0.h[1], w10 216; CHECK-GI-NEXT: mov v1.h[1], w11 217; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 218; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 219; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] 220; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0 221; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s 222; CHECK-GI-NEXT: ret 223 %load.A = load <4 x i8>, ptr %A 224 %load.B = load <4 x i16>, ptr %B 225 %zext.A = zext <4 x i8> %load.A to <4 x i32> 226 %sext.B = sext <4 x i16> %load.B to <4 x i32> 227 %res = mul <4 x i32> %zext.A, %sext.B 228 ret <4 x i32> %res 229} 230 231define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind { 232; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64: 233; CHECK-NEON: // %bb.0: 234; CHECK-NEON-NEXT: ldrh w8, [x0] 235; CHECK-NEON-NEXT: ldrh w9, [x0, #2] 236; CHECK-NEON-NEXT: ldr d1, [x1] 237; CHECK-NEON-NEXT: fmov d0, x8 238; CHECK-NEON-NEXT: mov v0.d[1], x9 239; CHECK-NEON-NEXT: xtn v0.2s, v0.2d 240; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s 241; CHECK-NEON-NEXT: ret 242; 243; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64: 244; CHECK-SVE: // %bb.0: 245; CHECK-SVE-NEXT: ldrh w8, [x0] 246; CHECK-SVE-NEXT: ldrh w9, [x0, #2] 247; CHECK-SVE-NEXT: ldr d1, [x1] 248; CHECK-SVE-NEXT: fmov d0, x8 249; CHECK-SVE-NEXT: mov v0.d[1], x9 250; CHECK-SVE-NEXT: xtn v0.2s, v0.2d 251; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s 252; CHECK-SVE-NEXT: ret 253; 254; CHECK-GI-LABEL: smull_zext_v2i32_v2i64: 255; CHECK-GI: // %bb.0: 256; CHECK-GI-NEXT: ld1 { v1.h }[0], [x0] 257; CHECK-GI-NEXT: ldr h2, [x0, #2] 258; CHECK-GI-NEXT: movi d0, #0x00ffff0000ffff 259; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] 260; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b 261; CHECK-GI-NEXT: ldr d1, [x1] 262; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0 263; CHECK-GI-NEXT: mov w8, v0.s[0] 264; CHECK-GI-NEXT: mov w9, v0.s[1] 265; CHECK-GI-NEXT: mov x11, v1.d[1] 266; CHECK-GI-NEXT: mov v0.d[0], x8 267; CHECK-GI-NEXT: mov v0.d[1], x9 268; CHECK-GI-NEXT: fmov x9, d1 269; CHECK-GI-NEXT: fmov x8, d0 270; CHECK-GI-NEXT: mov x10, v0.d[1] 271; CHECK-GI-NEXT: mul x8, x8, x9 272; CHECK-GI-NEXT: mul x9, x10, x11 273; CHECK-GI-NEXT: mov v0.d[0], x8 274; CHECK-GI-NEXT: mov v0.d[1], x9 275; CHECK-GI-NEXT: ret 276 %load.A = load <2 x i16>, ptr %A 277 %load.B = load <2 x i32>, ptr %B 278 %zext.A = zext <2 x i16> %load.A to <2 x i64> 279 %sext.B = sext <2 x i32> %load.B to <2 x i64> 280 %res = mul <2 x i64> %zext.A, %sext.B 281 ret <2 x i64> %res 282} 283 284define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind { 285; CHECK-NEON-LABEL: smull_zext_and_v2i32_v2i64: 286; CHECK-NEON: // %bb.0: 287; CHECK-NEON-NEXT: ldr d0, [x0] 288; CHECK-NEON-NEXT: ldr d1, [x1] 289; CHECK-NEON-NEXT: bic v0.2s, #128, lsl #24 290; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s 291; CHECK-NEON-NEXT: ret 292; 293; CHECK-SVE-LABEL: smull_zext_and_v2i32_v2i64: 294; CHECK-SVE: // %bb.0: 295; CHECK-SVE-NEXT: ldr d0, [x0] 296; CHECK-SVE-NEXT: ldr d1, [x1] 297; CHECK-SVE-NEXT: bic v0.2s, #128, lsl #24 298; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s 299; CHECK-SVE-NEXT: ret 300; 301; CHECK-GI-LABEL: smull_zext_and_v2i32_v2i64: 302; CHECK-GI: // %bb.0: 303; CHECK-GI-NEXT: mvni v0.2s, #128, lsl #24 304; CHECK-GI-NEXT: ldr d1, [x0] 305; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b 306; CHECK-GI-NEXT: ldr d1, [x1] 307; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0 308; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 309; CHECK-GI-NEXT: fmov x9, d1 310; CHECK-GI-NEXT: mov x11, v1.d[1] 311; CHECK-GI-NEXT: fmov x8, d0 312; CHECK-GI-NEXT: mov x10, v0.d[1] 313; CHECK-GI-NEXT: mul x8, x8, x9 314; CHECK-GI-NEXT: mul x9, x10, x11 315; CHECK-GI-NEXT: mov v0.d[0], x8 316; CHECK-GI-NEXT: mov v0.d[1], x9 317; CHECK-GI-NEXT: ret 318 %load.A = load <2 x i32>, ptr %A 319 %and.A = and <2 x i32> %load.A, <i32 u0x7FFFFFFF, i32 u0x7FFFFFFF> 320 %load.B = load <2 x i32>, ptr %B 321 %zext.A = zext <2 x i32> %and.A to <2 x i64> 322 %sext.B = sext <2 x i32> %load.B to <2 x i64> 323 %res = mul <2 x i64> %zext.A, %sext.B 324 ret <2 x i64> %res 325} 326 327define <8 x i16> @umull_v8i8_v8i16(ptr %A, ptr %B) nounwind { 328; CHECK-LABEL: umull_v8i8_v8i16: 329; CHECK: // %bb.0: 330; CHECK-NEXT: ldr d0, [x0] 331; CHECK-NEXT: ldr d1, [x1] 332; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b 333; CHECK-NEXT: ret 334 %tmp1 = load <8 x i8>, ptr %A 335 %tmp2 = load <8 x i8>, ptr %B 336 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16> 337 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> 338 %tmp5 = mul <8 x i16> %tmp3, %tmp4 339 ret <8 x i16> %tmp5 340} 341 342define <4 x i32> @umull_v4i16_v4i32(ptr %A, ptr %B) nounwind { 343; CHECK-LABEL: umull_v4i16_v4i32: 344; CHECK: // %bb.0: 345; CHECK-NEXT: ldr d0, [x0] 346; CHECK-NEXT: ldr d1, [x1] 347; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h 348; CHECK-NEXT: ret 349 %tmp1 = load <4 x i16>, ptr %A 350 %tmp2 = load <4 x i16>, ptr %B 351 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32> 352 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> 353 %tmp5 = mul <4 x i32> %tmp3, %tmp4 354 ret <4 x i32> %tmp5 355} 356 357define <2 x i64> @umull_v2i32_v2i64(ptr %A, ptr %B) nounwind { 358; CHECK-LABEL: umull_v2i32_v2i64: 359; CHECK: // %bb.0: 360; CHECK-NEXT: ldr d0, [x0] 361; CHECK-NEXT: ldr d1, [x1] 362; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s 363; CHECK-NEXT: ret 364 %tmp1 = load <2 x i32>, ptr %A 365 %tmp2 = load <2 x i32>, ptr %B 366 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64> 367 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> 368 %tmp5 = mul <2 x i64> %tmp3, %tmp4 369 ret <2 x i64> %tmp5 370} 371 372define <8 x i16> @amull_v8i8_v8i16(ptr %A, ptr %B) nounwind { 373; CHECK-NEON-LABEL: amull_v8i8_v8i16: 374; CHECK-NEON: // %bb.0: 375; CHECK-NEON-NEXT: ldr d0, [x0] 376; CHECK-NEON-NEXT: ldr d1, [x1] 377; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b 378; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8 379; CHECK-NEON-NEXT: ret 380; 381; CHECK-SVE-LABEL: amull_v8i8_v8i16: 382; CHECK-SVE: // %bb.0: 383; CHECK-SVE-NEXT: ldr d0, [x0] 384; CHECK-SVE-NEXT: ldr d1, [x1] 385; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b 386; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8 387; CHECK-SVE-NEXT: ret 388; 389; CHECK-GI-LABEL: amull_v8i8_v8i16: 390; CHECK-GI: // %bb.0: 391; CHECK-GI-NEXT: ldr d1, [x0] 392; CHECK-GI-NEXT: ldr d2, [x1] 393; CHECK-GI-NEXT: movi v0.2d, #0xff00ff00ff00ff 394; CHECK-GI-NEXT: umull v1.8h, v1.8b, v2.8b 395; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b 396; CHECK-GI-NEXT: ret 397 %tmp1 = load <8 x i8>, ptr %A 398 %tmp2 = load <8 x i8>, ptr %B 399 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16> 400 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> 401 %tmp5 = mul <8 x i16> %tmp3, %tmp4 402 %and = and <8 x i16> %tmp5, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 403 ret <8 x i16> %and 404} 405 406define <4 x i32> @amull_v4i16_v4i32(ptr %A, ptr %B) nounwind { 407; CHECK-NEON-LABEL: amull_v4i16_v4i32: 408; CHECK-NEON: // %bb.0: 409; CHECK-NEON-NEXT: ldr d1, [x0] 410; CHECK-NEON-NEXT: ldr d2, [x1] 411; CHECK-NEON-NEXT: movi v0.2d, #0x00ffff0000ffff 412; CHECK-NEON-NEXT: smull v1.4s, v1.4h, v2.4h 413; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b 414; CHECK-NEON-NEXT: ret 415; 416; CHECK-SVE-LABEL: amull_v4i16_v4i32: 417; CHECK-SVE: // %bb.0: 418; CHECK-SVE-NEXT: ldr d1, [x0] 419; CHECK-SVE-NEXT: ldr d2, [x1] 420; CHECK-SVE-NEXT: movi v0.2d, #0x00ffff0000ffff 421; CHECK-SVE-NEXT: smull v1.4s, v1.4h, v2.4h 422; CHECK-SVE-NEXT: and v0.16b, v1.16b, v0.16b 423; CHECK-SVE-NEXT: ret 424; 425; CHECK-GI-LABEL: amull_v4i16_v4i32: 426; CHECK-GI: // %bb.0: 427; CHECK-GI-NEXT: ldr d1, [x0] 428; CHECK-GI-NEXT: ldr d2, [x1] 429; CHECK-GI-NEXT: movi v0.2d, #0x00ffff0000ffff 430; CHECK-GI-NEXT: umull v1.4s, v1.4h, v2.4h 431; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b 432; CHECK-GI-NEXT: ret 433 %tmp1 = load <4 x i16>, ptr %A 434 %tmp2 = load <4 x i16>, ptr %B 435 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32> 436 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> 437 %tmp5 = mul <4 x i32> %tmp3, %tmp4 438 %and = and <4 x i32> %tmp5, <i32 65535, i32 65535, i32 65535, i32 65535> 439 ret <4 x i32> %and 440} 441 442define <2 x i64> @amull_v2i32_v2i64(ptr %A, ptr %B) nounwind { 443; CHECK-NEON-LABEL: amull_v2i32_v2i64: 444; CHECK-NEON: // %bb.0: 445; CHECK-NEON-NEXT: ldr d1, [x0] 446; CHECK-NEON-NEXT: ldr d2, [x1] 447; CHECK-NEON-NEXT: movi v0.2d, #0x000000ffffffff 448; CHECK-NEON-NEXT: smull v1.2d, v1.2s, v2.2s 449; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b 450; CHECK-NEON-NEXT: ret 451; 452; CHECK-SVE-LABEL: amull_v2i32_v2i64: 453; CHECK-SVE: // %bb.0: 454; CHECK-SVE-NEXT: ldr d1, [x0] 455; CHECK-SVE-NEXT: ldr d2, [x1] 456; CHECK-SVE-NEXT: movi v0.2d, #0x000000ffffffff 457; CHECK-SVE-NEXT: smull v1.2d, v1.2s, v2.2s 458; CHECK-SVE-NEXT: and v0.16b, v1.16b, v0.16b 459; CHECK-SVE-NEXT: ret 460; 461; CHECK-GI-LABEL: amull_v2i32_v2i64: 462; CHECK-GI: // %bb.0: 463; CHECK-GI-NEXT: ldr d1, [x0] 464; CHECK-GI-NEXT: ldr d2, [x1] 465; CHECK-GI-NEXT: movi v0.2d, #0x000000ffffffff 466; CHECK-GI-NEXT: umull v1.2d, v1.2s, v2.2s 467; CHECK-GI-NEXT: and v0.16b, v1.16b, v0.16b 468; CHECK-GI-NEXT: ret 469 %tmp1 = load <2 x i32>, ptr %A 470 %tmp2 = load <2 x i32>, ptr %B 471 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64> 472 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> 473 %tmp5 = mul <2 x i64> %tmp3, %tmp4 474 %and = and <2 x i64> %tmp5, <i64 4294967295, i64 4294967295> 475 ret <2 x i64> %and 476} 477 478define <8 x i16> @smlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { 479; CHECK-LABEL: smlal_v8i8_v8i16: 480; CHECK: // %bb.0: 481; CHECK-NEXT: ldr q0, [x0] 482; CHECK-NEXT: ldr d1, [x1] 483; CHECK-NEXT: ldr d2, [x2] 484; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b 485; CHECK-NEXT: ret 486 %tmp1 = load <8 x i16>, ptr %A 487 %tmp2 = load <8 x i8>, ptr %B 488 %tmp3 = load <8 x i8>, ptr %C 489 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> 490 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16> 491 %tmp6 = mul <8 x i16> %tmp4, %tmp5 492 %tmp7 = add <8 x i16> %tmp1, %tmp6 493 ret <8 x i16> %tmp7 494} 495 496define <4 x i32> @smlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { 497; CHECK-LABEL: smlal_v4i16_v4i32: 498; CHECK: // %bb.0: 499; CHECK-NEXT: ldr q0, [x0] 500; CHECK-NEXT: ldr d1, [x1] 501; CHECK-NEXT: ldr d2, [x2] 502; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h 503; CHECK-NEXT: ret 504 %tmp1 = load <4 x i32>, ptr %A 505 %tmp2 = load <4 x i16>, ptr %B 506 %tmp3 = load <4 x i16>, ptr %C 507 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> 508 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32> 509 %tmp6 = mul <4 x i32> %tmp4, %tmp5 510 %tmp7 = add <4 x i32> %tmp1, %tmp6 511 ret <4 x i32> %tmp7 512} 513 514define <2 x i64> @smlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { 515; CHECK-LABEL: smlal_v2i32_v2i64: 516; CHECK: // %bb.0: 517; CHECK-NEXT: ldr q0, [x0] 518; CHECK-NEXT: ldr d1, [x1] 519; CHECK-NEXT: ldr d2, [x2] 520; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s 521; CHECK-NEXT: ret 522 %tmp1 = load <2 x i64>, ptr %A 523 %tmp2 = load <2 x i32>, ptr %B 524 %tmp3 = load <2 x i32>, ptr %C 525 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> 526 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64> 527 %tmp6 = mul <2 x i64> %tmp4, %tmp5 528 %tmp7 = add <2 x i64> %tmp1, %tmp6 529 ret <2 x i64> %tmp7 530} 531 532define <8 x i16> @umlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { 533; CHECK-LABEL: umlal_v8i8_v8i16: 534; CHECK: // %bb.0: 535; CHECK-NEXT: ldr q0, [x0] 536; CHECK-NEXT: ldr d1, [x1] 537; CHECK-NEXT: ldr d2, [x2] 538; CHECK-NEXT: umlal v0.8h, v1.8b, v2.8b 539; CHECK-NEXT: ret 540 %tmp1 = load <8 x i16>, ptr %A 541 %tmp2 = load <8 x i8>, ptr %B 542 %tmp3 = load <8 x i8>, ptr %C 543 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> 544 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> 545 %tmp6 = mul <8 x i16> %tmp4, %tmp5 546 %tmp7 = add <8 x i16> %tmp1, %tmp6 547 ret <8 x i16> %tmp7 548} 549 550define <4 x i32> @umlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { 551; CHECK-LABEL: umlal_v4i16_v4i32: 552; CHECK: // %bb.0: 553; CHECK-NEXT: ldr q0, [x0] 554; CHECK-NEXT: ldr d1, [x1] 555; CHECK-NEXT: ldr d2, [x2] 556; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h 557; CHECK-NEXT: ret 558 %tmp1 = load <4 x i32>, ptr %A 559 %tmp2 = load <4 x i16>, ptr %B 560 %tmp3 = load <4 x i16>, ptr %C 561 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> 562 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> 563 %tmp6 = mul <4 x i32> %tmp4, %tmp5 564 %tmp7 = add <4 x i32> %tmp1, %tmp6 565 ret <4 x i32> %tmp7 566} 567 568define <2 x i64> @umlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { 569; CHECK-LABEL: umlal_v2i32_v2i64: 570; CHECK: // %bb.0: 571; CHECK-NEXT: ldr q0, [x0] 572; CHECK-NEXT: ldr d1, [x1] 573; CHECK-NEXT: ldr d2, [x2] 574; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s 575; CHECK-NEXT: ret 576 %tmp1 = load <2 x i64>, ptr %A 577 %tmp2 = load <2 x i32>, ptr %B 578 %tmp3 = load <2 x i32>, ptr %C 579 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> 580 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> 581 %tmp6 = mul <2 x i64> %tmp4, %tmp5 582 %tmp7 = add <2 x i64> %tmp1, %tmp6 583 ret <2 x i64> %tmp7 584} 585 586define <8 x i16> @amlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { 587; CHECK-NEON-LABEL: amlal_v8i8_v8i16: 588; CHECK-NEON: // %bb.0: 589; CHECK-NEON-NEXT: ldr q0, [x0] 590; CHECK-NEON-NEXT: ldr d1, [x1] 591; CHECK-NEON-NEXT: ldr d2, [x2] 592; CHECK-NEON-NEXT: smlal v0.8h, v1.8b, v2.8b 593; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8 594; CHECK-NEON-NEXT: ret 595; 596; CHECK-SVE-LABEL: amlal_v8i8_v8i16: 597; CHECK-SVE: // %bb.0: 598; CHECK-SVE-NEXT: ldr q0, [x0] 599; CHECK-SVE-NEXT: ldr d1, [x1] 600; CHECK-SVE-NEXT: ldr d2, [x2] 601; CHECK-SVE-NEXT: smlal v0.8h, v1.8b, v2.8b 602; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8 603; CHECK-SVE-NEXT: ret 604; 605; CHECK-GI-LABEL: amlal_v8i8_v8i16: 606; CHECK-GI: // %bb.0: 607; CHECK-GI-NEXT: ldr q0, [x0] 608; CHECK-GI-NEXT: ldr d1, [x1] 609; CHECK-GI-NEXT: movi v3.2d, #0xff00ff00ff00ff 610; CHECK-GI-NEXT: ldr d2, [x2] 611; CHECK-GI-NEXT: umlal v0.8h, v1.8b, v2.8b 612; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b 613; CHECK-GI-NEXT: ret 614 %tmp1 = load <8 x i16>, ptr %A 615 %tmp2 = load <8 x i8>, ptr %B 616 %tmp3 = load <8 x i8>, ptr %C 617 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> 618 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> 619 %tmp6 = mul <8 x i16> %tmp4, %tmp5 620 %tmp7 = add <8 x i16> %tmp1, %tmp6 621 %and = and <8 x i16> %tmp7, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 622 ret <8 x i16> %and 623} 624 625define <4 x i32> @amlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { 626; CHECK-NEON-LABEL: amlal_v4i16_v4i32: 627; CHECK-NEON: // %bb.0: 628; CHECK-NEON-NEXT: ldr q0, [x0] 629; CHECK-NEON-NEXT: ldr d1, [x1] 630; CHECK-NEON-NEXT: ldr d2, [x2] 631; CHECK-NEON-NEXT: smlal v0.4s, v1.4h, v2.4h 632; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff 633; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b 634; CHECK-NEON-NEXT: ret 635; 636; CHECK-SVE-LABEL: amlal_v4i16_v4i32: 637; CHECK-SVE: // %bb.0: 638; CHECK-SVE-NEXT: ldr q0, [x0] 639; CHECK-SVE-NEXT: ldr d1, [x1] 640; CHECK-SVE-NEXT: ldr d2, [x2] 641; CHECK-SVE-NEXT: smlal v0.4s, v1.4h, v2.4h 642; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff 643; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b 644; CHECK-SVE-NEXT: ret 645; 646; CHECK-GI-LABEL: amlal_v4i16_v4i32: 647; CHECK-GI: // %bb.0: 648; CHECK-GI-NEXT: ldr q0, [x0] 649; CHECK-GI-NEXT: ldr d1, [x1] 650; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff 651; CHECK-GI-NEXT: ldr d2, [x2] 652; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.4h 653; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b 654; CHECK-GI-NEXT: ret 655 %tmp1 = load <4 x i32>, ptr %A 656 %tmp2 = load <4 x i16>, ptr %B 657 %tmp3 = load <4 x i16>, ptr %C 658 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> 659 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> 660 %tmp6 = mul <4 x i32> %tmp4, %tmp5 661 %tmp7 = add <4 x i32> %tmp1, %tmp6 662 %and = and <4 x i32> %tmp7, <i32 65535, i32 65535, i32 65535, i32 65535> 663 ret <4 x i32> %and 664} 665 666define <2 x i64> @amlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { 667; CHECK-NEON-LABEL: amlal_v2i32_v2i64: 668; CHECK-NEON: // %bb.0: 669; CHECK-NEON-NEXT: ldr q0, [x0] 670; CHECK-NEON-NEXT: ldr d1, [x1] 671; CHECK-NEON-NEXT: ldr d2, [x2] 672; CHECK-NEON-NEXT: smlal v0.2d, v1.2s, v2.2s 673; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff 674; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b 675; CHECK-NEON-NEXT: ret 676; 677; CHECK-SVE-LABEL: amlal_v2i32_v2i64: 678; CHECK-SVE: // %bb.0: 679; CHECK-SVE-NEXT: ldr q0, [x0] 680; CHECK-SVE-NEXT: ldr d1, [x1] 681; CHECK-SVE-NEXT: ldr d2, [x2] 682; CHECK-SVE-NEXT: smlal v0.2d, v1.2s, v2.2s 683; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff 684; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b 685; CHECK-SVE-NEXT: ret 686; 687; CHECK-GI-LABEL: amlal_v2i32_v2i64: 688; CHECK-GI: // %bb.0: 689; CHECK-GI-NEXT: ldr q0, [x0] 690; CHECK-GI-NEXT: ldr d1, [x1] 691; CHECK-GI-NEXT: movi v3.2d, #0x000000ffffffff 692; CHECK-GI-NEXT: ldr d2, [x2] 693; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.2s 694; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b 695; CHECK-GI-NEXT: ret 696 %tmp1 = load <2 x i64>, ptr %A 697 %tmp2 = load <2 x i32>, ptr %B 698 %tmp3 = load <2 x i32>, ptr %C 699 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> 700 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> 701 %tmp6 = mul <2 x i64> %tmp4, %tmp5 702 %tmp7 = add <2 x i64> %tmp1, %tmp6 703 %and = and <2 x i64> %tmp7, <i64 4294967295, i64 4294967295> 704 ret <2 x i64> %and 705} 706 707define <8 x i16> @smlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { 708; CHECK-LABEL: smlsl_v8i8_v8i16: 709; CHECK: // %bb.0: 710; CHECK-NEXT: ldr q0, [x0] 711; CHECK-NEXT: ldr d1, [x1] 712; CHECK-NEXT: ldr d2, [x2] 713; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b 714; CHECK-NEXT: ret 715 %tmp1 = load <8 x i16>, ptr %A 716 %tmp2 = load <8 x i8>, ptr %B 717 %tmp3 = load <8 x i8>, ptr %C 718 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> 719 %tmp5 = sext <8 x i8> %tmp3 to <8 x i16> 720 %tmp6 = mul <8 x i16> %tmp4, %tmp5 721 %tmp7 = sub <8 x i16> %tmp1, %tmp6 722 ret <8 x i16> %tmp7 723} 724 725define <4 x i32> @smlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { 726; CHECK-LABEL: smlsl_v4i16_v4i32: 727; CHECK: // %bb.0: 728; CHECK-NEXT: ldr q0, [x0] 729; CHECK-NEXT: ldr d1, [x1] 730; CHECK-NEXT: ldr d2, [x2] 731; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h 732; CHECK-NEXT: ret 733 %tmp1 = load <4 x i32>, ptr %A 734 %tmp2 = load <4 x i16>, ptr %B 735 %tmp3 = load <4 x i16>, ptr %C 736 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> 737 %tmp5 = sext <4 x i16> %tmp3 to <4 x i32> 738 %tmp6 = mul <4 x i32> %tmp4, %tmp5 739 %tmp7 = sub <4 x i32> %tmp1, %tmp6 740 ret <4 x i32> %tmp7 741} 742 743define <2 x i64> @smlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { 744; CHECK-LABEL: smlsl_v2i32_v2i64: 745; CHECK: // %bb.0: 746; CHECK-NEXT: ldr q0, [x0] 747; CHECK-NEXT: ldr d1, [x1] 748; CHECK-NEXT: ldr d2, [x2] 749; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s 750; CHECK-NEXT: ret 751 %tmp1 = load <2 x i64>, ptr %A 752 %tmp2 = load <2 x i32>, ptr %B 753 %tmp3 = load <2 x i32>, ptr %C 754 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> 755 %tmp5 = sext <2 x i32> %tmp3 to <2 x i64> 756 %tmp6 = mul <2 x i64> %tmp4, %tmp5 757 %tmp7 = sub <2 x i64> %tmp1, %tmp6 758 ret <2 x i64> %tmp7 759} 760 761define <8 x i16> @umlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { 762; CHECK-LABEL: umlsl_v8i8_v8i16: 763; CHECK: // %bb.0: 764; CHECK-NEXT: ldr q0, [x0] 765; CHECK-NEXT: ldr d1, [x1] 766; CHECK-NEXT: ldr d2, [x2] 767; CHECK-NEXT: umlsl v0.8h, v1.8b, v2.8b 768; CHECK-NEXT: ret 769 %tmp1 = load <8 x i16>, ptr %A 770 %tmp2 = load <8 x i8>, ptr %B 771 %tmp3 = load <8 x i8>, ptr %C 772 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> 773 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> 774 %tmp6 = mul <8 x i16> %tmp4, %tmp5 775 %tmp7 = sub <8 x i16> %tmp1, %tmp6 776 ret <8 x i16> %tmp7 777} 778 779define <4 x i32> @umlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { 780; CHECK-LABEL: umlsl_v4i16_v4i32: 781; CHECK: // %bb.0: 782; CHECK-NEXT: ldr q0, [x0] 783; CHECK-NEXT: ldr d1, [x1] 784; CHECK-NEXT: ldr d2, [x2] 785; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h 786; CHECK-NEXT: ret 787 %tmp1 = load <4 x i32>, ptr %A 788 %tmp2 = load <4 x i16>, ptr %B 789 %tmp3 = load <4 x i16>, ptr %C 790 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> 791 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> 792 %tmp6 = mul <4 x i32> %tmp4, %tmp5 793 %tmp7 = sub <4 x i32> %tmp1, %tmp6 794 ret <4 x i32> %tmp7 795} 796 797define <2 x i64> @umlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { 798; CHECK-LABEL: umlsl_v2i32_v2i64: 799; CHECK: // %bb.0: 800; CHECK-NEXT: ldr q0, [x0] 801; CHECK-NEXT: ldr d1, [x1] 802; CHECK-NEXT: ldr d2, [x2] 803; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s 804; CHECK-NEXT: ret 805 %tmp1 = load <2 x i64>, ptr %A 806 %tmp2 = load <2 x i32>, ptr %B 807 %tmp3 = load <2 x i32>, ptr %C 808 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> 809 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> 810 %tmp6 = mul <2 x i64> %tmp4, %tmp5 811 %tmp7 = sub <2 x i64> %tmp1, %tmp6 812 ret <2 x i64> %tmp7 813} 814 815define <8 x i16> @amlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind { 816; CHECK-NEON-LABEL: amlsl_v8i8_v8i16: 817; CHECK-NEON: // %bb.0: 818; CHECK-NEON-NEXT: ldr q0, [x0] 819; CHECK-NEON-NEXT: ldr d1, [x1] 820; CHECK-NEON-NEXT: ldr d2, [x2] 821; CHECK-NEON-NEXT: smlsl v0.8h, v1.8b, v2.8b 822; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8 823; CHECK-NEON-NEXT: ret 824; 825; CHECK-SVE-LABEL: amlsl_v8i8_v8i16: 826; CHECK-SVE: // %bb.0: 827; CHECK-SVE-NEXT: ldr q0, [x0] 828; CHECK-SVE-NEXT: ldr d1, [x1] 829; CHECK-SVE-NEXT: ldr d2, [x2] 830; CHECK-SVE-NEXT: smlsl v0.8h, v1.8b, v2.8b 831; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8 832; CHECK-SVE-NEXT: ret 833; 834; CHECK-GI-LABEL: amlsl_v8i8_v8i16: 835; CHECK-GI: // %bb.0: 836; CHECK-GI-NEXT: ldr q0, [x0] 837; CHECK-GI-NEXT: ldr d1, [x1] 838; CHECK-GI-NEXT: movi v3.2d, #0xff00ff00ff00ff 839; CHECK-GI-NEXT: ldr d2, [x2] 840; CHECK-GI-NEXT: umlsl v0.8h, v1.8b, v2.8b 841; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b 842; CHECK-GI-NEXT: ret 843 %tmp1 = load <8 x i16>, ptr %A 844 %tmp2 = load <8 x i8>, ptr %B 845 %tmp3 = load <8 x i8>, ptr %C 846 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> 847 %tmp5 = zext <8 x i8> %tmp3 to <8 x i16> 848 %tmp6 = mul <8 x i16> %tmp4, %tmp5 849 %tmp7 = sub <8 x i16> %tmp1, %tmp6 850 %and = and <8 x i16> %tmp7, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 851 ret <8 x i16> %and 852} 853 854define <4 x i32> @amlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind { 855; CHECK-NEON-LABEL: amlsl_v4i16_v4i32: 856; CHECK-NEON: // %bb.0: 857; CHECK-NEON-NEXT: ldr q0, [x0] 858; CHECK-NEON-NEXT: ldr d1, [x1] 859; CHECK-NEON-NEXT: ldr d2, [x2] 860; CHECK-NEON-NEXT: smlsl v0.4s, v1.4h, v2.4h 861; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff 862; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b 863; CHECK-NEON-NEXT: ret 864; 865; CHECK-SVE-LABEL: amlsl_v4i16_v4i32: 866; CHECK-SVE: // %bb.0: 867; CHECK-SVE-NEXT: ldr q0, [x0] 868; CHECK-SVE-NEXT: ldr d1, [x1] 869; CHECK-SVE-NEXT: ldr d2, [x2] 870; CHECK-SVE-NEXT: smlsl v0.4s, v1.4h, v2.4h 871; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff 872; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b 873; CHECK-SVE-NEXT: ret 874; 875; CHECK-GI-LABEL: amlsl_v4i16_v4i32: 876; CHECK-GI: // %bb.0: 877; CHECK-GI-NEXT: ldr q0, [x0] 878; CHECK-GI-NEXT: ldr d1, [x1] 879; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff 880; CHECK-GI-NEXT: ldr d2, [x2] 881; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.4h 882; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b 883; CHECK-GI-NEXT: ret 884 %tmp1 = load <4 x i32>, ptr %A 885 %tmp2 = load <4 x i16>, ptr %B 886 %tmp3 = load <4 x i16>, ptr %C 887 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> 888 %tmp5 = zext <4 x i16> %tmp3 to <4 x i32> 889 %tmp6 = mul <4 x i32> %tmp4, %tmp5 890 %tmp7 = sub <4 x i32> %tmp1, %tmp6 891 %and = and <4 x i32> %tmp7, <i32 65535, i32 65535, i32 65535, i32 65535> 892 ret <4 x i32> %and 893} 894 895define <2 x i64> @amlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind { 896; CHECK-NEON-LABEL: amlsl_v2i32_v2i64: 897; CHECK-NEON: // %bb.0: 898; CHECK-NEON-NEXT: ldr q0, [x0] 899; CHECK-NEON-NEXT: ldr d1, [x1] 900; CHECK-NEON-NEXT: ldr d2, [x2] 901; CHECK-NEON-NEXT: smlsl v0.2d, v1.2s, v2.2s 902; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff 903; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b 904; CHECK-NEON-NEXT: ret 905; 906; CHECK-SVE-LABEL: amlsl_v2i32_v2i64: 907; CHECK-SVE: // %bb.0: 908; CHECK-SVE-NEXT: ldr q0, [x0] 909; CHECK-SVE-NEXT: ldr d1, [x1] 910; CHECK-SVE-NEXT: ldr d2, [x2] 911; CHECK-SVE-NEXT: smlsl v0.2d, v1.2s, v2.2s 912; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff 913; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b 914; CHECK-SVE-NEXT: ret 915; 916; CHECK-GI-LABEL: amlsl_v2i32_v2i64: 917; CHECK-GI: // %bb.0: 918; CHECK-GI-NEXT: ldr q0, [x0] 919; CHECK-GI-NEXT: ldr d1, [x1] 920; CHECK-GI-NEXT: movi v3.2d, #0x000000ffffffff 921; CHECK-GI-NEXT: ldr d2, [x2] 922; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.2s 923; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b 924; CHECK-GI-NEXT: ret 925 %tmp1 = load <2 x i64>, ptr %A 926 %tmp2 = load <2 x i32>, ptr %B 927 %tmp3 = load <2 x i32>, ptr %C 928 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> 929 %tmp5 = zext <2 x i32> %tmp3 to <2 x i64> 930 %tmp6 = mul <2 x i64> %tmp4, %tmp5 931 %tmp7 = sub <2 x i64> %tmp1, %tmp6 932 %and = and <2 x i64> %tmp7, <i64 4294967295, i64 4294967295> 933 ret <2 x i64> %and 934} 935 936; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements. 937define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { 938; CHECK-NEON-LABEL: smull_extvec_v8i8_v8i16: 939; CHECK-NEON: // %bb.0: 940; CHECK-NEON-NEXT: movi v1.8b, #244 941; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b 942; CHECK-NEON-NEXT: ret 943; 944; CHECK-SVE-LABEL: smull_extvec_v8i8_v8i16: 945; CHECK-SVE: // %bb.0: 946; CHECK-SVE-NEXT: movi v1.8b, #244 947; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b 948; CHECK-SVE-NEXT: ret 949; 950; CHECK-GI-LABEL: smull_extvec_v8i8_v8i16: 951; CHECK-GI: // %bb.0: 952; CHECK-GI-NEXT: mvni v1.8h, #11 953; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 954; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h 955; CHECK-GI-NEXT: ret 956 %tmp3 = sext <8 x i8> %arg to <8 x i16> 957 %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12> 958 ret <8 x i16> %tmp4 959} 960 961define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind { 962; Do not use SMULL if the BUILD_VECTOR element values are too big. 963; CHECK-NEON-LABEL: smull_noextvec_v8i8_v8i16: 964; CHECK-NEON: // %bb.0: 965; CHECK-NEON-NEXT: mov w8, #64537 // =0xfc19 966; CHECK-NEON-NEXT: sshll v0.8h, v0.8b, #0 967; CHECK-NEON-NEXT: dup v1.8h, w8 968; CHECK-NEON-NEXT: mul v0.8h, v0.8h, v1.8h 969; CHECK-NEON-NEXT: ret 970; 971; CHECK-SVE-LABEL: smull_noextvec_v8i8_v8i16: 972; CHECK-SVE: // %bb.0: 973; CHECK-SVE-NEXT: mov w8, #64537 // =0xfc19 974; CHECK-SVE-NEXT: sshll v0.8h, v0.8b, #0 975; CHECK-SVE-NEXT: dup v1.8h, w8 976; CHECK-SVE-NEXT: mul v0.8h, v0.8h, v1.8h 977; CHECK-SVE-NEXT: ret 978; 979; CHECK-GI-LABEL: smull_noextvec_v8i8_v8i16: 980; CHECK-GI: // %bb.0: 981; CHECK-GI-NEXT: adrp x8, .LCPI34_0 982; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 983; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI34_0] 984; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h 985; CHECK-GI-NEXT: ret 986 %tmp3 = sext <8 x i8> %arg to <8 x i16> 987 %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999> 988 ret <8 x i16> %tmp4 989} 990 991define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { 992; CHECK-NEON-LABEL: smull_extvec_v4i16_v4i32: 993; CHECK-NEON: // %bb.0: 994; CHECK-NEON-NEXT: mvni v1.4h, #11 995; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h 996; CHECK-NEON-NEXT: ret 997; 998; CHECK-SVE-LABEL: smull_extvec_v4i16_v4i32: 999; CHECK-SVE: // %bb.0: 1000; CHECK-SVE-NEXT: mvni v1.4h, #11 1001; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h 1002; CHECK-SVE-NEXT: ret 1003; 1004; CHECK-GI-LABEL: smull_extvec_v4i16_v4i32: 1005; CHECK-GI: // %bb.0: 1006; CHECK-GI-NEXT: mvni v1.4s, #11 1007; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 1008; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s 1009; CHECK-GI-NEXT: ret 1010 %tmp3 = sext <4 x i16> %arg to <4 x i32> 1011 %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12> 1012 ret <4 x i32> %tmp4 1013} 1014 1015define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { 1016; CHECK-NEON-LABEL: smull_extvec_v2i32_v2i64: 1017; CHECK-NEON: // %bb.0: 1018; CHECK-NEON-NEXT: mov w8, #-1234 // =0xfffffb2e 1019; CHECK-NEON-NEXT: dup v1.2s, w8 1020; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s 1021; CHECK-NEON-NEXT: ret 1022; 1023; CHECK-SVE-LABEL: smull_extvec_v2i32_v2i64: 1024; CHECK-SVE: // %bb.0: 1025; CHECK-SVE-NEXT: mov w8, #-1234 // =0xfffffb2e 1026; CHECK-SVE-NEXT: dup v1.2s, w8 1027; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s 1028; CHECK-SVE-NEXT: ret 1029; 1030; CHECK-GI-LABEL: smull_extvec_v2i32_v2i64: 1031; CHECK-GI: // %bb.0: 1032; CHECK-GI-NEXT: adrp x8, .LCPI36_0 1033; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 1034; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI36_0] 1035; CHECK-GI-NEXT: fmov x8, d0 1036; CHECK-GI-NEXT: fmov x9, d1 1037; CHECK-GI-NEXT: mov x10, v0.d[1] 1038; CHECK-GI-NEXT: mov x11, v1.d[1] 1039; CHECK-GI-NEXT: mul x8, x8, x9 1040; CHECK-GI-NEXT: mul x9, x10, x11 1041; CHECK-GI-NEXT: mov v0.d[0], x8 1042; CHECK-GI-NEXT: mov v0.d[1], x9 1043; CHECK-GI-NEXT: ret 1044 %tmp3 = sext <2 x i32> %arg to <2 x i64> 1045 %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234> 1046 ret <2 x i64> %tmp4 1047} 1048 1049define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { 1050; CHECK-NEON-LABEL: umull_extvec_v8i8_v8i16: 1051; CHECK-NEON: // %bb.0: 1052; CHECK-NEON-NEXT: movi v1.8b, #12 1053; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b 1054; CHECK-NEON-NEXT: ret 1055; 1056; CHECK-SVE-LABEL: umull_extvec_v8i8_v8i16: 1057; CHECK-SVE: // %bb.0: 1058; CHECK-SVE-NEXT: movi v1.8b, #12 1059; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b 1060; CHECK-SVE-NEXT: ret 1061; 1062; CHECK-GI-LABEL: umull_extvec_v8i8_v8i16: 1063; CHECK-GI: // %bb.0: 1064; CHECK-GI-NEXT: movi v1.8h, #12 1065; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 1066; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h 1067; CHECK-GI-NEXT: ret 1068 %tmp3 = zext <8 x i8> %arg to <8 x i16> 1069 %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12> 1070 ret <8 x i16> %tmp4 1071} 1072 1073define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind { 1074; Do not use SMULL if the BUILD_VECTOR element values are too big. 1075; CHECK-NEON-LABEL: umull_noextvec_v8i8_v8i16: 1076; CHECK-NEON: // %bb.0: 1077; CHECK-NEON-NEXT: mov w8, #999 // =0x3e7 1078; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0 1079; CHECK-NEON-NEXT: dup v1.8h, w8 1080; CHECK-NEON-NEXT: mul v0.8h, v0.8h, v1.8h 1081; CHECK-NEON-NEXT: ret 1082; 1083; CHECK-SVE-LABEL: umull_noextvec_v8i8_v8i16: 1084; CHECK-SVE: // %bb.0: 1085; CHECK-SVE-NEXT: mov w8, #999 // =0x3e7 1086; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0 1087; CHECK-SVE-NEXT: dup v1.8h, w8 1088; CHECK-SVE-NEXT: mul v0.8h, v0.8h, v1.8h 1089; CHECK-SVE-NEXT: ret 1090; 1091; CHECK-GI-LABEL: umull_noextvec_v8i8_v8i16: 1092; CHECK-GI: // %bb.0: 1093; CHECK-GI-NEXT: adrp x8, .LCPI38_0 1094; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 1095; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI38_0] 1096; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h 1097; CHECK-GI-NEXT: ret 1098 %tmp3 = zext <8 x i8> %arg to <8 x i16> 1099 %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999> 1100 ret <8 x i16> %tmp4 1101} 1102 1103define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { 1104; CHECK-NEON-LABEL: umull_extvec_v4i16_v4i32: 1105; CHECK-NEON: // %bb.0: 1106; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2 1107; CHECK-NEON-NEXT: dup v1.4h, w8 1108; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h 1109; CHECK-NEON-NEXT: ret 1110; 1111; CHECK-SVE-LABEL: umull_extvec_v4i16_v4i32: 1112; CHECK-SVE: // %bb.0: 1113; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2 1114; CHECK-SVE-NEXT: dup v1.4h, w8 1115; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h 1116; CHECK-SVE-NEXT: ret 1117; 1118; CHECK-GI-LABEL: umull_extvec_v4i16_v4i32: 1119; CHECK-GI: // %bb.0: 1120; CHECK-GI-NEXT: adrp x8, .LCPI39_0 1121; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 1122; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI39_0] 1123; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s 1124; CHECK-GI-NEXT: ret 1125 %tmp3 = zext <4 x i16> %arg to <4 x i32> 1126 %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234> 1127 ret <4 x i32> %tmp4 1128} 1129 1130define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { 1131; CHECK-NEON-LABEL: umull_extvec_v2i32_v2i64: 1132; CHECK-NEON: // %bb.0: 1133; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2 1134; CHECK-NEON-NEXT: dup v1.2s, w8 1135; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s 1136; CHECK-NEON-NEXT: ret 1137; 1138; CHECK-SVE-LABEL: umull_extvec_v2i32_v2i64: 1139; CHECK-SVE: // %bb.0: 1140; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2 1141; CHECK-SVE-NEXT: dup v1.2s, w8 1142; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s 1143; CHECK-SVE-NEXT: ret 1144; 1145; CHECK-GI-LABEL: umull_extvec_v2i32_v2i64: 1146; CHECK-GI: // %bb.0: 1147; CHECK-GI-NEXT: adrp x8, .LCPI40_0 1148; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 1149; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI40_0] 1150; CHECK-GI-NEXT: fmov x8, d0 1151; CHECK-GI-NEXT: fmov x9, d1 1152; CHECK-GI-NEXT: mov x10, v0.d[1] 1153; CHECK-GI-NEXT: mov x11, v1.d[1] 1154; CHECK-GI-NEXT: mul x8, x8, x9 1155; CHECK-GI-NEXT: mul x9, x10, x11 1156; CHECK-GI-NEXT: mov v0.d[0], x8 1157; CHECK-GI-NEXT: mov v0.d[1], x9 1158; CHECK-GI-NEXT: ret 1159 %tmp3 = zext <2 x i32> %arg to <2 x i64> 1160 %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234> 1161 ret <2 x i64> %tmp4 1162} 1163 1164define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind { 1165; CHECK-NEON-LABEL: amull_extvec_v8i8_v8i16: 1166; CHECK-NEON: // %bb.0: 1167; CHECK-NEON-NEXT: movi v1.8b, #12 1168; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b 1169; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8 1170; CHECK-NEON-NEXT: ret 1171; 1172; CHECK-SVE-LABEL: amull_extvec_v8i8_v8i16: 1173; CHECK-SVE: // %bb.0: 1174; CHECK-SVE-NEXT: movi v1.8b, #12 1175; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b 1176; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8 1177; CHECK-SVE-NEXT: ret 1178; 1179; CHECK-GI-LABEL: amull_extvec_v8i8_v8i16: 1180; CHECK-GI: // %bb.0: 1181; CHECK-GI-NEXT: movi v1.8h, #12 1182; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 1183; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff 1184; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h 1185; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b 1186; CHECK-GI-NEXT: ret 1187 %tmp3 = zext <8 x i8> %arg to <8 x i16> 1188 %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12> 1189 %and = and <8 x i16> %tmp4, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 1190 ret <8 x i16> %and 1191} 1192 1193define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind { 1194; CHECK-NEON-LABEL: amull_extvec_v4i16_v4i32: 1195; CHECK-NEON: // %bb.0: 1196; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2 1197; CHECK-NEON-NEXT: dup v1.4h, w8 1198; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h 1199; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff 1200; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b 1201; CHECK-NEON-NEXT: ret 1202; 1203; CHECK-SVE-LABEL: amull_extvec_v4i16_v4i32: 1204; CHECK-SVE: // %bb.0: 1205; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2 1206; CHECK-SVE-NEXT: dup v1.4h, w8 1207; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h 1208; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff 1209; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b 1210; CHECK-SVE-NEXT: ret 1211; 1212; CHECK-GI-LABEL: amull_extvec_v4i16_v4i32: 1213; CHECK-GI: // %bb.0: 1214; CHECK-GI-NEXT: adrp x8, .LCPI42_0 1215; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 1216; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff 1217; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI42_0] 1218; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s 1219; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b 1220; CHECK-GI-NEXT: ret 1221 %tmp3 = zext <4 x i16> %arg to <4 x i32> 1222 %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234> 1223 %and = and <4 x i32> %tmp4, <i32 65535, i32 65535, i32 65535, i32 65535> 1224 ret <4 x i32> %and 1225} 1226 1227define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind { 1228; CHECK-NEON-LABEL: amull_extvec_v2i32_v2i64: 1229; CHECK-NEON: // %bb.0: 1230; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2 1231; CHECK-NEON-NEXT: dup v1.2s, w8 1232; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s 1233; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff 1234; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b 1235; CHECK-NEON-NEXT: ret 1236; 1237; CHECK-SVE-LABEL: amull_extvec_v2i32_v2i64: 1238; CHECK-SVE: // %bb.0: 1239; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2 1240; CHECK-SVE-NEXT: dup v1.2s, w8 1241; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s 1242; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff 1243; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b 1244; CHECK-SVE-NEXT: ret 1245; 1246; CHECK-GI-LABEL: amull_extvec_v2i32_v2i64: 1247; CHECK-GI: // %bb.0: 1248; CHECK-GI-NEXT: adrp x8, .LCPI43_0 1249; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 1250; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI43_0] 1251; CHECK-GI-NEXT: fmov x8, d0 1252; CHECK-GI-NEXT: fmov x9, d1 1253; CHECK-GI-NEXT: mov x10, v0.d[1] 1254; CHECK-GI-NEXT: mov x11, v1.d[1] 1255; CHECK-GI-NEXT: movi v1.2d, #0x000000ffffffff 1256; CHECK-GI-NEXT: mul x8, x8, x9 1257; CHECK-GI-NEXT: mul x9, x10, x11 1258; CHECK-GI-NEXT: mov v0.d[0], x8 1259; CHECK-GI-NEXT: mov v0.d[1], x9 1260; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b 1261; CHECK-GI-NEXT: ret 1262 %tmp3 = zext <2 x i32> %arg to <2 x i64> 1263 %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234> 1264 %and = and <2 x i64> %tmp4, <i64 4294967295, i64 4294967295> 1265 ret <2 x i64> %and 1266} 1267 1268define i16 @smullWithInconsistentExtensions(<8 x i8> %x, <8 x i8> %y) { 1269; If one operand has a zero-extend and the other a sign-extend, smull 1270; cannot be used. 1271; CHECK-LABEL: smullWithInconsistentExtensions: 1272; CHECK: // %bb.0: 1273; CHECK-NEXT: sshll v0.8h, v0.8b, #0 1274; CHECK-NEXT: ushll v1.8h, v1.8b, #0 1275; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h 1276; CHECK-NEXT: umov w0, v0.h[0] 1277; CHECK-NEXT: ret 1278 %s = sext <8 x i8> %x to <8 x i16> 1279 %z = zext <8 x i8> %y to <8 x i16> 1280 %m = mul <8 x i16> %s, %z 1281 %r = extractelement <8 x i16> %m, i32 0 1282 ret i16 %r 1283} 1284 1285define <8 x i16> @smull_extended_vector_operand(<8 x i16> %v) { 1286; CHECK-LABEL: smull_extended_vector_operand: 1287; CHECK: // %bb.0: // %entry 1288; CHECK-NEXT: movi v1.4s, #139, lsl #8 1289; CHECK-NEXT: sshll v2.4s, v0.4h, #0 1290; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0 1291; CHECK-NEXT: mul v2.4s, v2.4s, v1.4s 1292; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s 1293; CHECK-NEXT: shrn v0.4h, v2.4s, #1 1294; CHECK-NEXT: shrn2 v0.8h, v1.4s, #1 1295; CHECK-NEXT: ret 1296entry: 1297%0 = sext <8 x i16> %v to <8 x i32> 1298%1 = mul <8 x i32> %0, <i32 35584, i32 35584, i32 35584, i32 35584, i32 35584, i32 35584, i32 35584, i32 35584> 1299%2 = lshr <8 x i32> %1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1300%3 = trunc <8 x i32> %2 to <8 x i16> 1301ret <8 x i16> %3 1302 1303} 1304 1305define void @distribute(ptr %dst, ptr %src, i32 %mul) nounwind { 1306; CHECK-NEON-LABEL: distribute: 1307; CHECK-NEON: // %bb.0: // %entry 1308; CHECK-NEON-NEXT: ldr q0, [x1] 1309; CHECK-NEON-NEXT: dup v1.8b, w2 1310; CHECK-NEON-NEXT: mov d2, v0.d[1] 1311; CHECK-NEON-NEXT: umull v2.8h, v2.8b, v1.8b 1312; CHECK-NEON-NEXT: umlal v2.8h, v0.8b, v1.8b 1313; CHECK-NEON-NEXT: str q2, [x0] 1314; CHECK-NEON-NEXT: ret 1315; 1316; CHECK-SVE-LABEL: distribute: 1317; CHECK-SVE: // %bb.0: // %entry 1318; CHECK-SVE-NEXT: ldr q0, [x1] 1319; CHECK-SVE-NEXT: dup v1.8b, w2 1320; CHECK-SVE-NEXT: mov d2, v0.d[1] 1321; CHECK-SVE-NEXT: umull v2.8h, v2.8b, v1.8b 1322; CHECK-SVE-NEXT: umlal v2.8h, v0.8b, v1.8b 1323; CHECK-SVE-NEXT: str q2, [x0] 1324; CHECK-SVE-NEXT: ret 1325; 1326; CHECK-GI-LABEL: distribute: 1327; CHECK-GI: // %bb.0: // %entry 1328; CHECK-GI-NEXT: ldr q0, [x1] 1329; CHECK-GI-NEXT: dup v1.8b, w2 1330; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 1331; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 1332; CHECK-GI-NEXT: uaddw2 v0.8h, v2.8h, v0.16b 1333; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h 1334; CHECK-GI-NEXT: str q0, [x0] 1335; CHECK-GI-NEXT: ret 1336entry: 1337 %0 = trunc i32 %mul to i8 1338 %1 = insertelement <8 x i8> undef, i8 %0, i32 0 1339 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer 1340 %3 = load <16 x i8>, ptr %src, align 1 1341 %4 = bitcast <16 x i8> %3 to <2 x double> 1342 %5 = extractelement <2 x double> %4, i32 1 1343 %6 = bitcast double %5 to <8 x i8> 1344 %7 = zext <8 x i8> %6 to <8 x i16> 1345 %8 = zext <8 x i8> %2 to <8 x i16> 1346 %9 = extractelement <2 x double> %4, i32 0 1347 %10 = bitcast double %9 to <8 x i8> 1348 %11 = zext <8 x i8> %10 to <8 x i16> 1349 %12 = add <8 x i16> %7, %11 1350 %13 = mul <8 x i16> %12, %8 1351 store <8 x i16> %13, ptr %dst, align 2 1352 ret void 1353} 1354 1355define <16 x i16> @umull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) { 1356; CHECK-NEON-LABEL: umull2_i8: 1357; CHECK-NEON: // %bb.0: 1358; CHECK-NEON-NEXT: umull2 v2.8h, v0.16b, v1.16b 1359; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b 1360; CHECK-NEON-NEXT: mov v1.16b, v2.16b 1361; CHECK-NEON-NEXT: ret 1362; 1363; CHECK-SVE-LABEL: umull2_i8: 1364; CHECK-SVE: // %bb.0: 1365; CHECK-SVE-NEXT: umull2 v2.8h, v0.16b, v1.16b 1366; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b 1367; CHECK-SVE-NEXT: mov v1.16b, v2.16b 1368; CHECK-SVE-NEXT: ret 1369; 1370; CHECK-GI-LABEL: umull2_i8: 1371; CHECK-GI: // %bb.0: 1372; CHECK-GI-NEXT: umull v2.8h, v0.8b, v1.8b 1373; CHECK-GI-NEXT: umull2 v1.8h, v0.16b, v1.16b 1374; CHECK-GI-NEXT: mov v0.16b, v2.16b 1375; CHECK-GI-NEXT: ret 1376 %arg1_ext = zext <16 x i8> %arg1 to <16 x i16> 1377 %arg2_ext = zext <16 x i8> %arg2 to <16 x i16> 1378 %mul = mul <16 x i16> %arg1_ext, %arg2_ext 1379 ret <16 x i16> %mul 1380} 1381 1382define <16 x i16> @smull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) { 1383; CHECK-NEON-LABEL: smull2_i8: 1384; CHECK-NEON: // %bb.0: 1385; CHECK-NEON-NEXT: smull2 v2.8h, v0.16b, v1.16b 1386; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b 1387; CHECK-NEON-NEXT: mov v1.16b, v2.16b 1388; CHECK-NEON-NEXT: ret 1389; 1390; CHECK-SVE-LABEL: smull2_i8: 1391; CHECK-SVE: // %bb.0: 1392; CHECK-SVE-NEXT: smull2 v2.8h, v0.16b, v1.16b 1393; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b 1394; CHECK-SVE-NEXT: mov v1.16b, v2.16b 1395; CHECK-SVE-NEXT: ret 1396; 1397; CHECK-GI-LABEL: smull2_i8: 1398; CHECK-GI: // %bb.0: 1399; CHECK-GI-NEXT: smull v2.8h, v0.8b, v1.8b 1400; CHECK-GI-NEXT: smull2 v1.8h, v0.16b, v1.16b 1401; CHECK-GI-NEXT: mov v0.16b, v2.16b 1402; CHECK-GI-NEXT: ret 1403 %arg1_ext = sext <16 x i8> %arg1 to <16 x i16> 1404 %arg2_ext = sext <16 x i8> %arg2 to <16 x i16> 1405 %mul = mul <16 x i16> %arg1_ext, %arg2_ext 1406 ret <16 x i16> %mul 1407} 1408 1409define <8 x i32> @umull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) { 1410; CHECK-NEON-LABEL: umull2_i16: 1411; CHECK-NEON: // %bb.0: 1412; CHECK-NEON-NEXT: umull2 v2.4s, v0.8h, v1.8h 1413; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h 1414; CHECK-NEON-NEXT: mov v1.16b, v2.16b 1415; CHECK-NEON-NEXT: ret 1416; 1417; CHECK-SVE-LABEL: umull2_i16: 1418; CHECK-SVE: // %bb.0: 1419; CHECK-SVE-NEXT: umull2 v2.4s, v0.8h, v1.8h 1420; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h 1421; CHECK-SVE-NEXT: mov v1.16b, v2.16b 1422; CHECK-SVE-NEXT: ret 1423; 1424; CHECK-GI-LABEL: umull2_i16: 1425; CHECK-GI: // %bb.0: 1426; CHECK-GI-NEXT: umull v2.4s, v0.4h, v1.4h 1427; CHECK-GI-NEXT: umull2 v1.4s, v0.8h, v1.8h 1428; CHECK-GI-NEXT: mov v0.16b, v2.16b 1429; CHECK-GI-NEXT: ret 1430 %arg1_ext = zext <8 x i16> %arg1 to <8 x i32> 1431 %arg2_ext = zext <8 x i16> %arg2 to <8 x i32> 1432 %mul = mul <8 x i32> %arg1_ext, %arg2_ext 1433 ret <8 x i32> %mul 1434} 1435 1436define <8 x i32> @smull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) { 1437; CHECK-NEON-LABEL: smull2_i16: 1438; CHECK-NEON: // %bb.0: 1439; CHECK-NEON-NEXT: smull2 v2.4s, v0.8h, v1.8h 1440; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h 1441; CHECK-NEON-NEXT: mov v1.16b, v2.16b 1442; CHECK-NEON-NEXT: ret 1443; 1444; CHECK-SVE-LABEL: smull2_i16: 1445; CHECK-SVE: // %bb.0: 1446; CHECK-SVE-NEXT: smull2 v2.4s, v0.8h, v1.8h 1447; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h 1448; CHECK-SVE-NEXT: mov v1.16b, v2.16b 1449; CHECK-SVE-NEXT: ret 1450; 1451; CHECK-GI-LABEL: smull2_i16: 1452; CHECK-GI: // %bb.0: 1453; CHECK-GI-NEXT: smull v2.4s, v0.4h, v1.4h 1454; CHECK-GI-NEXT: smull2 v1.4s, v0.8h, v1.8h 1455; CHECK-GI-NEXT: mov v0.16b, v2.16b 1456; CHECK-GI-NEXT: ret 1457 %arg1_ext = sext <8 x i16> %arg1 to <8 x i32> 1458 %arg2_ext = sext <8 x i16> %arg2 to <8 x i32> 1459 %mul = mul <8 x i32> %arg1_ext, %arg2_ext 1460 ret <8 x i32> %mul 1461} 1462 1463define <4 x i64> @umull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) { 1464; CHECK-NEON-LABEL: umull2_i32: 1465; CHECK-NEON: // %bb.0: 1466; CHECK-NEON-NEXT: umull2 v2.2d, v0.4s, v1.4s 1467; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s 1468; CHECK-NEON-NEXT: mov v1.16b, v2.16b 1469; CHECK-NEON-NEXT: ret 1470; 1471; CHECK-SVE-LABEL: umull2_i32: 1472; CHECK-SVE: // %bb.0: 1473; CHECK-SVE-NEXT: umull2 v2.2d, v0.4s, v1.4s 1474; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s 1475; CHECK-SVE-NEXT: mov v1.16b, v2.16b 1476; CHECK-SVE-NEXT: ret 1477; 1478; CHECK-GI-LABEL: umull2_i32: 1479; CHECK-GI: // %bb.0: 1480; CHECK-GI-NEXT: umull v2.2d, v0.2s, v1.2s 1481; CHECK-GI-NEXT: umull2 v1.2d, v0.4s, v1.4s 1482; CHECK-GI-NEXT: mov v0.16b, v2.16b 1483; CHECK-GI-NEXT: ret 1484 %arg1_ext = zext <4 x i32> %arg1 to <4 x i64> 1485 %arg2_ext = zext <4 x i32> %arg2 to <4 x i64> 1486 %mul = mul <4 x i64> %arg1_ext, %arg2_ext 1487 ret <4 x i64> %mul 1488} 1489 1490define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) { 1491; CHECK-NEON-LABEL: smull2_i32: 1492; CHECK-NEON: // %bb.0: 1493; CHECK-NEON-NEXT: smull2 v2.2d, v0.4s, v1.4s 1494; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s 1495; CHECK-NEON-NEXT: mov v1.16b, v2.16b 1496; CHECK-NEON-NEXT: ret 1497; 1498; CHECK-SVE-LABEL: smull2_i32: 1499; CHECK-SVE: // %bb.0: 1500; CHECK-SVE-NEXT: smull2 v2.2d, v0.4s, v1.4s 1501; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s 1502; CHECK-SVE-NEXT: mov v1.16b, v2.16b 1503; CHECK-SVE-NEXT: ret 1504; 1505; CHECK-GI-LABEL: smull2_i32: 1506; CHECK-GI: // %bb.0: 1507; CHECK-GI-NEXT: smull v2.2d, v0.2s, v1.2s 1508; CHECK-GI-NEXT: smull2 v1.2d, v0.4s, v1.4s 1509; CHECK-GI-NEXT: mov v0.16b, v2.16b 1510; CHECK-GI-NEXT: ret 1511 %arg1_ext = sext <4 x i32> %arg1 to <4 x i64> 1512 %arg2_ext = sext <4 x i32> %arg2 to <4 x i64> 1513 %mul = mul <4 x i64> %arg1_ext, %arg2_ext 1514 ret <4 x i64> %mul 1515} 1516 1517define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) { 1518; CHECK-NEON-LABEL: amull2_i8: 1519; CHECK-NEON: // %bb.0: 1520; CHECK-NEON-NEXT: smull v2.8h, v0.8b, v1.8b 1521; CHECK-NEON-NEXT: smull2 v1.8h, v0.16b, v1.16b 1522; CHECK-NEON-NEXT: bic v2.8h, #255, lsl #8 1523; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8 1524; CHECK-NEON-NEXT: mov v0.16b, v2.16b 1525; CHECK-NEON-NEXT: ret 1526; 1527; CHECK-SVE-LABEL: amull2_i8: 1528; CHECK-SVE: // %bb.0: 1529; CHECK-SVE-NEXT: smull v2.8h, v0.8b, v1.8b 1530; CHECK-SVE-NEXT: smull2 v1.8h, v0.16b, v1.16b 1531; CHECK-SVE-NEXT: bic v2.8h, #255, lsl #8 1532; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8 1533; CHECK-SVE-NEXT: mov v0.16b, v2.16b 1534; CHECK-SVE-NEXT: ret 1535; 1536; CHECK-GI-LABEL: amull2_i8: 1537; CHECK-GI: // %bb.0: 1538; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff 1539; CHECK-GI-NEXT: umull v3.8h, v0.8b, v1.8b 1540; CHECK-GI-NEXT: umull2 v1.8h, v0.16b, v1.16b 1541; CHECK-GI-NEXT: and v0.16b, v3.16b, v2.16b 1542; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b 1543; CHECK-GI-NEXT: ret 1544 %arg1_ext = zext <16 x i8> %arg1 to <16 x i16> 1545 %arg2_ext = zext <16 x i8> %arg2 to <16 x i16> 1546 %mul = mul <16 x i16> %arg1_ext, %arg2_ext 1547 %and = and <16 x i16> %mul, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 1548 ret <16 x i16> %and 1549} 1550 1551define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) { 1552; CHECK-NEON-LABEL: amull2_i16: 1553; CHECK-NEON: // %bb.0: 1554; CHECK-NEON-NEXT: movi v2.2d, #0x00ffff0000ffff 1555; CHECK-NEON-NEXT: smull v3.4s, v0.4h, v1.4h 1556; CHECK-NEON-NEXT: smull2 v0.4s, v0.8h, v1.8h 1557; CHECK-NEON-NEXT: and v1.16b, v0.16b, v2.16b 1558; CHECK-NEON-NEXT: and v0.16b, v3.16b, v2.16b 1559; CHECK-NEON-NEXT: ret 1560; 1561; CHECK-SVE-LABEL: amull2_i16: 1562; CHECK-SVE: // %bb.0: 1563; CHECK-SVE-NEXT: movi v2.2d, #0x00ffff0000ffff 1564; CHECK-SVE-NEXT: smull v3.4s, v0.4h, v1.4h 1565; CHECK-SVE-NEXT: smull2 v0.4s, v0.8h, v1.8h 1566; CHECK-SVE-NEXT: and v1.16b, v0.16b, v2.16b 1567; CHECK-SVE-NEXT: and v0.16b, v3.16b, v2.16b 1568; CHECK-SVE-NEXT: ret 1569; 1570; CHECK-GI-LABEL: amull2_i16: 1571; CHECK-GI: // %bb.0: 1572; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff 1573; CHECK-GI-NEXT: umull v3.4s, v0.4h, v1.4h 1574; CHECK-GI-NEXT: umull2 v1.4s, v0.8h, v1.8h 1575; CHECK-GI-NEXT: and v0.16b, v3.16b, v2.16b 1576; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b 1577; CHECK-GI-NEXT: ret 1578 %arg1_ext = zext <8 x i16> %arg1 to <8 x i32> 1579 %arg2_ext = zext <8 x i16> %arg2 to <8 x i32> 1580 %mul = mul <8 x i32> %arg1_ext, %arg2_ext 1581 %and = and <8 x i32> %mul, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 1582 ret <8 x i32> %and 1583} 1584 1585define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) { 1586; CHECK-NEON-LABEL: amull2_i32: 1587; CHECK-NEON: // %bb.0: 1588; CHECK-NEON-NEXT: movi v2.2d, #0x000000ffffffff 1589; CHECK-NEON-NEXT: smull v3.2d, v0.2s, v1.2s 1590; CHECK-NEON-NEXT: smull2 v0.2d, v0.4s, v1.4s 1591; CHECK-NEON-NEXT: and v1.16b, v0.16b, v2.16b 1592; CHECK-NEON-NEXT: and v0.16b, v3.16b, v2.16b 1593; CHECK-NEON-NEXT: ret 1594; 1595; CHECK-SVE-LABEL: amull2_i32: 1596; CHECK-SVE: // %bb.0: 1597; CHECK-SVE-NEXT: movi v2.2d, #0x000000ffffffff 1598; CHECK-SVE-NEXT: smull v3.2d, v0.2s, v1.2s 1599; CHECK-SVE-NEXT: smull2 v0.2d, v0.4s, v1.4s 1600; CHECK-SVE-NEXT: and v1.16b, v0.16b, v2.16b 1601; CHECK-SVE-NEXT: and v0.16b, v3.16b, v2.16b 1602; CHECK-SVE-NEXT: ret 1603; 1604; CHECK-GI-LABEL: amull2_i32: 1605; CHECK-GI: // %bb.0: 1606; CHECK-GI-NEXT: movi v2.2d, #0x000000ffffffff 1607; CHECK-GI-NEXT: umull v3.2d, v0.2s, v1.2s 1608; CHECK-GI-NEXT: umull2 v1.2d, v0.4s, v1.4s 1609; CHECK-GI-NEXT: and v0.16b, v3.16b, v2.16b 1610; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b 1611; CHECK-GI-NEXT: ret 1612 %arg1_ext = zext <4 x i32> %arg1 to <4 x i64> 1613 %arg2_ext = zext <4 x i32> %arg2 to <4 x i64> 1614 %mul = mul <4 x i64> %arg1_ext, %arg2_ext 1615 %and = and <4 x i64> %mul, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1616 ret <4 x i64> %and 1617} 1618 1619 1620define <8 x i16> @umull_and_v8i16(<8 x i8> %src1, <8 x i16> %src2) { 1621; CHECK-NEON-LABEL: umull_and_v8i16: 1622; CHECK-NEON: // %bb.0: // %entry 1623; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8 1624; CHECK-NEON-NEXT: xtn v1.8b, v1.8h 1625; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b 1626; CHECK-NEON-NEXT: ret 1627; 1628; CHECK-SVE-LABEL: umull_and_v8i16: 1629; CHECK-SVE: // %bb.0: // %entry 1630; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8 1631; CHECK-SVE-NEXT: xtn v1.8b, v1.8h 1632; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b 1633; CHECK-SVE-NEXT: ret 1634; 1635; CHECK-GI-LABEL: umull_and_v8i16: 1636; CHECK-GI: // %bb.0: // %entry 1637; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff 1638; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 1639; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b 1640; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h 1641; CHECK-GI-NEXT: ret 1642entry: 1643 %in1 = zext <8 x i8> %src1 to <8 x i16> 1644 %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 1645 %out = mul nsw <8 x i16> %in1, %in2 1646 ret <8 x i16> %out 1647} 1648 1649define <8 x i16> @umull_and_v8i16_c(<8 x i8> %src1, <8 x i16> %src2) { 1650; CHECK-NEON-LABEL: umull_and_v8i16_c: 1651; CHECK-NEON: // %bb.0: // %entry 1652; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8 1653; CHECK-NEON-NEXT: xtn v1.8b, v1.8h 1654; CHECK-NEON-NEXT: umull v0.8h, v1.8b, v0.8b 1655; CHECK-NEON-NEXT: ret 1656; 1657; CHECK-SVE-LABEL: umull_and_v8i16_c: 1658; CHECK-SVE: // %bb.0: // %entry 1659; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8 1660; CHECK-SVE-NEXT: xtn v1.8b, v1.8h 1661; CHECK-SVE-NEXT: umull v0.8h, v1.8b, v0.8b 1662; CHECK-SVE-NEXT: ret 1663; 1664; CHECK-GI-LABEL: umull_and_v8i16_c: 1665; CHECK-GI: // %bb.0: // %entry 1666; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff 1667; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 1668; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b 1669; CHECK-GI-NEXT: mul v0.8h, v1.8h, v0.8h 1670; CHECK-GI-NEXT: ret 1671entry: 1672 %in1 = zext <8 x i8> %src1 to <8 x i16> 1673 %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 1674 %out = mul nsw <8 x i16> %in2, %in1 1675 ret <8 x i16> %out 1676} 1677 1678define <8 x i16> @umull_and256_v8i16(<8 x i8> %src1, <8 x i16> %src2) { 1679; CHECK-LABEL: umull_and256_v8i16: 1680; CHECK: // %bb.0: // %entry 1681; CHECK-NEXT: movi v2.8h, #1, lsl #8 1682; CHECK-NEXT: ushll v0.8h, v0.8b, #0 1683; CHECK-NEXT: and v1.16b, v1.16b, v2.16b 1684; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h 1685; CHECK-NEXT: ret 1686entry: 1687 %in1 = zext <8 x i8> %src1 to <8 x i16> 1688 %in2 = and <8 x i16> %src2, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256> 1689 %out = mul nsw <8 x i16> %in1, %in2 1690 ret <8 x i16> %out 1691} 1692 1693define <8 x i16> @umull_andconst_v8i16(<8 x i8> %src1, <8 x i16> %src2) { 1694; CHECK-NEON-LABEL: umull_andconst_v8i16: 1695; CHECK-NEON: // %bb.0: // %entry 1696; CHECK-NEON-NEXT: movi v1.2d, #0xffffffffffffffff 1697; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b 1698; CHECK-NEON-NEXT: ret 1699; 1700; CHECK-SVE-LABEL: umull_andconst_v8i16: 1701; CHECK-SVE: // %bb.0: // %entry 1702; CHECK-SVE-NEXT: movi v1.2d, #0xffffffffffffffff 1703; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b 1704; CHECK-SVE-NEXT: ret 1705; 1706; CHECK-GI-LABEL: umull_andconst_v8i16: 1707; CHECK-GI: // %bb.0: // %entry 1708; CHECK-GI-NEXT: movi v1.2d, #0xff00ff00ff00ff 1709; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 1710; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h 1711; CHECK-GI-NEXT: ret 1712entry: 1713 %in1 = zext <8 x i8> %src1 to <8 x i16> 1714 %out = mul nsw <8 x i16> %in1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 1715 ret <8 x i16> %out 1716} 1717 1718define <8 x i16> @umull_smaller_v8i16(<8 x i4> %src1, <8 x i16> %src2) { 1719; CHECK-NEON-LABEL: umull_smaller_v8i16: 1720; CHECK-NEON: // %bb.0: // %entry 1721; CHECK-NEON-NEXT: movi v2.8b, #15 1722; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8 1723; CHECK-NEON-NEXT: xtn v1.8b, v1.8h 1724; CHECK-NEON-NEXT: and v0.8b, v0.8b, v2.8b 1725; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b 1726; CHECK-NEON-NEXT: ret 1727; 1728; CHECK-SVE-LABEL: umull_smaller_v8i16: 1729; CHECK-SVE: // %bb.0: // %entry 1730; CHECK-SVE-NEXT: movi v2.8b, #15 1731; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8 1732; CHECK-SVE-NEXT: xtn v1.8b, v1.8h 1733; CHECK-SVE-NEXT: and v0.8b, v0.8b, v2.8b 1734; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b 1735; CHECK-SVE-NEXT: ret 1736; 1737; CHECK-GI-LABEL: umull_smaller_v8i16: 1738; CHECK-GI: // %bb.0: // %entry 1739; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff 1740; CHECK-GI-NEXT: movi v3.8h, #15 1741; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 1742; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b 1743; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b 1744; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h 1745; CHECK-GI-NEXT: ret 1746entry: 1747 %in1 = zext <8 x i4> %src1 to <8 x i16> 1748 %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 1749 %out = mul nsw <8 x i16> %in1, %in2 1750 ret <8 x i16> %out 1751} 1752 1753define <4 x i32> @umull_and_v4i32(<4 x i16> %src1, <4 x i32> %src2) { 1754; CHECK-NEON-LABEL: umull_and_v4i32: 1755; CHECK-NEON: // %bb.0: // %entry 1756; CHECK-NEON-NEXT: movi v2.2d, #0x0000ff000000ff 1757; CHECK-NEON-NEXT: and v1.16b, v1.16b, v2.16b 1758; CHECK-NEON-NEXT: xtn v1.4h, v1.4s 1759; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h 1760; CHECK-NEON-NEXT: ret 1761; 1762; CHECK-SVE-LABEL: umull_and_v4i32: 1763; CHECK-SVE: // %bb.0: // %entry 1764; CHECK-SVE-NEXT: movi v2.2d, #0x0000ff000000ff 1765; CHECK-SVE-NEXT: and v1.16b, v1.16b, v2.16b 1766; CHECK-SVE-NEXT: xtn v1.4h, v1.4s 1767; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h 1768; CHECK-SVE-NEXT: ret 1769; 1770; CHECK-GI-LABEL: umull_and_v4i32: 1771; CHECK-GI: // %bb.0: // %entry 1772; CHECK-GI-NEXT: movi v2.2d, #0x0000ff000000ff 1773; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 1774; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b 1775; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s 1776; CHECK-GI-NEXT: ret 1777entry: 1778 %in1 = zext <4 x i16> %src1 to <4 x i32> 1779 %in2 = and <4 x i32> %src2, <i32 255, i32 255, i32 255, i32 255> 1780 %out = mul nsw <4 x i32> %in1, %in2 1781 ret <4 x i32> %out 1782} 1783 1784define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) { 1785; CHECK-NEON-LABEL: umull_and_v8i32: 1786; CHECK-NEON: // %bb.0: // %entry 1787; CHECK-NEON-NEXT: movi v3.2d, #0x0000ff000000ff 1788; CHECK-NEON-NEXT: and v2.16b, v2.16b, v3.16b 1789; CHECK-NEON-NEXT: and v1.16b, v1.16b, v3.16b 1790; CHECK-NEON-NEXT: uzp1 v2.8h, v1.8h, v2.8h 1791; CHECK-NEON-NEXT: umull2 v1.4s, v0.8h, v2.8h 1792; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v2.4h 1793; CHECK-NEON-NEXT: ret 1794; 1795; CHECK-SVE-LABEL: umull_and_v8i32: 1796; CHECK-SVE: // %bb.0: // %entry 1797; CHECK-SVE-NEXT: movi v3.2d, #0x0000ff000000ff 1798; CHECK-SVE-NEXT: and v2.16b, v2.16b, v3.16b 1799; CHECK-SVE-NEXT: and v1.16b, v1.16b, v3.16b 1800; CHECK-SVE-NEXT: uzp1 v2.8h, v1.8h, v2.8h 1801; CHECK-SVE-NEXT: umull2 v1.4s, v0.8h, v2.8h 1802; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v2.4h 1803; CHECK-SVE-NEXT: ret 1804; 1805; CHECK-GI-LABEL: umull_and_v8i32: 1806; CHECK-GI: // %bb.0: // %entry 1807; CHECK-GI-NEXT: movi v3.2d, #0x0000ff000000ff 1808; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 1809; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0 1810; CHECK-GI-NEXT: and v0.16b, v1.16b, v3.16b 1811; CHECK-GI-NEXT: and v1.16b, v2.16b, v3.16b 1812; CHECK-GI-NEXT: mul v0.4s, v4.4s, v0.4s 1813; CHECK-GI-NEXT: mul v1.4s, v5.4s, v1.4s 1814; CHECK-GI-NEXT: ret 1815entry: 1816 %in1 = zext <8 x i16> %src1 to <8 x i32> 1817 %in2 = and <8 x i32> %src2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 1818 %out = mul nsw <8 x i32> %in1, %in2 1819 ret <8 x i32> %out 1820} 1821 1822define <8 x i32> @umull_and_v8i32_dup(<8 x i16> %src1, i32 %src2) { 1823; CHECK-NEON-LABEL: umull_and_v8i32_dup: 1824; CHECK-NEON: // %bb.0: // %entry 1825; CHECK-NEON-NEXT: and w8, w0, #0xff 1826; CHECK-NEON-NEXT: dup v2.8h, w8 1827; CHECK-NEON-NEXT: umull2 v1.4s, v0.8h, v2.8h 1828; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v2.4h 1829; CHECK-NEON-NEXT: ret 1830; 1831; CHECK-SVE-LABEL: umull_and_v8i32_dup: 1832; CHECK-SVE: // %bb.0: // %entry 1833; CHECK-SVE-NEXT: and w8, w0, #0xff 1834; CHECK-SVE-NEXT: dup v2.8h, w8 1835; CHECK-SVE-NEXT: umull2 v1.4s, v0.8h, v2.8h 1836; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v2.4h 1837; CHECK-SVE-NEXT: ret 1838; 1839; CHECK-GI-LABEL: umull_and_v8i32_dup: 1840; CHECK-GI: // %bb.0: // %entry 1841; CHECK-GI-NEXT: and w8, w0, #0xff 1842; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0 1843; CHECK-GI-NEXT: ushll2 v2.4s, v0.8h, #0 1844; CHECK-GI-NEXT: dup v3.4s, w8 1845; CHECK-GI-NEXT: mul v0.4s, v1.4s, v3.4s 1846; CHECK-GI-NEXT: mul v1.4s, v2.4s, v3.4s 1847; CHECK-GI-NEXT: ret 1848entry: 1849 %in1 = zext <8 x i16> %src1 to <8 x i32> 1850 %in2 = and i32 %src2, 255 1851 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %in2, i64 0 1852 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer 1853 %out = mul nsw <8 x i32> %in1, %broadcast.splat 1854 ret <8 x i32> %out 1855} 1856 1857define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) { 1858; CHECK-NEON-LABEL: umull_and_v2i64: 1859; CHECK-NEON: // %bb.0: // %entry 1860; CHECK-NEON-NEXT: movi v2.2d, #0x000000000000ff 1861; CHECK-NEON-NEXT: and v1.16b, v1.16b, v2.16b 1862; CHECK-NEON-NEXT: xtn v1.2s, v1.2d 1863; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s 1864; CHECK-NEON-NEXT: ret 1865; 1866; CHECK-SVE-LABEL: umull_and_v2i64: 1867; CHECK-SVE: // %bb.0: // %entry 1868; CHECK-SVE-NEXT: movi v2.2d, #0x000000000000ff 1869; CHECK-SVE-NEXT: and v1.16b, v1.16b, v2.16b 1870; CHECK-SVE-NEXT: xtn v1.2s, v1.2d 1871; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s 1872; CHECK-SVE-NEXT: ret 1873; 1874; CHECK-GI-LABEL: umull_and_v2i64: 1875; CHECK-GI: // %bb.0: // %entry 1876; CHECK-GI-NEXT: movi v2.2d, #0x000000000000ff 1877; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 1878; CHECK-GI-NEXT: fmov x8, d0 1879; CHECK-GI-NEXT: mov x10, v0.d[1] 1880; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b 1881; CHECK-GI-NEXT: fmov x9, d1 1882; CHECK-GI-NEXT: mov x11, v1.d[1] 1883; CHECK-GI-NEXT: mul x8, x8, x9 1884; CHECK-GI-NEXT: mul x9, x10, x11 1885; CHECK-GI-NEXT: mov v0.d[0], x8 1886; CHECK-GI-NEXT: mov v0.d[1], x9 1887; CHECK-GI-NEXT: ret 1888entry: 1889 %in1 = zext <2 x i32> %src1 to <2 x i64> 1890 %in2 = and <2 x i64> %src2, <i64 255, i64 255> 1891 %out = mul nsw <2 x i64> %in1, %in2 1892 ret <2 x i64> %out 1893} 1894 1895define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) { 1896; CHECK-NEON-LABEL: umull_and_v4i64: 1897; CHECK-NEON: // %bb.0: // %entry 1898; CHECK-NEON-NEXT: movi v3.2d, #0x000000000000ff 1899; CHECK-NEON-NEXT: and v2.16b, v2.16b, v3.16b 1900; CHECK-NEON-NEXT: and v1.16b, v1.16b, v3.16b 1901; CHECK-NEON-NEXT: uzp1 v2.4s, v1.4s, v2.4s 1902; CHECK-NEON-NEXT: umull2 v1.2d, v0.4s, v2.4s 1903; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s 1904; CHECK-NEON-NEXT: ret 1905; 1906; CHECK-SVE-LABEL: umull_and_v4i64: 1907; CHECK-SVE: // %bb.0: // %entry 1908; CHECK-SVE-NEXT: movi v3.2d, #0x000000000000ff 1909; CHECK-SVE-NEXT: and v2.16b, v2.16b, v3.16b 1910; CHECK-SVE-NEXT: and v1.16b, v1.16b, v3.16b 1911; CHECK-SVE-NEXT: uzp1 v2.4s, v1.4s, v2.4s 1912; CHECK-SVE-NEXT: umull2 v1.2d, v0.4s, v2.4s 1913; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s 1914; CHECK-SVE-NEXT: ret 1915; 1916; CHECK-GI-LABEL: umull_and_v4i64: 1917; CHECK-GI: // %bb.0: // %entry 1918; CHECK-GI-NEXT: movi v3.2d, #0x000000000000ff 1919; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0 1920; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0 1921; CHECK-GI-NEXT: fmov x8, d4 1922; CHECK-GI-NEXT: mov x10, v4.d[1] 1923; CHECK-GI-NEXT: mov x13, v0.d[1] 1924; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b 1925; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b 1926; CHECK-GI-NEXT: fmov x9, d1 1927; CHECK-GI-NEXT: fmov x12, d2 1928; CHECK-GI-NEXT: mov x11, v1.d[1] 1929; CHECK-GI-NEXT: mov x14, v2.d[1] 1930; CHECK-GI-NEXT: mul x8, x8, x9 1931; CHECK-GI-NEXT: fmov x9, d0 1932; CHECK-GI-NEXT: mul x10, x10, x11 1933; CHECK-GI-NEXT: mul x9, x9, x12 1934; CHECK-GI-NEXT: mov v0.d[0], x8 1935; CHECK-GI-NEXT: mul x11, x13, x14 1936; CHECK-GI-NEXT: mov v1.d[0], x9 1937; CHECK-GI-NEXT: mov v0.d[1], x10 1938; CHECK-GI-NEXT: mov v1.d[1], x11 1939; CHECK-GI-NEXT: ret 1940entry: 1941 %in1 = zext <4 x i32> %src1 to <4 x i64> 1942 %in2 = and <4 x i64> %src2, <i64 255, i64 255, i64 255, i64 255> 1943 %out = mul nsw <4 x i64> %in1, %in2 1944 ret <4 x i64> %out 1945} 1946 1947define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) { 1948; CHECK-NEON-LABEL: umull_and_v4i64_dup: 1949; CHECK-NEON: // %bb.0: // %entry 1950; CHECK-NEON-NEXT: and w8, w0, #0xff 1951; CHECK-NEON-NEXT: dup v2.4s, w8 1952; CHECK-NEON-NEXT: umull2 v1.2d, v0.4s, v2.4s 1953; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s 1954; CHECK-NEON-NEXT: ret 1955; 1956; CHECK-SVE-LABEL: umull_and_v4i64_dup: 1957; CHECK-SVE: // %bb.0: // %entry 1958; CHECK-SVE-NEXT: and w8, w0, #0xff 1959; CHECK-SVE-NEXT: dup v2.4s, w8 1960; CHECK-SVE-NEXT: umull2 v1.2d, v0.4s, v2.4s 1961; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s 1962; CHECK-SVE-NEXT: ret 1963; 1964; CHECK-GI-LABEL: umull_and_v4i64_dup: 1965; CHECK-GI: // %bb.0: // %entry 1966; CHECK-GI-NEXT: and x8, x0, #0xff 1967; CHECK-GI-NEXT: ushll v1.2d, v0.2s, #0 1968; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0 1969; CHECK-GI-NEXT: dup v2.2d, x8 1970; CHECK-GI-NEXT: fmov x8, d1 1971; CHECK-GI-NEXT: fmov x12, d0 1972; CHECK-GI-NEXT: mov x10, v1.d[1] 1973; CHECK-GI-NEXT: fmov x9, d2 1974; CHECK-GI-NEXT: mov x11, v2.d[1] 1975; CHECK-GI-NEXT: mov x13, v0.d[1] 1976; CHECK-GI-NEXT: mul x8, x8, x9 1977; CHECK-GI-NEXT: mul x9, x12, x9 1978; CHECK-GI-NEXT: mul x10, x10, x11 1979; CHECK-GI-NEXT: mov v0.d[0], x8 1980; CHECK-GI-NEXT: mul x11, x13, x11 1981; CHECK-GI-NEXT: mov v1.d[0], x9 1982; CHECK-GI-NEXT: mov v0.d[1], x10 1983; CHECK-GI-NEXT: mov v1.d[1], x11 1984; CHECK-GI-NEXT: ret 1985entry: 1986 %in1 = zext <4 x i32> %src1 to <4 x i64> 1987 %in2 = and i64 %src2, 255 1988 %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %in2, i64 0 1989 %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer 1990 %out = mul nsw <4 x i64> %in1, %broadcast.splat 1991 ret <4 x i64> %out 1992} 1993 1994define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { 1995; CHECK-LABEL: pmlsl2_v8i16_uzp1: 1996; CHECK: // %bb.0: 1997; CHECK-NEXT: ldr q2, [x1, #16] 1998; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b 1999; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b 2000; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h 2001; CHECK-NEXT: str q0, [x0] 2002; CHECK-NEXT: ret 2003 %5 = getelementptr inbounds i32, ptr %3, i64 4 2004 %6 = load <8 x i16>, ptr %5, align 4 2005 %7 = trunc <8 x i16> %6 to <8 x i8> 2006 %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2007 %9 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %8, <8 x i8> %7) 2008 %10 = sub <8 x i16> %1, %9 2009 store <8 x i16> %10, ptr %2, align 16 2010 ret void 2011} 2012 2013define void @smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { 2014; CHECK-NEON-LABEL: smlsl2_v8i16_uzp1: 2015; CHECK-NEON: // %bb.0: 2016; CHECK-NEON-NEXT: ldr q2, [x1, #16] 2017; CHECK-NEON-NEXT: uzp1 v2.16b, v0.16b, v2.16b 2018; CHECK-NEON-NEXT: smlsl2 v1.8h, v0.16b, v2.16b 2019; CHECK-NEON-NEXT: str q1, [x0] 2020; CHECK-NEON-NEXT: ret 2021; 2022; CHECK-SVE-LABEL: smlsl2_v8i16_uzp1: 2023; CHECK-SVE: // %bb.0: 2024; CHECK-SVE-NEXT: ldr q2, [x1, #16] 2025; CHECK-SVE-NEXT: uzp1 v2.16b, v0.16b, v2.16b 2026; CHECK-SVE-NEXT: smlsl2 v1.8h, v0.16b, v2.16b 2027; CHECK-SVE-NEXT: str q1, [x0] 2028; CHECK-SVE-NEXT: ret 2029; 2030; CHECK-GI-LABEL: smlsl2_v8i16_uzp1: 2031; CHECK-GI: // %bb.0: 2032; CHECK-GI-NEXT: ldr q2, [x1, #16] 2033; CHECK-GI-NEXT: mov d0, v0.d[1] 2034; CHECK-GI-NEXT: xtn v2.8b, v2.8h 2035; CHECK-GI-NEXT: smlsl v1.8h, v0.8b, v2.8b 2036; CHECK-GI-NEXT: str q1, [x0] 2037; CHECK-GI-NEXT: ret 2038 %5 = getelementptr inbounds i32, ptr %3, i64 4 2039 %6 = load <8 x i16>, ptr %5, align 4 2040 %7 = trunc <8 x i16> %6 to <8 x i8> 2041 %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2042 %9 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %8, <8 x i8> %7) 2043 %10 = sub <8 x i16> %1, %9 2044 store <8 x i16> %10, ptr %2, align 16 2045 ret void 2046} 2047 2048define void @umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) { 2049; CHECK-NEON-LABEL: umlsl2_v8i16_uzp1: 2050; CHECK-NEON: // %bb.0: 2051; CHECK-NEON-NEXT: ldr q2, [x1, #16] 2052; CHECK-NEON-NEXT: uzp1 v2.16b, v0.16b, v2.16b 2053; CHECK-NEON-NEXT: umlsl2 v1.8h, v0.16b, v2.16b 2054; CHECK-NEON-NEXT: str q1, [x0] 2055; CHECK-NEON-NEXT: ret 2056; 2057; CHECK-SVE-LABEL: umlsl2_v8i16_uzp1: 2058; CHECK-SVE: // %bb.0: 2059; CHECK-SVE-NEXT: ldr q2, [x1, #16] 2060; CHECK-SVE-NEXT: uzp1 v2.16b, v0.16b, v2.16b 2061; CHECK-SVE-NEXT: umlsl2 v1.8h, v0.16b, v2.16b 2062; CHECK-SVE-NEXT: str q1, [x0] 2063; CHECK-SVE-NEXT: ret 2064; 2065; CHECK-GI-LABEL: umlsl2_v8i16_uzp1: 2066; CHECK-GI: // %bb.0: 2067; CHECK-GI-NEXT: ldr q2, [x1, #16] 2068; CHECK-GI-NEXT: mov d0, v0.d[1] 2069; CHECK-GI-NEXT: xtn v2.8b, v2.8h 2070; CHECK-GI-NEXT: umlsl v1.8h, v0.8b, v2.8b 2071; CHECK-GI-NEXT: str q1, [x0] 2072; CHECK-GI-NEXT: ret 2073 %5 = getelementptr inbounds i32, ptr %3, i64 4 2074 %6 = load <8 x i16>, ptr %5, align 4 2075 %7 = trunc <8 x i16> %6 to <8 x i8> 2076 %8 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2077 %9 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %8, <8 x i8> %7) 2078 %10 = sub <8 x i16> %1, %9 2079 store <8 x i16> %10, ptr %2, align 16 2080 ret void 2081} 2082 2083define void @smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) { 2084; CHECK-NEON-LABEL: smlsl2_v4i32_uzp1: 2085; CHECK-NEON: // %bb.0: 2086; CHECK-NEON-NEXT: ldr q2, [x1, #16] 2087; CHECK-NEON-NEXT: uzp1 v2.8h, v0.8h, v2.8h 2088; CHECK-NEON-NEXT: smlsl2 v1.4s, v0.8h, v2.8h 2089; CHECK-NEON-NEXT: str q1, [x0] 2090; CHECK-NEON-NEXT: ret 2091; 2092; CHECK-SVE-LABEL: smlsl2_v4i32_uzp1: 2093; CHECK-SVE: // %bb.0: 2094; CHECK-SVE-NEXT: ldr q2, [x1, #16] 2095; CHECK-SVE-NEXT: uzp1 v2.8h, v0.8h, v2.8h 2096; CHECK-SVE-NEXT: smlsl2 v1.4s, v0.8h, v2.8h 2097; CHECK-SVE-NEXT: str q1, [x0] 2098; CHECK-SVE-NEXT: ret 2099; 2100; CHECK-GI-LABEL: smlsl2_v4i32_uzp1: 2101; CHECK-GI: // %bb.0: 2102; CHECK-GI-NEXT: ldr q2, [x1, #16] 2103; CHECK-GI-NEXT: mov d0, v0.d[1] 2104; CHECK-GI-NEXT: xtn v2.4h, v2.4s 2105; CHECK-GI-NEXT: smlsl v1.4s, v0.4h, v2.4h 2106; CHECK-GI-NEXT: str q1, [x0] 2107; CHECK-GI-NEXT: ret 2108 %5 = getelementptr inbounds i32, ptr %3, i64 4 2109 %6 = load <4 x i32>, ptr %5, align 4 2110 %7 = trunc <4 x i32> %6 to <4 x i16> 2111 %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2112 %9 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %8, <4 x i16> %7) 2113 %10 = sub <4 x i32> %1, %9 2114 store <4 x i32> %10, ptr %2, align 16 2115 ret void 2116} 2117 2118define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) { 2119; CHECK-NEON-LABEL: umlsl2_v4i32_uzp1: 2120; CHECK-NEON: // %bb.0: 2121; CHECK-NEON-NEXT: ldr q2, [x1, #16] 2122; CHECK-NEON-NEXT: uzp1 v2.8h, v0.8h, v2.8h 2123; CHECK-NEON-NEXT: umlsl2 v1.4s, v0.8h, v2.8h 2124; CHECK-NEON-NEXT: str q1, [x0] 2125; CHECK-NEON-NEXT: ret 2126; 2127; CHECK-SVE-LABEL: umlsl2_v4i32_uzp1: 2128; CHECK-SVE: // %bb.0: 2129; CHECK-SVE-NEXT: ldr q2, [x1, #16] 2130; CHECK-SVE-NEXT: uzp1 v2.8h, v0.8h, v2.8h 2131; CHECK-SVE-NEXT: umlsl2 v1.4s, v0.8h, v2.8h 2132; CHECK-SVE-NEXT: str q1, [x0] 2133; CHECK-SVE-NEXT: ret 2134; 2135; CHECK-GI-LABEL: umlsl2_v4i32_uzp1: 2136; CHECK-GI: // %bb.0: 2137; CHECK-GI-NEXT: ldr q2, [x1, #16] 2138; CHECK-GI-NEXT: mov d0, v0.d[1] 2139; CHECK-GI-NEXT: xtn v2.4h, v2.4s 2140; CHECK-GI-NEXT: umlsl v1.4s, v0.4h, v2.4h 2141; CHECK-GI-NEXT: str q1, [x0] 2142; CHECK-GI-NEXT: ret 2143 %5 = getelementptr inbounds i32, ptr %3, i64 4 2144 %6 = load <4 x i32>, ptr %5, align 4 2145 %7 = trunc <4 x i32> %6 to <4 x i16> 2146 %8 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2147 %9 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %8, <4 x i16> %7) 2148 %10 = sub <4 x i32> %1, %9 2149 store <4 x i32> %10, ptr %2, align 16 2150 ret void 2151} 2152 2153define void @pmlsl_pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) { 2154; CHECK-LABEL: pmlsl_pmlsl2_v8i16_uzp1: 2155; CHECK: // %bb.0: // %entry 2156; CHECK-NEXT: ldp q2, q3, [x1] 2157; CHECK-NEXT: uzp1 v2.16b, v2.16b, v3.16b 2158; CHECK-NEXT: pmull v3.8h, v0.8b, v2.8b 2159; CHECK-NEXT: pmull2 v0.8h, v0.16b, v2.16b 2160; CHECK-NEXT: add v0.8h, v3.8h, v0.8h 2161; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h 2162; CHECK-NEXT: str q0, [x0] 2163; CHECK-NEXT: ret 2164entry: 2165 %5 = load <8 x i16>, ptr %3, align 4 2166 %6 = trunc <8 x i16> %5 to <8 x i8> 2167 %7 = getelementptr inbounds i32, ptr %3, i64 4 2168 %8 = load <8 x i16>, ptr %7, align 4 2169 %9 = trunc <8 x i16> %8 to <8 x i8> 2170 %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2171 %11 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %10, <8 x i8> %6) 2172 %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2173 %13 = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %12, <8 x i8> %9) 2174 %14 = add <8 x i16> %11, %13 2175 %15 = sub <8 x i16> %1, %14 2176 store <8 x i16> %15, ptr %2, align 16 2177 ret void 2178} 2179 2180define void @smlsl_smlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) { 2181; CHECK-NEON-LABEL: smlsl_smlsl2_v8i16_uzp1: 2182; CHECK-NEON: // %bb.0: // %entry 2183; CHECK-NEON-NEXT: ldp q2, q3, [x1] 2184; CHECK-NEON-NEXT: uzp1 v2.16b, v2.16b, v3.16b 2185; CHECK-NEON-NEXT: smlsl v1.8h, v0.8b, v2.8b 2186; CHECK-NEON-NEXT: smlsl2 v1.8h, v0.16b, v2.16b 2187; CHECK-NEON-NEXT: str q1, [x0] 2188; CHECK-NEON-NEXT: ret 2189; 2190; CHECK-SVE-LABEL: smlsl_smlsl2_v8i16_uzp1: 2191; CHECK-SVE: // %bb.0: // %entry 2192; CHECK-SVE-NEXT: ldp q2, q3, [x1] 2193; CHECK-SVE-NEXT: uzp1 v2.16b, v2.16b, v3.16b 2194; CHECK-SVE-NEXT: smlsl v1.8h, v0.8b, v2.8b 2195; CHECK-SVE-NEXT: smlsl2 v1.8h, v0.16b, v2.16b 2196; CHECK-SVE-NEXT: str q1, [x0] 2197; CHECK-SVE-NEXT: ret 2198; 2199; CHECK-GI-LABEL: smlsl_smlsl2_v8i16_uzp1: 2200; CHECK-GI: // %bb.0: // %entry 2201; CHECK-GI-NEXT: ldp q4, q2, [x1] 2202; CHECK-GI-NEXT: mov d3, v0.d[1] 2203; CHECK-GI-NEXT: xtn v2.8b, v2.8h 2204; CHECK-GI-NEXT: xtn v4.8b, v4.8h 2205; CHECK-GI-NEXT: smull v2.8h, v3.8b, v2.8b 2206; CHECK-GI-NEXT: smlal v2.8h, v0.8b, v4.8b 2207; CHECK-GI-NEXT: sub v0.8h, v1.8h, v2.8h 2208; CHECK-GI-NEXT: str q0, [x0] 2209; CHECK-GI-NEXT: ret 2210entry: 2211 %5 = load <8 x i16>, ptr %3, align 4 2212 %6 = trunc <8 x i16> %5 to <8 x i8> 2213 %7 = getelementptr inbounds i32, ptr %3, i64 4 2214 %8 = load <8 x i16>, ptr %7, align 4 2215 %9 = trunc <8 x i16> %8 to <8 x i8> 2216 %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2217 %11 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %10, <8 x i8> %6) 2218 %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2219 %13 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %12, <8 x i8> %9) 2220 %14 = add <8 x i16> %11, %13 2221 %15 = sub <8 x i16> %1, %14 2222 store <8 x i16> %15, ptr %2, align 16 2223 ret void 2224} 2225 2226define void @umlsl_umlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) { 2227; CHECK-NEON-LABEL: umlsl_umlsl2_v8i16_uzp1: 2228; CHECK-NEON: // %bb.0: // %entry 2229; CHECK-NEON-NEXT: ldp q2, q3, [x1] 2230; CHECK-NEON-NEXT: uzp1 v2.16b, v2.16b, v3.16b 2231; CHECK-NEON-NEXT: umlsl v1.8h, v0.8b, v2.8b 2232; CHECK-NEON-NEXT: umlsl2 v1.8h, v0.16b, v2.16b 2233; CHECK-NEON-NEXT: str q1, [x0] 2234; CHECK-NEON-NEXT: ret 2235; 2236; CHECK-SVE-LABEL: umlsl_umlsl2_v8i16_uzp1: 2237; CHECK-SVE: // %bb.0: // %entry 2238; CHECK-SVE-NEXT: ldp q2, q3, [x1] 2239; CHECK-SVE-NEXT: uzp1 v2.16b, v2.16b, v3.16b 2240; CHECK-SVE-NEXT: umlsl v1.8h, v0.8b, v2.8b 2241; CHECK-SVE-NEXT: umlsl2 v1.8h, v0.16b, v2.16b 2242; CHECK-SVE-NEXT: str q1, [x0] 2243; CHECK-SVE-NEXT: ret 2244; 2245; CHECK-GI-LABEL: umlsl_umlsl2_v8i16_uzp1: 2246; CHECK-GI: // %bb.0: // %entry 2247; CHECK-GI-NEXT: ldp q4, q2, [x1] 2248; CHECK-GI-NEXT: mov d3, v0.d[1] 2249; CHECK-GI-NEXT: xtn v2.8b, v2.8h 2250; CHECK-GI-NEXT: xtn v4.8b, v4.8h 2251; CHECK-GI-NEXT: umull v2.8h, v3.8b, v2.8b 2252; CHECK-GI-NEXT: umlal v2.8h, v0.8b, v4.8b 2253; CHECK-GI-NEXT: sub v0.8h, v1.8h, v2.8h 2254; CHECK-GI-NEXT: str q0, [x0] 2255; CHECK-GI-NEXT: ret 2256entry: 2257 %5 = load <8 x i16>, ptr %3, align 4 2258 %6 = trunc <8 x i16> %5 to <8 x i8> 2259 %7 = getelementptr inbounds i32, ptr %3, i64 4 2260 %8 = load <8 x i16>, ptr %7, align 4 2261 %9 = trunc <8 x i16> %8 to <8 x i8> 2262 %10 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2263 %11 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %10, <8 x i8> %6) 2264 %12 = shufflevector <16 x i8> %0, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2265 %13 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %12, <8 x i8> %9) 2266 %14 = add <8 x i16> %11, %13 2267 %15 = sub <8 x i16> %1, %14 2268 store <8 x i16> %15, ptr %2, align 16 2269 ret void 2270} 2271 2272define void @smlsl_smlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) { 2273; CHECK-NEON-LABEL: smlsl_smlsl2_v4i32_uzp1: 2274; CHECK-NEON: // %bb.0: // %entry 2275; CHECK-NEON-NEXT: ldp q2, q3, [x1] 2276; CHECK-NEON-NEXT: uzp1 v2.8h, v2.8h, v3.8h 2277; CHECK-NEON-NEXT: smlsl v1.4s, v0.4h, v2.4h 2278; CHECK-NEON-NEXT: smlsl2 v1.4s, v0.8h, v2.8h 2279; CHECK-NEON-NEXT: str q1, [x0] 2280; CHECK-NEON-NEXT: ret 2281; 2282; CHECK-SVE-LABEL: smlsl_smlsl2_v4i32_uzp1: 2283; CHECK-SVE: // %bb.0: // %entry 2284; CHECK-SVE-NEXT: ldp q2, q3, [x1] 2285; CHECK-SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h 2286; CHECK-SVE-NEXT: smlsl v1.4s, v0.4h, v2.4h 2287; CHECK-SVE-NEXT: smlsl2 v1.4s, v0.8h, v2.8h 2288; CHECK-SVE-NEXT: str q1, [x0] 2289; CHECK-SVE-NEXT: ret 2290; 2291; CHECK-GI-LABEL: smlsl_smlsl2_v4i32_uzp1: 2292; CHECK-GI: // %bb.0: // %entry 2293; CHECK-GI-NEXT: ldp q4, q2, [x1] 2294; CHECK-GI-NEXT: mov d3, v0.d[1] 2295; CHECK-GI-NEXT: xtn v2.4h, v2.4s 2296; CHECK-GI-NEXT: xtn v4.4h, v4.4s 2297; CHECK-GI-NEXT: smull v2.4s, v3.4h, v2.4h 2298; CHECK-GI-NEXT: smlal v2.4s, v0.4h, v4.4h 2299; CHECK-GI-NEXT: sub v0.4s, v1.4s, v2.4s 2300; CHECK-GI-NEXT: str q0, [x0] 2301; CHECK-GI-NEXT: ret 2302entry: 2303 %5 = load <4 x i32>, ptr %3, align 4 2304 %6 = trunc <4 x i32> %5 to <4 x i16> 2305 %7 = getelementptr inbounds i32, ptr %3, i64 4 2306 %8 = load <4 x i32>, ptr %7, align 4 2307 %9 = trunc <4 x i32> %8 to <4 x i16> 2308 %10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2309 %11 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %10, <4 x i16> %6) 2310 %12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2311 %13 = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %12, <4 x i16> %9) 2312 %14 = add <4 x i32> %11, %13 2313 %15 = sub <4 x i32> %1, %14 2314 store <4 x i32> %15, ptr %2, align 16 2315 ret void 2316} 2317 2318define void @umlsl_umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3, i32 %4) { 2319; CHECK-NEON-LABEL: umlsl_umlsl2_v4i32_uzp1: 2320; CHECK-NEON: // %bb.0: // %entry 2321; CHECK-NEON-NEXT: ldp q2, q3, [x1] 2322; CHECK-NEON-NEXT: uzp1 v2.8h, v2.8h, v3.8h 2323; CHECK-NEON-NEXT: umlsl v1.4s, v0.4h, v2.4h 2324; CHECK-NEON-NEXT: umlsl2 v1.4s, v0.8h, v2.8h 2325; CHECK-NEON-NEXT: str q1, [x0] 2326; CHECK-NEON-NEXT: ret 2327; 2328; CHECK-SVE-LABEL: umlsl_umlsl2_v4i32_uzp1: 2329; CHECK-SVE: // %bb.0: // %entry 2330; CHECK-SVE-NEXT: ldp q2, q3, [x1] 2331; CHECK-SVE-NEXT: uzp1 v2.8h, v2.8h, v3.8h 2332; CHECK-SVE-NEXT: umlsl v1.4s, v0.4h, v2.4h 2333; CHECK-SVE-NEXT: umlsl2 v1.4s, v0.8h, v2.8h 2334; CHECK-SVE-NEXT: str q1, [x0] 2335; CHECK-SVE-NEXT: ret 2336; 2337; CHECK-GI-LABEL: umlsl_umlsl2_v4i32_uzp1: 2338; CHECK-GI: // %bb.0: // %entry 2339; CHECK-GI-NEXT: ldp q4, q2, [x1] 2340; CHECK-GI-NEXT: mov d3, v0.d[1] 2341; CHECK-GI-NEXT: xtn v2.4h, v2.4s 2342; CHECK-GI-NEXT: xtn v4.4h, v4.4s 2343; CHECK-GI-NEXT: umull v2.4s, v3.4h, v2.4h 2344; CHECK-GI-NEXT: umlal v2.4s, v0.4h, v4.4h 2345; CHECK-GI-NEXT: sub v0.4s, v1.4s, v2.4s 2346; CHECK-GI-NEXT: str q0, [x0] 2347; CHECK-GI-NEXT: ret 2348entry: 2349 %5 = load <4 x i32>, ptr %3, align 4 2350 %6 = trunc <4 x i32> %5 to <4 x i16> 2351 %7 = getelementptr inbounds i32, ptr %3, i64 4 2352 %8 = load <4 x i32>, ptr %7, align 4 2353 %9 = trunc <4 x i32> %8 to <4 x i16> 2354 %10 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2355 %11 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %10, <4 x i16> %6) 2356 %12 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2357 %13 = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %12, <4 x i16> %9) 2358 %14 = add <4 x i32> %11, %13 2359 %15 = sub <4 x i32> %1, %14 2360 store <4 x i32> %15, ptr %2, align 16 2361 ret void 2362} 2363 2364define <2 x i32> @do_stuff(<2 x i64> %0, <2 x i64> %1) { 2365; CHECK-NEON-LABEL: do_stuff: 2366; CHECK-NEON: // %bb.0: 2367; CHECK-NEON-NEXT: uzp1 v0.4s, v0.4s, v0.4s 2368; CHECK-NEON-NEXT: smull2 v0.2d, v1.4s, v0.4s 2369; CHECK-NEON-NEXT: xtn v0.2s, v0.2d 2370; CHECK-NEON-NEXT: add v0.2s, v0.2s, v1.2s 2371; CHECK-NEON-NEXT: ret 2372; 2373; CHECK-SVE-LABEL: do_stuff: 2374; CHECK-SVE: // %bb.0: 2375; CHECK-SVE-NEXT: uzp1 v0.4s, v0.4s, v0.4s 2376; CHECK-SVE-NEXT: smull2 v0.2d, v1.4s, v0.4s 2377; CHECK-SVE-NEXT: xtn v0.2s, v0.2d 2378; CHECK-SVE-NEXT: add v0.2s, v0.2s, v1.2s 2379; CHECK-SVE-NEXT: ret 2380; 2381; CHECK-GI-LABEL: do_stuff: 2382; CHECK-GI: // %bb.0: 2383; CHECK-GI-NEXT: xtn v0.2s, v0.2d 2384; CHECK-GI-NEXT: mov d2, v1.d[1] 2385; CHECK-GI-NEXT: smull v0.2d, v2.2s, v0.2s 2386; CHECK-GI-NEXT: xtn v0.2s, v0.2d 2387; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s 2388; CHECK-GI-NEXT: ret 2389 %bc.1 = bitcast <2 x i64> %1 to <4 x i32> 2390 %trunc.0 = trunc <2 x i64> %0 to <2 x i32> 2391 %shuff.hi = shufflevector <4 x i32> %bc.1, <4 x i32> zeroinitializer, <2 x i32> <i32 2, i32 3> 2392 %shuff.lo = shufflevector <4 x i32> %bc.1, <4 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1> 2393 %smull = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuff.hi, <2 x i32> %trunc.0) 2394 %trunc.smull = trunc <2 x i64> %smull to <2 x i32> 2395 %final = add <2 x i32> %trunc.smull, %shuff.lo 2396 ret <2 x i32> %final 2397} 2398 2399define <2 x i64> @lsr(<2 x i64> %a, <2 x i64> %b) { 2400; CHECK-NEON-LABEL: lsr: 2401; CHECK-NEON: // %bb.0: 2402; CHECK-NEON-NEXT: shrn v0.2s, v0.2d, #32 2403; CHECK-NEON-NEXT: shrn v1.2s, v1.2d, #32 2404; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s 2405; CHECK-NEON-NEXT: ret 2406; 2407; CHECK-SVE-LABEL: lsr: 2408; CHECK-SVE: // %bb.0: 2409; CHECK-SVE-NEXT: shrn v0.2s, v0.2d, #32 2410; CHECK-SVE-NEXT: shrn v1.2s, v1.2d, #32 2411; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s 2412; CHECK-SVE-NEXT: ret 2413; 2414; CHECK-GI-LABEL: lsr: 2415; CHECK-GI: // %bb.0: 2416; CHECK-GI-NEXT: ushr v0.2d, v0.2d, #32 2417; CHECK-GI-NEXT: ushr v1.2d, v1.2d, #32 2418; CHECK-GI-NEXT: fmov x8, d0 2419; CHECK-GI-NEXT: fmov x9, d1 2420; CHECK-GI-NEXT: mov x10, v0.d[1] 2421; CHECK-GI-NEXT: mov x11, v1.d[1] 2422; CHECK-GI-NEXT: mul x8, x8, x9 2423; CHECK-GI-NEXT: mul x9, x10, x11 2424; CHECK-GI-NEXT: mov v0.d[0], x8 2425; CHECK-GI-NEXT: mov v0.d[1], x9 2426; CHECK-GI-NEXT: ret 2427 %x = lshr <2 x i64> %a, <i64 32, i64 32> 2428 %y = lshr <2 x i64> %b, <i64 32, i64 32> 2429 %z = mul nsw <2 x i64> %x, %y 2430 ret <2 x i64> %z 2431} 2432 2433define <2 x i64> @lsr_const(<2 x i64> %a, <2 x i64> %b) { 2434; CHECK-NEON-LABEL: lsr_const: 2435; CHECK-NEON: // %bb.0: 2436; CHECK-NEON-NEXT: movi v1.2s, #31 2437; CHECK-NEON-NEXT: shrn v0.2s, v0.2d, #32 2438; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s 2439; CHECK-NEON-NEXT: ret 2440; 2441; CHECK-SVE-LABEL: lsr_const: 2442; CHECK-SVE: // %bb.0: 2443; CHECK-SVE-NEXT: movi v1.2s, #31 2444; CHECK-SVE-NEXT: shrn v0.2s, v0.2d, #32 2445; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s 2446; CHECK-SVE-NEXT: ret 2447; 2448; CHECK-GI-LABEL: lsr_const: 2449; CHECK-GI: // %bb.0: 2450; CHECK-GI-NEXT: adrp x8, .LCPI79_0 2451; CHECK-GI-NEXT: ushr v0.2d, v0.2d, #32 2452; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI79_0] 2453; CHECK-GI-NEXT: fmov x8, d0 2454; CHECK-GI-NEXT: fmov x9, d1 2455; CHECK-GI-NEXT: mov x10, v0.d[1] 2456; CHECK-GI-NEXT: mov x11, v1.d[1] 2457; CHECK-GI-NEXT: mul x8, x8, x9 2458; CHECK-GI-NEXT: mul x9, x10, x11 2459; CHECK-GI-NEXT: mov v0.d[0], x8 2460; CHECK-GI-NEXT: mov v0.d[1], x9 2461; CHECK-GI-NEXT: ret 2462 %x = lshr <2 x i64> %a, <i64 32, i64 32> 2463 %z = mul nsw <2 x i64> %x, <i64 31, i64 31> 2464 ret <2 x i64> %z 2465} 2466 2467define <2 x i64> @asr(<2 x i64> %a, <2 x i64> %b) { 2468; CHECK-NEON-LABEL: asr: 2469; CHECK-NEON: // %bb.0: 2470; CHECK-NEON-NEXT: shrn v0.2s, v0.2d, #32 2471; CHECK-NEON-NEXT: shrn v1.2s, v1.2d, #32 2472; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s 2473; CHECK-NEON-NEXT: ret 2474; 2475; CHECK-SVE-LABEL: asr: 2476; CHECK-SVE: // %bb.0: 2477; CHECK-SVE-NEXT: shrn v0.2s, v0.2d, #32 2478; CHECK-SVE-NEXT: shrn v1.2s, v1.2d, #32 2479; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s 2480; CHECK-SVE-NEXT: ret 2481; 2482; CHECK-GI-LABEL: asr: 2483; CHECK-GI: // %bb.0: 2484; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #32 2485; CHECK-GI-NEXT: sshr v1.2d, v1.2d, #32 2486; CHECK-GI-NEXT: fmov x8, d0 2487; CHECK-GI-NEXT: fmov x9, d1 2488; CHECK-GI-NEXT: mov x10, v0.d[1] 2489; CHECK-GI-NEXT: mov x11, v1.d[1] 2490; CHECK-GI-NEXT: mul x8, x8, x9 2491; CHECK-GI-NEXT: mul x9, x10, x11 2492; CHECK-GI-NEXT: mov v0.d[0], x8 2493; CHECK-GI-NEXT: mov v0.d[1], x9 2494; CHECK-GI-NEXT: ret 2495 %x = ashr <2 x i64> %a, <i64 32, i64 32> 2496 %y = ashr <2 x i64> %b, <i64 32, i64 32> 2497 %z = mul nsw <2 x i64> %x, %y 2498 ret <2 x i64> %z 2499} 2500 2501define <2 x i64> @asr_const(<2 x i64> %a, <2 x i64> %b) { 2502; CHECK-NEON-LABEL: asr_const: 2503; CHECK-NEON: // %bb.0: 2504; CHECK-NEON-NEXT: movi v1.2s, #31 2505; CHECK-NEON-NEXT: shrn v0.2s, v0.2d, #32 2506; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s 2507; CHECK-NEON-NEXT: ret 2508; 2509; CHECK-SVE-LABEL: asr_const: 2510; CHECK-SVE: // %bb.0: 2511; CHECK-SVE-NEXT: movi v1.2s, #31 2512; CHECK-SVE-NEXT: shrn v0.2s, v0.2d, #32 2513; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s 2514; CHECK-SVE-NEXT: ret 2515; 2516; CHECK-GI-LABEL: asr_const: 2517; CHECK-GI: // %bb.0: 2518; CHECK-GI-NEXT: adrp x8, .LCPI81_0 2519; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #32 2520; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI81_0] 2521; CHECK-GI-NEXT: fmov x8, d0 2522; CHECK-GI-NEXT: fmov x9, d1 2523; CHECK-GI-NEXT: mov x10, v0.d[1] 2524; CHECK-GI-NEXT: mov x11, v1.d[1] 2525; CHECK-GI-NEXT: mul x8, x8, x9 2526; CHECK-GI-NEXT: mul x9, x10, x11 2527; CHECK-GI-NEXT: mov v0.d[0], x8 2528; CHECK-GI-NEXT: mov v0.d[1], x9 2529; CHECK-GI-NEXT: ret 2530 %x = ashr <2 x i64> %a, <i64 32, i64 32> 2531 %z = mul nsw <2 x i64> %x, <i64 31, i64 31> 2532 ret <2 x i64> %z 2533} 2534 2535define <8 x i16> @smulladdl_v8i8_v8i16(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) nounwind { 2536; CHECK-NEON-LABEL: smulladdl_v8i8_v8i16: 2537; CHECK-NEON: // %bb.0: 2538; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b 2539; CHECK-NEON-NEXT: saddw v0.8h, v0.8h, v2.8b 2540; CHECK-NEON-NEXT: ret 2541; 2542; CHECK-SVE-LABEL: smulladdl_v8i8_v8i16: 2543; CHECK-SVE: // %bb.0: 2544; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b 2545; CHECK-SVE-NEXT: saddw v0.8h, v0.8h, v2.8b 2546; CHECK-SVE-NEXT: ret 2547; 2548; CHECK-GI-LABEL: smulladdl_v8i8_v8i16: 2549; CHECK-GI: // %bb.0: 2550; CHECK-GI-NEXT: sshll v2.8h, v2.8b, #0 2551; CHECK-GI-NEXT: smlal v2.8h, v0.8b, v1.8b 2552; CHECK-GI-NEXT: mov v0.16b, v2.16b 2553; CHECK-GI-NEXT: ret 2554 %tmp1 = sext <8 x i8> %A to <8 x i16> 2555 %tmp2 = sext <8 x i8> %B to <8 x i16> 2556 %tmp3 = sext <8 x i8> %C to <8 x i16> 2557 %tmp4 = mul <8 x i16> %tmp1, %tmp2 2558 %tmp5 = add <8 x i16> %tmp4, %tmp3 2559 ret <8 x i16> %tmp5 2560} 2561 2562define <8 x i16> @umulladdl_v8i8_v8i16(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) nounwind { 2563; CHECK-NEON-LABEL: umulladdl_v8i8_v8i16: 2564; CHECK-NEON: // %bb.0: 2565; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b 2566; CHECK-NEON-NEXT: uaddw v0.8h, v0.8h, v2.8b 2567; CHECK-NEON-NEXT: ret 2568; 2569; CHECK-SVE-LABEL: umulladdl_v8i8_v8i16: 2570; CHECK-SVE: // %bb.0: 2571; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b 2572; CHECK-SVE-NEXT: uaddw v0.8h, v0.8h, v2.8b 2573; CHECK-SVE-NEXT: ret 2574; 2575; CHECK-GI-LABEL: umulladdl_v8i8_v8i16: 2576; CHECK-GI: // %bb.0: 2577; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 2578; CHECK-GI-NEXT: umlal v2.8h, v0.8b, v1.8b 2579; CHECK-GI-NEXT: mov v0.16b, v2.16b 2580; CHECK-GI-NEXT: ret 2581 %tmp1 = zext <8 x i8> %A to <8 x i16> 2582 %tmp2 = zext <8 x i8> %B to <8 x i16> 2583 %tmp3 = zext <8 x i8> %C to <8 x i16> 2584 %tmp4 = mul <8 x i16> %tmp1, %tmp2 2585 %tmp5 = add <8 x i16> %tmp4, %tmp3 2586 ret <8 x i16> %tmp5 2587} 2588 2589define <8 x i16> @smlall_v8i8_v8i16(<8 x i8> %A, <8 x i8> %B, <8 x i16> %C) nounwind { 2590; CHECK-LABEL: smlall_v8i8_v8i16: 2591; CHECK: // %bb.0: 2592; CHECK-NEXT: smlal v2.8h, v0.8b, v1.8b 2593; CHECK-NEXT: mov v0.16b, v2.16b 2594; CHECK-NEXT: ret 2595 %tmp1 = sext <8 x i8> %A to <8 x i16> 2596 %tmp2 = sext <8 x i8> %B to <8 x i16> 2597 %tmp4 = mul <8 x i16> %tmp1, %tmp2 2598 %tmp5 = add <8 x i16> %tmp4, %C 2599 ret <8 x i16> %tmp5 2600} 2601 2602define <8 x i16> @umlall_v8i8_v8i16(<8 x i8> %A, <8 x i8> %B, <8 x i16> %C) nounwind { 2603; CHECK-LABEL: umlall_v8i8_v8i16: 2604; CHECK: // %bb.0: 2605; CHECK-NEXT: umlal v2.8h, v0.8b, v1.8b 2606; CHECK-NEXT: mov v0.16b, v2.16b 2607; CHECK-NEXT: ret 2608 %tmp1 = zext <8 x i8> %A to <8 x i16> 2609 %tmp2 = zext <8 x i8> %B to <8 x i16> 2610 %tmp4 = mul <8 x i16> %tmp1, %tmp2 2611 %tmp5 = add <8 x i16> %tmp4, %C 2612 ret <8 x i16> %tmp5 2613} 2614 2615define <8 x i16> @smulladdl_const_v8i8_v8i16(<8 x i8> %A, <8 x i8> %C) nounwind { 2616; CHECK-NEON-LABEL: smulladdl_const_v8i8_v8i16: 2617; CHECK-NEON: // %bb.0: 2618; CHECK-NEON-NEXT: movi v2.8b, #10 2619; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v2.8b 2620; CHECK-NEON-NEXT: saddw v0.8h, v0.8h, v1.8b 2621; CHECK-NEON-NEXT: ret 2622; 2623; CHECK-SVE-LABEL: smulladdl_const_v8i8_v8i16: 2624; CHECK-SVE: // %bb.0: 2625; CHECK-SVE-NEXT: movi v2.8b, #10 2626; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v2.8b 2627; CHECK-SVE-NEXT: saddw v0.8h, v0.8h, v1.8b 2628; CHECK-SVE-NEXT: ret 2629; 2630; CHECK-GI-LABEL: smulladdl_const_v8i8_v8i16: 2631; CHECK-GI: // %bb.0: 2632; CHECK-GI-NEXT: movi v2.8h, #10 2633; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 2634; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h 2635; CHECK-GI-NEXT: saddw v0.8h, v0.8h, v1.8b 2636; CHECK-GI-NEXT: ret 2637 %tmp1 = sext <8 x i8> %A to <8 x i16> 2638 %tmp3 = sext <8 x i8> %C to <8 x i16> 2639 %tmp4 = mul <8 x i16> %tmp1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> 2640 %tmp5 = add <8 x i16> %tmp4, %tmp3 2641 ret <8 x i16> %tmp5 2642} 2643 2644define <8 x i16> @umulladdl_const_v8i8_v8i16(<8 x i8> %A, <8 x i8> %C) nounwind { 2645; CHECK-NEON-LABEL: umulladdl_const_v8i8_v8i16: 2646; CHECK-NEON: // %bb.0: 2647; CHECK-NEON-NEXT: movi v2.8b, #10 2648; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v2.8b 2649; CHECK-NEON-NEXT: uaddw v0.8h, v0.8h, v1.8b 2650; CHECK-NEON-NEXT: ret 2651; 2652; CHECK-SVE-LABEL: umulladdl_const_v8i8_v8i16: 2653; CHECK-SVE: // %bb.0: 2654; CHECK-SVE-NEXT: movi v2.8b, #10 2655; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v2.8b 2656; CHECK-SVE-NEXT: uaddw v0.8h, v0.8h, v1.8b 2657; CHECK-SVE-NEXT: ret 2658; 2659; CHECK-GI-LABEL: umulladdl_const_v8i8_v8i16: 2660; CHECK-GI: // %bb.0: 2661; CHECK-GI-NEXT: movi v2.8h, #10 2662; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 2663; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h 2664; CHECK-GI-NEXT: uaddw v0.8h, v0.8h, v1.8b 2665; CHECK-GI-NEXT: ret 2666 %tmp1 = zext <8 x i8> %A to <8 x i16> 2667 %tmp3 = zext <8 x i8> %C to <8 x i16> 2668 %tmp4 = mul <8 x i16> %tmp1, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> 2669 %tmp5 = add <8 x i16> %tmp4, %tmp3 2670 ret <8 x i16> %tmp5 2671} 2672 2673define <8 x i16> @sdistribute_v8i8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8> %mul) { 2674; CHECK-NEON-LABEL: sdistribute_v8i8: 2675; CHECK-NEON: // %bb.0: // %entry 2676; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v2.8b 2677; CHECK-NEON-NEXT: smlal v0.8h, v1.8b, v2.8b 2678; CHECK-NEON-NEXT: ret 2679; 2680; CHECK-SVE-LABEL: sdistribute_v8i8: 2681; CHECK-SVE: // %bb.0: // %entry 2682; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v2.8b 2683; CHECK-SVE-NEXT: smlal v0.8h, v1.8b, v2.8b 2684; CHECK-SVE-NEXT: ret 2685; 2686; CHECK-GI-LABEL: sdistribute_v8i8: 2687; CHECK-GI: // %bb.0: // %entry 2688; CHECK-GI-NEXT: sshll v2.8h, v2.8b, #0 2689; CHECK-GI-NEXT: saddl v0.8h, v0.8b, v1.8b 2690; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h 2691; CHECK-GI-NEXT: ret 2692entry: 2693 %4 = sext <8 x i8> %src1 to <8 x i16> 2694 %5 = sext <8 x i8> %mul to <8 x i16> 2695 %7 = sext <8 x i8> %src2 to <8 x i16> 2696 %8 = add nuw nsw <8 x i16> %4, %7 2697 %9 = mul <8 x i16> %8, %5 2698 ret <8 x i16> %9 2699} 2700 2701define <8 x i16> @sdistribute_const1_v8i8(<8 x i8> %src1, <8 x i8> %mul) { 2702; CHECK-NEON-LABEL: sdistribute_const1_v8i8: 2703; CHECK-NEON: // %bb.0: // %entry 2704; CHECK-NEON-NEXT: movi v2.8b, #10 2705; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b 2706; CHECK-NEON-NEXT: smlal v0.8h, v2.8b, v1.8b 2707; CHECK-NEON-NEXT: ret 2708; 2709; CHECK-SVE-LABEL: sdistribute_const1_v8i8: 2710; CHECK-SVE: // %bb.0: // %entry 2711; CHECK-SVE-NEXT: movi v2.8b, #10 2712; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b 2713; CHECK-SVE-NEXT: smlal v0.8h, v2.8b, v1.8b 2714; CHECK-SVE-NEXT: ret 2715; 2716; CHECK-GI-LABEL: sdistribute_const1_v8i8: 2717; CHECK-GI: // %bb.0: // %entry 2718; CHECK-GI-NEXT: movi v2.8h, #10 2719; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 2720; CHECK-GI-NEXT: saddw v0.8h, v2.8h, v0.8b 2721; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h 2722; CHECK-GI-NEXT: ret 2723entry: 2724 %4 = sext <8 x i8> %src1 to <8 x i16> 2725 %5 = sext <8 x i8> %mul to <8 x i16> 2726 %8 = add nuw nsw <8 x i16> %4, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> 2727 %9 = mul <8 x i16> %8, %5 2728 ret <8 x i16> %9 2729} 2730 2731define <8 x i16> @sdistribute_const2_v8i8(<8 x i8> %src1, <8 x i8> %src2) { 2732; CHECK-NEON-LABEL: sdistribute_const2_v8i8: 2733; CHECK-NEON: // %bb.0: // %entry 2734; CHECK-NEON-NEXT: movi v2.8b, #10 2735; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v2.8b 2736; CHECK-NEON-NEXT: smlal v0.8h, v1.8b, v2.8b 2737; CHECK-NEON-NEXT: ret 2738; 2739; CHECK-SVE-LABEL: sdistribute_const2_v8i8: 2740; CHECK-SVE: // %bb.0: // %entry 2741; CHECK-SVE-NEXT: movi v2.8b, #10 2742; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v2.8b 2743; CHECK-SVE-NEXT: smlal v0.8h, v1.8b, v2.8b 2744; CHECK-SVE-NEXT: ret 2745; 2746; CHECK-GI-LABEL: sdistribute_const2_v8i8: 2747; CHECK-GI: // %bb.0: // %entry 2748; CHECK-GI-NEXT: movi v2.8h, #10 2749; CHECK-GI-NEXT: saddl v0.8h, v0.8b, v1.8b 2750; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h 2751; CHECK-GI-NEXT: ret 2752entry: 2753 %4 = sext <8 x i8> %src1 to <8 x i16> 2754 %5 = sext <8 x i8> %src2 to <8 x i16> 2755 %8 = add nuw nsw <8 x i16> %4, %5 2756 %9 = mul <8 x i16> %8, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> 2757 ret <8 x i16> %9 2758} 2759 2760define <8 x i16> @udistribute_v8i8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8> %mul) { 2761; CHECK-NEON-LABEL: udistribute_v8i8: 2762; CHECK-NEON: // %bb.0: // %entry 2763; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v2.8b 2764; CHECK-NEON-NEXT: umlal v0.8h, v1.8b, v2.8b 2765; CHECK-NEON-NEXT: ret 2766; 2767; CHECK-SVE-LABEL: udistribute_v8i8: 2768; CHECK-SVE: // %bb.0: // %entry 2769; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v2.8b 2770; CHECK-SVE-NEXT: umlal v0.8h, v1.8b, v2.8b 2771; CHECK-SVE-NEXT: ret 2772; 2773; CHECK-GI-LABEL: udistribute_v8i8: 2774; CHECK-GI: // %bb.0: // %entry 2775; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 2776; CHECK-GI-NEXT: uaddl v0.8h, v0.8b, v1.8b 2777; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h 2778; CHECK-GI-NEXT: ret 2779entry: 2780 %4 = zext <8 x i8> %src1 to <8 x i16> 2781 %5 = zext <8 x i8> %mul to <8 x i16> 2782 %7 = zext <8 x i8> %src2 to <8 x i16> 2783 %8 = add nuw nsw <8 x i16> %4, %7 2784 %9 = mul <8 x i16> %8, %5 2785 ret <8 x i16> %9 2786} 2787 2788define <8 x i16> @udistribute_const1_v8i8(<8 x i8> %src1, <8 x i8> %mul) { 2789; CHECK-NEON-LABEL: udistribute_const1_v8i8: 2790; CHECK-NEON: // %bb.0: // %entry 2791; CHECK-NEON-NEXT: movi v2.8b, #10 2792; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b 2793; CHECK-NEON-NEXT: umlal v0.8h, v2.8b, v1.8b 2794; CHECK-NEON-NEXT: ret 2795; 2796; CHECK-SVE-LABEL: udistribute_const1_v8i8: 2797; CHECK-SVE: // %bb.0: // %entry 2798; CHECK-SVE-NEXT: movi v2.8b, #10 2799; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b 2800; CHECK-SVE-NEXT: umlal v0.8h, v2.8b, v1.8b 2801; CHECK-SVE-NEXT: ret 2802; 2803; CHECK-GI-LABEL: udistribute_const1_v8i8: 2804; CHECK-GI: // %bb.0: // %entry 2805; CHECK-GI-NEXT: movi v2.8h, #10 2806; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 2807; CHECK-GI-NEXT: uaddw v0.8h, v2.8h, v0.8b 2808; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h 2809; CHECK-GI-NEXT: ret 2810entry: 2811 %4 = zext <8 x i8> %src1 to <8 x i16> 2812 %5 = zext <8 x i8> %mul to <8 x i16> 2813 %8 = add nuw nsw <8 x i16> %4, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> 2814 %9 = mul <8 x i16> %8, %5 2815 ret <8 x i16> %9 2816} 2817 2818define <8 x i16> @udistribute_const2_v8i8(<8 x i8> %src1, <8 x i8> %src2) { 2819; CHECK-NEON-LABEL: udistribute_const2_v8i8: 2820; CHECK-NEON: // %bb.0: // %entry 2821; CHECK-NEON-NEXT: movi v2.8b, #10 2822; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v2.8b 2823; CHECK-NEON-NEXT: umlal v0.8h, v1.8b, v2.8b 2824; CHECK-NEON-NEXT: ret 2825; 2826; CHECK-SVE-LABEL: udistribute_const2_v8i8: 2827; CHECK-SVE: // %bb.0: // %entry 2828; CHECK-SVE-NEXT: movi v2.8b, #10 2829; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v2.8b 2830; CHECK-SVE-NEXT: umlal v0.8h, v1.8b, v2.8b 2831; CHECK-SVE-NEXT: ret 2832; 2833; CHECK-GI-LABEL: udistribute_const2_v8i8: 2834; CHECK-GI: // %bb.0: // %entry 2835; CHECK-GI-NEXT: movi v2.8h, #10 2836; CHECK-GI-NEXT: uaddl v0.8h, v0.8b, v1.8b 2837; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h 2838; CHECK-GI-NEXT: ret 2839entry: 2840 %4 = zext <8 x i8> %src1 to <8 x i16> 2841 %5 = zext <8 x i8> %src2 to <8 x i16> 2842 %8 = add nuw nsw <8 x i16> %4, %5 2843 %9 = mul <8 x i16> %8, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> 2844 ret <8 x i16> %9 2845} 2846 2847 2848define <2 x i64> @smulladdl_v2i32_v2i64(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) nounwind { 2849; CHECK-NEON-LABEL: smulladdl_v2i32_v2i64: 2850; CHECK-NEON: // %bb.0: 2851; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s 2852; CHECK-NEON-NEXT: saddw v0.2d, v0.2d, v2.2s 2853; CHECK-NEON-NEXT: ret 2854; 2855; CHECK-SVE-LABEL: smulladdl_v2i32_v2i64: 2856; CHECK-SVE: // %bb.0: 2857; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s 2858; CHECK-SVE-NEXT: saddw v0.2d, v0.2d, v2.2s 2859; CHECK-SVE-NEXT: ret 2860; 2861; CHECK-GI-LABEL: smulladdl_v2i32_v2i64: 2862; CHECK-GI: // %bb.0: 2863; CHECK-GI-NEXT: sshll v2.2d, v2.2s, #0 2864; CHECK-GI-NEXT: smlal v2.2d, v0.2s, v1.2s 2865; CHECK-GI-NEXT: mov v0.16b, v2.16b 2866; CHECK-GI-NEXT: ret 2867 %tmp1 = sext <2 x i32> %A to <2 x i64> 2868 %tmp2 = sext <2 x i32> %B to <2 x i64> 2869 %tmp3 = sext <2 x i32> %C to <2 x i64> 2870 %tmp4 = mul <2 x i64> %tmp1, %tmp2 2871 %tmp5 = add <2 x i64> %tmp4, %tmp3 2872 ret <2 x i64> %tmp5 2873} 2874 2875define <2 x i64> @umulladdl_v2i32_v2i64(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) nounwind { 2876; CHECK-NEON-LABEL: umulladdl_v2i32_v2i64: 2877; CHECK-NEON: // %bb.0: 2878; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s 2879; CHECK-NEON-NEXT: uaddw v0.2d, v0.2d, v2.2s 2880; CHECK-NEON-NEXT: ret 2881; 2882; CHECK-SVE-LABEL: umulladdl_v2i32_v2i64: 2883; CHECK-SVE: // %bb.0: 2884; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s 2885; CHECK-SVE-NEXT: uaddw v0.2d, v0.2d, v2.2s 2886; CHECK-SVE-NEXT: ret 2887; 2888; CHECK-GI-LABEL: umulladdl_v2i32_v2i64: 2889; CHECK-GI: // %bb.0: 2890; CHECK-GI-NEXT: ushll v2.2d, v2.2s, #0 2891; CHECK-GI-NEXT: umlal v2.2d, v0.2s, v1.2s 2892; CHECK-GI-NEXT: mov v0.16b, v2.16b 2893; CHECK-GI-NEXT: ret 2894 %tmp1 = zext <2 x i32> %A to <2 x i64> 2895 %tmp2 = zext <2 x i32> %B to <2 x i64> 2896 %tmp3 = zext <2 x i32> %C to <2 x i64> 2897 %tmp4 = mul <2 x i64> %tmp1, %tmp2 2898 %tmp5 = add <2 x i64> %tmp4, %tmp3 2899 ret <2 x i64> %tmp5 2900} 2901 2902define <2 x i64> @smlall_v2i32_v2i64(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { 2903; CHECK-LABEL: smlall_v2i32_v2i64: 2904; CHECK: // %bb.0: 2905; CHECK-NEXT: smlal v2.2d, v0.2s, v1.2s 2906; CHECK-NEXT: mov v0.16b, v2.16b 2907; CHECK-NEXT: ret 2908 %tmp1 = sext <2 x i32> %A to <2 x i64> 2909 %tmp2 = sext <2 x i32> %B to <2 x i64> 2910 %tmp4 = mul <2 x i64> %tmp1, %tmp2 2911 %tmp5 = add <2 x i64> %tmp4, %C 2912 ret <2 x i64> %tmp5 2913} 2914 2915define <2 x i64> @umlall_v2i32_v2i64(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { 2916; CHECK-LABEL: umlall_v2i32_v2i64: 2917; CHECK: // %bb.0: 2918; CHECK-NEXT: umlal v2.2d, v0.2s, v1.2s 2919; CHECK-NEXT: mov v0.16b, v2.16b 2920; CHECK-NEXT: ret 2921 %tmp1 = zext <2 x i32> %A to <2 x i64> 2922 %tmp2 = zext <2 x i32> %B to <2 x i64> 2923 %tmp4 = mul <2 x i64> %tmp1, %tmp2 2924 %tmp5 = add <2 x i64> %tmp4, %C 2925 ret <2 x i64> %tmp5 2926} 2927 2928define <2 x i64> @smulladdl_const_v2i32_v2i64(<2 x i32> %A, <2 x i32> %C) nounwind { 2929; CHECK-NEON-LABEL: smulladdl_const_v2i32_v2i64: 2930; CHECK-NEON: // %bb.0: 2931; CHECK-NEON-NEXT: movi v2.2s, #10 2932; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v2.2s 2933; CHECK-NEON-NEXT: saddw v0.2d, v0.2d, v1.2s 2934; CHECK-NEON-NEXT: ret 2935; 2936; CHECK-SVE-LABEL: smulladdl_const_v2i32_v2i64: 2937; CHECK-SVE: // %bb.0: 2938; CHECK-SVE-NEXT: movi v2.2s, #10 2939; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v2.2s 2940; CHECK-SVE-NEXT: saddw v0.2d, v0.2d, v1.2s 2941; CHECK-SVE-NEXT: ret 2942; 2943; CHECK-GI-LABEL: smulladdl_const_v2i32_v2i64: 2944; CHECK-GI: // %bb.0: 2945; CHECK-GI-NEXT: adrp x8, .LCPI98_0 2946; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 2947; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI98_0] 2948; CHECK-GI-NEXT: fmov x8, d0 2949; CHECK-GI-NEXT: fmov x9, d2 2950; CHECK-GI-NEXT: mov x10, v0.d[1] 2951; CHECK-GI-NEXT: mov x11, v2.d[1] 2952; CHECK-GI-NEXT: mul x8, x8, x9 2953; CHECK-GI-NEXT: mul x9, x10, x11 2954; CHECK-GI-NEXT: mov v0.d[0], x8 2955; CHECK-GI-NEXT: mov v0.d[1], x9 2956; CHECK-GI-NEXT: saddw v0.2d, v0.2d, v1.2s 2957; CHECK-GI-NEXT: ret 2958 %tmp1 = sext <2 x i32> %A to <2 x i64> 2959 %tmp3 = sext <2 x i32> %C to <2 x i64> 2960 %tmp4 = mul <2 x i64> %tmp1, <i64 10, i64 10> 2961 %tmp5 = add <2 x i64> %tmp4, %tmp3 2962 ret <2 x i64> %tmp5 2963} 2964 2965define <2 x i64> @umulladdl_const_v2i32_v2i64(<2 x i32> %A, <2 x i32> %C) nounwind { 2966; CHECK-NEON-LABEL: umulladdl_const_v2i32_v2i64: 2967; CHECK-NEON: // %bb.0: 2968; CHECK-NEON-NEXT: movi v2.2s, #10 2969; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s 2970; CHECK-NEON-NEXT: uaddw v0.2d, v0.2d, v1.2s 2971; CHECK-NEON-NEXT: ret 2972; 2973; CHECK-SVE-LABEL: umulladdl_const_v2i32_v2i64: 2974; CHECK-SVE: // %bb.0: 2975; CHECK-SVE-NEXT: movi v2.2s, #10 2976; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s 2977; CHECK-SVE-NEXT: uaddw v0.2d, v0.2d, v1.2s 2978; CHECK-SVE-NEXT: ret 2979; 2980; CHECK-GI-LABEL: umulladdl_const_v2i32_v2i64: 2981; CHECK-GI: // %bb.0: 2982; CHECK-GI-NEXT: adrp x8, .LCPI99_0 2983; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 2984; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI99_0] 2985; CHECK-GI-NEXT: fmov x8, d0 2986; CHECK-GI-NEXT: fmov x9, d2 2987; CHECK-GI-NEXT: mov x10, v0.d[1] 2988; CHECK-GI-NEXT: mov x11, v2.d[1] 2989; CHECK-GI-NEXT: mul x8, x8, x9 2990; CHECK-GI-NEXT: mul x9, x10, x11 2991; CHECK-GI-NEXT: mov v0.d[0], x8 2992; CHECK-GI-NEXT: mov v0.d[1], x9 2993; CHECK-GI-NEXT: uaddw v0.2d, v0.2d, v1.2s 2994; CHECK-GI-NEXT: ret 2995 %tmp1 = zext <2 x i32> %A to <2 x i64> 2996 %tmp3 = zext <2 x i32> %C to <2 x i64> 2997 %tmp4 = mul <2 x i64> %tmp1, <i64 10, i64 10> 2998 %tmp5 = add <2 x i64> %tmp4, %tmp3 2999 ret <2 x i64> %tmp5 3000} 3001 3002define <2 x i64> @sdistribute_v2i32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32> %mul) { 3003; CHECK-NEON-LABEL: sdistribute_v2i32: 3004; CHECK-NEON: // %bb.0: // %entry 3005; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v2.2s 3006; CHECK-NEON-NEXT: smlal v0.2d, v1.2s, v2.2s 3007; CHECK-NEON-NEXT: ret 3008; 3009; CHECK-SVE-LABEL: sdistribute_v2i32: 3010; CHECK-SVE: // %bb.0: // %entry 3011; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v2.2s 3012; CHECK-SVE-NEXT: smlal v0.2d, v1.2s, v2.2s 3013; CHECK-SVE-NEXT: ret 3014; 3015; CHECK-GI-LABEL: sdistribute_v2i32: 3016; CHECK-GI: // %bb.0: // %entry 3017; CHECK-GI-NEXT: sshll v2.2d, v2.2s, #0 3018; CHECK-GI-NEXT: saddl v0.2d, v0.2s, v1.2s 3019; CHECK-GI-NEXT: fmov x8, d0 3020; CHECK-GI-NEXT: fmov x9, d2 3021; CHECK-GI-NEXT: mov x10, v0.d[1] 3022; CHECK-GI-NEXT: mov x11, v2.d[1] 3023; CHECK-GI-NEXT: mul x8, x8, x9 3024; CHECK-GI-NEXT: mul x9, x10, x11 3025; CHECK-GI-NEXT: mov v0.d[0], x8 3026; CHECK-GI-NEXT: mov v0.d[1], x9 3027; CHECK-GI-NEXT: ret 3028entry: 3029 %4 = sext <2 x i32> %src1 to <2 x i64> 3030 %5 = sext <2 x i32> %mul to <2 x i64> 3031 %7 = sext <2 x i32> %src2 to <2 x i64> 3032 %8 = add nuw nsw <2 x i64> %4, %7 3033 %9 = mul <2 x i64> %8, %5 3034 ret <2 x i64> %9 3035} 3036 3037define <2 x i64> @sdistribute_const1_v2i32(<2 x i32> %src1, <2 x i32> %mul) { 3038; CHECK-NEON-LABEL: sdistribute_const1_v2i32: 3039; CHECK-NEON: // %bb.0: // %entry 3040; CHECK-NEON-NEXT: movi v2.2s, #10 3041; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s 3042; CHECK-NEON-NEXT: smlal v0.2d, v2.2s, v1.2s 3043; CHECK-NEON-NEXT: ret 3044; 3045; CHECK-SVE-LABEL: sdistribute_const1_v2i32: 3046; CHECK-SVE: // %bb.0: // %entry 3047; CHECK-SVE-NEXT: movi v2.2s, #10 3048; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s 3049; CHECK-SVE-NEXT: smlal v0.2d, v2.2s, v1.2s 3050; CHECK-SVE-NEXT: ret 3051; 3052; CHECK-GI-LABEL: sdistribute_const1_v2i32: 3053; CHECK-GI: // %bb.0: // %entry 3054; CHECK-GI-NEXT: adrp x8, .LCPI101_0 3055; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0 3056; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI101_0] 3057; CHECK-GI-NEXT: saddw v0.2d, v2.2d, v0.2s 3058; CHECK-GI-NEXT: fmov x9, d1 3059; CHECK-GI-NEXT: mov x11, v1.d[1] 3060; CHECK-GI-NEXT: fmov x8, d0 3061; CHECK-GI-NEXT: mov x10, v0.d[1] 3062; CHECK-GI-NEXT: mul x8, x8, x9 3063; CHECK-GI-NEXT: mul x9, x10, x11 3064; CHECK-GI-NEXT: mov v0.d[0], x8 3065; CHECK-GI-NEXT: mov v0.d[1], x9 3066; CHECK-GI-NEXT: ret 3067entry: 3068 %4 = sext <2 x i32> %src1 to <2 x i64> 3069 %5 = sext <2 x i32> %mul to <2 x i64> 3070 %8 = add nuw nsw <2 x i64> %4, <i64 10, i64 10> 3071 %9 = mul <2 x i64> %8, %5 3072 ret <2 x i64> %9 3073} 3074 3075define <2 x i64> @sdistribute_const2_v2i32(<2 x i32> %src1, <2 x i32> %src2) { 3076; CHECK-NEON-LABEL: sdistribute_const2_v2i32: 3077; CHECK-NEON: // %bb.0: // %entry 3078; CHECK-NEON-NEXT: movi v2.2s, #10 3079; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v2.2s 3080; CHECK-NEON-NEXT: smlal v0.2d, v1.2s, v2.2s 3081; CHECK-NEON-NEXT: ret 3082; 3083; CHECK-SVE-LABEL: sdistribute_const2_v2i32: 3084; CHECK-SVE: // %bb.0: // %entry 3085; CHECK-SVE-NEXT: movi v2.2s, #10 3086; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v2.2s 3087; CHECK-SVE-NEXT: smlal v0.2d, v1.2s, v2.2s 3088; CHECK-SVE-NEXT: ret 3089; 3090; CHECK-GI-LABEL: sdistribute_const2_v2i32: 3091; CHECK-GI: // %bb.0: // %entry 3092; CHECK-GI-NEXT: adrp x8, .LCPI102_0 3093; CHECK-GI-NEXT: saddl v0.2d, v0.2s, v1.2s 3094; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI102_0] 3095; CHECK-GI-NEXT: fmov x8, d0 3096; CHECK-GI-NEXT: fmov x9, d1 3097; CHECK-GI-NEXT: mov x10, v0.d[1] 3098; CHECK-GI-NEXT: mov x11, v1.d[1] 3099; CHECK-GI-NEXT: mul x8, x8, x9 3100; CHECK-GI-NEXT: mul x9, x10, x11 3101; CHECK-GI-NEXT: mov v0.d[0], x8 3102; CHECK-GI-NEXT: mov v0.d[1], x9 3103; CHECK-GI-NEXT: ret 3104entry: 3105 %4 = sext <2 x i32> %src1 to <2 x i64> 3106 %5 = sext <2 x i32> %src2 to <2 x i64> 3107 %8 = add nuw nsw <2 x i64> %4, %5 3108 %9 = mul <2 x i64> %8, <i64 10, i64 10> 3109 ret <2 x i64> %9 3110} 3111 3112define <2 x i64> @udistribute_v2i32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32> %mul) { 3113; CHECK-NEON-LABEL: udistribute_v2i32: 3114; CHECK-NEON: // %bb.0: // %entry 3115; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s 3116; CHECK-NEON-NEXT: umlal v0.2d, v1.2s, v2.2s 3117; CHECK-NEON-NEXT: ret 3118; 3119; CHECK-SVE-LABEL: udistribute_v2i32: 3120; CHECK-SVE: // %bb.0: // %entry 3121; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s 3122; CHECK-SVE-NEXT: umlal v0.2d, v1.2s, v2.2s 3123; CHECK-SVE-NEXT: ret 3124; 3125; CHECK-GI-LABEL: udistribute_v2i32: 3126; CHECK-GI: // %bb.0: // %entry 3127; CHECK-GI-NEXT: ushll v2.2d, v2.2s, #0 3128; CHECK-GI-NEXT: uaddl v0.2d, v0.2s, v1.2s 3129; CHECK-GI-NEXT: fmov x8, d0 3130; CHECK-GI-NEXT: fmov x9, d2 3131; CHECK-GI-NEXT: mov x10, v0.d[1] 3132; CHECK-GI-NEXT: mov x11, v2.d[1] 3133; CHECK-GI-NEXT: mul x8, x8, x9 3134; CHECK-GI-NEXT: mul x9, x10, x11 3135; CHECK-GI-NEXT: mov v0.d[0], x8 3136; CHECK-GI-NEXT: mov v0.d[1], x9 3137; CHECK-GI-NEXT: ret 3138entry: 3139 %4 = zext <2 x i32> %src1 to <2 x i64> 3140 %5 = zext <2 x i32> %mul to <2 x i64> 3141 %7 = zext <2 x i32> %src2 to <2 x i64> 3142 %8 = add nuw nsw <2 x i64> %4, %7 3143 %9 = mul <2 x i64> %8, %5 3144 ret <2 x i64> %9 3145} 3146 3147define <2 x i64> @udistribute_const1_v2i32(<2 x i32> %src1, <2 x i32> %mul) { 3148; CHECK-NEON-LABEL: udistribute_const1_v2i32: 3149; CHECK-NEON: // %bb.0: // %entry 3150; CHECK-NEON-NEXT: movi v2.2s, #10 3151; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s 3152; CHECK-NEON-NEXT: umlal v0.2d, v2.2s, v1.2s 3153; CHECK-NEON-NEXT: ret 3154; 3155; CHECK-SVE-LABEL: udistribute_const1_v2i32: 3156; CHECK-SVE: // %bb.0: // %entry 3157; CHECK-SVE-NEXT: movi v2.2s, #10 3158; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s 3159; CHECK-SVE-NEXT: umlal v0.2d, v2.2s, v1.2s 3160; CHECK-SVE-NEXT: ret 3161; 3162; CHECK-GI-LABEL: udistribute_const1_v2i32: 3163; CHECK-GI: // %bb.0: // %entry 3164; CHECK-GI-NEXT: adrp x8, .LCPI104_0 3165; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0 3166; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI104_0] 3167; CHECK-GI-NEXT: uaddw v0.2d, v2.2d, v0.2s 3168; CHECK-GI-NEXT: fmov x9, d1 3169; CHECK-GI-NEXT: mov x11, v1.d[1] 3170; CHECK-GI-NEXT: fmov x8, d0 3171; CHECK-GI-NEXT: mov x10, v0.d[1] 3172; CHECK-GI-NEXT: mul x8, x8, x9 3173; CHECK-GI-NEXT: mul x9, x10, x11 3174; CHECK-GI-NEXT: mov v0.d[0], x8 3175; CHECK-GI-NEXT: mov v0.d[1], x9 3176; CHECK-GI-NEXT: ret 3177entry: 3178 %4 = zext <2 x i32> %src1 to <2 x i64> 3179 %5 = zext <2 x i32> %mul to <2 x i64> 3180 %8 = add nuw nsw <2 x i64> %4, <i64 10, i64 10> 3181 %9 = mul <2 x i64> %8, %5 3182 ret <2 x i64> %9 3183} 3184 3185define <2 x i64> @udistribute_const2_v2i32(<2 x i32> %src1, <2 x i32> %src2) { 3186; CHECK-NEON-LABEL: udistribute_const2_v2i32: 3187; CHECK-NEON: // %bb.0: // %entry 3188; CHECK-NEON-NEXT: movi v2.2s, #10 3189; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s 3190; CHECK-NEON-NEXT: umlal v0.2d, v1.2s, v2.2s 3191; CHECK-NEON-NEXT: ret 3192; 3193; CHECK-SVE-LABEL: udistribute_const2_v2i32: 3194; CHECK-SVE: // %bb.0: // %entry 3195; CHECK-SVE-NEXT: movi v2.2s, #10 3196; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s 3197; CHECK-SVE-NEXT: umlal v0.2d, v1.2s, v2.2s 3198; CHECK-SVE-NEXT: ret 3199; 3200; CHECK-GI-LABEL: udistribute_const2_v2i32: 3201; CHECK-GI: // %bb.0: // %entry 3202; CHECK-GI-NEXT: adrp x8, .LCPI105_0 3203; CHECK-GI-NEXT: uaddl v0.2d, v0.2s, v1.2s 3204; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI105_0] 3205; CHECK-GI-NEXT: fmov x8, d0 3206; CHECK-GI-NEXT: fmov x9, d1 3207; CHECK-GI-NEXT: mov x10, v0.d[1] 3208; CHECK-GI-NEXT: mov x11, v1.d[1] 3209; CHECK-GI-NEXT: mul x8, x8, x9 3210; CHECK-GI-NEXT: mul x9, x10, x11 3211; CHECK-GI-NEXT: mov v0.d[0], x8 3212; CHECK-GI-NEXT: mov v0.d[1], x9 3213; CHECK-GI-NEXT: ret 3214entry: 3215 %4 = zext <2 x i32> %src1 to <2 x i64> 3216 %5 = zext <2 x i32> %src2 to <2 x i64> 3217 %8 = add nuw nsw <2 x i64> %4, %5 3218 %9 = mul <2 x i64> %8, <i64 10, i64 10> 3219 ret <2 x i64> %9 3220} 3221 3222declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) 3223declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) 3224declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) 3225declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) 3226declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) 3227declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) 3228