1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc < %s -mtriple=arm64-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-SD 3; RUN: llc < %s -mtriple=arm64-eabi -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI 4 5; CHECK-GI: warning: Instruction selection used fallback path for saddlp1d 6; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uaddlp1d 7 8define <8 x i8> @addhn8b(ptr %A, ptr %B) nounwind { 9; CHECK-LABEL: addhn8b: 10; CHECK: // %bb.0: 11; CHECK-NEXT: ldr q0, [x0] 12; CHECK-NEXT: ldr q1, [x1] 13; CHECK-NEXT: addhn v0.8b, v0.8h, v1.8h 14; CHECK-NEXT: ret 15 %tmp1 = load <8 x i16>, ptr %A 16 %tmp2 = load <8 x i16>, ptr %B 17 %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) 18 ret <8 x i8> %tmp3 19} 20 21define <4 x i16> @addhn4h(ptr %A, ptr %B) nounwind { 22; CHECK-LABEL: addhn4h: 23; CHECK: // %bb.0: 24; CHECK-NEXT: ldr q0, [x0] 25; CHECK-NEXT: ldr q1, [x1] 26; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s 27; CHECK-NEXT: ret 28 %tmp1 = load <4 x i32>, ptr %A 29 %tmp2 = load <4 x i32>, ptr %B 30 %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) 31 ret <4 x i16> %tmp3 32} 33 34define <2 x i32> @addhn2s(ptr %A, ptr %B) nounwind { 35; CHECK-LABEL: addhn2s: 36; CHECK: // %bb.0: 37; CHECK-NEXT: ldr q0, [x0] 38; CHECK-NEXT: ldr q1, [x1] 39; CHECK-NEXT: addhn v0.2s, v0.2d, v1.2d 40; CHECK-NEXT: ret 41 %tmp1 = load <2 x i64>, ptr %A 42 %tmp2 = load <2 x i64>, ptr %B 43 %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) 44 ret <2 x i32> %tmp3 45} 46 47define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind { 48; CHECK-LABEL: addhn2_16b: 49; CHECK: // %bb.0: 50; CHECK-NEXT: addhn v2.8b, v0.8h, v1.8h 51; CHECK-NEXT: addhn2 v2.16b, v0.8h, v1.8h 52; CHECK-NEXT: mov v0.16b, v2.16b 53; CHECK-NEXT: ret 54 %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 55 %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 56 %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 57 ret <16 x i8> %res 58} 59 60define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind { 61; CHECK-LABEL: addhn2_8h: 62; CHECK: // %bb.0: 63; CHECK-NEXT: addhn v2.4h, v0.4s, v1.4s 64; CHECK-NEXT: addhn2 v2.8h, v0.4s, v1.4s 65; CHECK-NEXT: mov v0.16b, v2.16b 66; CHECK-NEXT: ret 67 %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 68 %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 69 %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 70 ret <8 x i16> %res 71} 72 73define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind { 74; CHECK-LABEL: addhn2_4s: 75; CHECK: // %bb.0: 76; CHECK-NEXT: addhn v2.2s, v0.2d, v1.2d 77; CHECK-NEXT: addhn2 v2.4s, v0.2d, v1.2d 78; CHECK-NEXT: mov v0.16b, v2.16b 79; CHECK-NEXT: ret 80 %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 81 %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 82 %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 83 ret <4 x i32> %res 84} 85 86declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 87declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 88declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 89 90 91define <8 x i8> @raddhn8b(ptr %A, ptr %B) nounwind { 92; CHECK-LABEL: raddhn8b: 93; CHECK: // %bb.0: 94; CHECK-NEXT: ldr q0, [x0] 95; CHECK-NEXT: ldr q1, [x1] 96; CHECK-NEXT: raddhn v0.8b, v0.8h, v1.8h 97; CHECK-NEXT: ret 98 %tmp1 = load <8 x i16>, ptr %A 99 %tmp2 = load <8 x i16>, ptr %B 100 %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) 101 ret <8 x i8> %tmp3 102} 103 104define <4 x i16> @raddhn4h(ptr %A, ptr %B) nounwind { 105; CHECK-LABEL: raddhn4h: 106; CHECK: // %bb.0: 107; CHECK-NEXT: ldr q0, [x0] 108; CHECK-NEXT: ldr q1, [x1] 109; CHECK-NEXT: raddhn v0.4h, v0.4s, v1.4s 110; CHECK-NEXT: ret 111 %tmp1 = load <4 x i32>, ptr %A 112 %tmp2 = load <4 x i32>, ptr %B 113 %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) 114 ret <4 x i16> %tmp3 115} 116 117define <2 x i32> @raddhn2s(ptr %A, ptr %B) nounwind { 118; CHECK-LABEL: raddhn2s: 119; CHECK: // %bb.0: 120; CHECK-NEXT: ldr q0, [x0] 121; CHECK-NEXT: ldr q1, [x1] 122; CHECK-NEXT: raddhn v0.2s, v0.2d, v1.2d 123; CHECK-NEXT: ret 124 %tmp1 = load <2 x i64>, ptr %A 125 %tmp2 = load <2 x i64>, ptr %B 126 %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) 127 ret <2 x i32> %tmp3 128} 129 130define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind { 131; CHECK-LABEL: raddhn2_16b: 132; CHECK: // %bb.0: 133; CHECK-NEXT: raddhn v2.8b, v0.8h, v1.8h 134; CHECK-NEXT: raddhn2 v2.16b, v0.8h, v1.8h 135; CHECK-NEXT: mov v0.16b, v2.16b 136; CHECK-NEXT: ret 137 %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 138 %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind 139 %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 140 ret <16 x i8> %res 141} 142 143define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind { 144; CHECK-LABEL: raddhn2_8h: 145; CHECK: // %bb.0: 146; CHECK-NEXT: raddhn v2.4h, v0.4s, v1.4s 147; CHECK-NEXT: raddhn2 v2.8h, v0.4s, v1.4s 148; CHECK-NEXT: mov v0.16b, v2.16b 149; CHECK-NEXT: ret 150 %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 151 %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind 152 %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 153 ret <8 x i16> %res 154} 155 156define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind { 157; CHECK-LABEL: raddhn2_4s: 158; CHECK: // %bb.0: 159; CHECK-NEXT: raddhn v2.2s, v0.2d, v1.2d 160; CHECK-NEXT: raddhn2 v2.4s, v0.2d, v1.2d 161; CHECK-NEXT: mov v0.16b, v2.16b 162; CHECK-NEXT: ret 163 %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 164 %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind 165 %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 166 ret <4 x i32> %res 167} 168 169declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 170declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 171declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 172 173define <8 x i16> @saddl8h(ptr %A, ptr %B) nounwind { 174; CHECK-LABEL: saddl8h: 175; CHECK: // %bb.0: 176; CHECK-NEXT: ldr d0, [x0] 177; CHECK-NEXT: ldr d1, [x1] 178; CHECK-NEXT: saddl v0.8h, v0.8b, v1.8b 179; CHECK-NEXT: ret 180 %tmp1 = load <8 x i8>, ptr %A 181 %tmp2 = load <8 x i8>, ptr %B 182 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16> 183 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16> 184 %tmp5 = add <8 x i16> %tmp3, %tmp4 185 ret <8 x i16> %tmp5 186} 187 188define <4 x i32> @saddl4s(ptr %A, ptr %B) nounwind { 189; CHECK-LABEL: saddl4s: 190; CHECK: // %bb.0: 191; CHECK-NEXT: ldr d0, [x0] 192; CHECK-NEXT: ldr d1, [x1] 193; CHECK-NEXT: saddl v0.4s, v0.4h, v1.4h 194; CHECK-NEXT: ret 195 %tmp1 = load <4 x i16>, ptr %A 196 %tmp2 = load <4 x i16>, ptr %B 197 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32> 198 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32> 199 %tmp5 = add <4 x i32> %tmp3, %tmp4 200 ret <4 x i32> %tmp5 201} 202 203define <2 x i64> @saddl2d(ptr %A, ptr %B) nounwind { 204; CHECK-LABEL: saddl2d: 205; CHECK: // %bb.0: 206; CHECK-NEXT: ldr d0, [x0] 207; CHECK-NEXT: ldr d1, [x1] 208; CHECK-NEXT: saddl v0.2d, v0.2s, v1.2s 209; CHECK-NEXT: ret 210 %tmp1 = load <2 x i32>, ptr %A 211 %tmp2 = load <2 x i32>, ptr %B 212 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64> 213 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64> 214 %tmp5 = add <2 x i64> %tmp3, %tmp4 215 ret <2 x i64> %tmp5 216} 217 218define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind { 219; CHECK-LABEL: saddl2_8h: 220; CHECK: // %bb.0: 221; CHECK-NEXT: saddl2 v0.8h, v0.16b, v1.16b 222; CHECK-NEXT: ret 223 %tmp = bitcast <16 x i8> %a to <2 x i64> 224 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 225 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> 226 %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16> 227 %tmp2 = bitcast <16 x i8> %b to <2 x i64> 228 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 229 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8> 230 %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16> 231 %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i 232 ret <8 x i16> %add.i 233} 234 235define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind { 236; CHECK-LABEL: saddl2_4s: 237; CHECK: // %bb.0: 238; CHECK-NEXT: saddl2 v0.4s, v0.8h, v1.8h 239; CHECK-NEXT: ret 240 %tmp = bitcast <8 x i16> %a to <2 x i64> 241 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 242 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> 243 %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32> 244 %tmp2 = bitcast <8 x i16> %b to <2 x i64> 245 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 246 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16> 247 %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32> 248 %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i 249 ret <4 x i32> %add.i 250} 251 252define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind { 253; CHECK-LABEL: saddl2_2d: 254; CHECK: // %bb.0: 255; CHECK-NEXT: saddl2 v0.2d, v0.4s, v1.4s 256; CHECK-NEXT: ret 257 %tmp = bitcast <4 x i32> %a to <2 x i64> 258 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 259 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> 260 %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64> 261 %tmp2 = bitcast <4 x i32> %b to <2 x i64> 262 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 263 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32> 264 %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64> 265 %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i 266 ret <2 x i64> %add.i 267} 268 269define <8 x i16> @uaddl8h(ptr %A, ptr %B) nounwind { 270; CHECK-LABEL: uaddl8h: 271; CHECK: // %bb.0: 272; CHECK-NEXT: ldr d0, [x0] 273; CHECK-NEXT: ldr d1, [x1] 274; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b 275; CHECK-NEXT: ret 276 %tmp1 = load <8 x i8>, ptr %A 277 %tmp2 = load <8 x i8>, ptr %B 278 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16> 279 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16> 280 %tmp5 = add <8 x i16> %tmp3, %tmp4 281 ret <8 x i16> %tmp5 282} 283 284define <4 x i32> @uaddl4s(ptr %A, ptr %B) nounwind { 285; CHECK-LABEL: uaddl4s: 286; CHECK: // %bb.0: 287; CHECK-NEXT: ldr d0, [x0] 288; CHECK-NEXT: ldr d1, [x1] 289; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h 290; CHECK-NEXT: ret 291 %tmp1 = load <4 x i16>, ptr %A 292 %tmp2 = load <4 x i16>, ptr %B 293 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32> 294 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32> 295 %tmp5 = add <4 x i32> %tmp3, %tmp4 296 ret <4 x i32> %tmp5 297} 298 299define <2 x i64> @uaddl2d(ptr %A, ptr %B) nounwind { 300; CHECK-LABEL: uaddl2d: 301; CHECK: // %bb.0: 302; CHECK-NEXT: ldr d0, [x0] 303; CHECK-NEXT: ldr d1, [x1] 304; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s 305; CHECK-NEXT: ret 306 %tmp1 = load <2 x i32>, ptr %A 307 %tmp2 = load <2 x i32>, ptr %B 308 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64> 309 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64> 310 %tmp5 = add <2 x i64> %tmp3, %tmp4 311 ret <2 x i64> %tmp5 312} 313 314 315define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind { 316; CHECK-LABEL: uaddl2_8h: 317; CHECK: // %bb.0: 318; CHECK-NEXT: uaddl2 v0.8h, v0.16b, v1.16b 319; CHECK-NEXT: ret 320 %tmp = bitcast <16 x i8> %a to <2 x i64> 321 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 322 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> 323 %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16> 324 %tmp2 = bitcast <16 x i8> %b to <2 x i64> 325 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 326 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8> 327 %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16> 328 %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i 329 ret <8 x i16> %add.i 330} 331 332define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind { 333; CHECK-LABEL: uaddl2_4s: 334; CHECK: // %bb.0: 335; CHECK-NEXT: uaddl2 v0.4s, v0.8h, v1.8h 336; CHECK-NEXT: ret 337 %tmp = bitcast <8 x i16> %a to <2 x i64> 338 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 339 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> 340 %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32> 341 %tmp2 = bitcast <8 x i16> %b to <2 x i64> 342 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 343 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16> 344 %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32> 345 %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i 346 ret <4 x i32> %add.i 347} 348 349define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind { 350; CHECK-LABEL: uaddl2_2d: 351; CHECK: // %bb.0: 352; CHECK-NEXT: uaddl2 v0.2d, v0.4s, v1.4s 353; CHECK-NEXT: ret 354 %tmp = bitcast <4 x i32> %a to <2 x i64> 355 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 356 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> 357 %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64> 358 %tmp2 = bitcast <4 x i32> %b to <2 x i64> 359 %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 360 %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32> 361 %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64> 362 %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i 363 ret <2 x i64> %add.i 364} 365 366define <8 x i16> @uaddw8h(ptr %A, ptr %B) nounwind { 367; CHECK-LABEL: uaddw8h: 368; CHECK: // %bb.0: 369; CHECK-NEXT: ldr q0, [x0] 370; CHECK-NEXT: ldr d1, [x1] 371; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b 372; CHECK-NEXT: ret 373 %tmp1 = load <8 x i16>, ptr %A 374 %tmp2 = load <8 x i8>, ptr %B 375 %tmp3 = zext <8 x i8> %tmp2 to <8 x i16> 376 %tmp4 = add <8 x i16> %tmp1, %tmp3 377 ret <8 x i16> %tmp4 378} 379 380define <4 x i32> @uaddw4s(ptr %A, ptr %B) nounwind { 381; CHECK-LABEL: uaddw4s: 382; CHECK: // %bb.0: 383; CHECK-NEXT: ldr q0, [x0] 384; CHECK-NEXT: ldr d1, [x1] 385; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h 386; CHECK-NEXT: ret 387 %tmp1 = load <4 x i32>, ptr %A 388 %tmp2 = load <4 x i16>, ptr %B 389 %tmp3 = zext <4 x i16> %tmp2 to <4 x i32> 390 %tmp4 = add <4 x i32> %tmp1, %tmp3 391 ret <4 x i32> %tmp4 392} 393 394define <2 x i64> @uaddw2d(ptr %A, ptr %B) nounwind { 395; CHECK-LABEL: uaddw2d: 396; CHECK: // %bb.0: 397; CHECK-NEXT: ldr q0, [x0] 398; CHECK-NEXT: ldr d1, [x1] 399; CHECK-NEXT: uaddw v0.2d, v0.2d, v1.2s 400; CHECK-NEXT: ret 401 %tmp1 = load <2 x i64>, ptr %A 402 %tmp2 = load <2 x i32>, ptr %B 403 %tmp3 = zext <2 x i32> %tmp2 to <2 x i64> 404 %tmp4 = add <2 x i64> %tmp1, %tmp3 405 ret <2 x i64> %tmp4 406} 407 408define <8 x i16> @uaddw2_8h(ptr %A, ptr %B) nounwind { 409; CHECK-SD-LABEL: uaddw2_8h: 410; CHECK-SD: // %bb.0: 411; CHECK-SD-NEXT: ldr q0, [x0] 412; CHECK-SD-NEXT: ldr d1, [x1, #8] 413; CHECK-SD-NEXT: uaddw v0.8h, v0.8h, v1.8b 414; CHECK-SD-NEXT: ret 415; 416; CHECK-GI-LABEL: uaddw2_8h: 417; CHECK-GI: // %bb.0: 418; CHECK-GI-NEXT: ldr q0, [x0] 419; CHECK-GI-NEXT: ldr q1, [x1] 420; CHECK-GI-NEXT: uaddw2 v0.8h, v0.8h, v1.16b 421; CHECK-GI-NEXT: ret 422 %tmp1 = load <8 x i16>, ptr %A 423 424 %tmp2 = load <16 x i8>, ptr %B 425 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 426 %ext2 = zext <8 x i8> %high2 to <8 x i16> 427 428 %res = add <8 x i16> %tmp1, %ext2 429 ret <8 x i16> %res 430} 431 432define <4 x i32> @uaddw2_4s(ptr %A, ptr %B) nounwind { 433; CHECK-SD-LABEL: uaddw2_4s: 434; CHECK-SD: // %bb.0: 435; CHECK-SD-NEXT: ldr q0, [x0] 436; CHECK-SD-NEXT: ldr d1, [x1, #8] 437; CHECK-SD-NEXT: uaddw v0.4s, v0.4s, v1.4h 438; CHECK-SD-NEXT: ret 439; 440; CHECK-GI-LABEL: uaddw2_4s: 441; CHECK-GI: // %bb.0: 442; CHECK-GI-NEXT: ldr q0, [x0] 443; CHECK-GI-NEXT: ldr q1, [x1] 444; CHECK-GI-NEXT: uaddw2 v0.4s, v0.4s, v1.8h 445; CHECK-GI-NEXT: ret 446 %tmp1 = load <4 x i32>, ptr %A 447 448 %tmp2 = load <8 x i16>, ptr %B 449 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 450 %ext2 = zext <4 x i16> %high2 to <4 x i32> 451 452 %res = add <4 x i32> %tmp1, %ext2 453 ret <4 x i32> %res 454} 455 456define <2 x i64> @uaddw2_2d(ptr %A, ptr %B) nounwind { 457; CHECK-SD-LABEL: uaddw2_2d: 458; CHECK-SD: // %bb.0: 459; CHECK-SD-NEXT: ldr q0, [x0] 460; CHECK-SD-NEXT: ldr d1, [x1, #8] 461; CHECK-SD-NEXT: uaddw v0.2d, v0.2d, v1.2s 462; CHECK-SD-NEXT: ret 463; 464; CHECK-GI-LABEL: uaddw2_2d: 465; CHECK-GI: // %bb.0: 466; CHECK-GI-NEXT: ldr q0, [x0] 467; CHECK-GI-NEXT: ldr q1, [x1] 468; CHECK-GI-NEXT: uaddw2 v0.2d, v0.2d, v1.4s 469; CHECK-GI-NEXT: ret 470 %tmp1 = load <2 x i64>, ptr %A 471 472 %tmp2 = load <4 x i32>, ptr %B 473 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 474 %ext2 = zext <2 x i32> %high2 to <2 x i64> 475 476 %res = add <2 x i64> %tmp1, %ext2 477 ret <2 x i64> %res 478} 479 480define <8 x i16> @saddw8h(ptr %A, ptr %B) nounwind { 481; CHECK-LABEL: saddw8h: 482; CHECK: // %bb.0: 483; CHECK-NEXT: ldr q0, [x0] 484; CHECK-NEXT: ldr d1, [x1] 485; CHECK-NEXT: saddw v0.8h, v0.8h, v1.8b 486; CHECK-NEXT: ret 487 %tmp1 = load <8 x i16>, ptr %A 488 %tmp2 = load <8 x i8>, ptr %B 489 %tmp3 = sext <8 x i8> %tmp2 to <8 x i16> 490 %tmp4 = add <8 x i16> %tmp1, %tmp3 491 ret <8 x i16> %tmp4 492} 493 494define <4 x i32> @saddw4s(ptr %A, ptr %B) nounwind { 495; CHECK-LABEL: saddw4s: 496; CHECK: // %bb.0: 497; CHECK-NEXT: ldr q0, [x0] 498; CHECK-NEXT: ldr d1, [x1] 499; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h 500; CHECK-NEXT: ret 501 %tmp1 = load <4 x i32>, ptr %A 502 %tmp2 = load <4 x i16>, ptr %B 503 %tmp3 = sext <4 x i16> %tmp2 to <4 x i32> 504 %tmp4 = add <4 x i32> %tmp1, %tmp3 505 ret <4 x i32> %tmp4 506} 507 508define <2 x i64> @saddw2d(ptr %A, ptr %B) nounwind { 509; CHECK-LABEL: saddw2d: 510; CHECK: // %bb.0: 511; CHECK-NEXT: ldr q0, [x0] 512; CHECK-NEXT: ldr d1, [x1] 513; CHECK-NEXT: saddw v0.2d, v0.2d, v1.2s 514; CHECK-NEXT: ret 515 %tmp1 = load <2 x i64>, ptr %A 516 %tmp2 = load <2 x i32>, ptr %B 517 %tmp3 = sext <2 x i32> %tmp2 to <2 x i64> 518 %tmp4 = add <2 x i64> %tmp1, %tmp3 519 ret <2 x i64> %tmp4 520} 521 522define <8 x i16> @saddw2_8h(ptr %A, ptr %B) nounwind { 523; CHECK-SD-LABEL: saddw2_8h: 524; CHECK-SD: // %bb.0: 525; CHECK-SD-NEXT: ldr q0, [x0] 526; CHECK-SD-NEXT: ldr d1, [x1, #8] 527; CHECK-SD-NEXT: saddw v0.8h, v0.8h, v1.8b 528; CHECK-SD-NEXT: ret 529; 530; CHECK-GI-LABEL: saddw2_8h: 531; CHECK-GI: // %bb.0: 532; CHECK-GI-NEXT: ldr q0, [x0] 533; CHECK-GI-NEXT: ldr q1, [x1] 534; CHECK-GI-NEXT: saddw2 v0.8h, v0.8h, v1.16b 535; CHECK-GI-NEXT: ret 536 %tmp1 = load <8 x i16>, ptr %A 537 538 %tmp2 = load <16 x i8>, ptr %B 539 %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 540 %ext2 = sext <8 x i8> %high2 to <8 x i16> 541 542 %res = add <8 x i16> %tmp1, %ext2 543 ret <8 x i16> %res 544} 545 546define <4 x i32> @saddw2_4s(ptr %A, ptr %B) nounwind { 547; CHECK-SD-LABEL: saddw2_4s: 548; CHECK-SD: // %bb.0: 549; CHECK-SD-NEXT: ldr q0, [x0] 550; CHECK-SD-NEXT: ldr d1, [x1, #8] 551; CHECK-SD-NEXT: saddw v0.4s, v0.4s, v1.4h 552; CHECK-SD-NEXT: ret 553; 554; CHECK-GI-LABEL: saddw2_4s: 555; CHECK-GI: // %bb.0: 556; CHECK-GI-NEXT: ldr q0, [x0] 557; CHECK-GI-NEXT: ldr q1, [x1] 558; CHECK-GI-NEXT: saddw2 v0.4s, v0.4s, v1.8h 559; CHECK-GI-NEXT: ret 560 %tmp1 = load <4 x i32>, ptr %A 561 562 %tmp2 = load <8 x i16>, ptr %B 563 %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 564 %ext2 = sext <4 x i16> %high2 to <4 x i32> 565 566 %res = add <4 x i32> %tmp1, %ext2 567 ret <4 x i32> %res 568} 569 570define <2 x i64> @saddw2_2d(ptr %A, ptr %B) nounwind { 571; CHECK-SD-LABEL: saddw2_2d: 572; CHECK-SD: // %bb.0: 573; CHECK-SD-NEXT: ldr q0, [x0] 574; CHECK-SD-NEXT: ldr d1, [x1, #8] 575; CHECK-SD-NEXT: saddw v0.2d, v0.2d, v1.2s 576; CHECK-SD-NEXT: ret 577; 578; CHECK-GI-LABEL: saddw2_2d: 579; CHECK-GI: // %bb.0: 580; CHECK-GI-NEXT: ldr q0, [x0] 581; CHECK-GI-NEXT: ldr q1, [x1] 582; CHECK-GI-NEXT: saddw2 v0.2d, v0.2d, v1.4s 583; CHECK-GI-NEXT: ret 584 %tmp1 = load <2 x i64>, ptr %A 585 586 %tmp2 = load <4 x i32>, ptr %B 587 %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 588 %ext2 = sext <2 x i32> %high2 to <2 x i64> 589 590 %res = add <2 x i64> %tmp1, %ext2 591 ret <2 x i64> %res 592} 593 594define <4 x i16> @saddlp4h(ptr %A) nounwind { 595; CHECK-LABEL: saddlp4h: 596; CHECK: // %bb.0: 597; CHECK-NEXT: ldr d0, [x0] 598; CHECK-NEXT: saddlp v0.4h, v0.8b 599; CHECK-NEXT: ret 600 %tmp1 = load <8 x i8>, ptr %A 601 %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1) 602 ret <4 x i16> %tmp3 603} 604 605define <2 x i32> @saddlp2s(ptr %A) nounwind { 606; CHECK-LABEL: saddlp2s: 607; CHECK: // %bb.0: 608; CHECK-NEXT: ldr d0, [x0] 609; CHECK-NEXT: saddlp v0.2s, v0.4h 610; CHECK-NEXT: ret 611 %tmp1 = load <4 x i16>, ptr %A 612 %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1) 613 ret <2 x i32> %tmp3 614} 615 616define <1 x i64> @saddlp1d(ptr %A) nounwind { 617; CHECK-LABEL: saddlp1d: 618; CHECK: // %bb.0: 619; CHECK-NEXT: ldr d0, [x0] 620; CHECK-NEXT: saddlp v0.1d, v0.2s 621; CHECK-NEXT: ret 622 %tmp1 = load <2 x i32>, ptr %A 623 %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1) 624 ret <1 x i64> %tmp3 625} 626 627define <8 x i16> @saddlp8h(ptr %A) nounwind { 628; CHECK-LABEL: saddlp8h: 629; CHECK: // %bb.0: 630; CHECK-NEXT: ldr q0, [x0] 631; CHECK-NEXT: saddlp v0.8h, v0.16b 632; CHECK-NEXT: ret 633 %tmp1 = load <16 x i8>, ptr %A 634 %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1) 635 ret <8 x i16> %tmp3 636} 637 638define <4 x i32> @saddlp4s(ptr %A) nounwind { 639; CHECK-LABEL: saddlp4s: 640; CHECK: // %bb.0: 641; CHECK-NEXT: ldr q0, [x0] 642; CHECK-NEXT: saddlp v0.4s, v0.8h 643; CHECK-NEXT: ret 644 %tmp1 = load <8 x i16>, ptr %A 645 %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1) 646 ret <4 x i32> %tmp3 647} 648 649define <2 x i64> @saddlp2d(ptr %A) nounwind { 650; CHECK-LABEL: saddlp2d: 651; CHECK: // %bb.0: 652; CHECK-NEXT: ldr q0, [x0] 653; CHECK-NEXT: saddlp v0.2d, v0.4s 654; CHECK-NEXT: ret 655 %tmp1 = load <4 x i32>, ptr %A 656 %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1) 657 ret <2 x i64> %tmp3 658} 659 660declare <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone 661declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone 662declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone 663 664declare <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone 665declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone 666declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone 667 668define <4 x i16> @uaddlp4h(ptr %A) nounwind { 669; CHECK-LABEL: uaddlp4h: 670; CHECK: // %bb.0: 671; CHECK-NEXT: ldr d0, [x0] 672; CHECK-NEXT: uaddlp v0.4h, v0.8b 673; CHECK-NEXT: ret 674 %tmp1 = load <8 x i8>, ptr %A 675 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1) 676 ret <4 x i16> %tmp3 677} 678 679define <2 x i32> @uaddlp2s(ptr %A) nounwind { 680; CHECK-LABEL: uaddlp2s: 681; CHECK: // %bb.0: 682; CHECK-NEXT: ldr d0, [x0] 683; CHECK-NEXT: uaddlp v0.2s, v0.4h 684; CHECK-NEXT: ret 685 %tmp1 = load <4 x i16>, ptr %A 686 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1) 687 ret <2 x i32> %tmp3 688} 689 690define <1 x i64> @uaddlp1d(ptr %A) nounwind { 691; CHECK-LABEL: uaddlp1d: 692; CHECK: // %bb.0: 693; CHECK-NEXT: ldr d0, [x0] 694; CHECK-NEXT: uaddlp v0.1d, v0.2s 695; CHECK-NEXT: ret 696 %tmp1 = load <2 x i32>, ptr %A 697 %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1) 698 ret <1 x i64> %tmp3 699} 700 701define <8 x i16> @uaddlp8h(ptr %A) nounwind { 702; CHECK-LABEL: uaddlp8h: 703; CHECK: // %bb.0: 704; CHECK-NEXT: ldr q0, [x0] 705; CHECK-NEXT: uaddlp v0.8h, v0.16b 706; CHECK-NEXT: ret 707 %tmp1 = load <16 x i8>, ptr %A 708 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1) 709 ret <8 x i16> %tmp3 710} 711 712define <4 x i32> @uaddlp4s(ptr %A) nounwind { 713; CHECK-LABEL: uaddlp4s: 714; CHECK: // %bb.0: 715; CHECK-NEXT: ldr q0, [x0] 716; CHECK-NEXT: uaddlp v0.4s, v0.8h 717; CHECK-NEXT: ret 718 %tmp1 = load <8 x i16>, ptr %A 719 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1) 720 ret <4 x i32> %tmp3 721} 722 723define <2 x i64> @uaddlp2d(ptr %A) nounwind { 724; CHECK-LABEL: uaddlp2d: 725; CHECK: // %bb.0: 726; CHECK-NEXT: ldr q0, [x0] 727; CHECK-NEXT: uaddlp v0.2d, v0.4s 728; CHECK-NEXT: ret 729 %tmp1 = load <4 x i32>, ptr %A 730 %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1) 731 ret <2 x i64> %tmp3 732} 733 734declare <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone 735declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone 736declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone 737 738declare <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone 739declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone 740declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone 741 742define <4 x i16> @sadalp4h(ptr %A, ptr %B) nounwind { 743; CHECK-LABEL: sadalp4h: 744; CHECK: // %bb.0: 745; CHECK-NEXT: ldr d1, [x0] 746; CHECK-NEXT: ldr d0, [x1] 747; CHECK-NEXT: sadalp v0.4h, v1.8b 748; CHECK-NEXT: ret 749 %tmp1 = load <8 x i8>, ptr %A 750 %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1) 751 %tmp4 = load <4 x i16>, ptr %B 752 %tmp5 = add <4 x i16> %tmp3, %tmp4 753 ret <4 x i16> %tmp5 754} 755 756define <2 x i32> @sadalp2s(ptr %A, ptr %B) nounwind { 757; CHECK-LABEL: sadalp2s: 758; CHECK: // %bb.0: 759; CHECK-NEXT: ldr d1, [x0] 760; CHECK-NEXT: ldr d0, [x1] 761; CHECK-NEXT: sadalp v0.2s, v1.4h 762; CHECK-NEXT: ret 763 %tmp1 = load <4 x i16>, ptr %A 764 %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1) 765 %tmp4 = load <2 x i32>, ptr %B 766 %tmp5 = add <2 x i32> %tmp3, %tmp4 767 ret <2 x i32> %tmp5 768} 769 770define <8 x i16> @sadalp8h(ptr %A, ptr %B) nounwind { 771; CHECK-LABEL: sadalp8h: 772; CHECK: // %bb.0: 773; CHECK-NEXT: ldr q1, [x0] 774; CHECK-NEXT: ldr q0, [x1] 775; CHECK-NEXT: sadalp v0.8h, v1.16b 776; CHECK-NEXT: ret 777 %tmp1 = load <16 x i8>, ptr %A 778 %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1) 779 %tmp4 = load <8 x i16>, ptr %B 780 %tmp5 = add <8 x i16> %tmp3, %tmp4 781 ret <8 x i16> %tmp5 782} 783 784define <4 x i32> @sadalp4s(ptr %A, ptr %B) nounwind { 785; CHECK-LABEL: sadalp4s: 786; CHECK: // %bb.0: 787; CHECK-NEXT: ldr q1, [x0] 788; CHECK-NEXT: ldr q0, [x1] 789; CHECK-NEXT: sadalp v0.4s, v1.8h 790; CHECK-NEXT: ret 791 %tmp1 = load <8 x i16>, ptr %A 792 %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1) 793 %tmp4 = load <4 x i32>, ptr %B 794 %tmp5 = add <4 x i32> %tmp3, %tmp4 795 ret <4 x i32> %tmp5 796} 797 798define <2 x i64> @sadalp2d(ptr %A, ptr %B) nounwind { 799; CHECK-LABEL: sadalp2d: 800; CHECK: // %bb.0: 801; CHECK-NEXT: ldr q1, [x0] 802; CHECK-NEXT: ldr q0, [x1] 803; CHECK-NEXT: sadalp v0.2d, v1.4s 804; CHECK-NEXT: ret 805 %tmp1 = load <4 x i32>, ptr %A 806 %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1) 807 %tmp4 = load <2 x i64>, ptr %B 808 %tmp5 = add <2 x i64> %tmp3, %tmp4 809 ret <2 x i64> %tmp5 810} 811 812define <4 x i16> @uadalp4h(ptr %A, ptr %B) nounwind { 813; CHECK-LABEL: uadalp4h: 814; CHECK: // %bb.0: 815; CHECK-NEXT: ldr d1, [x0] 816; CHECK-NEXT: ldr d0, [x1] 817; CHECK-NEXT: uadalp v0.4h, v1.8b 818; CHECK-NEXT: ret 819 %tmp1 = load <8 x i8>, ptr %A 820 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1) 821 %tmp4 = load <4 x i16>, ptr %B 822 %tmp5 = add <4 x i16> %tmp3, %tmp4 823 ret <4 x i16> %tmp5 824} 825 826define <2 x i32> @uadalp2s(ptr %A, ptr %B) nounwind { 827; CHECK-LABEL: uadalp2s: 828; CHECK: // %bb.0: 829; CHECK-NEXT: ldr d1, [x0] 830; CHECK-NEXT: ldr d0, [x1] 831; CHECK-NEXT: uadalp v0.2s, v1.4h 832; CHECK-NEXT: ret 833 %tmp1 = load <4 x i16>, ptr %A 834 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1) 835 %tmp4 = load <2 x i32>, ptr %B 836 %tmp5 = add <2 x i32> %tmp3, %tmp4 837 ret <2 x i32> %tmp5 838} 839 840define <8 x i16> @uadalp8h(ptr %A, ptr %B) nounwind { 841; CHECK-LABEL: uadalp8h: 842; CHECK: // %bb.0: 843; CHECK-NEXT: ldr q1, [x0] 844; CHECK-NEXT: ldr q0, [x1] 845; CHECK-NEXT: uadalp v0.8h, v1.16b 846; CHECK-NEXT: ret 847 %tmp1 = load <16 x i8>, ptr %A 848 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1) 849 %tmp4 = load <8 x i16>, ptr %B 850 %tmp5 = add <8 x i16> %tmp3, %tmp4 851 ret <8 x i16> %tmp5 852} 853 854define <4 x i32> @uadalp4s(ptr %A, ptr %B) nounwind { 855; CHECK-LABEL: uadalp4s: 856; CHECK: // %bb.0: 857; CHECK-NEXT: ldr q1, [x0] 858; CHECK-NEXT: ldr q0, [x1] 859; CHECK-NEXT: uadalp v0.4s, v1.8h 860; CHECK-NEXT: ret 861 %tmp1 = load <8 x i16>, ptr %A 862 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1) 863 %tmp4 = load <4 x i32>, ptr %B 864 %tmp5 = add <4 x i32> %tmp3, %tmp4 865 ret <4 x i32> %tmp5 866} 867 868define <2 x i64> @uadalp2d(ptr %A, ptr %B) nounwind { 869; CHECK-LABEL: uadalp2d: 870; CHECK: // %bb.0: 871; CHECK-NEXT: ldr q1, [x0] 872; CHECK-NEXT: ldr q0, [x1] 873; CHECK-NEXT: uadalp v0.2d, v1.4s 874; CHECK-NEXT: ret 875 %tmp1 = load <4 x i32>, ptr %A 876 %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1) 877 %tmp4 = load <2 x i64>, ptr %B 878 %tmp5 = add <2 x i64> %tmp3, %tmp4 879 ret <2 x i64> %tmp5 880} 881 882define <8 x i8> @addp_8b(ptr %A, ptr %B) nounwind { 883; CHECK-LABEL: addp_8b: 884; CHECK: // %bb.0: 885; CHECK-NEXT: ldr d0, [x0] 886; CHECK-NEXT: ldr d1, [x1] 887; CHECK-NEXT: addp v0.8b, v0.8b, v1.8b 888; CHECK-NEXT: ret 889 %tmp1 = load <8 x i8>, ptr %A 890 %tmp2 = load <8 x i8>, ptr %B 891 %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) 892 ret <8 x i8> %tmp3 893} 894 895define <16 x i8> @addp_16b(ptr %A, ptr %B) nounwind { 896; CHECK-LABEL: addp_16b: 897; CHECK: // %bb.0: 898; CHECK-NEXT: ldr q0, [x0] 899; CHECK-NEXT: ldr q1, [x1] 900; CHECK-NEXT: addp v0.16b, v0.16b, v1.16b 901; CHECK-NEXT: ret 902 %tmp1 = load <16 x i8>, ptr %A 903 %tmp2 = load <16 x i8>, ptr %B 904 %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) 905 ret <16 x i8> %tmp3 906} 907 908define <4 x i16> @addp_4h(ptr %A, ptr %B) nounwind { 909; CHECK-LABEL: addp_4h: 910; CHECK: // %bb.0: 911; CHECK-NEXT: ldr d0, [x0] 912; CHECK-NEXT: ldr d1, [x1] 913; CHECK-NEXT: addp v0.4h, v0.4h, v1.4h 914; CHECK-NEXT: ret 915 %tmp1 = load <4 x i16>, ptr %A 916 %tmp2 = load <4 x i16>, ptr %B 917 %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 918 ret <4 x i16> %tmp3 919} 920 921define <8 x i16> @addp_8h(ptr %A, ptr %B) nounwind { 922; CHECK-LABEL: addp_8h: 923; CHECK: // %bb.0: 924; CHECK-NEXT: ldr q0, [x0] 925; CHECK-NEXT: ldr q1, [x1] 926; CHECK-NEXT: addp v0.8h, v0.8h, v1.8h 927; CHECK-NEXT: ret 928 %tmp1 = load <8 x i16>, ptr %A 929 %tmp2 = load <8 x i16>, ptr %B 930 %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 931 ret <8 x i16> %tmp3 932} 933 934define <2 x i32> @addp_2s(ptr %A, ptr %B) nounwind { 935; CHECK-LABEL: addp_2s: 936; CHECK: // %bb.0: 937; CHECK-NEXT: ldr d0, [x0] 938; CHECK-NEXT: ldr d1, [x1] 939; CHECK-NEXT: addp v0.2s, v0.2s, v1.2s 940; CHECK-NEXT: ret 941 %tmp1 = load <2 x i32>, ptr %A 942 %tmp2 = load <2 x i32>, ptr %B 943 %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 944 ret <2 x i32> %tmp3 945} 946 947define <4 x i32> @addp_4s(ptr %A, ptr %B) nounwind { 948; CHECK-LABEL: addp_4s: 949; CHECK: // %bb.0: 950; CHECK-NEXT: ldr q0, [x0] 951; CHECK-NEXT: ldr q1, [x1] 952; CHECK-NEXT: addp v0.4s, v0.4s, v1.4s 953; CHECK-NEXT: ret 954 %tmp1 = load <4 x i32>, ptr %A 955 %tmp2 = load <4 x i32>, ptr %B 956 %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 957 ret <4 x i32> %tmp3 958} 959 960define <2 x i64> @addp_2d(ptr %A, ptr %B) nounwind { 961; CHECK-LABEL: addp_2d: 962; CHECK: // %bb.0: 963; CHECK-NEXT: ldr q0, [x0] 964; CHECK-NEXT: ldr q1, [x1] 965; CHECK-NEXT: addp v0.2d, v0.2d, v1.2d 966; CHECK-NEXT: ret 967 %tmp1 = load <2 x i64>, ptr %A 968 %tmp2 = load <2 x i64>, ptr %B 969 %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) 970 ret <2 x i64> %tmp3 971} 972 973declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone 974declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 975declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 976declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 977declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 978declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 979declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone 980 981define <2 x float> @faddp_2s(ptr %A, ptr %B) nounwind { 982; CHECK-LABEL: faddp_2s: 983; CHECK: // %bb.0: 984; CHECK-NEXT: ldr d0, [x0] 985; CHECK-NEXT: ldr d1, [x1] 986; CHECK-NEXT: faddp v0.2s, v0.2s, v1.2s 987; CHECK-NEXT: ret 988 %tmp1 = load <2 x float>, ptr %A 989 %tmp2 = load <2 x float>, ptr %B 990 %tmp3 = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) 991 ret <2 x float> %tmp3 992} 993 994define <4 x float> @faddp_4s(ptr %A, ptr %B) nounwind { 995; CHECK-LABEL: faddp_4s: 996; CHECK: // %bb.0: 997; CHECK-NEXT: ldr q0, [x0] 998; CHECK-NEXT: ldr q1, [x1] 999; CHECK-NEXT: faddp v0.4s, v0.4s, v1.4s 1000; CHECK-NEXT: ret 1001 %tmp1 = load <4 x float>, ptr %A 1002 %tmp2 = load <4 x float>, ptr %B 1003 %tmp3 = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) 1004 ret <4 x float> %tmp3 1005} 1006 1007define <2 x double> @faddp_2d(ptr %A, ptr %B) nounwind { 1008; CHECK-LABEL: faddp_2d: 1009; CHECK: // %bb.0: 1010; CHECK-NEXT: ldr q0, [x0] 1011; CHECK-NEXT: ldr q1, [x1] 1012; CHECK-NEXT: faddp v0.2d, v0.2d, v1.2d 1013; CHECK-NEXT: ret 1014 %tmp1 = load <2 x double>, ptr %A 1015 %tmp2 = load <2 x double>, ptr %B 1016 %tmp3 = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) 1017 ret <2 x double> %tmp3 1018} 1019 1020declare <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float>, <2 x float>) nounwind readnone 1021declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>) nounwind readnone 1022declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>) nounwind readnone 1023 1024define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) { 1025; CHECK-LABEL: uaddl_duprhs: 1026; CHECK: // %bb.0: 1027; CHECK-NEXT: dup v1.2s, w0 1028; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s 1029; CHECK-NEXT: ret 1030 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 1031 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 1032 1033 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1034 1035 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64> 1036 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64> 1037 1038 %res = add <2 x i64> %lhs.ext, %rhs.ext 1039 ret <2 x i64> %res 1040} 1041 1042define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) { 1043; CHECK-SD-LABEL: uaddl2_duprhs: 1044; CHECK-SD: // %bb.0: 1045; CHECK-SD-NEXT: dup v1.4s, w0 1046; CHECK-SD-NEXT: uaddl2 v0.2d, v0.4s, v1.4s 1047; CHECK-SD-NEXT: ret 1048; 1049; CHECK-GI-LABEL: uaddl2_duprhs: 1050; CHECK-GI: // %bb.0: 1051; CHECK-GI-NEXT: dup v1.2s, w0 1052; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0 1053; CHECK-GI-NEXT: uaddw2 v0.2d, v1.2d, v0.4s 1054; CHECK-GI-NEXT: ret 1055 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 1056 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 1057 1058 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1059 1060 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64> 1061 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64> 1062 1063 %res = add <2 x i64> %lhs.ext, %rhs.ext 1064 ret <2 x i64> %res 1065} 1066 1067define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) { 1068; CHECK-LABEL: saddl_duplhs: 1069; CHECK: // %bb.0: 1070; CHECK-NEXT: dup v1.2s, w0 1071; CHECK-NEXT: saddl v0.2d, v1.2s, v0.2s 1072; CHECK-NEXT: ret 1073 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0 1074 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1 1075 1076 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1077 1078 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64> 1079 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64> 1080 1081 %res = add <2 x i64> %lhs.ext, %rhs.ext 1082 ret <2 x i64> %res 1083} 1084 1085define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) { 1086; CHECK-SD-LABEL: saddl2_duplhs: 1087; CHECK-SD: // %bb.0: 1088; CHECK-SD-NEXT: dup v1.4s, w0 1089; CHECK-SD-NEXT: saddl2 v0.2d, v1.4s, v0.4s 1090; CHECK-SD-NEXT: ret 1091; 1092; CHECK-GI-LABEL: saddl2_duplhs: 1093; CHECK-GI: // %bb.0: 1094; CHECK-GI-NEXT: dup v1.2s, w0 1095; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0 1096; CHECK-GI-NEXT: saddw2 v0.2d, v1.2d, v0.4s 1097; CHECK-GI-NEXT: ret 1098 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0 1099 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1 1100 1101 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1102 1103 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64> 1104 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64> 1105 1106 %res = add <2 x i64> %lhs.ext, %rhs.ext 1107 ret <2 x i64> %res 1108} 1109 1110define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) { 1111; CHECK-LABEL: usubl_duprhs: 1112; CHECK: // %bb.0: 1113; CHECK-NEXT: dup v1.2s, w0 1114; CHECK-NEXT: usubl v0.2d, v0.2s, v1.2s 1115; CHECK-NEXT: ret 1116 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 1117 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 1118 1119 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1120 1121 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64> 1122 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64> 1123 1124 %res = sub <2 x i64> %lhs.ext, %rhs.ext 1125 ret <2 x i64> %res 1126} 1127 1128define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) { 1129; CHECK-SD-LABEL: usubl2_duprhs: 1130; CHECK-SD: // %bb.0: 1131; CHECK-SD-NEXT: dup v1.4s, w0 1132; CHECK-SD-NEXT: usubl2 v0.2d, v0.4s, v1.4s 1133; CHECK-SD-NEXT: ret 1134; 1135; CHECK-GI-LABEL: usubl2_duprhs: 1136; CHECK-GI: // %bb.0: 1137; CHECK-GI-NEXT: dup v1.2s, w0 1138; CHECK-GI-NEXT: mov d0, v0.d[1] 1139; CHECK-GI-NEXT: usubl v0.2d, v0.2s, v1.2s 1140; CHECK-GI-NEXT: ret 1141 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 1142 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 1143 1144 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1145 1146 %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64> 1147 %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64> 1148 1149 %res = sub <2 x i64> %lhs.ext, %rhs.ext 1150 ret <2 x i64> %res 1151} 1152 1153define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) { 1154; CHECK-LABEL: ssubl_duplhs: 1155; CHECK: // %bb.0: 1156; CHECK-NEXT: dup v1.2s, w0 1157; CHECK-NEXT: ssubl v0.2d, v1.2s, v0.2s 1158; CHECK-NEXT: ret 1159 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0 1160 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1 1161 1162 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1163 1164 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64> 1165 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64> 1166 1167 %res = sub <2 x i64> %lhs.ext, %rhs.ext 1168 ret <2 x i64> %res 1169} 1170 1171define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) { 1172; CHECK-SD-LABEL: ssubl2_duplhs: 1173; CHECK-SD: // %bb.0: 1174; CHECK-SD-NEXT: dup v1.4s, w0 1175; CHECK-SD-NEXT: ssubl2 v0.2d, v1.4s, v0.4s 1176; CHECK-SD-NEXT: ret 1177; 1178; CHECK-GI-LABEL: ssubl2_duplhs: 1179; CHECK-GI: // %bb.0: 1180; CHECK-GI-NEXT: dup v1.2s, w0 1181; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0 1182; CHECK-GI-NEXT: ssubw2 v0.2d, v1.2d, v0.4s 1183; CHECK-GI-NEXT: ret 1184 %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0 1185 %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1 1186 1187 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1188 1189 %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64> 1190 %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64> 1191 1192 %res = sub <2 x i64> %lhs.ext, %rhs.ext 1193 ret <2 x i64> %res 1194} 1195 1196define <8 x i8> @addhn8b_natural(ptr %A, ptr %B) nounwind { 1197; CHECK-SD-LABEL: addhn8b_natural: 1198; CHECK-SD: // %bb.0: 1199; CHECK-SD-NEXT: ldr q0, [x0] 1200; CHECK-SD-NEXT: ldr q1, [x1] 1201; CHECK-SD-NEXT: addhn v0.8b, v0.8h, v1.8h 1202; CHECK-SD-NEXT: ret 1203; 1204; CHECK-GI-LABEL: addhn8b_natural: 1205; CHECK-GI: // %bb.0: 1206; CHECK-GI-NEXT: ldr q0, [x0] 1207; CHECK-GI-NEXT: ldr q1, [x1] 1208; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h 1209; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8 1210; CHECK-GI-NEXT: ret 1211 %tmp1 = load <8 x i16>, ptr %A 1212 %tmp2 = load <8 x i16>, ptr %B 1213 %sum = add <8 x i16> %tmp1, %tmp2 1214 %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1215 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 1216 ret <8 x i8> %narrowed 1217} 1218 1219define <4 x i16> @addhn4h_natural(ptr %A, ptr %B) nounwind { 1220; CHECK-SD-LABEL: addhn4h_natural: 1221; CHECK-SD: // %bb.0: 1222; CHECK-SD-NEXT: ldr q0, [x0] 1223; CHECK-SD-NEXT: ldr q1, [x1] 1224; CHECK-SD-NEXT: addhn v0.4h, v0.4s, v1.4s 1225; CHECK-SD-NEXT: ret 1226; 1227; CHECK-GI-LABEL: addhn4h_natural: 1228; CHECK-GI: // %bb.0: 1229; CHECK-GI-NEXT: ldr q0, [x0] 1230; CHECK-GI-NEXT: ldr q1, [x1] 1231; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s 1232; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16 1233; CHECK-GI-NEXT: ret 1234 %tmp1 = load <4 x i32>, ptr %A 1235 %tmp2 = load <4 x i32>, ptr %B 1236 %sum = add <4 x i32> %tmp1, %tmp2 1237 %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16> 1238 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 1239 ret <4 x i16> %narrowed 1240} 1241 1242define <2 x i32> @addhn2s_natural(ptr %A, ptr %B) nounwind { 1243; CHECK-SD-LABEL: addhn2s_natural: 1244; CHECK-SD: // %bb.0: 1245; CHECK-SD-NEXT: ldr q0, [x0] 1246; CHECK-SD-NEXT: ldr q1, [x1] 1247; CHECK-SD-NEXT: addhn v0.2s, v0.2d, v1.2d 1248; CHECK-SD-NEXT: ret 1249; 1250; CHECK-GI-LABEL: addhn2s_natural: 1251; CHECK-GI: // %bb.0: 1252; CHECK-GI-NEXT: ldr q0, [x0] 1253; CHECK-GI-NEXT: ldr q1, [x1] 1254; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d 1255; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32 1256; CHECK-GI-NEXT: ret 1257 %tmp1 = load <2 x i64>, ptr %A 1258 %tmp2 = load <2 x i64>, ptr %B 1259 %sum = add <2 x i64> %tmp1, %tmp2 1260 %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32> 1261 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 1262 ret <2 x i32> %narrowed 1263} 1264 1265define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind { 1266; CHECK-SD-LABEL: addhn2_16b_natural: 1267; CHECK-SD: // %bb.0: 1268; CHECK-SD-NEXT: ldr q1, [x0] 1269; CHECK-SD-NEXT: ldr q2, [x1] 1270; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 1271; CHECK-SD-NEXT: addhn2 v0.16b, v1.8h, v2.8h 1272; CHECK-SD-NEXT: ret 1273; 1274; CHECK-GI-LABEL: addhn2_16b_natural: 1275; CHECK-GI: // %bb.0: 1276; CHECK-GI-NEXT: ldr q1, [x0] 1277; CHECK-GI-NEXT: ldr q2, [x1] 1278; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 1279; CHECK-GI-NEXT: add v1.8h, v1.8h, v2.8h 1280; CHECK-GI-NEXT: shrn2 v0.16b, v1.8h, #8 1281; CHECK-GI-NEXT: ret 1282 %tmp1 = load <8 x i16>, ptr %A 1283 %tmp2 = load <8 x i16>, ptr %B 1284 %sum = add <8 x i16> %tmp1, %tmp2 1285 %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1286 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 1287 %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1288 ret <16 x i8> %res 1289} 1290 1291define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind { 1292; CHECK-SD-LABEL: addhn2_8h_natural: 1293; CHECK-SD: // %bb.0: 1294; CHECK-SD-NEXT: ldr q1, [x0] 1295; CHECK-SD-NEXT: ldr q2, [x1] 1296; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 1297; CHECK-SD-NEXT: addhn2 v0.8h, v1.4s, v2.4s 1298; CHECK-SD-NEXT: ret 1299; 1300; CHECK-GI-LABEL: addhn2_8h_natural: 1301; CHECK-GI: // %bb.0: 1302; CHECK-GI-NEXT: ldr q1, [x0] 1303; CHECK-GI-NEXT: ldr q2, [x1] 1304; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 1305; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s 1306; CHECK-GI-NEXT: shrn2 v0.8h, v1.4s, #16 1307; CHECK-GI-NEXT: ret 1308 %tmp1 = load <4 x i32>, ptr %A 1309 %tmp2 = load <4 x i32>, ptr %B 1310 %sum = add <4 x i32> %tmp1, %tmp2 1311 %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16> 1312 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 1313 %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1314 ret <8 x i16> %res 1315} 1316 1317define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind { 1318; CHECK-SD-LABEL: addhn2_4s_natural: 1319; CHECK-SD: // %bb.0: 1320; CHECK-SD-NEXT: ldr q1, [x0] 1321; CHECK-SD-NEXT: ldr q2, [x1] 1322; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 1323; CHECK-SD-NEXT: addhn2 v0.4s, v1.2d, v2.2d 1324; CHECK-SD-NEXT: ret 1325; 1326; CHECK-GI-LABEL: addhn2_4s_natural: 1327; CHECK-GI: // %bb.0: 1328; CHECK-GI-NEXT: ldr q1, [x0] 1329; CHECK-GI-NEXT: ldr q2, [x1] 1330; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 1331; CHECK-GI-NEXT: add v1.2d, v1.2d, v2.2d 1332; CHECK-GI-NEXT: shrn2 v0.4s, v1.2d, #32 1333; CHECK-GI-NEXT: ret 1334 %tmp1 = load <2 x i64>, ptr %A 1335 %tmp2 = load <2 x i64>, ptr %B 1336 %sum = add <2 x i64> %tmp1, %tmp2 1337 %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32> 1338 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 1339 %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1340 ret <4 x i32> %res 1341} 1342 1343define <4 x i32> @addhn_addhn2_4s(ptr %A, ptr %B, ptr %C, ptr %D) nounwind { 1344; CHECK-SD-LABEL: addhn_addhn2_4s: 1345; CHECK-SD: // %bb.0: 1346; CHECK-SD-NEXT: ldr q1, [x0] 1347; CHECK-SD-NEXT: ldr q2, [x1] 1348; CHECK-SD-NEXT: addhn v0.2s, v1.2d, v2.2d 1349; CHECK-SD-NEXT: addhn2 v0.4s, v1.2d, v2.2d 1350; CHECK-SD-NEXT: ret 1351; 1352; CHECK-GI-LABEL: addhn_addhn2_4s: 1353; CHECK-GI: // %bb.0: 1354; CHECK-GI-NEXT: ldr q0, [x0] 1355; CHECK-GI-NEXT: ldr q1, [x1] 1356; CHECK-GI-NEXT: add v1.2d, v0.2d, v1.2d 1357; CHECK-GI-NEXT: shrn v0.2s, v1.2d, #32 1358; CHECK-GI-NEXT: shrn2 v0.4s, v1.2d, #32 1359; CHECK-GI-NEXT: ret 1360 %tmp1 = load <2 x i64>, ptr %A 1361 %tmp2 = load <2 x i64>, ptr %B 1362 %sum1 = add <2 x i64> %tmp1, %tmp2 1363 %low_bits = lshr <2 x i64> %sum1, <i64 32, i64 32> 1364 %narrowed1 = trunc <2 x i64> %low_bits to <2 x i32> 1365 %tmp3 = load <2 x i64>, ptr %C 1366 %tmp4 = load <2 x i64>, ptr %D 1367 %sum2 = add <2 x i64> %tmp3, %tmp4 1368 %high_bits = lshr <2 x i64> %sum1, <i64 32, i64 32> 1369 %narrowed2 = trunc <2 x i64> %high_bits to <2 x i32> 1370 %res = shufflevector <2 x i32> %narrowed1, <2 x i32> %narrowed2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1371 ret <4 x i32> %res 1372} 1373 1374define <8 x i8> @subhn8b_natural(ptr %A, ptr %B) nounwind { 1375; CHECK-SD-LABEL: subhn8b_natural: 1376; CHECK-SD: // %bb.0: 1377; CHECK-SD-NEXT: ldr q0, [x0] 1378; CHECK-SD-NEXT: ldr q1, [x1] 1379; CHECK-SD-NEXT: subhn v0.8b, v0.8h, v1.8h 1380; CHECK-SD-NEXT: ret 1381; 1382; CHECK-GI-LABEL: subhn8b_natural: 1383; CHECK-GI: // %bb.0: 1384; CHECK-GI-NEXT: ldr q0, [x0] 1385; CHECK-GI-NEXT: ldr q1, [x1] 1386; CHECK-GI-NEXT: sub v0.8h, v0.8h, v1.8h 1387; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8 1388; CHECK-GI-NEXT: ret 1389 %tmp1 = load <8 x i16>, ptr %A 1390 %tmp2 = load <8 x i16>, ptr %B 1391 %diff = sub <8 x i16> %tmp1, %tmp2 1392 %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1393 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 1394 ret <8 x i8> %narrowed 1395} 1396 1397define <4 x i16> @subhn4h_natural(ptr %A, ptr %B) nounwind { 1398; CHECK-SD-LABEL: subhn4h_natural: 1399; CHECK-SD: // %bb.0: 1400; CHECK-SD-NEXT: ldr q0, [x0] 1401; CHECK-SD-NEXT: ldr q1, [x1] 1402; CHECK-SD-NEXT: subhn v0.4h, v0.4s, v1.4s 1403; CHECK-SD-NEXT: ret 1404; 1405; CHECK-GI-LABEL: subhn4h_natural: 1406; CHECK-GI: // %bb.0: 1407; CHECK-GI-NEXT: ldr q0, [x0] 1408; CHECK-GI-NEXT: ldr q1, [x1] 1409; CHECK-GI-NEXT: sub v0.4s, v0.4s, v1.4s 1410; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16 1411; CHECK-GI-NEXT: ret 1412 %tmp1 = load <4 x i32>, ptr %A 1413 %tmp2 = load <4 x i32>, ptr %B 1414 %diff = sub <4 x i32> %tmp1, %tmp2 1415 %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16> 1416 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 1417 ret <4 x i16> %narrowed 1418} 1419 1420define <2 x i32> @subhn2s_natural(ptr %A, ptr %B) nounwind { 1421; CHECK-SD-LABEL: subhn2s_natural: 1422; CHECK-SD: // %bb.0: 1423; CHECK-SD-NEXT: ldr q0, [x0] 1424; CHECK-SD-NEXT: ldr q1, [x1] 1425; CHECK-SD-NEXT: subhn v0.2s, v0.2d, v1.2d 1426; CHECK-SD-NEXT: ret 1427; 1428; CHECK-GI-LABEL: subhn2s_natural: 1429; CHECK-GI: // %bb.0: 1430; CHECK-GI-NEXT: ldr q0, [x0] 1431; CHECK-GI-NEXT: ldr q1, [x1] 1432; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d 1433; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32 1434; CHECK-GI-NEXT: ret 1435 %tmp1 = load <2 x i64>, ptr %A 1436 %tmp2 = load <2 x i64>, ptr %B 1437 %diff = sub <2 x i64> %tmp1, %tmp2 1438 %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32> 1439 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 1440 ret <2 x i32> %narrowed 1441} 1442 1443define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind { 1444; CHECK-SD-LABEL: subhn2_16b_natural: 1445; CHECK-SD: // %bb.0: 1446; CHECK-SD-NEXT: ldr q1, [x0] 1447; CHECK-SD-NEXT: ldr q2, [x1] 1448; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 1449; CHECK-SD-NEXT: subhn2 v0.16b, v1.8h, v2.8h 1450; CHECK-SD-NEXT: ret 1451; 1452; CHECK-GI-LABEL: subhn2_16b_natural: 1453; CHECK-GI: // %bb.0: 1454; CHECK-GI-NEXT: ldr q1, [x0] 1455; CHECK-GI-NEXT: ldr q2, [x1] 1456; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 1457; CHECK-GI-NEXT: sub v1.8h, v1.8h, v2.8h 1458; CHECK-GI-NEXT: shrn2 v0.16b, v1.8h, #8 1459; CHECK-GI-NEXT: ret 1460 %tmp1 = load <8 x i16>, ptr %A 1461 %tmp2 = load <8 x i16>, ptr %B 1462 %diff = sub <8 x i16> %tmp1, %tmp2 1463 %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1464 %narrowed = trunc <8 x i16> %high_bits to <8 x i8> 1465 %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1466 ret <16 x i8> %res 1467} 1468 1469define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind { 1470; CHECK-SD-LABEL: subhn2_8h_natural: 1471; CHECK-SD: // %bb.0: 1472; CHECK-SD-NEXT: ldr q1, [x0] 1473; CHECK-SD-NEXT: ldr q2, [x1] 1474; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 1475; CHECK-SD-NEXT: subhn2 v0.8h, v1.4s, v2.4s 1476; CHECK-SD-NEXT: ret 1477; 1478; CHECK-GI-LABEL: subhn2_8h_natural: 1479; CHECK-GI: // %bb.0: 1480; CHECK-GI-NEXT: ldr q1, [x0] 1481; CHECK-GI-NEXT: ldr q2, [x1] 1482; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 1483; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s 1484; CHECK-GI-NEXT: shrn2 v0.8h, v1.4s, #16 1485; CHECK-GI-NEXT: ret 1486 %tmp1 = load <4 x i32>, ptr %A 1487 %tmp2 = load <4 x i32>, ptr %B 1488 %diff = sub <4 x i32> %tmp1, %tmp2 1489 %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16> 1490 %narrowed = trunc <4 x i32> %high_bits to <4 x i16> 1491 %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1492 ret <8 x i16> %res 1493} 1494 1495define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind { 1496; CHECK-SD-LABEL: subhn2_4s_natural: 1497; CHECK-SD: // %bb.0: 1498; CHECK-SD-NEXT: ldr q1, [x0] 1499; CHECK-SD-NEXT: ldr q2, [x1] 1500; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 1501; CHECK-SD-NEXT: subhn2 v0.4s, v1.2d, v2.2d 1502; CHECK-SD-NEXT: ret 1503; 1504; CHECK-GI-LABEL: subhn2_4s_natural: 1505; CHECK-GI: // %bb.0: 1506; CHECK-GI-NEXT: ldr q1, [x0] 1507; CHECK-GI-NEXT: ldr q2, [x1] 1508; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 1509; CHECK-GI-NEXT: sub v1.2d, v1.2d, v2.2d 1510; CHECK-GI-NEXT: shrn2 v0.4s, v1.2d, #32 1511; CHECK-GI-NEXT: ret 1512 %tmp1 = load <2 x i64>, ptr %A 1513 %tmp2 = load <2 x i64>, ptr %B 1514 %diff = sub <2 x i64> %tmp1, %tmp2 1515 %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32> 1516 %narrowed = trunc <2 x i64> %high_bits to <2 x i32> 1517 %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1518 ret <4 x i32> %res 1519} 1520