1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc <4 x i32> @vhadds_v4i32(<4 x i32> %s0, <4 x i32> %s1) { 5; CHECK-LABEL: vhadds_v4i32: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vhadd.s32 q0, q0, q1 8; CHECK-NEXT: bx lr 9entry: 10 %s0s = sext <4 x i32> %s0 to <4 x i64> 11 %s1s = sext <4 x i32> %s1 to <4 x i64> 12 %m = add nsw <4 x i64> %s0s, %s1s 13 %s = lshr <4 x i64> %m, <i64 1, i64 1, i64 1, i64 1> 14 %s2 = trunc <4 x i64> %s to <4 x i32> 15 ret <4 x i32> %s2 16} 17 18define arm_aapcs_vfpcc <4 x i32> @vhaddu_v4i32(<4 x i32> %s0, <4 x i32> %s1) { 19; CHECK-LABEL: vhaddu_v4i32: 20; CHECK: @ %bb.0: @ %entry 21; CHECK-NEXT: vhadd.u32 q0, q0, q1 22; CHECK-NEXT: bx lr 23entry: 24 %s0s = zext <4 x i32> %s0 to <4 x i64> 25 %s1s = zext <4 x i32> %s1 to <4 x i64> 26 %m = add nuw nsw <4 x i64> %s0s, %s1s 27 %s = lshr <4 x i64> %m, <i64 1, i64 1, i64 1, i64 1> 28 %s2 = trunc <4 x i64> %s to <4 x i32> 29 ret <4 x i32> %s2 30} 31 32define arm_aapcs_vfpcc <4 x i16> @vhadds_v4i16(<4 x i16> %s0, <4 x i16> %s1) { 33; CHECK-LABEL: vhadds_v4i16: 34; CHECK: @ %bb.0: @ %entry 35; CHECK-NEXT: vmovlb.s16 q1, q1 36; CHECK-NEXT: vmovlb.s16 q0, q0 37; CHECK-NEXT: vadd.i32 q0, q0, q1 38; CHECK-NEXT: vshr.u32 q0, q0, #1 39; CHECK-NEXT: bx lr 40entry: 41 %s0s = sext <4 x i16> %s0 to <4 x i32> 42 %s1s = sext <4 x i16> %s1 to <4 x i32> 43 %m = add nsw <4 x i32> %s0s, %s1s 44 %s = lshr <4 x i32> %m, <i32 1, i32 1, i32 1, i32 1> 45 %s2 = trunc <4 x i32> %s to <4 x i16> 46 ret <4 x i16> %s2 47} 48 49define arm_aapcs_vfpcc <4 x i16> @vhaddu_v4i16(<4 x i16> %s0, <4 x i16> %s1) { 50; CHECK-LABEL: vhaddu_v4i16: 51; CHECK: @ %bb.0: @ %entry 52; CHECK-NEXT: vmovlb.u16 q1, q1 53; CHECK-NEXT: vmovlb.u16 q0, q0 54; CHECK-NEXT: vhadd.u32 q0, q0, q1 55; CHECK-NEXT: bx lr 56entry: 57 %s0s = zext <4 x i16> %s0 to <4 x i32> 58 %s1s = zext <4 x i16> %s1 to <4 x i32> 59 %m = add nuw nsw <4 x i32> %s0s, %s1s 60 %s = lshr <4 x i32> %m, <i32 1, i32 1, i32 1, i32 1> 61 %s2 = trunc <4 x i32> %s to <4 x i16> 62 ret <4 x i16> %s2 63} 64 65define arm_aapcs_vfpcc <8 x i16> @vhadds_v8i16(<8 x i16> %s0, <8 x i16> %s1) { 66; CHECK-LABEL: vhadds_v8i16: 67; CHECK: @ %bb.0: @ %entry 68; CHECK-NEXT: vhadd.s16 q0, q0, q1 69; CHECK-NEXT: bx lr 70entry: 71 %s0s = sext <8 x i16> %s0 to <8 x i32> 72 %s1s = sext <8 x i16> %s1 to <8 x i32> 73 %m = add nsw <8 x i32> %s0s, %s1s 74 %s = lshr <8 x i32> %m, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 75 %s2 = trunc <8 x i32> %s to <8 x i16> 76 ret <8 x i16> %s2 77} 78 79define arm_aapcs_vfpcc <8 x i16> @vhaddu_v8i16(<8 x i16> %s0, <8 x i16> %s1) { 80; CHECK-LABEL: vhaddu_v8i16: 81; CHECK: @ %bb.0: @ %entry 82; CHECK-NEXT: vhadd.u16 q0, q0, q1 83; CHECK-NEXT: bx lr 84entry: 85 %s0s = zext <8 x i16> %s0 to <8 x i32> 86 %s1s = zext <8 x i16> %s1 to <8 x i32> 87 %m = add nuw nsw <8 x i32> %s0s, %s1s 88 %s = lshr <8 x i32> %m, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 89 %s2 = trunc <8 x i32> %s to <8 x i16> 90 ret <8 x i16> %s2 91} 92 93define arm_aapcs_vfpcc <4 x i8> @vhadds_v4i8(<4 x i8> %s0, <4 x i8> %s1) { 94; CHECK-LABEL: vhadds_v4i8: 95; CHECK: @ %bb.0: @ %entry 96; CHECK-NEXT: vmovlb.s8 q1, q1 97; CHECK-NEXT: vmovlb.s8 q0, q0 98; CHECK-NEXT: vmovlb.s16 q1, q1 99; CHECK-NEXT: vmovlb.s16 q0, q0 100; CHECK-NEXT: vadd.i32 q0, q0, q1 101; CHECK-NEXT: vmovlb.u16 q0, q0 102; CHECK-NEXT: vshr.u32 q0, q0, #1 103; CHECK-NEXT: bx lr 104entry: 105 %s0s = sext <4 x i8> %s0 to <4 x i16> 106 %s1s = sext <4 x i8> %s1 to <4 x i16> 107 %m = add nsw <4 x i16> %s0s, %s1s 108 %s = lshr <4 x i16> %m, <i16 1, i16 1, i16 1, i16 1> 109 %s2 = trunc <4 x i16> %s to <4 x i8> 110 ret <4 x i8> %s2 111} 112 113define arm_aapcs_vfpcc <4 x i8> @vhaddu_v4i8(<4 x i8> %s0, <4 x i8> %s1) { 114; CHECK-LABEL: vhaddu_v4i8: 115; CHECK: @ %bb.0: @ %entry 116; CHECK-NEXT: vmov.i32 q2, #0xff 117; CHECK-NEXT: vand q1, q1, q2 118; CHECK-NEXT: vand q0, q0, q2 119; CHECK-NEXT: vhadd.u32 q0, q0, q1 120; CHECK-NEXT: bx lr 121entry: 122 %s0s = zext <4 x i8> %s0 to <4 x i16> 123 %s1s = zext <4 x i8> %s1 to <4 x i16> 124 %m = add nuw nsw <4 x i16> %s0s, %s1s 125 %s = lshr <4 x i16> %m, <i16 1, i16 1, i16 1, i16 1> 126 %s2 = trunc <4 x i16> %s to <4 x i8> 127 ret <4 x i8> %s2 128} 129 130define arm_aapcs_vfpcc <8 x i8> @vhadds_v8i8(<8 x i8> %s0, <8 x i8> %s1) { 131; CHECK-LABEL: vhadds_v8i8: 132; CHECK: @ %bb.0: @ %entry 133; CHECK-NEXT: vmovlb.s8 q1, q1 134; CHECK-NEXT: vmovlb.s8 q0, q0 135; CHECK-NEXT: vadd.i16 q0, q0, q1 136; CHECK-NEXT: vshr.u16 q0, q0, #1 137; CHECK-NEXT: bx lr 138entry: 139 %s0s = sext <8 x i8> %s0 to <8 x i16> 140 %s1s = sext <8 x i8> %s1 to <8 x i16> 141 %m = add nsw <8 x i16> %s0s, %s1s 142 %s = lshr <8 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 143 %s2 = trunc <8 x i16> %s to <8 x i8> 144 ret <8 x i8> %s2 145} 146 147define arm_aapcs_vfpcc <8 x i8> @vhaddu_v8i8(<8 x i8> %s0, <8 x i8> %s1) { 148; CHECK-LABEL: vhaddu_v8i8: 149; CHECK: @ %bb.0: @ %entry 150; CHECK-NEXT: vmovlb.u8 q1, q1 151; CHECK-NEXT: vmovlb.u8 q0, q0 152; CHECK-NEXT: vhadd.u16 q0, q0, q1 153; CHECK-NEXT: bx lr 154entry: 155 %s0s = zext <8 x i8> %s0 to <8 x i16> 156 %s1s = zext <8 x i8> %s1 to <8 x i16> 157 %m = add nuw nsw <8 x i16> %s0s, %s1s 158 %s = lshr <8 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 159 %s2 = trunc <8 x i16> %s to <8 x i8> 160 ret <8 x i8> %s2 161} 162 163define arm_aapcs_vfpcc <16 x i8> @vhadds_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 164; CHECK-LABEL: vhadds_v16i8: 165; CHECK: @ %bb.0: @ %entry 166; CHECK-NEXT: vhadd.s8 q0, q0, q1 167; CHECK-NEXT: bx lr 168entry: 169 %s0s = sext <16 x i8> %s0 to <16 x i16> 170 %s1s = sext <16 x i8> %s1 to <16 x i16> 171 %m = add nsw <16 x i16> %s0s, %s1s 172 %s = lshr <16 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 173 %s2 = trunc <16 x i16> %s to <16 x i8> 174 ret <16 x i8> %s2 175} 176 177define arm_aapcs_vfpcc <16 x i8> @vhaddu_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 178; CHECK-LABEL: vhaddu_v16i8: 179; CHECK: @ %bb.0: @ %entry 180; CHECK-NEXT: vhadd.u8 q0, q0, q1 181; CHECK-NEXT: bx lr 182entry: 183 %s0s = zext <16 x i8> %s0 to <16 x i16> 184 %s1s = zext <16 x i8> %s1 to <16 x i16> 185 %m = add nuw nsw <16 x i16> %s0s, %s1s 186 %s = lshr <16 x i16> %m, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 187 %s2 = trunc <16 x i16> %s to <16 x i8> 188 ret <16 x i8> %s2 189} 190 191define arm_aapcs_vfpcc <4 x i32> @vrhadds_v4i32(<4 x i32> %s0, <4 x i32> %s1) { 192; CHECK-LABEL: vrhadds_v4i32: 193; CHECK: @ %bb.0: @ %entry 194; CHECK-NEXT: vrhadd.s32 q0, q0, q1 195; CHECK-NEXT: bx lr 196entry: 197 %s0s = sext <4 x i32> %s0 to <4 x i64> 198 %s1s = sext <4 x i32> %s1 to <4 x i64> 199 %add = add nsw <4 x i64> %s0s, <i64 1, i64 1, i64 1, i64 1> 200 %add2 = add nsw <4 x i64> %add, %s1s 201 %s = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1> 202 %result = trunc <4 x i64> %s to <4 x i32> 203 ret <4 x i32> %result 204} 205 206define arm_aapcs_vfpcc <4 x i32> @vrhaddu_v4i32(<4 x i32> %s0, <4 x i32> %s1) { 207; CHECK-LABEL: vrhaddu_v4i32: 208; CHECK: @ %bb.0: @ %entry 209; CHECK-NEXT: vrhadd.u32 q0, q0, q1 210; CHECK-NEXT: bx lr 211entry: 212 %s0s = zext <4 x i32> %s0 to <4 x i64> 213 %s1s = zext <4 x i32> %s1 to <4 x i64> 214 %add = add nuw nsw <4 x i64> %s0s, <i64 1, i64 1, i64 1, i64 1> 215 %add2 = add nuw nsw <4 x i64> %add, %s1s 216 %s = lshr <4 x i64> %add2, <i64 1, i64 1, i64 1, i64 1> 217 %result = trunc <4 x i64> %s to <4 x i32> 218 ret <4 x i32> %result 219} 220 221define arm_aapcs_vfpcc <4 x i16> @vrhadds_v4i16(<4 x i16> %s0, <4 x i16> %s1) { 222; CHECK-LABEL: vrhadds_v4i16: 223; CHECK: @ %bb.0: @ %entry 224; CHECK-NEXT: vmovlb.s16 q1, q1 225; CHECK-NEXT: vmovlb.s16 q0, q0 226; CHECK-NEXT: vadd.i32 q0, q0, q1 227; CHECK-NEXT: movs r0, #1 228; CHECK-NEXT: vadd.i32 q0, q0, r0 229; CHECK-NEXT: vshr.u32 q0, q0, #1 230; CHECK-NEXT: bx lr 231entry: 232 %s0s = sext <4 x i16> %s0 to <4 x i32> 233 %s1s = sext <4 x i16> %s1 to <4 x i32> 234 %add = add nsw <4 x i32> %s0s, <i32 1, i32 1, i32 1, i32 1> 235 %add2 = add nsw <4 x i32> %add, %s1s 236 %s = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1> 237 %result = trunc <4 x i32> %s to <4 x i16> 238 ret <4 x i16> %result 239} 240 241define arm_aapcs_vfpcc <4 x i16> @vrhaddu_v4i16(<4 x i16> %s0, <4 x i16> %s1) { 242; CHECK-LABEL: vrhaddu_v4i16: 243; CHECK: @ %bb.0: @ %entry 244; CHECK-NEXT: vmovlb.u16 q1, q1 245; CHECK-NEXT: vmovlb.u16 q0, q0 246; CHECK-NEXT: vrhadd.u32 q0, q0, q1 247; CHECK-NEXT: bx lr 248entry: 249 %s0s = zext <4 x i16> %s0 to <4 x i32> 250 %s1s = zext <4 x i16> %s1 to <4 x i32> 251 %add = add nuw nsw <4 x i32> %s0s, <i32 1, i32 1, i32 1, i32 1> 252 %add2 = add nuw nsw <4 x i32> %add, %s1s 253 %s = lshr <4 x i32> %add2, <i32 1, i32 1, i32 1, i32 1> 254 %result = trunc <4 x i32> %s to <4 x i16> 255 ret <4 x i16> %result 256} 257 258define arm_aapcs_vfpcc <8 x i16> @vrhadds_v8i16(<8 x i16> %s0, <8 x i16> %s1) { 259; CHECK-LABEL: vrhadds_v8i16: 260; CHECK: @ %bb.0: @ %entry 261; CHECK-NEXT: vrhadd.s16 q0, q0, q1 262; CHECK-NEXT: bx lr 263entry: 264 %s0s = sext <8 x i16> %s0 to <8 x i32> 265 %s1s = sext <8 x i16> %s1 to <8 x i32> 266 %add = add nsw <8 x i32> %s0s, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 267 %add2 = add nsw <8 x i32> %add, %s1s 268 %s = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 269 %result = trunc <8 x i32> %s to <8 x i16> 270 ret <8 x i16> %result 271} 272 273define arm_aapcs_vfpcc <8 x i16> @vrhaddu_v8i16(<8 x i16> %s0, <8 x i16> %s1) { 274; CHECK-LABEL: vrhaddu_v8i16: 275; CHECK: @ %bb.0: @ %entry 276; CHECK-NEXT: vrhadd.u16 q0, q0, q1 277; CHECK-NEXT: bx lr 278entry: 279 %s0s = zext <8 x i16> %s0 to <8 x i32> 280 %s1s = zext <8 x i16> %s1 to <8 x i32> 281 %add = add nuw nsw <8 x i32> %s0s, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 282 %add2 = add nuw nsw <8 x i32> %add, %s1s 283 %s = lshr <8 x i32> %add2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 284 %result = trunc <8 x i32> %s to <8 x i16> 285 ret <8 x i16> %result 286} 287 288define arm_aapcs_vfpcc <4 x i8> @vrhadds_v4i8(<4 x i8> %s0, <4 x i8> %s1) { 289; CHECK-LABEL: vrhadds_v4i8: 290; CHECK: @ %bb.0: @ %entry 291; CHECK-NEXT: vmovlb.s8 q1, q1 292; CHECK-NEXT: vmovlb.s8 q0, q0 293; CHECK-NEXT: vmovlb.s16 q1, q1 294; CHECK-NEXT: vmovlb.s16 q0, q0 295; CHECK-NEXT: vadd.i32 q0, q0, q1 296; CHECK-NEXT: movs r0, #1 297; CHECK-NEXT: vadd.i32 q0, q0, r0 298; CHECK-NEXT: vmovlb.u16 q0, q0 299; CHECK-NEXT: vshr.u32 q0, q0, #1 300; CHECK-NEXT: bx lr 301entry: 302 %s0s = sext <4 x i8> %s0 to <4 x i16> 303 %s1s = sext <4 x i8> %s1 to <4 x i16> 304 %add = add nsw <4 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1> 305 %add2 = add nsw <4 x i16> %add, %s1s 306 %s = lshr <4 x i16> %add2, <i16 1, i16 1, i16 1, i16 1> 307 %result = trunc <4 x i16> %s to <4 x i8> 308 ret <4 x i8> %result 309} 310 311define arm_aapcs_vfpcc <4 x i8> @vrhaddu_v4i8(<4 x i8> %s0, <4 x i8> %s1) { 312; CHECK-LABEL: vrhaddu_v4i8: 313; CHECK: @ %bb.0: @ %entry 314; CHECK-NEXT: vmov.i32 q2, #0xff 315; CHECK-NEXT: vand q1, q1, q2 316; CHECK-NEXT: vand q0, q0, q2 317; CHECK-NEXT: vrhadd.u32 q0, q0, q1 318; CHECK-NEXT: bx lr 319entry: 320 %s0s = zext <4 x i8> %s0 to <4 x i16> 321 %s1s = zext <4 x i8> %s1 to <4 x i16> 322 %add = add nuw nsw <4 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1> 323 %add2 = add nuw nsw <4 x i16> %add, %s1s 324 %s = lshr <4 x i16> %add2, <i16 1, i16 1, i16 1, i16 1> 325 %result = trunc <4 x i16> %s to <4 x i8> 326 ret <4 x i8> %result 327} 328 329define arm_aapcs_vfpcc <8 x i8> @vrhadds_v8i8(<8 x i8> %s0, <8 x i8> %s1) { 330; CHECK-LABEL: vrhadds_v8i8: 331; CHECK: @ %bb.0: @ %entry 332; CHECK-NEXT: vmovlb.s8 q1, q1 333; CHECK-NEXT: vmovlb.s8 q0, q0 334; CHECK-NEXT: vadd.i16 q0, q0, q1 335; CHECK-NEXT: movs r0, #1 336; CHECK-NEXT: vadd.i16 q0, q0, r0 337; CHECK-NEXT: vshr.u16 q0, q0, #1 338; CHECK-NEXT: bx lr 339entry: 340 %s0s = sext <8 x i8> %s0 to <8 x i16> 341 %s1s = sext <8 x i8> %s1 to <8 x i16> 342 %add = add nsw <8 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 343 %add2 = add nsw <8 x i16> %add, %s1s 344 %s = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 345 %result = trunc <8 x i16> %s to <8 x i8> 346 ret <8 x i8> %result 347} 348 349define arm_aapcs_vfpcc <8 x i8> @vrhaddu_v8i8(<8 x i8> %s0, <8 x i8> %s1) { 350; CHECK-LABEL: vrhaddu_v8i8: 351; CHECK: @ %bb.0: @ %entry 352; CHECK-NEXT: vmovlb.u8 q1, q1 353; CHECK-NEXT: vmovlb.u8 q0, q0 354; CHECK-NEXT: vrhadd.u16 q0, q0, q1 355; CHECK-NEXT: bx lr 356entry: 357 %s0s = zext <8 x i8> %s0 to <8 x i16> 358 %s1s = zext <8 x i8> %s1 to <8 x i16> 359 %add = add nuw nsw <8 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 360 %add2 = add nuw nsw <8 x i16> %add, %s1s 361 %s = lshr <8 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 362 %result = trunc <8 x i16> %s to <8 x i8> 363 ret <8 x i8> %result 364} 365 366define arm_aapcs_vfpcc <16 x i8> @vrhadds_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 367; CHECK-LABEL: vrhadds_v16i8: 368; CHECK: @ %bb.0: @ %entry 369; CHECK-NEXT: vrhadd.s8 q0, q0, q1 370; CHECK-NEXT: bx lr 371entry: 372 %s0s = sext <16 x i8> %s0 to <16 x i16> 373 %s1s = sext <16 x i8> %s1 to <16 x i16> 374 %add = add nsw <16 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 375 %add2 = add nsw <16 x i16> %add, %s1s 376 %s = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 377 %result = trunc <16 x i16> %s to <16 x i8> 378 ret <16 x i8> %result 379} 380 381define arm_aapcs_vfpcc <16 x i8> @vrhaddu_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 382; CHECK-LABEL: vrhaddu_v16i8: 383; CHECK: @ %bb.0: @ %entry 384; CHECK-NEXT: vrhadd.u8 q0, q0, q1 385; CHECK-NEXT: bx lr 386entry: 387 %s0s = zext <16 x i8> %s0 to <16 x i16> 388 %s1s = zext <16 x i8> %s1 to <16 x i16> 389 %add = add nuw nsw <16 x i16> %s0s, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 390 %add2 = add nuw nsw <16 x i16> %add, %s1s 391 %s = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 392 %result = trunc <16 x i16> %s to <16 x i8> 393 ret <16 x i8> %result 394} 395 396define void @vhadd_loop_s8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) { 397; CHECK-LABEL: vhadd_loop_s8: 398; CHECK: @ %bb.0: @ %entry 399; CHECK-NEXT: .save {r7, lr} 400; CHECK-NEXT: push {r7, lr} 401; CHECK-NEXT: mov.w lr, #64 402; CHECK-NEXT: .LBB24_1: @ %vector.body 403; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 404; CHECK-NEXT: vldrb.u8 q0, [r0], #16 405; CHECK-NEXT: vldrb.u8 q1, [r1], #16 406; CHECK-NEXT: vhadd.s8 q0, q1, q0 407; CHECK-NEXT: vstrb.8 q0, [r2], #16 408; CHECK-NEXT: le lr, .LBB24_1 409; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 410; CHECK-NEXT: pop {r7, pc} 411entry: 412 br label %vector.body 413 414vector.body: ; preds = %vector.body, %entry 415 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 416 %0 = getelementptr inbounds i8, ptr %x, i32 %index 417 %wide.load = load <16 x i8>, ptr %0, align 1 418 %1 = sext <16 x i8> %wide.load to <16 x i16> 419 %2 = getelementptr inbounds i8, ptr %y, i32 %index 420 %wide.load16 = load <16 x i8>, ptr %2, align 1 421 %3 = sext <16 x i8> %wide.load16 to <16 x i16> 422 %4 = add nsw <16 x i16> %3, %1 423 %5 = lshr <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 424 %6 = trunc <16 x i16> %5 to <16 x i8> 425 %7 = getelementptr inbounds i8, ptr %z, i32 %index 426 store <16 x i8> %6, ptr %7, align 1 427 %index.next = add i32 %index, 16 428 %8 = icmp eq i32 %index.next, 1024 429 br i1 %8, label %for.cond.cleanup, label %vector.body 430 431for.cond.cleanup: ; preds = %vector.body 432 ret void 433} 434 435define void @vhadd_loop_s16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) { 436; CHECK-LABEL: vhadd_loop_s16: 437; CHECK: @ %bb.0: @ %entry 438; CHECK-NEXT: .save {r7, lr} 439; CHECK-NEXT: push {r7, lr} 440; CHECK-NEXT: mov.w lr, #128 441; CHECK-NEXT: .LBB25_1: @ %vector.body 442; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 443; CHECK-NEXT: vldrh.u16 q0, [r0], #16 444; CHECK-NEXT: vldrh.u16 q1, [r1], #16 445; CHECK-NEXT: vhadd.s16 q0, q1, q0 446; CHECK-NEXT: vstrb.8 q0, [r2], #16 447; CHECK-NEXT: le lr, .LBB25_1 448; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 449; CHECK-NEXT: pop {r7, pc} 450entry: 451 br label %vector.body 452 453vector.body: ; preds = %vector.body, %entry 454 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 455 %0 = getelementptr inbounds i16, ptr %x, i32 %index 456 %wide.load = load <8 x i16>, ptr %0, align 2 457 %1 = sext <8 x i16> %wide.load to <8 x i32> 458 %2 = getelementptr inbounds i16, ptr %y, i32 %index 459 %wide.load16 = load <8 x i16>, ptr %2, align 2 460 %3 = sext <8 x i16> %wide.load16 to <8 x i32> 461 %4 = add nsw <8 x i32> %3, %1 462 %5 = lshr <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 463 %6 = trunc <8 x i32> %5 to <8 x i16> 464 %7 = getelementptr inbounds i16, ptr %z, i32 %index 465 store <8 x i16> %6, ptr %7, align 2 466 %index.next = add i32 %index, 8 467 %8 = icmp eq i32 %index.next, 1024 468 br i1 %8, label %for.cond.cleanup, label %vector.body 469 470for.cond.cleanup: ; preds = %vector.body 471 ret void 472} 473 474define void @vhadd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) { 475; CHECK-LABEL: vhadd_loop_s32: 476; CHECK: @ %bb.0: @ %entry 477; CHECK-NEXT: .save {r7, lr} 478; CHECK-NEXT: push {r7, lr} 479; CHECK-NEXT: mov.w lr, #256 480; CHECK-NEXT: .LBB26_1: @ %vector.body 481; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 482; CHECK-NEXT: vldrw.u32 q0, [r0], #16 483; CHECK-NEXT: vldrw.u32 q1, [r1], #16 484; CHECK-NEXT: vhadd.s32 q0, q1, q0 485; CHECK-NEXT: vstrb.8 q0, [r2], #16 486; CHECK-NEXT: le lr, .LBB26_1 487; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 488; CHECK-NEXT: pop {r7, pc} 489entry: 490 br label %vector.body 491 492vector.body: ; preds = %vector.body, %entry 493 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 494 %0 = getelementptr inbounds i32, ptr %x, i32 %index 495 %wide.load = load <4 x i32>, ptr %0, align 4 496 %1 = sext <4 x i32> %wide.load to <4 x i64> 497 %2 = getelementptr inbounds i32, ptr %y, i32 %index 498 %wide.load16 = load <4 x i32>, ptr %2, align 4 499 %3 = sext <4 x i32> %wide.load16 to <4 x i64> 500 %4 = add nsw <4 x i64> %3, %1 501 %5 = lshr <4 x i64> %4, <i64 1, i64 1, i64 1, i64 1> 502 %6 = trunc <4 x i64> %5 to <4 x i32> 503 %7 = getelementptr inbounds i32, ptr %z, i32 %index 504 store <4 x i32> %6, ptr %7, align 4 505 %index.next = add i32 %index, 4 506 %8 = icmp eq i32 %index.next, 1024 507 br i1 %8, label %for.cond.cleanup, label %vector.body 508 509for.cond.cleanup: ; preds = %vector.body 510 ret void 511} 512 513define void @vhadd_loop_u8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) { 514; CHECK-LABEL: vhadd_loop_u8: 515; CHECK: @ %bb.0: @ %entry 516; CHECK-NEXT: .save {r7, lr} 517; CHECK-NEXT: push {r7, lr} 518; CHECK-NEXT: mov.w lr, #64 519; CHECK-NEXT: .LBB27_1: @ %vector.body 520; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 521; CHECK-NEXT: vldrb.u8 q0, [r0], #16 522; CHECK-NEXT: vldrb.u8 q1, [r1], #16 523; CHECK-NEXT: vhadd.u8 q0, q1, q0 524; CHECK-NEXT: vstrb.8 q0, [r2], #16 525; CHECK-NEXT: le lr, .LBB27_1 526; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 527; CHECK-NEXT: pop {r7, pc} 528entry: 529 br label %vector.body 530 531vector.body: ; preds = %vector.body, %entry 532 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 533 %0 = getelementptr inbounds i8, ptr %x, i32 %index 534 %wide.load = load <16 x i8>, ptr %0, align 1 535 %1 = zext <16 x i8> %wide.load to <16 x i16> 536 %2 = getelementptr inbounds i8, ptr %y, i32 %index 537 %wide.load16 = load <16 x i8>, ptr %2, align 1 538 %3 = zext <16 x i8> %wide.load16 to <16 x i16> 539 %4 = add nuw nsw <16 x i16> %3, %1 540 %5 = lshr <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 541 %6 = trunc <16 x i16> %5 to <16 x i8> 542 %7 = getelementptr inbounds i8, ptr %z, i32 %index 543 store <16 x i8> %6, ptr %7, align 1 544 %index.next = add i32 %index, 16 545 %8 = icmp eq i32 %index.next, 1024 546 br i1 %8, label %for.cond.cleanup, label %vector.body 547 548for.cond.cleanup: ; preds = %vector.body 549 ret void 550} 551 552define void @vhadd_loop_u16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) { 553; CHECK-LABEL: vhadd_loop_u16: 554; CHECK: @ %bb.0: @ %entry 555; CHECK-NEXT: .save {r7, lr} 556; CHECK-NEXT: push {r7, lr} 557; CHECK-NEXT: mov.w lr, #128 558; CHECK-NEXT: .LBB28_1: @ %vector.body 559; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 560; CHECK-NEXT: vldrh.u16 q0, [r0], #16 561; CHECK-NEXT: vldrh.u16 q1, [r1], #16 562; CHECK-NEXT: vhadd.u16 q0, q1, q0 563; CHECK-NEXT: vstrb.8 q0, [r2], #16 564; CHECK-NEXT: le lr, .LBB28_1 565; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 566; CHECK-NEXT: pop {r7, pc} 567entry: 568 br label %vector.body 569 570vector.body: ; preds = %vector.body, %entry 571 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 572 %0 = getelementptr inbounds i16, ptr %x, i32 %index 573 %wide.load = load <8 x i16>, ptr %0, align 2 574 %1 = zext <8 x i16> %wide.load to <8 x i32> 575 %2 = getelementptr inbounds i16, ptr %y, i32 %index 576 %wide.load16 = load <8 x i16>, ptr %2, align 2 577 %3 = zext <8 x i16> %wide.load16 to <8 x i32> 578 %4 = add nuw nsw <8 x i32> %3, %1 579 %5 = lshr <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 580 %6 = trunc <8 x i32> %5 to <8 x i16> 581 %7 = getelementptr inbounds i16, ptr %z, i32 %index 582 store <8 x i16> %6, ptr %7, align 2 583 %index.next = add i32 %index, 8 584 %8 = icmp eq i32 %index.next, 1024 585 br i1 %8, label %for.cond.cleanup, label %vector.body 586 587for.cond.cleanup: ; preds = %vector.body 588 ret void 589} 590 591define void @vhadd_loop_u32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) { 592; CHECK-LABEL: vhadd_loop_u32: 593; CHECK: @ %bb.0: @ %entry 594; CHECK-NEXT: .save {r7, lr} 595; CHECK-NEXT: push {r7, lr} 596; CHECK-NEXT: mov.w lr, #256 597; CHECK-NEXT: .LBB29_1: @ %vector.body 598; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 599; CHECK-NEXT: vldrw.u32 q0, [r0], #16 600; CHECK-NEXT: vldrw.u32 q1, [r1], #16 601; CHECK-NEXT: vhadd.u32 q0, q1, q0 602; CHECK-NEXT: vstrb.8 q0, [r2], #16 603; CHECK-NEXT: le lr, .LBB29_1 604; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 605; CHECK-NEXT: pop {r7, pc} 606entry: 607 br label %vector.body 608 609vector.body: ; preds = %vector.body, %entry 610 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 611 %0 = getelementptr inbounds i32, ptr %x, i32 %index 612 %wide.load = load <4 x i32>, ptr %0, align 4 613 %1 = zext <4 x i32> %wide.load to <4 x i64> 614 %2 = getelementptr inbounds i32, ptr %y, i32 %index 615 %wide.load16 = load <4 x i32>, ptr %2, align 4 616 %3 = zext <4 x i32> %wide.load16 to <4 x i64> 617 %4 = add nuw nsw <4 x i64> %3, %1 618 %5 = lshr <4 x i64> %4, <i64 1, i64 1, i64 1, i64 1> 619 %6 = trunc <4 x i64> %5 to <4 x i32> 620 %7 = getelementptr inbounds i32, ptr %z, i32 %index 621 store <4 x i32> %6, ptr %7, align 4 622 %index.next = add i32 %index, 4 623 %8 = icmp eq i32 %index.next, 1024 624 br i1 %8, label %for.cond.cleanup, label %vector.body 625 626for.cond.cleanup: ; preds = %vector.body 627 ret void 628} 629 630define void @vrhadd_loop_s8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) { 631; CHECK-LABEL: vrhadd_loop_s8: 632; CHECK: @ %bb.0: @ %entry 633; CHECK-NEXT: .save {r7, lr} 634; CHECK-NEXT: push {r7, lr} 635; CHECK-NEXT: mov.w lr, #64 636; CHECK-NEXT: .LBB30_1: @ %vector.body 637; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 638; CHECK-NEXT: vldrb.u8 q0, [r1], #16 639; CHECK-NEXT: vldrb.u8 q1, [r0], #16 640; CHECK-NEXT: vrhadd.u8 q0, q1, q0 641; CHECK-NEXT: vstrb.8 q0, [r2], #16 642; CHECK-NEXT: le lr, .LBB30_1 643; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 644; CHECK-NEXT: pop {r7, pc} 645entry: 646 br label %vector.body 647 648vector.body: ; preds = %vector.body, %entry 649 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 650 %0 = getelementptr inbounds i8, ptr %x, i32 %index 651 %wide.load = load <16 x i8>, ptr %0, align 1 652 %1 = zext <16 x i8> %wide.load to <16 x i16> 653 %2 = getelementptr inbounds i8, ptr %y, i32 %index 654 %wide.load16 = load <16 x i8>, ptr %2, align 1 655 %3 = zext <16 x i8> %wide.load16 to <16 x i16> 656 %4 = add nuw nsw <16 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 657 %5 = add nuw nsw <16 x i16> %4, %3 658 %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 659 %7 = trunc <16 x i16> %6 to <16 x i8> 660 %8 = getelementptr inbounds i8, ptr %z, i32 %index 661 store <16 x i8> %7, ptr %8, align 1 662 %index.next = add i32 %index, 16 663 %9 = icmp eq i32 %index.next, 1024 664 br i1 %9, label %for.cond.cleanup, label %vector.body 665 666for.cond.cleanup: ; preds = %vector.body 667 ret void 668} 669 670define void @vrhadd_loop_s16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) { 671; CHECK-LABEL: vrhadd_loop_s16: 672; CHECK: @ %bb.0: @ %entry 673; CHECK-NEXT: .save {r7, lr} 674; CHECK-NEXT: push {r7, lr} 675; CHECK-NEXT: mov.w lr, #128 676; CHECK-NEXT: .LBB31_1: @ %vector.body 677; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 678; CHECK-NEXT: vldrh.u16 q0, [r1], #16 679; CHECK-NEXT: vldrh.u16 q1, [r0], #16 680; CHECK-NEXT: vrhadd.u16 q0, q1, q0 681; CHECK-NEXT: vstrb.8 q0, [r2], #16 682; CHECK-NEXT: le lr, .LBB31_1 683; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 684; CHECK-NEXT: pop {r7, pc} 685entry: 686 br label %vector.body 687 688vector.body: ; preds = %vector.body, %entry 689 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 690 %0 = getelementptr inbounds i16, ptr %x, i32 %index 691 %wide.load = load <8 x i16>, ptr %0, align 2 692 %1 = zext <8 x i16> %wide.load to <8 x i32> 693 %2 = getelementptr inbounds i16, ptr %y, i32 %index 694 %wide.load16 = load <8 x i16>, ptr %2, align 2 695 %3 = zext <8 x i16> %wide.load16 to <8 x i32> 696 %4 = add nuw nsw <8 x i32> %1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 697 %5 = add nuw nsw <8 x i32> %4, %3 698 %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 699 %7 = trunc <8 x i32> %6 to <8 x i16> 700 %8 = getelementptr inbounds i16, ptr %z, i32 %index 701 store <8 x i16> %7, ptr %8, align 2 702 %index.next = add i32 %index, 8 703 %9 = icmp eq i32 %index.next, 1024 704 br i1 %9, label %for.cond.cleanup, label %vector.body 705 706for.cond.cleanup: ; preds = %vector.body 707 ret void 708} 709 710define void @vrhadd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) { 711; CHECK-LABEL: vrhadd_loop_s32: 712; CHECK: @ %bb.0: @ %entry 713; CHECK-NEXT: .save {r7, lr} 714; CHECK-NEXT: push {r7, lr} 715; CHECK-NEXT: mov.w lr, #256 716; CHECK-NEXT: .LBB32_1: @ %vector.body 717; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 718; CHECK-NEXT: vldrw.u32 q0, [r1], #16 719; CHECK-NEXT: vldrw.u32 q1, [r0], #16 720; CHECK-NEXT: vrhadd.u32 q0, q1, q0 721; CHECK-NEXT: vstrb.8 q0, [r2], #16 722; CHECK-NEXT: le lr, .LBB32_1 723; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 724; CHECK-NEXT: pop {r7, pc} 725entry: 726 br label %vector.body 727 728vector.body: ; preds = %vector.body, %entry 729 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 730 %0 = getelementptr inbounds i32, ptr %x, i32 %index 731 %wide.load = load <4 x i32>, ptr %0, align 4 732 %1 = zext <4 x i32> %wide.load to <4 x i64> 733 %2 = getelementptr inbounds i32, ptr %y, i32 %index 734 %wide.load16 = load <4 x i32>, ptr %2, align 4 735 %3 = zext <4 x i32> %wide.load16 to <4 x i64> 736 %4 = add nuw nsw <4 x i64> %1, <i64 1, i64 1, i64 1, i64 1> 737 %5 = add nuw nsw <4 x i64> %4, %3 738 %6 = lshr <4 x i64> %5, <i64 1, i64 1, i64 1, i64 1> 739 %7 = trunc <4 x i64> %6 to <4 x i32> 740 %8 = getelementptr inbounds i32, ptr %z, i32 %index 741 store <4 x i32> %7, ptr %8, align 4 742 %index.next = add i32 %index, 4 743 %9 = icmp eq i32 %index.next, 1024 744 br i1 %9, label %for.cond.cleanup, label %vector.body 745 746for.cond.cleanup: ; preds = %vector.body 747 ret void 748} 749 750define void @vrhadd_loop_u8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) { 751; CHECK-LABEL: vrhadd_loop_u8: 752; CHECK: @ %bb.0: @ %entry 753; CHECK-NEXT: .save {r7, lr} 754; CHECK-NEXT: push {r7, lr} 755; CHECK-NEXT: mov.w lr, #64 756; CHECK-NEXT: .LBB33_1: @ %vector.body 757; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 758; CHECK-NEXT: vldrb.u8 q0, [r1], #16 759; CHECK-NEXT: vldrb.u8 q1, [r0], #16 760; CHECK-NEXT: vrhadd.u8 q0, q1, q0 761; CHECK-NEXT: vstrb.8 q0, [r2], #16 762; CHECK-NEXT: le lr, .LBB33_1 763; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 764; CHECK-NEXT: pop {r7, pc} 765entry: 766 br label %vector.body 767 768vector.body: ; preds = %vector.body, %entry 769 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 770 %0 = getelementptr inbounds i8, ptr %x, i32 %index 771 %wide.load = load <16 x i8>, ptr %0, align 1 772 %1 = zext <16 x i8> %wide.load to <16 x i16> 773 %2 = getelementptr inbounds i8, ptr %y, i32 %index 774 %wide.load16 = load <16 x i8>, ptr %2, align 1 775 %3 = zext <16 x i8> %wide.load16 to <16 x i16> 776 %4 = add nuw nsw <16 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 777 %5 = add nuw nsw <16 x i16> %4, %3 778 %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 779 %7 = trunc <16 x i16> %6 to <16 x i8> 780 %8 = getelementptr inbounds i8, ptr %z, i32 %index 781 store <16 x i8> %7, ptr %8, align 1 782 %index.next = add i32 %index, 16 783 %9 = icmp eq i32 %index.next, 1024 784 br i1 %9, label %for.cond.cleanup, label %vector.body 785 786for.cond.cleanup: ; preds = %vector.body 787 ret void 788} 789 790define void @vrhadd_loop_u16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) { 791; CHECK-LABEL: vrhadd_loop_u16: 792; CHECK: @ %bb.0: @ %entry 793; CHECK-NEXT: .save {r7, lr} 794; CHECK-NEXT: push {r7, lr} 795; CHECK-NEXT: mov.w lr, #128 796; CHECK-NEXT: .LBB34_1: @ %vector.body 797; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 798; CHECK-NEXT: vldrh.u16 q0, [r1], #16 799; CHECK-NEXT: vldrh.u16 q1, [r0], #16 800; CHECK-NEXT: vrhadd.u16 q0, q1, q0 801; CHECK-NEXT: vstrb.8 q0, [r2], #16 802; CHECK-NEXT: le lr, .LBB34_1 803; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 804; CHECK-NEXT: pop {r7, pc} 805entry: 806 br label %vector.body 807 808vector.body: ; preds = %vector.body, %entry 809 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 810 %0 = getelementptr inbounds i16, ptr %x, i32 %index 811 %wide.load = load <8 x i16>, ptr %0, align 2 812 %1 = zext <8 x i16> %wide.load to <8 x i32> 813 %2 = getelementptr inbounds i16, ptr %y, i32 %index 814 %wide.load16 = load <8 x i16>, ptr %2, align 2 815 %3 = zext <8 x i16> %wide.load16 to <8 x i32> 816 %4 = add nuw nsw <8 x i32> %1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 817 %5 = add nuw nsw <8 x i32> %4, %3 818 %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 819 %7 = trunc <8 x i32> %6 to <8 x i16> 820 %8 = getelementptr inbounds i16, ptr %z, i32 %index 821 store <8 x i16> %7, ptr %8, align 2 822 %index.next = add i32 %index, 8 823 %9 = icmp eq i32 %index.next, 1024 824 br i1 %9, label %for.cond.cleanup, label %vector.body 825 826for.cond.cleanup: ; preds = %vector.body 827 ret void 828} 829 830define void @vrhadd_loop_u32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture writeonly %z, i32 %n) { 831; CHECK-LABEL: vrhadd_loop_u32: 832; CHECK: @ %bb.0: @ %entry 833; CHECK-NEXT: .save {r7, lr} 834; CHECK-NEXT: push {r7, lr} 835; CHECK-NEXT: mov.w lr, #256 836; CHECK-NEXT: .LBB35_1: @ %vector.body 837; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 838; CHECK-NEXT: vldrw.u32 q0, [r1], #16 839; CHECK-NEXT: vldrw.u32 q1, [r0], #16 840; CHECK-NEXT: vrhadd.u32 q0, q1, q0 841; CHECK-NEXT: vstrb.8 q0, [r2], #16 842; CHECK-NEXT: le lr, .LBB35_1 843; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 844; CHECK-NEXT: pop {r7, pc} 845entry: 846 br label %vector.body 847 848vector.body: ; preds = %vector.body, %entry 849 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 850 %0 = getelementptr inbounds i32, ptr %x, i32 %index 851 %wide.load = load <4 x i32>, ptr %0, align 4 852 %1 = zext <4 x i32> %wide.load to <4 x i64> 853 %2 = getelementptr inbounds i32, ptr %y, i32 %index 854 %wide.load16 = load <4 x i32>, ptr %2, align 4 855 %3 = zext <4 x i32> %wide.load16 to <4 x i64> 856 %4 = add nuw nsw <4 x i64> %1, <i64 1, i64 1, i64 1, i64 1> 857 %5 = add nuw nsw <4 x i64> %4, %3 858 %6 = lshr <4 x i64> %5, <i64 1, i64 1, i64 1, i64 1> 859 %7 = trunc <4 x i64> %6 to <4 x i32> 860 %8 = getelementptr inbounds i32, ptr %z, i32 %index 861 store <4 x i32> %7, ptr %8, align 4 862 %index.next = add i32 %index, 4 863 %9 = icmp eq i32 %index.next, 1024 864 br i1 %9, label %for.cond.cleanup, label %vector.body 865 866for.cond.cleanup: ; preds = %vector.body 867 ret void 868} 869 870 871define arm_aapcs_vfpcc i16 @vhadds_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 872; CHECK-LABEL: vhadds_reduce_v16i8: 873; CHECK: @ %bb.0: @ %entry 874; CHECK-NEXT: vhadd.s8 q0, q0, q1 875; CHECK-NEXT: vaddv.s8 r0, q0 876; CHECK-NEXT: bx lr 877entry: 878 %s0s = sext <16 x i8> %s0 to <16 x i16> 879 %s1s = sext <16 x i8> %s1 to <16 x i16> 880 %add = add <16 x i16> %s0s, %s1s 881 %s = ashr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 882 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) 883 ret i16 %result 884} 885 886define arm_aapcs_vfpcc i16 @vhaddu_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 887; CHECK-LABEL: vhaddu_reduce_v16i8: 888; CHECK: @ %bb.0: @ %entry 889; CHECK-NEXT: vhadd.u8 q0, q0, q1 890; CHECK-NEXT: vaddv.u8 r0, q0 891; CHECK-NEXT: bx lr 892entry: 893 %s0s = zext <16 x i8> %s0 to <16 x i16> 894 %s1s = zext <16 x i8> %s1 to <16 x i16> 895 %add = add <16 x i16> %s0s, %s1s 896 %s = lshr <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 897 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) 898 ret i16 %result 899} 900 901define arm_aapcs_vfpcc i16 @vrhadds_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 902; CHECK-LABEL: vrhadds_reduce_v16i8: 903; CHECK: @ %bb.0: @ %entry 904; CHECK-NEXT: vrhadd.s8 q0, q0, q1 905; CHECK-NEXT: vaddv.s8 r0, q0 906; CHECK-NEXT: bx lr 907entry: 908 %s0s = sext <16 x i8> %s0 to <16 x i16> 909 %s1s = sext <16 x i8> %s1 to <16 x i16> 910 %add = add <16 x i16> %s0s, %s1s 911 %add2 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 912 %s = ashr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 913 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) 914 ret i16 %result 915} 916 917define arm_aapcs_vfpcc i16 @vrhaddu_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 918; CHECK-LABEL: vrhaddu_reduce_v16i8: 919; CHECK: @ %bb.0: @ %entry 920; CHECK-NEXT: vrhadd.u8 q0, q0, q1 921; CHECK-NEXT: vaddv.u8 r0, q0 922; CHECK-NEXT: bx lr 923entry: 924 %s0s = zext <16 x i8> %s0 to <16 x i16> 925 %s1s = zext <16 x i8> %s1 to <16 x i16> 926 %add = add <16 x i16> %s0s, %s1s 927 %add2 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 928 %s = lshr <16 x i16> %add2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 929 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) 930 ret i16 %result 931} 932 933declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 934