1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK 3 4define arm_aapcs_vfpcc <2 x i32> @vmulhs_v2i32(<2 x i32> %s0, <2 x i32> %s1) { 5; CHECK-LABEL: vmulhs_v2i32: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vmullb.s32 q2, q0, q1 8; CHECK-NEXT: vmov r0, s11 9; CHECK-NEXT: vmov r1, s9 10; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 11; CHECK-NEXT: asrs r0, r0, #31 12; CHECK-NEXT: asrs r1, r1, #31 13; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 14; CHECK-NEXT: bx lr 15entry: 16 %s0s = sext <2 x i32> %s0 to <2 x i64> 17 %s1s = sext <2 x i32> %s1 to <2 x i64> 18 %m = mul <2 x i64> %s0s, %s1s 19 %s = ashr <2 x i64> %m, <i64 32, i64 32> 20 %s2 = trunc <2 x i64> %s to <2 x i32> 21 ret <2 x i32> %s2 22} 23 24define arm_aapcs_vfpcc <2 x i32> @vmulhu_v2i32(<2 x i32> %s0, <2 x i32> %s1) { 25; CHECK-LABEL: vmulhu_v2i32: 26; CHECK: @ %bb.0: @ %entry 27; CHECK-NEXT: vmullb.u32 q2, q0, q1 28; CHECK-NEXT: vldr s1, .LCPI1_0 29; CHECK-NEXT: vmov.f32 s0, s9 30; CHECK-NEXT: vmov.f32 s2, s11 31; CHECK-NEXT: vmov.f32 s3, s1 32; CHECK-NEXT: bx lr 33; CHECK-NEXT: .p2align 2 34; CHECK-NEXT: @ %bb.1: 35; CHECK-NEXT: .LCPI1_0: 36; CHECK-NEXT: .long 0x00000000 @ float 0 37entry: 38 %s0s = zext <2 x i32> %s0 to <2 x i64> 39 %s1s = zext <2 x i32> %s1 to <2 x i64> 40 %m = mul <2 x i64> %s0s, %s1s 41 %s = lshr <2 x i64> %m, <i64 32, i64 32> 42 %s2 = trunc <2 x i64> %s to <2 x i32> 43 ret <2 x i32> %s2 44} 45 46define arm_aapcs_vfpcc <4 x i32> @vmulhs_v4i32(<4 x i32> %s0, <4 x i32> %s1) { 47; CHECK-LABEL: vmulhs_v4i32: 48; CHECK: @ %bb.0: @ %entry 49; CHECK-NEXT: vmulh.s32 q0, q0, q1 50; CHECK-NEXT: bx lr 51entry: 52 %s0s = sext <4 x i32> %s0 to <4 x i64> 53 %s1s = sext <4 x i32> %s1 to <4 x i64> 54 %m = mul <4 x i64> %s0s, %s1s 55 %s = ashr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32> 56 %s2 = trunc <4 x i64> %s to <4 x i32> 57 ret <4 x i32> %s2 58} 59 60define arm_aapcs_vfpcc <4 x i32> @vmulhu_v4i32(<4 x i32> %s0, <4 x i32> %s1) { 61; CHECK-LABEL: vmulhu_v4i32: 62; CHECK: @ %bb.0: @ %entry 63; CHECK-NEXT: vmulh.u32 q0, q0, q1 64; CHECK-NEXT: bx lr 65entry: 66 %s0s = zext <4 x i32> %s0 to <4 x i64> 67 %s1s = zext <4 x i32> %s1 to <4 x i64> 68 %m = mul <4 x i64> %s0s, %s1s 69 %s = lshr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32> 70 %s2 = trunc <4 x i64> %s to <4 x i32> 71 ret <4 x i32> %s2 72} 73 74define arm_aapcs_vfpcc <4 x i16> @vmulhs_v4i16(<4 x i16> %s0, <4 x i16> %s1) { 75; CHECK-LABEL: vmulhs_v4i16: 76; CHECK: @ %bb.0: @ %entry 77; CHECK-NEXT: vmullb.s16 q0, q0, q1 78; CHECK-NEXT: vshr.s32 q0, q0, #16 79; CHECK-NEXT: bx lr 80entry: 81 %s0s = sext <4 x i16> %s0 to <4 x i32> 82 %s1s = sext <4 x i16> %s1 to <4 x i32> 83 %m = mul <4 x i32> %s0s, %s1s 84 %s = ashr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16> 85 %s2 = trunc <4 x i32> %s to <4 x i16> 86 ret <4 x i16> %s2 87} 88 89define arm_aapcs_vfpcc <4 x i16> @vmulhu_v4i16(<4 x i16> %s0, <4 x i16> %s1) { 90; CHECK-LABEL: vmulhu_v4i16: 91; CHECK: @ %bb.0: @ %entry 92; CHECK-NEXT: vmullb.u16 q0, q0, q1 93; CHECK-NEXT: vshr.u32 q0, q0, #16 94; CHECK-NEXT: bx lr 95entry: 96 %s0s = zext <4 x i16> %s0 to <4 x i32> 97 %s1s = zext <4 x i16> %s1 to <4 x i32> 98 %m = mul <4 x i32> %s0s, %s1s 99 %s = lshr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16> 100 %s2 = trunc <4 x i32> %s to <4 x i16> 101 ret <4 x i16> %s2 102} 103 104define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) { 105; CHECK-LABEL: vmulhs_v8i16: 106; CHECK: @ %bb.0: @ %entry 107; CHECK-NEXT: vmulh.s16 q0, q0, q1 108; CHECK-NEXT: bx lr 109entry: 110 %s0s = sext <8 x i16> %s0 to <8 x i32> 111 %s1s = sext <8 x i16> %s1 to <8 x i32> 112 %m = mul <8 x i32> %s0s, %s1s 113 %s = ashr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 114 %s2 = trunc <8 x i32> %s to <8 x i16> 115 ret <8 x i16> %s2 116} 117 118define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) { 119; CHECK-LABEL: vmulhu_v8i16: 120; CHECK: @ %bb.0: @ %entry 121; CHECK-NEXT: vmulh.u16 q0, q0, q1 122; CHECK-NEXT: bx lr 123entry: 124 %s0s = zext <8 x i16> %s0 to <8 x i32> 125 %s1s = zext <8 x i16> %s1 to <8 x i32> 126 %m = mul <8 x i32> %s0s, %s1s 127 %s = lshr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 128 %s2 = trunc <8 x i32> %s to <8 x i16> 129 ret <8 x i16> %s2 130} 131 132define arm_aapcs_vfpcc <4 x i8> @vmulhs_v4i8(<4 x i8> %s0, <4 x i8> %s1) { 133; CHECK-LABEL: vmulhs_v4i8: 134; CHECK: @ %bb.0: @ %entry 135; CHECK-NEXT: vmovlb.s8 q1, q1 136; CHECK-NEXT: vmovlb.s8 q0, q0 137; CHECK-NEXT: vmovlb.s16 q1, q1 138; CHECK-NEXT: vmovlb.s16 q0, q0 139; CHECK-NEXT: vmul.i32 q0, q0, q1 140; CHECK-NEXT: vshr.s32 q0, q0, #8 141; CHECK-NEXT: bx lr 142entry: 143 %s0s = sext <4 x i8> %s0 to <4 x i16> 144 %s1s = sext <4 x i8> %s1 to <4 x i16> 145 %m = mul <4 x i16> %s0s, %s1s 146 %s = ashr <4 x i16> %m, <i16 8, i16 8, i16 8, i16 8> 147 %s2 = trunc <4 x i16> %s to <4 x i8> 148 ret <4 x i8> %s2 149} 150 151define arm_aapcs_vfpcc <4 x i8> @vmulhu_v4i8(<4 x i8> %s0, <4 x i8> %s1) { 152; CHECK-LABEL: vmulhu_v4i8: 153; CHECK: @ %bb.0: @ %entry 154; CHECK-NEXT: vmov.i32 q2, #0xff 155; CHECK-NEXT: vand q1, q1, q2 156; CHECK-NEXT: vand q0, q0, q2 157; CHECK-NEXT: vmul.i32 q0, q0, q1 158; CHECK-NEXT: vshr.u32 q0, q0, #8 159; CHECK-NEXT: bx lr 160entry: 161 %s0s = zext <4 x i8> %s0 to <4 x i16> 162 %s1s = zext <4 x i8> %s1 to <4 x i16> 163 %m = mul <4 x i16> %s0s, %s1s 164 %s = lshr <4 x i16> %m, <i16 8, i16 8, i16 8, i16 8> 165 %s2 = trunc <4 x i16> %s to <4 x i8> 166 ret <4 x i8> %s2 167} 168 169define arm_aapcs_vfpcc <8 x i8> @vmulhs_v8i8(<8 x i8> %s0, <8 x i8> %s1) { 170; CHECK-LABEL: vmulhs_v8i8: 171; CHECK: @ %bb.0: @ %entry 172; CHECK-NEXT: vmullb.s8 q0, q0, q1 173; CHECK-NEXT: vshr.s16 q0, q0, #8 174; CHECK-NEXT: bx lr 175entry: 176 %s0s = sext <8 x i8> %s0 to <8 x i16> 177 %s1s = sext <8 x i8> %s1 to <8 x i16> 178 %m = mul <8 x i16> %s0s, %s1s 179 %s = ashr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 180 %s2 = trunc <8 x i16> %s to <8 x i8> 181 ret <8 x i8> %s2 182} 183 184define arm_aapcs_vfpcc <8 x i8> @vmulhu_v8i8(<8 x i8> %s0, <8 x i8> %s1) { 185; CHECK-LABEL: vmulhu_v8i8: 186; CHECK: @ %bb.0: @ %entry 187; CHECK-NEXT: vmullb.u8 q0, q0, q1 188; CHECK-NEXT: vshr.u16 q0, q0, #8 189; CHECK-NEXT: bx lr 190entry: 191 %s0s = zext <8 x i8> %s0 to <8 x i16> 192 %s1s = zext <8 x i8> %s1 to <8 x i16> 193 %m = mul <8 x i16> %s0s, %s1s 194 %s = lshr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 195 %s2 = trunc <8 x i16> %s to <8 x i8> 196 ret <8 x i8> %s2 197} 198 199define arm_aapcs_vfpcc <16 x i8> @vmulhs_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 200; CHECK-LABEL: vmulhs_v16i8: 201; CHECK: @ %bb.0: @ %entry 202; CHECK-NEXT: vmulh.s8 q0, q0, q1 203; CHECK-NEXT: bx lr 204entry: 205 %s0s = sext <16 x i8> %s0 to <16 x i16> 206 %s1s = sext <16 x i8> %s1 to <16 x i16> 207 %m = mul <16 x i16> %s0s, %s1s 208 %s = ashr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 209 %s2 = trunc <16 x i16> %s to <16 x i8> 210 ret <16 x i8> %s2 211} 212 213define arm_aapcs_vfpcc <16 x i8> @vmulhu_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 214; CHECK-LABEL: vmulhu_v16i8: 215; CHECK: @ %bb.0: @ %entry 216; CHECK-NEXT: vmulh.u8 q0, q0, q1 217; CHECK-NEXT: bx lr 218entry: 219 %s0s = zext <16 x i8> %s0 to <16 x i16> 220 %s1s = zext <16 x i8> %s1 to <16 x i16> 221 %m = mul <16 x i16> %s0s, %s1s 222 %s = lshr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 223 %s2 = trunc <16 x i16> %s to <16 x i8> 224 ret <16 x i8> %s2 225} 226 227define void @vmulh_s8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) { 228; CHECK-LABEL: vmulh_s8: 229; CHECK: @ %bb.0: @ %entry 230; CHECK-NEXT: .save {r7, lr} 231; CHECK-NEXT: push {r7, lr} 232; CHECK-NEXT: mov.w lr, #64 233; CHECK-NEXT: .LBB14_1: @ %vector.body 234; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 235; CHECK-NEXT: vldrb.u8 q0, [r0], #16 236; CHECK-NEXT: vldrb.u8 q1, [r1], #16 237; CHECK-NEXT: vmulh.s8 q0, q1, q0 238; CHECK-NEXT: vstrb.8 q0, [r2], #16 239; CHECK-NEXT: le lr, .LBB14_1 240; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 241; CHECK-NEXT: pop {r7, pc} 242entry: 243 br label %vector.body 244 245vector.body: ; preds = %vector.body, %entry 246 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 247 %0 = getelementptr inbounds i8, ptr %x, i32 %index 248 %wide.load = load <16 x i8>, ptr %0, align 1 249 %1 = sext <16 x i8> %wide.load to <16 x i16> 250 %2 = getelementptr inbounds i8, ptr %y, i32 %index 251 %wide.load17 = load <16 x i8>, ptr %2, align 1 252 %3 = sext <16 x i8> %wide.load17 to <16 x i16> 253 %4 = mul nsw <16 x i16> %3, %1 254 %5 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 255 %6 = trunc <16 x i16> %5 to <16 x i8> 256 %7 = getelementptr inbounds i8, ptr %z, i32 %index 257 store <16 x i8> %6, ptr %7, align 1 258 %index.next = add i32 %index, 16 259 %8 = icmp eq i32 %index.next, 1024 260 br i1 %8, label %for.cond.cleanup, label %vector.body 261 262for.cond.cleanup: ; preds = %vector.body 263 ret void 264} 265 266define void @vmulh_s16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) { 267; CHECK-LABEL: vmulh_s16: 268; CHECK: @ %bb.0: @ %entry 269; CHECK-NEXT: .save {r7, lr} 270; CHECK-NEXT: push {r7, lr} 271; CHECK-NEXT: mov.w lr, #128 272; CHECK-NEXT: .LBB15_1: @ %vector.body 273; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 274; CHECK-NEXT: vldrh.u16 q0, [r0], #16 275; CHECK-NEXT: vldrh.u16 q1, [r1], #16 276; CHECK-NEXT: vmulh.s16 q0, q1, q0 277; CHECK-NEXT: vstrb.8 q0, [r2], #16 278; CHECK-NEXT: le lr, .LBB15_1 279; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 280; CHECK-NEXT: pop {r7, pc} 281entry: 282 br label %vector.body 283 284vector.body: ; preds = %vector.body, %entry 285 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 286 %0 = getelementptr inbounds i16, ptr %x, i32 %index 287 %wide.load = load <8 x i16>, ptr %0, align 2 288 %1 = sext <8 x i16> %wide.load to <8 x i32> 289 %2 = getelementptr inbounds i16, ptr %y, i32 %index 290 %wide.load17 = load <8 x i16>, ptr %2, align 2 291 %3 = sext <8 x i16> %wide.load17 to <8 x i32> 292 %4 = mul nsw <8 x i32> %3, %1 293 %5 = lshr <8 x i32> %4, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 294 %6 = trunc <8 x i32> %5 to <8 x i16> 295 %7 = getelementptr inbounds i16, ptr %z, i32 %index 296 store <8 x i16> %6, ptr %7, align 2 297 %index.next = add i32 %index, 8 298 %8 = icmp eq i32 %index.next, 1024 299 br i1 %8, label %for.cond.cleanup, label %vector.body 300 301for.cond.cleanup: ; preds = %vector.body 302 ret void 303} 304 305define void @vmulh_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) { 306; CHECK-LABEL: vmulh_s32: 307; CHECK: @ %bb.0: @ %entry 308; CHECK-NEXT: .save {r7, lr} 309; CHECK-NEXT: push {r7, lr} 310; CHECK-NEXT: mov.w lr, #256 311; CHECK-NEXT: .LBB16_1: @ %vector.body 312; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 313; CHECK-NEXT: vldrw.u32 q0, [r0], #16 314; CHECK-NEXT: vldrw.u32 q1, [r1], #16 315; CHECK-NEXT: vmulh.s32 q0, q1, q0 316; CHECK-NEXT: vstrb.8 q0, [r2], #16 317; CHECK-NEXT: le lr, .LBB16_1 318; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 319; CHECK-NEXT: pop {r7, pc} 320entry: 321 br label %vector.body 322 323vector.body: ; preds = %vector.body, %entry 324 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 325 %0 = getelementptr inbounds i32, ptr %x, i32 %index 326 %wide.load = load <4 x i32>, ptr %0, align 4 327 %1 = sext <4 x i32> %wide.load to <4 x i64> 328 %2 = getelementptr inbounds i32, ptr %y, i32 %index 329 %wide.load17 = load <4 x i32>, ptr %2, align 4 330 %3 = sext <4 x i32> %wide.load17 to <4 x i64> 331 %4 = mul nsw <4 x i64> %3, %1 332 %5 = lshr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32> 333 %6 = trunc <4 x i64> %5 to <4 x i32> 334 %7 = getelementptr inbounds i32, ptr %z, i32 %index 335 store <4 x i32> %6, ptr %7, align 4 336 %index.next = add i32 %index, 4 337 %8 = icmp eq i32 %index.next, 1024 338 br i1 %8, label %for.cond.cleanup, label %vector.body 339 340for.cond.cleanup: ; preds = %vector.body 341 ret void 342} 343 344define void @vmulh_u8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) { 345; CHECK-LABEL: vmulh_u8: 346; CHECK: @ %bb.0: @ %entry 347; CHECK-NEXT: .save {r7, lr} 348; CHECK-NEXT: push {r7, lr} 349; CHECK-NEXT: mov.w lr, #64 350; CHECK-NEXT: .LBB17_1: @ %vector.body 351; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 352; CHECK-NEXT: vldrb.u8 q0, [r0], #16 353; CHECK-NEXT: vldrb.u8 q1, [r1], #16 354; CHECK-NEXT: vmulh.u8 q0, q1, q0 355; CHECK-NEXT: vstrb.8 q0, [r2], #16 356; CHECK-NEXT: le lr, .LBB17_1 357; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 358; CHECK-NEXT: pop {r7, pc} 359entry: 360 br label %vector.body 361 362vector.body: ; preds = %vector.body, %entry 363 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 364 %0 = getelementptr inbounds i8, ptr %x, i32 %index 365 %wide.load = load <16 x i8>, ptr %0, align 1 366 %1 = zext <16 x i8> %wide.load to <16 x i16> 367 %2 = getelementptr inbounds i8, ptr %y, i32 %index 368 %wide.load17 = load <16 x i8>, ptr %2, align 1 369 %3 = zext <16 x i8> %wide.load17 to <16 x i16> 370 %4 = mul nuw <16 x i16> %3, %1 371 %5 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 372 %6 = trunc <16 x i16> %5 to <16 x i8> 373 %7 = getelementptr inbounds i8, ptr %z, i32 %index 374 store <16 x i8> %6, ptr %7, align 1 375 %index.next = add i32 %index, 16 376 %8 = icmp eq i32 %index.next, 1024 377 br i1 %8, label %for.cond.cleanup, label %vector.body 378 379for.cond.cleanup: ; preds = %vector.body 380 ret void 381} 382 383define void @vmulh_u16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) { 384; CHECK-LABEL: vmulh_u16: 385; CHECK: @ %bb.0: @ %entry 386; CHECK-NEXT: .save {r7, lr} 387; CHECK-NEXT: push {r7, lr} 388; CHECK-NEXT: mov.w lr, #128 389; CHECK-NEXT: .LBB18_1: @ %vector.body 390; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 391; CHECK-NEXT: vldrh.u16 q0, [r0], #16 392; CHECK-NEXT: vldrh.u16 q1, [r1], #16 393; CHECK-NEXT: vmulh.u16 q0, q1, q0 394; CHECK-NEXT: vstrb.8 q0, [r2], #16 395; CHECK-NEXT: le lr, .LBB18_1 396; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 397; CHECK-NEXT: pop {r7, pc} 398entry: 399 br label %vector.body 400 401vector.body: ; preds = %vector.body, %entry 402 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 403 %0 = getelementptr inbounds i16, ptr %x, i32 %index 404 %wide.load = load <8 x i16>, ptr %0, align 2 405 %1 = zext <8 x i16> %wide.load to <8 x i32> 406 %2 = getelementptr inbounds i16, ptr %y, i32 %index 407 %wide.load17 = load <8 x i16>, ptr %2, align 2 408 %3 = zext <8 x i16> %wide.load17 to <8 x i32> 409 %4 = mul nuw <8 x i32> %3, %1 410 %5 = lshr <8 x i32> %4, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 411 %6 = trunc <8 x i32> %5 to <8 x i16> 412 %7 = getelementptr inbounds i16, ptr %z, i32 %index 413 store <8 x i16> %6, ptr %7, align 2 414 %index.next = add i32 %index, 8 415 %8 = icmp eq i32 %index.next, 1024 416 br i1 %8, label %for.cond.cleanup, label %vector.body 417 418for.cond.cleanup: ; preds = %vector.body 419 ret void 420} 421 422define void @vmulh_u32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) { 423; CHECK-LABEL: vmulh_u32: 424; CHECK: @ %bb.0: @ %entry 425; CHECK-NEXT: .save {r7, lr} 426; CHECK-NEXT: push {r7, lr} 427; CHECK-NEXT: mov.w lr, #256 428; CHECK-NEXT: .LBB19_1: @ %vector.body 429; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 430; CHECK-NEXT: vldrw.u32 q0, [r0], #16 431; CHECK-NEXT: vldrw.u32 q1, [r1], #16 432; CHECK-NEXT: vmulh.u32 q0, q1, q0 433; CHECK-NEXT: vstrb.8 q0, [r2], #16 434; CHECK-NEXT: le lr, .LBB19_1 435; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 436; CHECK-NEXT: pop {r7, pc} 437entry: 438 br label %vector.body 439 440vector.body: ; preds = %vector.body, %entry 441 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 442 %0 = getelementptr inbounds i32, ptr %x, i32 %index 443 %wide.load = load <4 x i32>, ptr %0, align 4 444 %1 = zext <4 x i32> %wide.load to <4 x i64> 445 %2 = getelementptr inbounds i32, ptr %y, i32 %index 446 %wide.load17 = load <4 x i32>, ptr %2, align 4 447 %3 = zext <4 x i32> %wide.load17 to <4 x i64> 448 %4 = mul nuw <4 x i64> %3, %1 449 %5 = lshr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32> 450 %6 = trunc <4 x i64> %5 to <4 x i32> 451 %7 = getelementptr inbounds i32, ptr %z, i32 %index 452 store <4 x i32> %6, ptr %7, align 4 453 %index.next = add i32 %index, 4 454 %8 = icmp eq i32 %index.next, 1024 455 br i1 %8, label %for.cond.cleanup, label %vector.body 456 457for.cond.cleanup: ; preds = %vector.body 458 ret void 459} 460 461 462define void @vmulh_s32_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) { 463; CHECK-LABEL: vmulh_s32_pred: 464; CHECK: @ %bb.0: @ %entry 465; CHECK-NEXT: .save {r7, lr} 466; CHECK-NEXT: push {r7, lr} 467; CHECK-NEXT: cmp r3, #1 468; CHECK-NEXT: it lt 469; CHECK-NEXT: poplt {r7, pc} 470; CHECK-NEXT: .LBB20_1: @ %vector.ph 471; CHECK-NEXT: dlstp.32 lr, r3 472; CHECK-NEXT: .LBB20_2: @ %vector.body 473; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 474; CHECK-NEXT: vldrw.u32 q0, [r1], #16 475; CHECK-NEXT: vldrw.u32 q1, [r2], #16 476; CHECK-NEXT: vmulh.s32 q0, q1, q0 477; CHECK-NEXT: vstrw.32 q0, [r0], #16 478; CHECK-NEXT: letp lr, .LBB20_2 479; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 480; CHECK-NEXT: pop {r7, pc} 481entry: 482 %cmp10 = icmp sgt i32 %n, 0 483 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup 484 485vector.ph: ; preds = %entry 486 %n.rnd.up = add i32 %n, 3 487 %n.vec = and i32 %n.rnd.up, -4 488 br label %vector.body 489 490vector.body: ; preds = %vector.body, %vector.ph 491 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 492 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 493 %0 = getelementptr inbounds i32, ptr %x, i32 %index 494 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %0, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison) 495 %1 = sext <4 x i32> %wide.masked.load to <4 x i64> 496 %2 = getelementptr inbounds i32, ptr %y, i32 %index 497 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison) 498 %3 = sext <4 x i32> %wide.masked.load12 to <4 x i64> 499 %4 = mul nsw <4 x i64> %3, %1 500 %5 = lshr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32> 501 %6 = trunc <4 x i64> %5 to <4 x i32> 502 %7 = getelementptr inbounds i32, ptr %d, i32 %index 503 call void @llvm.masked.store.v4i32.p0(<4 x i32> %6, ptr %7, i32 4, <4 x i1> %active.lane.mask) 504 %index.next = add i32 %index, 4 505 %8 = icmp eq i32 %index.next, %n.vec 506 br i1 %8, label %for.cond.cleanup, label %vector.body 507 508for.cond.cleanup: ; preds = %vector.body, %entry 509 ret void 510} 511 512define void @vmulh_u32_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) { 513; CHECK-LABEL: vmulh_u32_pred: 514; CHECK: @ %bb.0: @ %entry 515; CHECK-NEXT: .save {r7, lr} 516; CHECK-NEXT: push {r7, lr} 517; CHECK-NEXT: cmp r3, #1 518; CHECK-NEXT: it lt 519; CHECK-NEXT: poplt {r7, pc} 520; CHECK-NEXT: .LBB21_1: @ %vector.ph 521; CHECK-NEXT: dlstp.32 lr, r3 522; CHECK-NEXT: .LBB21_2: @ %vector.body 523; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 524; CHECK-NEXT: vldrw.u32 q0, [r1], #16 525; CHECK-NEXT: vldrw.u32 q1, [r2], #16 526; CHECK-NEXT: vmulh.u32 q0, q1, q0 527; CHECK-NEXT: vstrw.32 q0, [r0], #16 528; CHECK-NEXT: letp lr, .LBB21_2 529; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 530; CHECK-NEXT: pop {r7, pc} 531entry: 532 %cmp10 = icmp sgt i32 %n, 0 533 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup 534 535vector.ph: ; preds = %entry 536 %n.rnd.up = add i32 %n, 3 537 %n.vec = and i32 %n.rnd.up, -4 538 br label %vector.body 539 540vector.body: ; preds = %vector.body, %vector.ph 541 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 542 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 543 %0 = getelementptr inbounds i32, ptr %x, i32 %index 544 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %0, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison) 545 %1 = zext <4 x i32> %wide.masked.load to <4 x i64> 546 %2 = getelementptr inbounds i32, ptr %y, i32 %index 547 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison) 548 %3 = zext <4 x i32> %wide.masked.load12 to <4 x i64> 549 %4 = mul nuw <4 x i64> %3, %1 550 %5 = lshr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32> 551 %6 = trunc <4 x i64> %5 to <4 x i32> 552 %7 = getelementptr inbounds i32, ptr %d, i32 %index 553 call void @llvm.masked.store.v4i32.p0(<4 x i32> %6, ptr %7, i32 4, <4 x i1> %active.lane.mask) 554 %index.next = add i32 %index, 4 555 %8 = icmp eq i32 %index.next, %n.vec 556 br i1 %8, label %for.cond.cleanup, label %vector.body 557 558for.cond.cleanup: ; preds = %vector.body, %entry 559 ret void 560} 561 562define void @vmulh_s16_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) { 563; CHECK-LABEL: vmulh_s16_pred: 564; CHECK: @ %bb.0: @ %entry 565; CHECK-NEXT: .save {r7, lr} 566; CHECK-NEXT: push {r7, lr} 567; CHECK-NEXT: cmp r3, #1 568; CHECK-NEXT: it lt 569; CHECK-NEXT: poplt {r7, pc} 570; CHECK-NEXT: .LBB22_1: @ %vector.ph 571; CHECK-NEXT: dlstp.16 lr, r3 572; CHECK-NEXT: .LBB22_2: @ %vector.body 573; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 574; CHECK-NEXT: vldrh.u16 q0, [r1], #16 575; CHECK-NEXT: vldrh.u16 q1, [r2], #16 576; CHECK-NEXT: vmulh.s16 q0, q1, q0 577; CHECK-NEXT: vstrh.16 q0, [r0], #16 578; CHECK-NEXT: letp lr, .LBB22_2 579; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 580; CHECK-NEXT: pop {r7, pc} 581entry: 582 %cmp10 = icmp sgt i32 %n, 0 583 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup 584 585vector.ph: ; preds = %entry 586 %n.rnd.up = add i32 %n, 7 587 %n.vec = and i32 %n.rnd.up, -8 588 br label %vector.body 589 590vector.body: ; preds = %vector.body, %vector.ph 591 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 592 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) 593 %0 = getelementptr inbounds i16, ptr %x, i32 %index 594 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %0, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison) 595 %1 = sext <8 x i16> %wide.masked.load to <8 x i32> 596 %2 = getelementptr inbounds i16, ptr %y, i32 %index 597 %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %2, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison) 598 %3 = sext <8 x i16> %wide.masked.load12 to <8 x i32> 599 %4 = mul nsw <8 x i32> %3, %1 600 %5 = lshr <8 x i32> %4, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 601 %6 = trunc <8 x i32> %5 to <8 x i16> 602 %7 = getelementptr inbounds i16, ptr %d, i32 %index 603 call void @llvm.masked.store.v8i16.p0(<8 x i16> %6, ptr %7, i32 2, <8 x i1> %active.lane.mask) 604 %index.next = add i32 %index, 8 605 %8 = icmp eq i32 %index.next, %n.vec 606 br i1 %8, label %for.cond.cleanup, label %vector.body 607 608for.cond.cleanup: ; preds = %vector.body, %entry 609 ret void 610} 611 612define void @vmulh_u16_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) { 613; CHECK-LABEL: vmulh_u16_pred: 614; CHECK: @ %bb.0: @ %entry 615; CHECK-NEXT: .save {r7, lr} 616; CHECK-NEXT: push {r7, lr} 617; CHECK-NEXT: cmp r3, #1 618; CHECK-NEXT: it lt 619; CHECK-NEXT: poplt {r7, pc} 620; CHECK-NEXT: .LBB23_1: @ %vector.ph 621; CHECK-NEXT: dlstp.16 lr, r3 622; CHECK-NEXT: .LBB23_2: @ %vector.body 623; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 624; CHECK-NEXT: vldrh.u16 q0, [r1], #16 625; CHECK-NEXT: vldrh.u16 q1, [r2], #16 626; CHECK-NEXT: vmulh.u16 q0, q1, q0 627; CHECK-NEXT: vstrh.16 q0, [r0], #16 628; CHECK-NEXT: letp lr, .LBB23_2 629; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 630; CHECK-NEXT: pop {r7, pc} 631entry: 632 %cmp10 = icmp sgt i32 %n, 0 633 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup 634 635vector.ph: ; preds = %entry 636 %n.rnd.up = add i32 %n, 7 637 %n.vec = and i32 %n.rnd.up, -8 638 br label %vector.body 639 640vector.body: ; preds = %vector.body, %vector.ph 641 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 642 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) 643 %0 = getelementptr inbounds i16, ptr %x, i32 %index 644 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %0, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison) 645 %1 = zext <8 x i16> %wide.masked.load to <8 x i32> 646 %2 = getelementptr inbounds i16, ptr %y, i32 %index 647 %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %2, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison) 648 %3 = zext <8 x i16> %wide.masked.load12 to <8 x i32> 649 %4 = mul nuw <8 x i32> %3, %1 650 %5 = lshr <8 x i32> %4, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 651 %6 = trunc <8 x i32> %5 to <8 x i16> 652 %7 = getelementptr inbounds i16, ptr %d, i32 %index 653 call void @llvm.masked.store.v8i16.p0(<8 x i16> %6, ptr %7, i32 2, <8 x i1> %active.lane.mask) 654 %index.next = add i32 %index, 8 655 %8 = icmp eq i32 %index.next, %n.vec 656 br i1 %8, label %for.cond.cleanup, label %vector.body 657 658for.cond.cleanup: ; preds = %vector.body, %entry 659 ret void 660} 661 662define void @vmulh_s8_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) { 663; CHECK-LABEL: vmulh_s8_pred: 664; CHECK: @ %bb.0: @ %entry 665; CHECK-NEXT: .save {r7, lr} 666; CHECK-NEXT: push {r7, lr} 667; CHECK-NEXT: cmp r3, #1 668; CHECK-NEXT: it lt 669; CHECK-NEXT: poplt {r7, pc} 670; CHECK-NEXT: .LBB24_1: @ %vector.ph 671; CHECK-NEXT: dlstp.8 lr, r3 672; CHECK-NEXT: .LBB24_2: @ %vector.body 673; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 674; CHECK-NEXT: vldrb.u8 q0, [r1], #16 675; CHECK-NEXT: vldrb.u8 q1, [r2], #16 676; CHECK-NEXT: vmulh.s8 q0, q1, q0 677; CHECK-NEXT: vstrb.8 q0, [r0], #16 678; CHECK-NEXT: letp lr, .LBB24_2 679; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 680; CHECK-NEXT: pop {r7, pc} 681entry: 682 %cmp10 = icmp sgt i32 %n, 0 683 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup 684 685vector.ph: ; preds = %entry 686 %n.rnd.up = add i32 %n, 15 687 %n.vec = and i32 %n.rnd.up, -16 688 br label %vector.body 689 690vector.body: ; preds = %vector.body, %vector.ph 691 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 692 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 693 %0 = getelementptr inbounds i8, ptr %x, i32 %index 694 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %0, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison) 695 %1 = sext <16 x i8> %wide.masked.load to <16 x i16> 696 %2 = getelementptr inbounds i8, ptr %y, i32 %index 697 %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %2, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison) 698 %3 = sext <16 x i8> %wide.masked.load12 to <16 x i16> 699 %4 = mul nsw <16 x i16> %3, %1 700 %5 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 701 %6 = trunc <16 x i16> %5 to <16 x i8> 702 %7 = getelementptr inbounds i8, ptr %d, i32 %index 703 call void @llvm.masked.store.v16i8.p0(<16 x i8> %6, ptr %7, i32 1, <16 x i1> %active.lane.mask) 704 %index.next = add i32 %index, 16 705 %8 = icmp eq i32 %index.next, %n.vec 706 br i1 %8, label %for.cond.cleanup, label %vector.body 707 708for.cond.cleanup: ; preds = %vector.body, %entry 709 ret void 710} 711 712define void @vmulh_u8_pred(ptr noalias nocapture %d, ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) { 713; CHECK-LABEL: vmulh_u8_pred: 714; CHECK: @ %bb.0: @ %entry 715; CHECK-NEXT: .save {r7, lr} 716; CHECK-NEXT: push {r7, lr} 717; CHECK-NEXT: cmp r3, #1 718; CHECK-NEXT: it lt 719; CHECK-NEXT: poplt {r7, pc} 720; CHECK-NEXT: .LBB25_1: @ %vector.ph 721; CHECK-NEXT: dlstp.8 lr, r3 722; CHECK-NEXT: .LBB25_2: @ %vector.body 723; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 724; CHECK-NEXT: vldrb.u8 q0, [r1], #16 725; CHECK-NEXT: vldrb.u8 q1, [r2], #16 726; CHECK-NEXT: vmulh.u8 q0, q1, q0 727; CHECK-NEXT: vstrb.8 q0, [r0], #16 728; CHECK-NEXT: letp lr, .LBB25_2 729; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 730; CHECK-NEXT: pop {r7, pc} 731entry: 732 %cmp10 = icmp sgt i32 %n, 0 733 br i1 %cmp10, label %vector.ph, label %for.cond.cleanup 734 735vector.ph: ; preds = %entry 736 %n.rnd.up = add i32 %n, 15 737 %n.vec = and i32 %n.rnd.up, -16 738 br label %vector.body 739 740vector.body: ; preds = %vector.body, %vector.ph 741 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 742 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 743 %0 = getelementptr inbounds i8, ptr %x, i32 %index 744 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %0, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison) 745 %1 = zext <16 x i8> %wide.masked.load to <16 x i16> 746 %2 = getelementptr inbounds i8, ptr %y, i32 %index 747 %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %2, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison) 748 %3 = zext <16 x i8> %wide.masked.load12 to <16 x i16> 749 %4 = mul nuw <16 x i16> %3, %1 750 %5 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 751 %6 = trunc <16 x i16> %5 to <16 x i8> 752 %7 = getelementptr inbounds i8, ptr %d, i32 %index 753 call void @llvm.masked.store.v16i8.p0(<16 x i8> %6, ptr %7, i32 1, <16 x i1> %active.lane.mask) 754 %index.next = add i32 %index, 16 755 %8 = icmp eq i32 %index.next, %n.vec 756 br i1 %8, label %for.cond.cleanup, label %vector.body 757 758for.cond.cleanup: ; preds = %vector.body, %entry 759 ret void 760} 761 762 763define arm_aapcs_vfpcc i16 @vmulhs_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 764; CHECK-LABEL: vmulhs_reduce_v16i8: 765; CHECK: @ %bb.0: @ %entry 766; CHECK-NEXT: vmulh.s8 q0, q0, q1 767; CHECK-NEXT: vaddv.s8 r0, q0 768; CHECK-NEXT: bx lr 769entry: 770 %s0s = sext <16 x i8> %s0 to <16 x i16> 771 %s1s = sext <16 x i8> %s1 to <16 x i16> 772 %m = mul <16 x i16> %s0s, %s1s 773 %s = ashr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 774 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) 775 ret i16 %result 776} 777 778define arm_aapcs_vfpcc i16 @vmulhu_reduce_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 779; CHECK-LABEL: vmulhu_reduce_v16i8: 780; CHECK: @ %bb.0: @ %entry 781; CHECK-NEXT: vmulh.u8 q0, q0, q1 782; CHECK-NEXT: vaddv.s8 r0, q0 783; CHECK-NEXT: bx lr 784entry: 785 %s0s = zext <16 x i8> %s0 to <16 x i16> 786 %s1s = zext <16 x i8> %s1 to <16 x i16> 787 %m = mul <16 x i16> %s0s, %s1s 788 %s = ashr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 789 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s) 790 ret i16 %result 791} 792 793declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 794 795 796declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 797declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) 798declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) 799declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) 800declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>) 801declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>) 802declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) 803declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>) 804declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>) 805