1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc i32 @vqdmulh_v16i8(<16 x i8> %s0, <16 x i8> %s1) { 5; CHECK-LABEL: vqdmulh_v16i8: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 8; CHECK-NEXT: vaddv.s8 r0, q0 9; CHECK-NEXT: bx lr 10entry: 11 %l2 = sext <16 x i8> %s0 to <16 x i32> 12 %l5 = sext <16 x i8> %s1 to <16 x i32> 13 %l6 = mul nsw <16 x i32> %l5, %l2 14 %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 15 %l9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>) 16 %l10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %l9) 17 ret i32 %l10 18} 19 20define arm_aapcs_vfpcc <16 x i8> @vqdmulh_v16i8_b(<16 x i8> %s0, <16 x i8> %s1) { 21; CHECK-LABEL: vqdmulh_v16i8_b: 22; CHECK: @ %bb.0: @ %entry 23; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 24; CHECK-NEXT: bx lr 25entry: 26 %l2 = sext <16 x i8> %s0 to <16 x i32> 27 %l5 = sext <16 x i8> %s1 to <16 x i32> 28 %l6 = mul nsw <16 x i32> %l5, %l2 29 %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 30 %l9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>) 31 %l10 = trunc <16 x i32> %l9 to <16 x i8> 32 ret <16 x i8> %l10 33} 34 35define arm_aapcs_vfpcc <8 x i8> @vqdmulh_v8i8_b(<8 x i8> %s0, <8 x i8> %s1) { 36; CHECK-LABEL: vqdmulh_v8i8_b: 37; CHECK: @ %bb.0: @ %entry 38; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 39; CHECK-NEXT: vmovlb.s8 q0, q0 40; CHECK-NEXT: bx lr 41entry: 42 %l2 = sext <8 x i8> %s0 to <8 x i32> 43 %l5 = sext <8 x i8> %s1 to <8 x i32> 44 %l6 = mul nsw <8 x i32> %l5, %l2 45 %l7 = ashr <8 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 46 %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>) 47 %l10 = trunc <8 x i32> %l9 to <8 x i8> 48 ret <8 x i8> %l10 49} 50 51define arm_aapcs_vfpcc <4 x i8> @vqdmulh_v4i8_b(<4 x i8> %s0, <4 x i8> %s1) { 52; CHECK-LABEL: vqdmulh_v4i8_b: 53; CHECK: @ %bb.0: @ %entry 54; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 55; CHECK-NEXT: vmovlb.s8 q0, q0 56; CHECK-NEXT: vmovlb.s16 q0, q0 57; CHECK-NEXT: bx lr 58entry: 59 %l2 = sext <4 x i8> %s0 to <4 x i32> 60 %l5 = sext <4 x i8> %s1 to <4 x i32> 61 %l6 = mul nsw <4 x i32> %l5, %l2 62 %l7 = ashr <4 x i32> %l6, <i32 7, i32 7, i32 7, i32 7> 63 %l9 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l7, <4 x i32> <i32 127, i32 127, i32 127, i32 127>) 64 %l10 = trunc <4 x i32> %l9 to <4 x i8> 65 ret <4 x i8> %l10 66} 67 68define arm_aapcs_vfpcc <32 x i8> @vqdmulh_v32i8_b(<32 x i8> %s0, <32 x i8> %s1) { 69; CHECK-LABEL: vqdmulh_v32i8_b: 70; CHECK: @ %bb.0: @ %entry 71; CHECK-NEXT: vqdmulh.s8 q0, q2, q0 72; CHECK-NEXT: vqdmulh.s8 q1, q3, q1 73; CHECK-NEXT: bx lr 74entry: 75 %l2 = sext <32 x i8> %s0 to <32 x i32> 76 %l5 = sext <32 x i8> %s1 to <32 x i32> 77 %l6 = mul nsw <32 x i32> %l5, %l2 78 %l7 = ashr <32 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 79 %l9 = call <32 x i32> @llvm.smin.v32i32(<32 x i32> %l7, <32 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>) 80 %l10 = trunc <32 x i32> %l9 to <32 x i8> 81 ret <32 x i8> %l10 82} 83 84define arm_aapcs_vfpcc i32 @vqdmulh_v8i16(<8 x i16> %s0, <8 x i16> %s1) { 85; CHECK-LABEL: vqdmulh_v8i16: 86; CHECK: @ %bb.0: @ %entry 87; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 88; CHECK-NEXT: vaddv.s16 r0, q0 89; CHECK-NEXT: bx lr 90entry: 91 %l2 = sext <8 x i16> %s0 to <8 x i32> 92 %l5 = sext <8 x i16> %s1 to <8 x i32> 93 %l6 = mul nsw <8 x i32> %l5, %l2 94 %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 95 %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>) 96 %l10 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %l9) 97 ret i32 %l10 98} 99 100define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_b(<8 x i16> %s0, <8 x i16> %s1) { 101; CHECK-LABEL: vqdmulh_v8i16_b: 102; CHECK: @ %bb.0: @ %entry 103; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 104; CHECK-NEXT: bx lr 105entry: 106 %l2 = sext <8 x i16> %s0 to <8 x i32> 107 %l5 = sext <8 x i16> %s1 to <8 x i32> 108 %l6 = mul nsw <8 x i32> %l5, %l2 109 %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 110 %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>) 111 %l10 = trunc <8 x i32> %l9 to <8 x i16> 112 ret <8 x i16> %l10 113} 114 115define arm_aapcs_vfpcc <4 x i16> @vqdmulh_v4i16_b(<4 x i16> %s0, <4 x i16> %s1) { 116; CHECK-LABEL: vqdmulh_v4i16_b: 117; CHECK: @ %bb.0: @ %entry 118; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 119; CHECK-NEXT: vmovlb.s16 q0, q0 120; CHECK-NEXT: bx lr 121entry: 122 %l2 = sext <4 x i16> %s0 to <4 x i32> 123 %l5 = sext <4 x i16> %s1 to <4 x i32> 124 %l6 = mul nsw <4 x i32> %l5, %l2 125 %l7 = ashr <4 x i32> %l6, <i32 15, i32 15, i32 15, i32 15> 126 %l9 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l7, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>) 127 %l10 = trunc <4 x i32> %l9 to <4 x i16> 128 ret <4 x i16> %l10 129} 130 131define arm_aapcs_vfpcc <16 x i16> @vqdmulh_v16i16_b(<16 x i16> %s0, <16 x i16> %s1) { 132; CHECK-LABEL: vqdmulh_v16i16_b: 133; CHECK: @ %bb.0: @ %entry 134; CHECK-NEXT: vqdmulh.s16 q0, q2, q0 135; CHECK-NEXT: vqdmulh.s16 q1, q3, q1 136; CHECK-NEXT: bx lr 137entry: 138 %l2 = sext <16 x i16> %s0 to <16 x i32> 139 %l5 = sext <16 x i16> %s1 to <16 x i32> 140 %l6 = mul nsw <16 x i32> %l5, %l2 141 %l7 = ashr <16 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 142 %l9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %l7, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>) 143 %l10 = trunc <16 x i32> %l9 to <16 x i16> 144 ret <16 x i16> %l10 145} 146 147define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_c(<8 x i16> %s0, <8 x i16> %s1) { 148; CHECK-LABEL: vqdmulh_v8i16_c: 149; CHECK: @ %bb.0: @ %entry 150; CHECK-NEXT: .pad #16 151; CHECK-NEXT: sub sp, #16 152; CHECK-NEXT: vmov.u16 r0, q0[6] 153; CHECK-NEXT: vmov.u16 r1, q0[4] 154; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 155; CHECK-NEXT: vmov.u16 r0, q0[7] 156; CHECK-NEXT: vmov.u16 r1, q0[5] 157; CHECK-NEXT: vmov.u16 r2, q0[0] 158; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 159; CHECK-NEXT: vmov.u16 r0, q1[6] 160; CHECK-NEXT: vmov.u16 r1, q1[4] 161; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 162; CHECK-NEXT: vmov.u16 r0, q1[7] 163; CHECK-NEXT: vmov.u16 r1, q1[5] 164; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 165; CHECK-NEXT: mov r0, sp 166; CHECK-NEXT: vmullb.s16 q2, q3, q2 167; CHECK-NEXT: vmov.u16 r1, q0[2] 168; CHECK-NEXT: vshl.i32 q2, q2, #10 169; CHECK-NEXT: vshr.s32 q2, q2, #10 170; CHECK-NEXT: vshr.s32 q2, q2, #15 171; CHECK-NEXT: vstrh.32 q2, [r0, #8] 172; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 173; CHECK-NEXT: vmov.u16 r1, q0[3] 174; CHECK-NEXT: vmov.u16 r2, q0[1] 175; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 176; CHECK-NEXT: vmov.u16 r1, q1[2] 177; CHECK-NEXT: vmov.u16 r2, q1[0] 178; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 179; CHECK-NEXT: vmov.u16 r1, q1[3] 180; CHECK-NEXT: vmov.u16 r2, q1[1] 181; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 182; CHECK-NEXT: vmullb.s16 q0, q0, q2 183; CHECK-NEXT: vshl.i32 q0, q0, #10 184; CHECK-NEXT: vshr.s32 q0, q0, #10 185; CHECK-NEXT: vshr.s32 q0, q0, #15 186; CHECK-NEXT: vstrh.32 q0, [r0] 187; CHECK-NEXT: vldrw.u32 q0, [r0] 188; CHECK-NEXT: add sp, #16 189; CHECK-NEXT: bx lr 190entry: 191 %l2 = sext <8 x i16> %s0 to <8 x i22> 192 %l5 = sext <8 x i16> %s1 to <8 x i22> 193 %l6 = mul nsw <8 x i22> %l5, %l2 194 %l7 = ashr <8 x i22> %l6, <i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15> 195 %l9 = call <8 x i22> @llvm.smin.v8i22(<8 x i22> %l7, <8 x i22> <i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767>) 196 %l10 = trunc <8 x i22> %l9 to <8 x i16> 197 ret <8 x i16> %l10 198} 199 200define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved(<8 x i16> %s0, <8 x i16> %s1) { 201; CHECK-LABEL: vqdmulh_v8i16_interleaved: 202; CHECK: @ %bb.0: @ %entry 203; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 204; CHECK-NEXT: bx lr 205entry: 206 %0 = shufflevector <8 x i16> %s0, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 207 %1 = sext <8 x i16> %0 to <8 x i32> 208 %l2 = sext <8 x i16> %s0 to <8 x i32> 209 %2 = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 210 %3 = sext <8 x i16> %2 to <8 x i32> 211 %l5 = sext <8 x i16> %s1 to <8 x i32> 212 %l6 = mul nsw <8 x i32> %3, %1 213 %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 214 %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>) 215 %l10 = trunc <8 x i32> %l9 to <8 x i16> 216 %4 = shufflevector <8 x i16> %l10, <8 x i16> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 217 ret <8 x i16> %4 218} 219 220define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved2(<4 x i32> %s0a, <8 x i16> %s1) { 221; CHECK-LABEL: vqdmulh_v8i16_interleaved2: 222; CHECK: @ %bb.0: 223; CHECK-NEXT: vqdmulh.s16 q2, q1, q0 224; CHECK-NEXT: vrev32.16 q1, q1 225; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 226; CHECK-NEXT: vmovnt.i32 q2, q0 227; CHECK-NEXT: vmov q0, q2 228; CHECK-NEXT: bx lr 229 %s0 = trunc <4 x i32> %s0a to <4 x i16> 230 %strided.vec = shufflevector <8 x i16> %s1, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 231 %strided.vec44 = shufflevector <8 x i16> %s1, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 232 %l7 = sext <4 x i16> %strided.vec to <4 x i32> 233 %l8 = sext <4 x i16> %s0 to <4 x i32> 234 %l9 = mul nsw <4 x i32> %l7, %l8 235 %l10 = ashr <4 x i32> %l9, <i32 15, i32 15, i32 15, i32 15> 236 %l12 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l10, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>) 237 %l13 = trunc <4 x i32> %l12 to <4 x i16> 238 %l14 = sext <4 x i16> %strided.vec44 to <4 x i32> 239 %l15 = mul nsw <4 x i32> %l14, %l8 240 %l16 = ashr <4 x i32> %l15, <i32 15, i32 15, i32 15, i32 15> 241 %l18 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l16, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>) 242 %l19 = trunc <4 x i32> %l18 to <4 x i16> 243 %interleaved.vec = shufflevector <4 x i16> %l13, <4 x i16> %l19, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 244 ret <8 x i16> %interleaved.vec 245} 246 247define arm_aapcs_vfpcc i64 @vqdmulh_v4i32(<4 x i32> %s0, <4 x i32> %s1) { 248; CHECK-LABEL: vqdmulh_v4i32: 249; CHECK: @ %bb.0: @ %entry 250; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 251; CHECK-NEXT: vaddlv.s32 r0, r1, q0 252; CHECK-NEXT: bx lr 253entry: 254 %l2 = sext <4 x i32> %s0 to <4 x i64> 255 %l5 = sext <4 x i32> %s1 to <4 x i64> 256 %l6 = mul nsw <4 x i64> %l5, %l2 257 %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31> 258 %l9 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>) 259 %l10 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %l9) 260 ret i64 %l10 261} 262 263define arm_aapcs_vfpcc <4 x i32> @vqdmulh_v4i32_b(<4 x i32> %s0, <4 x i32> %s1) { 264; CHECK-LABEL: vqdmulh_v4i32_b: 265; CHECK: @ %bb.0: @ %entry 266; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 267; CHECK-NEXT: bx lr 268entry: 269 %l2 = sext <4 x i32> %s0 to <4 x i64> 270 %l5 = sext <4 x i32> %s1 to <4 x i64> 271 %l6 = mul nsw <4 x i64> %l5, %l2 272 %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31> 273 %l9 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>) 274 %l10 = trunc <4 x i64> %l9 to <4 x i32> 275 ret <4 x i32> %l10 276} 277 278define arm_aapcs_vfpcc <2 x i32> @vqdmulh_v2i32_b(<2 x i32> %s0, <2 x i32> %s1) { 279; CHECK-LABEL: vqdmulh_v2i32_b: 280; CHECK: @ %bb.0: @ %entry 281; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 282; CHECK-NEXT: vmov r0, s2 283; CHECK-NEXT: vmov r1, s0 284; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 285; CHECK-NEXT: asrs r0, r0, #31 286; CHECK-NEXT: asrs r1, r1, #31 287; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 288; CHECK-NEXT: bx lr 289entry: 290 %l2 = sext <2 x i32> %s0 to <2 x i64> 291 %l5 = sext <2 x i32> %s1 to <2 x i64> 292 %l6 = mul nsw <2 x i64> %l5, %l2 293 %l7 = ashr <2 x i64> %l6, <i64 31, i64 31> 294 %l9 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %l7, <2 x i64> <i64 2147483647, i64 2147483647>) 295 %l10 = trunc <2 x i64> %l9 to <2 x i32> 296 ret <2 x i32> %l10 297} 298 299define arm_aapcs_vfpcc <8 x i32> @vqdmulh_v8i32_b(<8 x i32> %s0, <8 x i32> %s1) { 300; CHECK-LABEL: vqdmulh_v8i32_b: 301; CHECK: @ %bb.0: @ %entry 302; CHECK-NEXT: vqdmulh.s32 q0, q2, q0 303; CHECK-NEXT: vqdmulh.s32 q1, q3, q1 304; CHECK-NEXT: bx lr 305entry: 306 %l2 = sext <8 x i32> %s0 to <8 x i64> 307 %l5 = sext <8 x i32> %s1 to <8 x i64> 308 %l6 = mul nsw <8 x i64> %l5, %l2 309 %l7 = ashr <8 x i64> %l6, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31> 310 %l9 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %l7, <8 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>) 311 %l10 = trunc <8 x i64> %l9 to <8 x i32> 312 ret <8 x i32> %l10 313} 314 315define arm_aapcs_vfpcc <16 x i32> @vqdmulh_v16i32_b(<16 x i32> %s0, <16 x i32> %s1) { 316; CHECK-LABEL: vqdmulh_v16i32_b: 317; CHECK: @ %bb.0: @ %entry 318; CHECK-NEXT: .vsave {d8, d9} 319; CHECK-NEXT: vpush {d8, d9} 320; CHECK-NEXT: add r0, sp, #16 321; CHECK-NEXT: vldrw.u32 q4, [r0] 322; CHECK-NEXT: add r0, sp, #32 323; CHECK-NEXT: vqdmulh.s32 q0, q4, q0 324; CHECK-NEXT: vldrw.u32 q4, [r0] 325; CHECK-NEXT: add r0, sp, #48 326; CHECK-NEXT: vqdmulh.s32 q1, q4, q1 327; CHECK-NEXT: vldrw.u32 q4, [r0] 328; CHECK-NEXT: add r0, sp, #64 329; CHECK-NEXT: vqdmulh.s32 q2, q4, q2 330; CHECK-NEXT: vldrw.u32 q4, [r0] 331; CHECK-NEXT: vqdmulh.s32 q3, q4, q3 332; CHECK-NEXT: vpop {d8, d9} 333; CHECK-NEXT: bx lr 334entry: 335 %l2 = sext <16 x i32> %s0 to <16 x i64> 336 %l5 = sext <16 x i32> %s1 to <16 x i64> 337 %l6 = mul nsw <16 x i64> %l5, %l2 338 %l7 = ashr <16 x i64> %l6, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31> 339 %l9 = call <16 x i64> @llvm.smin.v16i64(<16 x i64> %l7, <16 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>) 340 %l10 = trunc <16 x i64> %l9 to <16 x i32> 341 ret <16 x i32> %l10 342} 343 344 345 346define void @vqdmulh_loop_i8(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) local_unnamed_addr #0 { 347; CHECK-LABEL: vqdmulh_loop_i8: 348; CHECK: @ %bb.0: @ %entry 349; CHECK-NEXT: .save {r7, lr} 350; CHECK-NEXT: push {r7, lr} 351; CHECK-NEXT: mov.w lr, #64 352; CHECK-NEXT: .LBB17_1: @ %vector.body 353; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 354; CHECK-NEXT: vldrb.u8 q0, [r0], #16 355; CHECK-NEXT: vldrb.u8 q1, [r1], #16 356; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 357; CHECK-NEXT: vstrb.8 q0, [r2], #16 358; CHECK-NEXT: le lr, .LBB17_1 359; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 360; CHECK-NEXT: pop {r7, pc} 361entry: 362 br label %vector.body 363 364vector.body: ; preds = %vector.body, %entry 365 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 366 %0 = getelementptr inbounds i8, ptr %x, i32 %index 367 %wide.load = load <16 x i8>, ptr %0, align 1 368 %1 = sext <16 x i8> %wide.load to <16 x i32> 369 %2 = getelementptr inbounds i8, ptr %y, i32 %index 370 %wide.load26 = load <16 x i8>, ptr %2, align 1 371 %3 = sext <16 x i8> %wide.load26 to <16 x i32> 372 %4 = mul nsw <16 x i32> %3, %1 373 %5 = ashr <16 x i32> %4, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 374 %6 = icmp slt <16 x i32> %5, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127> 375 %7 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %5, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>) 376 %8 = trunc <16 x i32> %7 to <16 x i8> 377 %9 = getelementptr inbounds i8, ptr %z, i32 %index 378 store <16 x i8> %8, ptr %9, align 1 379 %index.next = add i32 %index, 16 380 %10 = icmp eq i32 %index.next, 1024 381 br i1 %10, label %for.cond.cleanup, label %vector.body 382 383for.cond.cleanup: ; preds = %vector.body 384 ret void 385} 386 387define void @vqdmulh_loop_i16(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) { 388; CHECK-LABEL: vqdmulh_loop_i16: 389; CHECK: @ %bb.0: @ %entry 390; CHECK-NEXT: .save {r7, lr} 391; CHECK-NEXT: push {r7, lr} 392; CHECK-NEXT: mov.w lr, #128 393; CHECK-NEXT: .LBB18_1: @ %vector.body 394; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 395; CHECK-NEXT: vldrh.u16 q0, [r0], #16 396; CHECK-NEXT: vldrh.u16 q1, [r1], #16 397; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 398; CHECK-NEXT: vstrb.8 q0, [r2], #16 399; CHECK-NEXT: le lr, .LBB18_1 400; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 401; CHECK-NEXT: pop {r7, pc} 402entry: 403 br label %vector.body 404 405vector.body: ; preds = %vector.body, %entry 406 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 407 %0 = getelementptr inbounds i16, ptr %x, i32 %index 408 %wide.load = load <8 x i16>, ptr %0, align 2 409 %1 = sext <8 x i16> %wide.load to <8 x i32> 410 %2 = getelementptr inbounds i16, ptr %y, i32 %index 411 %wide.load30 = load <8 x i16>, ptr %2, align 2 412 %3 = sext <8 x i16> %wide.load30 to <8 x i32> 413 %4 = mul nsw <8 x i32> %3, %1 414 %5 = ashr <8 x i32> %4, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 415 %6 = icmp slt <8 x i32> %5, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 416 %7 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %5, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>) 417 %8 = trunc <8 x i32> %7 to <8 x i16> 418 %9 = getelementptr inbounds i16, ptr %z, i32 %index 419 store <8 x i16> %8, ptr %9, align 2 420 %index.next = add i32 %index, 8 421 %10 = icmp eq i32 %index.next, 1024 422 br i1 %10, label %for.cond.cleanup, label %vector.body 423 424for.cond.cleanup: ; preds = %vector.body 425 ret void 426} 427 428define void @vqdmulh_loop_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, i32 %n) { 429; CHECK-LABEL: vqdmulh_loop_i32: 430; CHECK: @ %bb.0: @ %entry 431; CHECK-NEXT: .save {r7, lr} 432; CHECK-NEXT: push {r7, lr} 433; CHECK-NEXT: mov.w lr, #256 434; CHECK-NEXT: .LBB19_1: @ %vector.body 435; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 436; CHECK-NEXT: vldrw.u32 q0, [r0], #16 437; CHECK-NEXT: vldrw.u32 q1, [r1], #16 438; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 439; CHECK-NEXT: vstrb.8 q0, [r2], #16 440; CHECK-NEXT: le lr, .LBB19_1 441; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 442; CHECK-NEXT: pop {r7, pc} 443entry: 444 br label %vector.body 445 446vector.body: ; preds = %vector.body, %entry 447 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 448 %0 = getelementptr inbounds i32, ptr %x, i32 %index 449 %wide.load = load <4 x i32>, ptr %0, align 4 450 %1 = sext <4 x i32> %wide.load to <4 x i64> 451 %2 = getelementptr inbounds i32, ptr %y, i32 %index 452 %wide.load30 = load <4 x i32>, ptr %2, align 4 453 %3 = sext <4 x i32> %wide.load30 to <4 x i64> 454 %4 = mul nsw <4 x i64> %3, %1 455 %5 = ashr <4 x i64> %4, <i64 31, i64 31, i64 31, i64 31> 456 %6 = icmp slt <4 x i64> %5, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> 457 %7 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %5, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>) 458 %8 = trunc <4 x i64> %7 to <4 x i32> 459 %9 = getelementptr inbounds i32, ptr %z, i32 %index 460 store <4 x i32> %8, ptr %9, align 4 461 %index.next = add i32 %index, 4 462 %10 = icmp eq i32 %index.next, 1024 463 br i1 %10, label %for.cond.cleanup, label %vector.body 464 465for.cond.cleanup: ; preds = %vector.body 466 ret void 467} 468 469define i32 @scalar(i16 %a) { 470; CHECK-LABEL: scalar: 471; CHECK: @ %bb.0: 472; CHECK-NEXT: smulbb r1, r0, r0 473; CHECK-NEXT: movs r0, #127 474; CHECK-NEXT: lsrs r2, r1, #7 475; CHECK-NEXT: cmp r2, #127 476; CHECK-NEXT: it lt 477; CHECK-NEXT: lsrlt r0, r1, #7 478; CHECK-NEXT: bx lr 479 %e = sext i16 %a to i32 480 %d = mul nsw i32 %e, %e 481 %b = ashr i32 %d, 7 482 %c = call i32 @llvm.smin.i32(i32 %b, i32 127) 483 ret i32 %c 484} 485 486declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 487declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 488declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 489declare i32 @llvm.smin.i32(i32 %a, i32 %b) 490declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>) 491declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>) 492declare <8 x i64> @llvm.smin.v8i64(<8 x i64>, <8 x i64>) 493declare <16 x i64> @llvm.smin.v16i64(<16 x i64>, <16 x i64>) 494declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) 495declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) 496declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>) 497declare <32 x i32> @llvm.smin.v32i32(<32 x i32>, <32 x i32>) 498declare <8 x i22> @llvm.smin.v8i22(<8 x i22>, <8 x i22>) 499