1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6; This test only tests the legal types for a given vector width, as mulh nodes 7; do not get generated for non-legal types. 8 9target triple = "aarch64-unknown-linux-gnu" 10 11; 12; SMULH 13; 14 15; Don't use SVE for 64-bit vectors. 16define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 { 17; CHECK-LABEL: smulh_v8i8: 18; CHECK: // %bb.0: 19; CHECK-NEXT: ptrue p0.b, vl8 20; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 21; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 22; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b 23; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 24; CHECK-NEXT: ret 25 %insert = insertelement <8 x i16> undef, i16 8, i64 0 26 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer 27 %1 = sext <8 x i8> %op1 to <8 x i16> 28 %2 = sext <8 x i8> %op2 to <8 x i16> 29 %mul = mul <8 x i16> %1, %2 30 %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 31 %res = trunc <8 x i16> %shr to <8 x i8> 32 ret <8 x i8> %res 33} 34 35; Don't use SVE for 128-bit vectors. 36define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 { 37; CHECK-LABEL: smulh_v16i8: 38; CHECK: // %bb.0: 39; CHECK-NEXT: ptrue p0.b, vl16 40; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 41; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 42; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b 43; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 44; CHECK-NEXT: ret 45 %1 = sext <16 x i8> %op1 to <16 x i16> 46 %2 = sext <16 x i8> %op2 to <16 x i16> 47 %mul = mul <16 x i16> %1, %2 48 %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 49 %res = trunc <16 x i16> %shr to <16 x i8> 50 ret <16 x i8> %res 51} 52 53define void @smulh_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 54; CHECK-LABEL: smulh_v32i8: 55; CHECK: // %bb.0: 56; CHECK-NEXT: ptrue p0.b, vl32 57; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 58; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 59; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b 60; CHECK-NEXT: st1b { z0.b }, p0, [x0] 61; CHECK-NEXT: ret 62 %op1 = load <32 x i8>, ptr %a 63 %op2 = load <32 x i8>, ptr %b 64 %1 = sext <32 x i8> %op1 to <32 x i16> 65 %2 = sext <32 x i8> %op2 to <32 x i16> 66 %mul = mul <32 x i16> %1, %2 67 %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 68 %res = trunc <32 x i16> %shr to <32 x i8> 69 store <32 x i8> %res, ptr %a 70 ret void 71} 72 73define void @smulh_v64i8(ptr %a, ptr %b) #0 { 74; VBITS_GE_256-LABEL: smulh_v64i8: 75; VBITS_GE_256: // %bb.0: 76; VBITS_GE_256-NEXT: ptrue p0.b, vl32 77; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 78; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 79; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] 80; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] 81; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] 82; VBITS_GE_256-NEXT: smulh z0.b, p0/m, z0.b, z1.b 83; VBITS_GE_256-NEXT: movprfx z1, z2 84; VBITS_GE_256-NEXT: smulh z1.b, p0/m, z1.b, z3.b 85; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 86; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 87; VBITS_GE_256-NEXT: ret 88; 89; VBITS_GE_512-LABEL: smulh_v64i8: 90; VBITS_GE_512: // %bb.0: 91; VBITS_GE_512-NEXT: ptrue p0.b, vl64 92; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 93; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] 94; VBITS_GE_512-NEXT: smulh z0.b, p0/m, z0.b, z1.b 95; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 96; VBITS_GE_512-NEXT: ret 97 %op1 = load <64 x i8>, ptr %a 98 %op2 = load <64 x i8>, ptr %b 99 %insert = insertelement <64 x i16> undef, i16 8, i64 0 100 %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer 101 %1 = sext <64 x i8> %op1 to <64 x i16> 102 %2 = sext <64 x i8> %op2 to <64 x i16> 103 %mul = mul <64 x i16> %1, %2 104 %shr = lshr <64 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 105 %res = trunc <64 x i16> %shr to <64 x i8> 106 store <64 x i8> %res, ptr %a 107 ret void 108} 109 110define void @smulh_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 111; CHECK-LABEL: smulh_v128i8: 112; CHECK: // %bb.0: 113; CHECK-NEXT: ptrue p0.b, vl128 114; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 115; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 116; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b 117; CHECK-NEXT: st1b { z0.b }, p0, [x0] 118; CHECK-NEXT: ret 119 %op1 = load <128 x i8>, ptr %a 120 %op2 = load <128 x i8>, ptr %b 121 %1 = sext <128 x i8> %op1 to <128 x i16> 122 %2 = sext <128 x i8> %op2 to <128 x i16> 123 %mul = mul <128 x i16> %1, %2 124 %shr = lshr <128 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 125 %res = trunc <128 x i16> %shr to <128 x i8> 126 store <128 x i8> %res, ptr %a 127 ret void 128} 129 130define void @smulh_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 131; CHECK-LABEL: smulh_v256i8: 132; CHECK: // %bb.0: 133; CHECK-NEXT: ptrue p0.b, vl256 134; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 135; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 136; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b 137; CHECK-NEXT: st1b { z0.b }, p0, [x0] 138; CHECK-NEXT: ret 139 %op1 = load <256 x i8>, ptr %a 140 %op2 = load <256 x i8>, ptr %b 141 %1 = sext <256 x i8> %op1 to <256 x i16> 142 %2 = sext <256 x i8> %op2 to <256 x i16> 143 %mul = mul <256 x i16> %1, %2 144 %shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 145 %res = trunc <256 x i16> %shr to <256 x i8> 146 store <256 x i8> %res, ptr %a 147 ret void 148} 149 150; Don't use SVE for 64-bit vectors. 151define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 { 152; CHECK-LABEL: smulh_v4i16: 153; CHECK: // %bb.0: 154; CHECK-NEXT: ptrue p0.h, vl4 155; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 156; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 157; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h 158; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 159; CHECK-NEXT: ret 160 %1 = sext <4 x i16> %op1 to <4 x i32> 161 %2 = sext <4 x i16> %op2 to <4 x i32> 162 %mul = mul <4 x i32> %1, %2 163 %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16> 164 %res = trunc <4 x i32> %shr to <4 x i16> 165 ret <4 x i16> %res 166} 167 168; Don't use SVE for 128-bit vectors. 169define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 { 170; CHECK-LABEL: smulh_v8i16: 171; CHECK: // %bb.0: 172; CHECK-NEXT: ptrue p0.h, vl8 173; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 174; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 175; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h 176; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 177; CHECK-NEXT: ret 178 %1 = sext <8 x i16> %op1 to <8 x i32> 179 %2 = sext <8 x i16> %op2 to <8 x i32> 180 %mul = mul <8 x i32> %1, %2 181 %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 182 %res = trunc <8 x i32> %shr to <8 x i16> 183 ret <8 x i16> %res 184} 185 186define void @smulh_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 187; CHECK-LABEL: smulh_v16i16: 188; CHECK: // %bb.0: 189; CHECK-NEXT: ptrue p0.h, vl16 190; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 191; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 192; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h 193; CHECK-NEXT: st1h { z0.h }, p0, [x0] 194; CHECK-NEXT: ret 195 %op1 = load <16 x i16>, ptr %a 196 %op2 = load <16 x i16>, ptr %b 197 %1 = sext <16 x i16> %op1 to <16 x i32> 198 %2 = sext <16 x i16> %op2 to <16 x i32> 199 %mul = mul <16 x i32> %1, %2 200 %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 201 %res = trunc <16 x i32> %shr to <16 x i16> 202 store <16 x i16> %res, ptr %a 203 ret void 204} 205 206define void @smulh_v32i16(ptr %a, ptr %b) #0 { 207; VBITS_GE_256-LABEL: smulh_v32i16: 208; VBITS_GE_256: // %bb.0: 209; VBITS_GE_256-NEXT: ptrue p0.h, vl16 210; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 211; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 212; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 213; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 214; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 215; VBITS_GE_256-NEXT: smulh z0.h, p0/m, z0.h, z1.h 216; VBITS_GE_256-NEXT: movprfx z1, z2 217; VBITS_GE_256-NEXT: smulh z1.h, p0/m, z1.h, z3.h 218; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 219; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 220; VBITS_GE_256-NEXT: ret 221; 222; VBITS_GE_512-LABEL: smulh_v32i16: 223; VBITS_GE_512: // %bb.0: 224; VBITS_GE_512-NEXT: ptrue p0.h, vl32 225; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 226; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 227; VBITS_GE_512-NEXT: smulh z0.h, p0/m, z0.h, z1.h 228; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 229; VBITS_GE_512-NEXT: ret 230 %op1 = load <32 x i16>, ptr %a 231 %op2 = load <32 x i16>, ptr %b 232 %1 = sext <32 x i16> %op1 to <32 x i32> 233 %2 = sext <32 x i16> %op2 to <32 x i32> 234 %mul = mul <32 x i32> %1, %2 235 %shr = lshr <32 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 236 %res = trunc <32 x i32> %shr to <32 x i16> 237 store <32 x i16> %res, ptr %a 238 ret void 239} 240 241define void @smulh_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 242; CHECK-LABEL: smulh_v64i16: 243; CHECK: // %bb.0: 244; CHECK-NEXT: ptrue p0.h, vl64 245; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 246; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 247; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h 248; CHECK-NEXT: st1h { z0.h }, p0, [x0] 249; CHECK-NEXT: ret 250 %op1 = load <64 x i16>, ptr %a 251 %op2 = load <64 x i16>, ptr %b 252 %1 = sext <64 x i16> %op1 to <64 x i32> 253 %2 = sext <64 x i16> %op2 to <64 x i32> 254 %mul = mul <64 x i32> %1, %2 255 %shr = lshr <64 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 256 %res = trunc <64 x i32> %shr to <64 x i16> 257 store <64 x i16> %res, ptr %a 258 ret void 259} 260 261define void @smulh_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 262; CHECK-LABEL: smulh_v128i16: 263; CHECK: // %bb.0: 264; CHECK-NEXT: ptrue p0.h, vl128 265; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 266; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 267; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h 268; CHECK-NEXT: st1h { z0.h }, p0, [x0] 269; CHECK-NEXT: ret 270 %op1 = load <128 x i16>, ptr %a 271 %op2 = load <128 x i16>, ptr %b 272 %1 = sext <128 x i16> %op1 to <128 x i32> 273 %2 = sext <128 x i16> %op2 to <128 x i32> 274 %mul = mul <128 x i32> %1, %2 275 %shr = lshr <128 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 276 %res = trunc <128 x i32> %shr to <128 x i16> 277 store <128 x i16> %res, ptr %a 278 ret void 279} 280 281; Vector i64 multiplications are not legal for NEON so use SVE when available. 282define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 { 283; CHECK-LABEL: smulh_v2i32: 284; CHECK: // %bb.0: 285; CHECK-NEXT: ptrue p0.s, vl2 286; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 287; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 288; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s 289; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 290; CHECK-NEXT: ret 291 %1 = sext <2 x i32> %op1 to <2 x i64> 292 %2 = sext <2 x i32> %op2 to <2 x i64> 293 %mul = mul <2 x i64> %1, %2 294 %shr = lshr <2 x i64> %mul, <i64 32, i64 32> 295 %res = trunc <2 x i64> %shr to <2 x i32> 296 ret <2 x i32> %res 297} 298 299; Don't use SVE for 128-bit vectors. 300define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 { 301; CHECK-LABEL: smulh_v4i32: 302; CHECK: // %bb.0: 303; CHECK-NEXT: ptrue p0.s, vl4 304; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 305; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 306; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s 307; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 308; CHECK-NEXT: ret 309 %1 = sext <4 x i32> %op1 to <4 x i64> 310 %2 = sext <4 x i32> %op2 to <4 x i64> 311 %mul = mul <4 x i64> %1, %2 312 %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32> 313 %res = trunc <4 x i64> %shr to <4 x i32> 314 ret <4 x i32> %res 315} 316 317define void @smulh_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 318; CHECK-LABEL: smulh_v8i32: 319; CHECK: // %bb.0: 320; CHECK-NEXT: ptrue p0.s, vl8 321; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 322; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 323; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s 324; CHECK-NEXT: st1w { z0.s }, p0, [x0] 325; CHECK-NEXT: ret 326 %op1 = load <8 x i32>, ptr %a 327 %op2 = load <8 x i32>, ptr %b 328 %1 = sext <8 x i32> %op1 to <8 x i64> 329 %2 = sext <8 x i32> %op2 to <8 x i64> 330 %mul = mul <8 x i64> %1, %2 331 %shr = lshr <8 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 332 %res = trunc <8 x i64> %shr to <8 x i32> 333 store <8 x i32> %res, ptr %a 334 ret void 335} 336 337define void @smulh_v16i32(ptr %a, ptr %b) #0 { 338; VBITS_GE_256-LABEL: smulh_v16i32: 339; VBITS_GE_256: // %bb.0: 340; VBITS_GE_256-NEXT: ptrue p0.s, vl8 341; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 342; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 343; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 344; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 345; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 346; VBITS_GE_256-NEXT: smulh z0.s, p0/m, z0.s, z1.s 347; VBITS_GE_256-NEXT: movprfx z1, z2 348; VBITS_GE_256-NEXT: smulh z1.s, p0/m, z1.s, z3.s 349; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 350; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 351; VBITS_GE_256-NEXT: ret 352; 353; VBITS_GE_512-LABEL: smulh_v16i32: 354; VBITS_GE_512: // %bb.0: 355; VBITS_GE_512-NEXT: ptrue p0.s, vl16 356; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 357; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 358; VBITS_GE_512-NEXT: smulh z0.s, p0/m, z0.s, z1.s 359; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 360; VBITS_GE_512-NEXT: ret 361 %op1 = load <16 x i32>, ptr %a 362 %op2 = load <16 x i32>, ptr %b 363 %1 = sext <16 x i32> %op1 to <16 x i64> 364 %2 = sext <16 x i32> %op2 to <16 x i64> 365 %mul = mul <16 x i64> %1, %2 366 %shr = lshr <16 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 367 %res = trunc <16 x i64> %shr to <16 x i32> 368 store <16 x i32> %res, ptr %a 369 ret void 370} 371 372define void @smulh_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 373; CHECK-LABEL: smulh_v32i32: 374; CHECK: // %bb.0: 375; CHECK-NEXT: ptrue p0.s, vl32 376; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 377; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 378; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s 379; CHECK-NEXT: st1w { z0.s }, p0, [x0] 380; CHECK-NEXT: ret 381 %op1 = load <32 x i32>, ptr %a 382 %op2 = load <32 x i32>, ptr %b 383 %1 = sext <32 x i32> %op1 to <32 x i64> 384 %2 = sext <32 x i32> %op2 to <32 x i64> 385 %mul = mul <32 x i64> %1, %2 386 %shr = lshr <32 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 387 %res = trunc <32 x i64> %shr to <32 x i32> 388 store <32 x i32> %res, ptr %a 389 ret void 390} 391 392define void @smulh_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 393; CHECK-LABEL: smulh_v64i32: 394; CHECK: // %bb.0: 395; CHECK-NEXT: ptrue p0.s, vl64 396; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 397; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 398; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s 399; CHECK-NEXT: st1w { z0.s }, p0, [x0] 400; CHECK-NEXT: ret 401 %op1 = load <64 x i32>, ptr %a 402 %op2 = load <64 x i32>, ptr %b 403 %1 = sext <64 x i32> %op1 to <64 x i64> 404 %2 = sext <64 x i32> %op2 to <64 x i64> 405 %mul = mul <64 x i64> %1, %2 406 %shr = lshr <64 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 407 %res = trunc <64 x i64> %shr to <64 x i32> 408 store <64 x i32> %res, ptr %a 409 ret void 410} 411 412; Vector i64 multiplications are not legal for NEON so use SVE when available. 413define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 { 414; CHECK-LABEL: smulh_v1i64: 415; CHECK: // %bb.0: 416; CHECK-NEXT: ptrue p0.d, vl1 417; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 418; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 419; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d 420; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 421; CHECK-NEXT: ret 422 %insert = insertelement <1 x i128> undef, i128 64, i128 0 423 %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer 424 %1 = sext <1 x i64> %op1 to <1 x i128> 425 %2 = sext <1 x i64> %op2 to <1 x i128> 426 %mul = mul <1 x i128> %1, %2 427 %shr = lshr <1 x i128> %mul, %splat 428 %res = trunc <1 x i128> %shr to <1 x i64> 429 ret <1 x i64> %res 430} 431 432; Vector i64 multiplications are not legal for NEON so use SVE when available. 433define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 { 434; CHECK-LABEL: smulh_v2i64: 435; CHECK: // %bb.0: 436; CHECK-NEXT: ptrue p0.d, vl2 437; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 438; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 439; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d 440; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 441; CHECK-NEXT: ret 442 %1 = sext <2 x i64> %op1 to <2 x i128> 443 %2 = sext <2 x i64> %op2 to <2 x i128> 444 %mul = mul <2 x i128> %1, %2 445 %shr = lshr <2 x i128> %mul, <i128 64, i128 64> 446 %res = trunc <2 x i128> %shr to <2 x i64> 447 ret <2 x i64> %res 448} 449 450define void @smulh_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 451; CHECK-LABEL: smulh_v4i64: 452; CHECK: // %bb.0: 453; CHECK-NEXT: ptrue p0.d, vl4 454; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 455; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 456; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d 457; CHECK-NEXT: st1d { z0.d }, p0, [x0] 458; CHECK-NEXT: ret 459 %op1 = load <4 x i64>, ptr %a 460 %op2 = load <4 x i64>, ptr %b 461 %1 = sext <4 x i64> %op1 to <4 x i128> 462 %2 = sext <4 x i64> %op2 to <4 x i128> 463 %mul = mul <4 x i128> %1, %2 464 %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64> 465 %res = trunc <4 x i128> %shr to <4 x i64> 466 store <4 x i64> %res, ptr %a 467 ret void 468} 469 470define void @smulh_v8i64(ptr %a, ptr %b) #0 { 471; VBITS_GE_256-LABEL: smulh_v8i64: 472; VBITS_GE_256: // %bb.0: 473; VBITS_GE_256-NEXT: ptrue p0.d, vl4 474; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 475; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 476; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 477; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 478; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 479; VBITS_GE_256-NEXT: smulh z0.d, p0/m, z0.d, z1.d 480; VBITS_GE_256-NEXT: movprfx z1, z2 481; VBITS_GE_256-NEXT: smulh z1.d, p0/m, z1.d, z3.d 482; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 483; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 484; VBITS_GE_256-NEXT: ret 485; 486; VBITS_GE_512-LABEL: smulh_v8i64: 487; VBITS_GE_512: // %bb.0: 488; VBITS_GE_512-NEXT: ptrue p0.d, vl8 489; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 490; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 491; VBITS_GE_512-NEXT: smulh z0.d, p0/m, z0.d, z1.d 492; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 493; VBITS_GE_512-NEXT: ret 494 %op1 = load <8 x i64>, ptr %a 495 %op2 = load <8 x i64>, ptr %b 496 %1 = sext <8 x i64> %op1 to <8 x i128> 497 %2 = sext <8 x i64> %op2 to <8 x i128> 498 %mul = mul <8 x i128> %1, %2 499 %shr = lshr <8 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64> 500 %res = trunc <8 x i128> %shr to <8 x i64> 501 store <8 x i64> %res, ptr %a 502 ret void 503} 504 505define void @smulh_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 506; CHECK-LABEL: smulh_v16i64: 507; CHECK: // %bb.0: 508; CHECK-NEXT: ptrue p0.d, vl16 509; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 510; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 511; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d 512; CHECK-NEXT: st1d { z0.d }, p0, [x0] 513; CHECK-NEXT: ret 514 %op1 = load <16 x i64>, ptr %a 515 %op2 = load <16 x i64>, ptr %b 516 %1 = sext <16 x i64> %op1 to <16 x i128> 517 %2 = sext <16 x i64> %op2 to <16 x i128> 518 %mul = mul <16 x i128> %1, %2 519 %shr = lshr <16 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64> 520 %res = trunc <16 x i128> %shr to <16 x i64> 521 store <16 x i64> %res, ptr %a 522 ret void 523} 524 525define void @smulh_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 526; CHECK-LABEL: smulh_v32i64: 527; CHECK: // %bb.0: 528; CHECK-NEXT: ptrue p0.d, vl32 529; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 530; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 531; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d 532; CHECK-NEXT: st1d { z0.d }, p0, [x0] 533; CHECK-NEXT: ret 534 %op1 = load <32 x i64>, ptr %a 535 %op2 = load <32 x i64>, ptr %b 536 %1 = sext <32 x i64> %op1 to <32 x i128> 537 %2 = sext <32 x i64> %op2 to <32 x i128> 538 %mul = mul <32 x i128> %1, %2 539 %shr = lshr <32 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64> 540 %res = trunc <32 x i128> %shr to <32 x i64> 541 store <32 x i64> %res, ptr %a 542 ret void 543} 544 545; 546; UMULH 547; 548 549; Don't use SVE for 64-bit vectors. 550; FIXME: The codegen for the >=256 bits case can be improved. 551define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 { 552; CHECK-LABEL: umulh_v8i8: 553; CHECK: // %bb.0: 554; CHECK-NEXT: ptrue p0.b, vl8 555; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 556; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 557; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b 558; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 559; CHECK-NEXT: ret 560 %1 = zext <8 x i8> %op1 to <8 x i16> 561 %2 = zext <8 x i8> %op2 to <8 x i16> 562 %mul = mul <8 x i16> %1, %2 563 %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 564 %res = trunc <8 x i16> %shr to <8 x i8> 565 ret <8 x i8> %res 566} 567 568; Don't use SVE for 128-bit vectors. 569define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 { 570; CHECK-LABEL: umulh_v16i8: 571; CHECK: // %bb.0: 572; CHECK-NEXT: ptrue p0.b, vl16 573; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 574; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 575; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b 576; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 577; CHECK-NEXT: ret 578 %1 = zext <16 x i8> %op1 to <16 x i16> 579 %2 = zext <16 x i8> %op2 to <16 x i16> 580 %mul = mul <16 x i16> %1, %2 581 %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 582 %res = trunc <16 x i16> %shr to <16 x i8> 583 ret <16 x i8> %res 584} 585 586define void @umulh_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 587; CHECK-LABEL: umulh_v32i8: 588; CHECK: // %bb.0: 589; CHECK-NEXT: ptrue p0.b, vl32 590; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 591; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 592; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b 593; CHECK-NEXT: st1b { z0.b }, p0, [x0] 594; CHECK-NEXT: ret 595 %op1 = load <32 x i8>, ptr %a 596 %op2 = load <32 x i8>, ptr %b 597 %1 = zext <32 x i8> %op1 to <32 x i16> 598 %2 = zext <32 x i8> %op2 to <32 x i16> 599 %mul = mul <32 x i16> %1, %2 600 %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 601 %res = trunc <32 x i16> %shr to <32 x i8> 602 store <32 x i8> %res, ptr %a 603 ret void 604} 605 606define void @umulh_v64i8(ptr %a, ptr %b) #0 { 607; VBITS_GE_256-LABEL: umulh_v64i8: 608; VBITS_GE_256: // %bb.0: 609; VBITS_GE_256-NEXT: ptrue p0.b, vl32 610; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 611; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 612; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] 613; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] 614; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] 615; VBITS_GE_256-NEXT: umulh z0.b, p0/m, z0.b, z1.b 616; VBITS_GE_256-NEXT: movprfx z1, z2 617; VBITS_GE_256-NEXT: umulh z1.b, p0/m, z1.b, z3.b 618; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 619; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 620; VBITS_GE_256-NEXT: ret 621; 622; VBITS_GE_512-LABEL: umulh_v64i8: 623; VBITS_GE_512: // %bb.0: 624; VBITS_GE_512-NEXT: ptrue p0.b, vl64 625; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 626; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] 627; VBITS_GE_512-NEXT: umulh z0.b, p0/m, z0.b, z1.b 628; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 629; VBITS_GE_512-NEXT: ret 630 %op1 = load <64 x i8>, ptr %a 631 %op2 = load <64 x i8>, ptr %b 632 %1 = zext <64 x i8> %op1 to <64 x i16> 633 %2 = zext <64 x i8> %op2 to <64 x i16> 634 %mul = mul <64 x i16> %1, %2 635 %shr = lshr <64 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 636 %res = trunc <64 x i16> %shr to <64 x i8> 637 store <64 x i8> %res, ptr %a 638 ret void 639} 640 641define void @umulh_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 642; CHECK-LABEL: umulh_v128i8: 643; CHECK: // %bb.0: 644; CHECK-NEXT: ptrue p0.b, vl128 645; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 646; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 647; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b 648; CHECK-NEXT: st1b { z0.b }, p0, [x0] 649; CHECK-NEXT: ret 650 %op1 = load <128 x i8>, ptr %a 651 %op2 = load <128 x i8>, ptr %b 652 %insert = insertelement <128 x i16> undef, i16 8, i64 0 653 %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer 654 %1 = zext <128 x i8> %op1 to <128 x i16> 655 %2 = zext <128 x i8> %op2 to <128 x i16> 656 %mul = mul <128 x i16> %1, %2 657 %shr = lshr <128 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 658 %res = trunc <128 x i16> %shr to <128 x i8> 659 store <128 x i8> %res, ptr %a 660 ret void 661} 662 663define void @umulh_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 664; CHECK-LABEL: umulh_v256i8: 665; CHECK: // %bb.0: 666; CHECK-NEXT: ptrue p0.b, vl256 667; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 668; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 669; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b 670; CHECK-NEXT: st1b { z0.b }, p0, [x0] 671; CHECK-NEXT: ret 672 %op1 = load <256 x i8>, ptr %a 673 %op2 = load <256 x i8>, ptr %b 674 %1 = zext <256 x i8> %op1 to <256 x i16> 675 %2 = zext <256 x i8> %op2 to <256 x i16> 676 %mul = mul <256 x i16> %1, %2 677 %shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 678 %res = trunc <256 x i16> %shr to <256 x i8> 679 store <256 x i8> %res, ptr %a 680 ret void 681} 682 683; Don't use SVE for 64-bit vectors. 684; FIXME: The codegen for the >=256 bits case can be improved. 685define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 { 686; CHECK-LABEL: umulh_v4i16: 687; CHECK: // %bb.0: 688; CHECK-NEXT: ptrue p0.h, vl4 689; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 690; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 691; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h 692; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 693; CHECK-NEXT: ret 694 %1 = zext <4 x i16> %op1 to <4 x i32> 695 %2 = zext <4 x i16> %op2 to <4 x i32> 696 %mul = mul <4 x i32> %1, %2 697 %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16> 698 %res = trunc <4 x i32> %shr to <4 x i16> 699 ret <4 x i16> %res 700} 701 702; Don't use SVE for 128-bit vectors. 703define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 { 704; CHECK-LABEL: umulh_v8i16: 705; CHECK: // %bb.0: 706; CHECK-NEXT: ptrue p0.h, vl8 707; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 708; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 709; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h 710; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 711; CHECK-NEXT: ret 712 %1 = zext <8 x i16> %op1 to <8 x i32> 713 %2 = zext <8 x i16> %op2 to <8 x i32> 714 %mul = mul <8 x i32> %1, %2 715 %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 716 %res = trunc <8 x i32> %shr to <8 x i16> 717 ret <8 x i16> %res 718} 719 720define void @umulh_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 721; CHECK-LABEL: umulh_v16i16: 722; CHECK: // %bb.0: 723; CHECK-NEXT: ptrue p0.h, vl16 724; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 725; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 726; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h 727; CHECK-NEXT: st1h { z0.h }, p0, [x0] 728; CHECK-NEXT: ret 729 %op1 = load <16 x i16>, ptr %a 730 %op2 = load <16 x i16>, ptr %b 731 %1 = zext <16 x i16> %op1 to <16 x i32> 732 %2 = zext <16 x i16> %op2 to <16 x i32> 733 %mul = mul <16 x i32> %1, %2 734 %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 735 %res = trunc <16 x i32> %shr to <16 x i16> 736 store <16 x i16> %res, ptr %a 737 ret void 738} 739 740define void @umulh_v32i16(ptr %a, ptr %b) #0 { 741; VBITS_GE_256-LABEL: umulh_v32i16: 742; VBITS_GE_256: // %bb.0: 743; VBITS_GE_256-NEXT: ptrue p0.h, vl16 744; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 745; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 746; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 747; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 748; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 749; VBITS_GE_256-NEXT: umulh z0.h, p0/m, z0.h, z1.h 750; VBITS_GE_256-NEXT: movprfx z1, z2 751; VBITS_GE_256-NEXT: umulh z1.h, p0/m, z1.h, z3.h 752; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 753; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 754; VBITS_GE_256-NEXT: ret 755; 756; VBITS_GE_512-LABEL: umulh_v32i16: 757; VBITS_GE_512: // %bb.0: 758; VBITS_GE_512-NEXT: ptrue p0.h, vl32 759; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 760; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 761; VBITS_GE_512-NEXT: umulh z0.h, p0/m, z0.h, z1.h 762; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 763; VBITS_GE_512-NEXT: ret 764 %op1 = load <32 x i16>, ptr %a 765 %op2 = load <32 x i16>, ptr %b 766 %1 = zext <32 x i16> %op1 to <32 x i32> 767 %2 = zext <32 x i16> %op2 to <32 x i32> 768 %mul = mul <32 x i32> %1, %2 769 %shr = lshr <32 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 770 %res = trunc <32 x i32> %shr to <32 x i16> 771 store <32 x i16> %res, ptr %a 772 ret void 773} 774 775define void @umulh_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 776; CHECK-LABEL: umulh_v64i16: 777; CHECK: // %bb.0: 778; CHECK-NEXT: ptrue p0.h, vl64 779; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 780; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 781; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h 782; CHECK-NEXT: st1h { z0.h }, p0, [x0] 783; CHECK-NEXT: ret 784 %op1 = load <64 x i16>, ptr %a 785 %op2 = load <64 x i16>, ptr %b 786 %1 = zext <64 x i16> %op1 to <64 x i32> 787 %2 = zext <64 x i16> %op2 to <64 x i32> 788 %mul = mul <64 x i32> %1, %2 789 %shr = lshr <64 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 790 %res = trunc <64 x i32> %shr to <64 x i16> 791 store <64 x i16> %res, ptr %a 792 ret void 793} 794 795define void @umulh_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 796; CHECK-LABEL: umulh_v128i16: 797; CHECK: // %bb.0: 798; CHECK-NEXT: ptrue p0.h, vl128 799; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 800; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 801; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h 802; CHECK-NEXT: st1h { z0.h }, p0, [x0] 803; CHECK-NEXT: ret 804 %op1 = load <128 x i16>, ptr %a 805 %op2 = load <128 x i16>, ptr %b 806 %1 = zext <128 x i16> %op1 to <128 x i32> 807 %2 = zext <128 x i16> %op2 to <128 x i32> 808 %mul = mul <128 x i32> %1, %2 809 %shr = lshr <128 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 810 %res = trunc <128 x i32> %shr to <128 x i16> 811 store <128 x i16> %res, ptr %a 812 ret void 813} 814 815; Vector i64 multiplications are not legal for NEON so use SVE when available. 816define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 { 817; CHECK-LABEL: umulh_v2i32: 818; CHECK: // %bb.0: 819; CHECK-NEXT: ptrue p0.s, vl2 820; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 821; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 822; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s 823; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 824; CHECK-NEXT: ret 825 %1 = zext <2 x i32> %op1 to <2 x i64> 826 %2 = zext <2 x i32> %op2 to <2 x i64> 827 %mul = mul <2 x i64> %1, %2 828 %shr = lshr <2 x i64> %mul, <i64 32, i64 32> 829 %res = trunc <2 x i64> %shr to <2 x i32> 830 ret <2 x i32> %res 831} 832 833; Don't use SVE for 128-bit vectors. 834define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 { 835; CHECK-LABEL: umulh_v4i32: 836; CHECK: // %bb.0: 837; CHECK-NEXT: ptrue p0.s, vl4 838; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 839; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 840; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s 841; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 842; CHECK-NEXT: ret 843 %1 = zext <4 x i32> %op1 to <4 x i64> 844 %2 = zext <4 x i32> %op2 to <4 x i64> 845 %mul = mul <4 x i64> %1, %2 846 %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32> 847 %res = trunc <4 x i64> %shr to <4 x i32> 848 ret <4 x i32> %res 849} 850 851define void @umulh_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 852; CHECK-LABEL: umulh_v8i32: 853; CHECK: // %bb.0: 854; CHECK-NEXT: ptrue p0.s, vl8 855; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 856; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 857; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s 858; CHECK-NEXT: st1w { z0.s }, p0, [x0] 859; CHECK-NEXT: ret 860 %op1 = load <8 x i32>, ptr %a 861 %op2 = load <8 x i32>, ptr %b 862 %insert = insertelement <8 x i64> undef, i64 32, i64 0 863 %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer 864 %1 = zext <8 x i32> %op1 to <8 x i64> 865 %2 = zext <8 x i32> %op2 to <8 x i64> 866 %mul = mul <8 x i64> %1, %2 867 %shr = lshr <8 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 868 %res = trunc <8 x i64> %shr to <8 x i32> 869 store <8 x i32> %res, ptr %a 870 ret void 871} 872 873define void @umulh_v16i32(ptr %a, ptr %b) #0 { 874; VBITS_GE_256-LABEL: umulh_v16i32: 875; VBITS_GE_256: // %bb.0: 876; VBITS_GE_256-NEXT: ptrue p0.s, vl8 877; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 878; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 879; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 880; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 881; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 882; VBITS_GE_256-NEXT: umulh z0.s, p0/m, z0.s, z1.s 883; VBITS_GE_256-NEXT: movprfx z1, z2 884; VBITS_GE_256-NEXT: umulh z1.s, p0/m, z1.s, z3.s 885; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 886; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 887; VBITS_GE_256-NEXT: ret 888; 889; VBITS_GE_512-LABEL: umulh_v16i32: 890; VBITS_GE_512: // %bb.0: 891; VBITS_GE_512-NEXT: ptrue p0.s, vl16 892; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 893; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 894; VBITS_GE_512-NEXT: umulh z0.s, p0/m, z0.s, z1.s 895; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 896; VBITS_GE_512-NEXT: ret 897 %op1 = load <16 x i32>, ptr %a 898 %op2 = load <16 x i32>, ptr %b 899 %1 = zext <16 x i32> %op1 to <16 x i64> 900 %2 = zext <16 x i32> %op2 to <16 x i64> 901 %mul = mul <16 x i64> %1, %2 902 %shr = lshr <16 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 903 %res = trunc <16 x i64> %shr to <16 x i32> 904 store <16 x i32> %res, ptr %a 905 ret void 906} 907 908define void @umulh_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 909; CHECK-LABEL: umulh_v32i32: 910; CHECK: // %bb.0: 911; CHECK-NEXT: ptrue p0.s, vl32 912; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 913; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 914; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s 915; CHECK-NEXT: st1w { z0.s }, p0, [x0] 916; CHECK-NEXT: ret 917 %op1 = load <32 x i32>, ptr %a 918 %op2 = load <32 x i32>, ptr %b 919 %1 = zext <32 x i32> %op1 to <32 x i64> 920 %2 = zext <32 x i32> %op2 to <32 x i64> 921 %mul = mul <32 x i64> %1, %2 922 %shr = lshr <32 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 923 %res = trunc <32 x i64> %shr to <32 x i32> 924 store <32 x i32> %res, ptr %a 925 ret void 926} 927 928define void @umulh_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 929; CHECK-LABEL: umulh_v64i32: 930; CHECK: // %bb.0: 931; CHECK-NEXT: ptrue p0.s, vl64 932; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 933; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 934; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s 935; CHECK-NEXT: st1w { z0.s }, p0, [x0] 936; CHECK-NEXT: ret 937 %op1 = load <64 x i32>, ptr %a 938 %op2 = load <64 x i32>, ptr %b 939 %1 = zext <64 x i32> %op1 to <64 x i64> 940 %2 = zext <64 x i32> %op2 to <64 x i64> 941 %mul = mul <64 x i64> %1, %2 942 %shr = lshr <64 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 943 %res = trunc <64 x i64> %shr to <64 x i32> 944 store <64 x i32> %res, ptr %a 945 ret void 946} 947 948; Vector i64 multiplications are not legal for NEON so use SVE when available. 949define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 { 950; CHECK-LABEL: umulh_v1i64: 951; CHECK: // %bb.0: 952; CHECK-NEXT: ptrue p0.d, vl1 953; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 954; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 955; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d 956; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 957; CHECK-NEXT: ret 958 %1 = zext <1 x i64> %op1 to <1 x i128> 959 %2 = zext <1 x i64> %op2 to <1 x i128> 960 %mul = mul <1 x i128> %1, %2 961 %shr = lshr <1 x i128> %mul, <i128 64> 962 %res = trunc <1 x i128> %shr to <1 x i64> 963 ret <1 x i64> %res 964} 965 966; Vector i64 multiplications are not legal for NEON so use SVE when available. 967define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 { 968; CHECK-LABEL: umulh_v2i64: 969; CHECK: // %bb.0: 970; CHECK-NEXT: ptrue p0.d, vl2 971; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 972; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 973; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d 974; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 975; CHECK-NEXT: ret 976 %1 = zext <2 x i64> %op1 to <2 x i128> 977 %2 = zext <2 x i64> %op2 to <2 x i128> 978 %mul = mul <2 x i128> %1, %2 979 %shr = lshr <2 x i128> %mul, <i128 64, i128 64> 980 %res = trunc <2 x i128> %shr to <2 x i64> 981 ret <2 x i64> %res 982} 983 984define void @umulh_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 985; CHECK-LABEL: umulh_v4i64: 986; CHECK: // %bb.0: 987; CHECK-NEXT: ptrue p0.d, vl4 988; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 989; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 990; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d 991; CHECK-NEXT: st1d { z0.d }, p0, [x0] 992; CHECK-NEXT: ret 993 %op1 = load <4 x i64>, ptr %a 994 %op2 = load <4 x i64>, ptr %b 995 %1 = zext <4 x i64> %op1 to <4 x i128> 996 %2 = zext <4 x i64> %op2 to <4 x i128> 997 %mul = mul <4 x i128> %1, %2 998 %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64> 999 %res = trunc <4 x i128> %shr to <4 x i64> 1000 store <4 x i64> %res, ptr %a 1001 ret void 1002} 1003 1004define void @umulh_v8i64(ptr %a, ptr %b) #0 { 1005; VBITS_GE_256-LABEL: umulh_v8i64: 1006; VBITS_GE_256: // %bb.0: 1007; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1008; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1009; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1010; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 1011; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 1012; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 1013; VBITS_GE_256-NEXT: umulh z0.d, p0/m, z0.d, z1.d 1014; VBITS_GE_256-NEXT: movprfx z1, z2 1015; VBITS_GE_256-NEXT: umulh z1.d, p0/m, z1.d, z3.d 1016; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 1017; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 1018; VBITS_GE_256-NEXT: ret 1019; 1020; VBITS_GE_512-LABEL: umulh_v8i64: 1021; VBITS_GE_512: // %bb.0: 1022; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1023; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1024; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 1025; VBITS_GE_512-NEXT: umulh z0.d, p0/m, z0.d, z1.d 1026; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 1027; VBITS_GE_512-NEXT: ret 1028 %op1 = load <8 x i64>, ptr %a 1029 %op2 = load <8 x i64>, ptr %b 1030 %1 = zext <8 x i64> %op1 to <8 x i128> 1031 %2 = zext <8 x i64> %op2 to <8 x i128> 1032 %mul = mul <8 x i128> %1, %2 1033 %shr = lshr <8 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64> 1034 %res = trunc <8 x i128> %shr to <8 x i64> 1035 store <8 x i64> %res, ptr %a 1036 ret void 1037} 1038 1039define void @umulh_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 1040; CHECK-LABEL: umulh_v16i64: 1041; CHECK: // %bb.0: 1042; CHECK-NEXT: ptrue p0.d, vl16 1043; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1044; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1045; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d 1046; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1047; CHECK-NEXT: ret 1048 %op1 = load <16 x i64>, ptr %a 1049 %op2 = load <16 x i64>, ptr %b 1050 %1 = zext <16 x i64> %op1 to <16 x i128> 1051 %2 = zext <16 x i64> %op2 to <16 x i128> 1052 %mul = mul <16 x i128> %1, %2 1053 %shr = lshr <16 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64> 1054 %res = trunc <16 x i128> %shr to <16 x i64> 1055 store <16 x i64> %res, ptr %a 1056 ret void 1057} 1058 1059define void @umulh_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 1060; CHECK-LABEL: umulh_v32i64: 1061; CHECK: // %bb.0: 1062; CHECK-NEXT: ptrue p0.d, vl32 1063; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1064; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1065; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d 1066; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1067; CHECK-NEXT: ret 1068 %op1 = load <32 x i64>, ptr %a 1069 %op2 = load <32 x i64>, ptr %b 1070 %1 = zext <32 x i64> %op1 to <32 x i128> 1071 %2 = zext <32 x i64> %op2 to <32 x i128> 1072 %mul = mul <32 x i128> %1, %2 1073 %shr = lshr <32 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64> 1074 %res = trunc <32 x i128> %shr to <32 x i64> 1075 store <32 x i64> %res, ptr %a 1076 ret void 1077} 1078attributes #0 = { "target-features"="+sve" } 1079