1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -verify-machineinstrs < %s | FileCheck %s 3 4declare { <vscale x 2 x i8>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>) 5 6define <vscale x 2 x i8> @smulo_nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y) { 7; CHECK-LABEL: smulo_nxv2i8: 8; CHECK: // %bb.0: 9; CHECK-NEXT: ptrue p0.d 10; CHECK-NEXT: sxtb z1.d, p0/m, z1.d 11; CHECK-NEXT: sxtb z0.d, p0/m, z0.d 12; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d 13; CHECK-NEXT: movprfx z1, z0 14; CHECK-NEXT: sxtb z1.d, p0/m, z0.d 15; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, z0.d 16; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 17; CHECK-NEXT: ret 18 %a = call { <vscale x 2 x i8>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y) 19 %b = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i1> } %a, 0 20 %c = extractvalue { <vscale x 2 x i8>, <vscale x 2 x i1> } %a, 1 21 %d = select <vscale x 2 x i1> %c, <vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> %b 22 ret <vscale x 2 x i8> %d 23} 24 25declare { <vscale x 4 x i8>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>) 26 27define <vscale x 4 x i8> @smulo_nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %y) { 28; CHECK-LABEL: smulo_nxv4i8: 29; CHECK: // %bb.0: 30; CHECK-NEXT: ptrue p0.s 31; CHECK-NEXT: sxtb z1.s, p0/m, z1.s 32; CHECK-NEXT: sxtb z0.s, p0/m, z0.s 33; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s 34; CHECK-NEXT: movprfx z1, z0 35; CHECK-NEXT: sxtb z1.s, p0/m, z0.s 36; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, z0.s 37; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0 38; CHECK-NEXT: ret 39 %a = call { <vscale x 4 x i8>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %y) 40 %b = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i1> } %a, 0 41 %c = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i1> } %a, 1 42 %d = select <vscale x 4 x i1> %c, <vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> %b 43 ret <vscale x 4 x i8> %d 44} 45 46declare { <vscale x 8 x i8>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>) 47 48define <vscale x 8 x i8> @smulo_nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y) { 49; CHECK-LABEL: smulo_nxv8i8: 50; CHECK: // %bb.0: 51; CHECK-NEXT: ptrue p0.h 52; CHECK-NEXT: sxtb z1.h, p0/m, z1.h 53; CHECK-NEXT: sxtb z0.h, p0/m, z0.h 54; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h 55; CHECK-NEXT: movprfx z1, z0 56; CHECK-NEXT: sxtb z1.h, p0/m, z0.h 57; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, z0.h 58; CHECK-NEXT: mov z0.h, p0/m, #0 // =0x0 59; CHECK-NEXT: ret 60 %a = call { <vscale x 8 x i8>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y) 61 %b = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i1> } %a, 0 62 %c = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i1> } %a, 1 63 %d = select <vscale x 8 x i1> %c, <vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> %b 64 ret <vscale x 8 x i8> %d 65} 66 67declare { <vscale x 16 x i8>, <vscale x 16 x i1> } @llvm.smul.with.overflow.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>) 68 69define <vscale x 16 x i8> @smulo_nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) { 70; CHECK-LABEL: smulo_nxv16i8: 71; CHECK: // %bb.0: 72; CHECK-NEXT: ptrue p0.b 73; CHECK-NEXT: movprfx z2, z0 74; CHECK-NEXT: mul z2.b, p0/m, z2.b, z1.b 75; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b 76; CHECK-NEXT: asr z1.b, z2.b, #7 77; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, z1.b 78; CHECK-NEXT: mov z2.b, p0/m, #0 // =0x0 79; CHECK-NEXT: mov z0.d, z2.d 80; CHECK-NEXT: ret 81 %a = call { <vscale x 16 x i8>, <vscale x 16 x i1> } @llvm.smul.with.overflow.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) 82 %b = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i1> } %a, 0 83 %c = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i1> } %a, 1 84 %d = select <vscale x 16 x i1> %c, <vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> %b 85 ret <vscale x 16 x i8> %d 86} 87 88declare { <vscale x 32 x i8>, <vscale x 32 x i1> } @llvm.smul.with.overflow.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>) 89 90define <vscale x 32 x i8> @smulo_nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %y) { 91; CHECK-LABEL: smulo_nxv32i8: 92; CHECK: // %bb.0: 93; CHECK-NEXT: ptrue p0.b 94; CHECK-NEXT: movprfx z4, z1 95; CHECK-NEXT: mul z4.b, p0/m, z4.b, z3.b 96; CHECK-NEXT: movprfx z5, z0 97; CHECK-NEXT: mul z5.b, p0/m, z5.b, z2.b 98; CHECK-NEXT: smulh z1.b, p0/m, z1.b, z3.b 99; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z2.b 100; CHECK-NEXT: asr z2.b, z4.b, #7 101; CHECK-NEXT: asr z3.b, z5.b, #7 102; CHECK-NEXT: cmpne p1.b, p0/z, z1.b, z2.b 103; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, z3.b 104; CHECK-NEXT: mov z5.b, p0/m, #0 // =0x0 105; CHECK-NEXT: mov z4.b, p1/m, #0 // =0x0 106; CHECK-NEXT: mov z0.d, z5.d 107; CHECK-NEXT: mov z1.d, z4.d 108; CHECK-NEXT: ret 109 %a = call { <vscale x 32 x i8>, <vscale x 32 x i1> } @llvm.smul.with.overflow.nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %y) 110 %b = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i1> } %a, 0 111 %c = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i1> } %a, 1 112 %d = select <vscale x 32 x i1> %c, <vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> %b 113 ret <vscale x 32 x i8> %d 114} 115 116declare { <vscale x 64 x i8>, <vscale x 64 x i1> } @llvm.smul.with.overflow.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>) 117 118define <vscale x 64 x i8> @smulo_nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %y) { 119; CHECK-LABEL: smulo_nxv64i8: 120; CHECK: // %bb.0: 121; CHECK-NEXT: ptrue p0.b 122; CHECK-NEXT: movprfx z24, z3 123; CHECK-NEXT: mul z24.b, p0/m, z24.b, z7.b 124; CHECK-NEXT: movprfx z25, z0 125; CHECK-NEXT: mul z25.b, p0/m, z25.b, z4.b 126; CHECK-NEXT: movprfx z26, z2 127; CHECK-NEXT: mul z26.b, p0/m, z26.b, z6.b 128; CHECK-NEXT: movprfx z27, z1 129; CHECK-NEXT: mul z27.b, p0/m, z27.b, z5.b 130; CHECK-NEXT: smulh z3.b, p0/m, z3.b, z7.b 131; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z4.b 132; CHECK-NEXT: smulh z2.b, p0/m, z2.b, z6.b 133; CHECK-NEXT: smulh z1.b, p0/m, z1.b, z5.b 134; CHECK-NEXT: asr z4.b, z25.b, #7 135; CHECK-NEXT: asr z5.b, z24.b, #7 136; CHECK-NEXT: asr z6.b, z26.b, #7 137; CHECK-NEXT: asr z7.b, z27.b, #7 138; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, z4.b 139; CHECK-NEXT: cmpne p2.b, p0/z, z3.b, z5.b 140; CHECK-NEXT: cmpne p3.b, p0/z, z2.b, z6.b 141; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, z7.b 142; CHECK-NEXT: mov z25.b, p1/m, #0 // =0x0 143; CHECK-NEXT: mov z26.b, p3/m, #0 // =0x0 144; CHECK-NEXT: mov z24.b, p2/m, #0 // =0x0 145; CHECK-NEXT: mov z27.b, p0/m, #0 // =0x0 146; CHECK-NEXT: mov z0.d, z25.d 147; CHECK-NEXT: mov z2.d, z26.d 148; CHECK-NEXT: mov z3.d, z24.d 149; CHECK-NEXT: mov z1.d, z27.d 150; CHECK-NEXT: ret 151 %a = call { <vscale x 64 x i8>, <vscale x 64 x i1> } @llvm.smul.with.overflow.nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %y) 152 %b = extractvalue { <vscale x 64 x i8>, <vscale x 64 x i1> } %a, 0 153 %c = extractvalue { <vscale x 64 x i8>, <vscale x 64 x i1> } %a, 1 154 %d = select <vscale x 64 x i1> %c, <vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> %b 155 ret <vscale x 64 x i8> %d 156} 157 158declare { <vscale x 2 x i16>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>) 159 160define <vscale x 2 x i16> @smulo_nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %y) { 161; CHECK-LABEL: smulo_nxv2i16: 162; CHECK: // %bb.0: 163; CHECK-NEXT: ptrue p0.d 164; CHECK-NEXT: sxth z1.d, p0/m, z1.d 165; CHECK-NEXT: sxth z0.d, p0/m, z0.d 166; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d 167; CHECK-NEXT: movprfx z1, z0 168; CHECK-NEXT: sxth z1.d, p0/m, z0.d 169; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, z0.d 170; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 171; CHECK-NEXT: ret 172 %a = call { <vscale x 2 x i16>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %y) 173 %b = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i1> } %a, 0 174 %c = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i1> } %a, 1 175 %d = select <vscale x 2 x i1> %c, <vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> %b 176 ret <vscale x 2 x i16> %d 177} 178 179declare { <vscale x 4 x i16>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>) 180 181define <vscale x 4 x i16> @smulo_nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) { 182; CHECK-LABEL: smulo_nxv4i16: 183; CHECK: // %bb.0: 184; CHECK-NEXT: ptrue p0.s 185; CHECK-NEXT: sxth z1.s, p0/m, z1.s 186; CHECK-NEXT: sxth z0.s, p0/m, z0.s 187; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s 188; CHECK-NEXT: movprfx z1, z0 189; CHECK-NEXT: sxth z1.s, p0/m, z0.s 190; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, z0.s 191; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0 192; CHECK-NEXT: ret 193 %a = call { <vscale x 4 x i16>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) 194 %b = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i1> } %a, 0 195 %c = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i1> } %a, 1 196 %d = select <vscale x 4 x i1> %c, <vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> %b 197 ret <vscale x 4 x i16> %d 198} 199 200declare { <vscale x 8 x i16>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>) 201 202define <vscale x 8 x i16> @smulo_nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) { 203; CHECK-LABEL: smulo_nxv8i16: 204; CHECK: // %bb.0: 205; CHECK-NEXT: ptrue p0.h 206; CHECK-NEXT: movprfx z2, z0 207; CHECK-NEXT: mul z2.h, p0/m, z2.h, z1.h 208; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h 209; CHECK-NEXT: asr z1.h, z2.h, #15 210; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, z1.h 211; CHECK-NEXT: mov z2.h, p0/m, #0 // =0x0 212; CHECK-NEXT: mov z0.d, z2.d 213; CHECK-NEXT: ret 214 %a = call { <vscale x 8 x i16>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) 215 %b = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i1> } %a, 0 216 %c = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i1> } %a, 1 217 %d = select <vscale x 8 x i1> %c, <vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> %b 218 ret <vscale x 8 x i16> %d 219} 220 221declare { <vscale x 16 x i16>, <vscale x 16 x i1> } @llvm.smul.with.overflow.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>) 222 223define <vscale x 16 x i16> @smulo_nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %y) { 224; CHECK-LABEL: smulo_nxv16i16: 225; CHECK: // %bb.0: 226; CHECK-NEXT: ptrue p0.h 227; CHECK-NEXT: movprfx z4, z1 228; CHECK-NEXT: mul z4.h, p0/m, z4.h, z3.h 229; CHECK-NEXT: movprfx z5, z0 230; CHECK-NEXT: mul z5.h, p0/m, z5.h, z2.h 231; CHECK-NEXT: smulh z1.h, p0/m, z1.h, z3.h 232; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z2.h 233; CHECK-NEXT: asr z2.h, z4.h, #15 234; CHECK-NEXT: asr z3.h, z5.h, #15 235; CHECK-NEXT: cmpne p1.h, p0/z, z1.h, z2.h 236; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, z3.h 237; CHECK-NEXT: mov z5.h, p0/m, #0 // =0x0 238; CHECK-NEXT: mov z4.h, p1/m, #0 // =0x0 239; CHECK-NEXT: mov z0.d, z5.d 240; CHECK-NEXT: mov z1.d, z4.d 241; CHECK-NEXT: ret 242 %a = call { <vscale x 16 x i16>, <vscale x 16 x i1> } @llvm.smul.with.overflow.nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %y) 243 %b = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i1> } %a, 0 244 %c = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i1> } %a, 1 245 %d = select <vscale x 16 x i1> %c, <vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> %b 246 ret <vscale x 16 x i16> %d 247} 248 249declare { <vscale x 32 x i16>, <vscale x 32 x i1> } @llvm.smul.with.overflow.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>) 250 251define <vscale x 32 x i16> @smulo_nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %y) { 252; CHECK-LABEL: smulo_nxv32i16: 253; CHECK: // %bb.0: 254; CHECK-NEXT: ptrue p0.h 255; CHECK-NEXT: movprfx z24, z3 256; CHECK-NEXT: mul z24.h, p0/m, z24.h, z7.h 257; CHECK-NEXT: movprfx z25, z0 258; CHECK-NEXT: mul z25.h, p0/m, z25.h, z4.h 259; CHECK-NEXT: movprfx z26, z2 260; CHECK-NEXT: mul z26.h, p0/m, z26.h, z6.h 261; CHECK-NEXT: movprfx z27, z1 262; CHECK-NEXT: mul z27.h, p0/m, z27.h, z5.h 263; CHECK-NEXT: smulh z3.h, p0/m, z3.h, z7.h 264; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z4.h 265; CHECK-NEXT: smulh z2.h, p0/m, z2.h, z6.h 266; CHECK-NEXT: smulh z1.h, p0/m, z1.h, z5.h 267; CHECK-NEXT: asr z4.h, z25.h, #15 268; CHECK-NEXT: asr z5.h, z24.h, #15 269; CHECK-NEXT: asr z6.h, z26.h, #15 270; CHECK-NEXT: asr z7.h, z27.h, #15 271; CHECK-NEXT: cmpne p1.h, p0/z, z0.h, z4.h 272; CHECK-NEXT: cmpne p2.h, p0/z, z3.h, z5.h 273; CHECK-NEXT: cmpne p3.h, p0/z, z2.h, z6.h 274; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, z7.h 275; CHECK-NEXT: mov z25.h, p1/m, #0 // =0x0 276; CHECK-NEXT: mov z26.h, p3/m, #0 // =0x0 277; CHECK-NEXT: mov z24.h, p2/m, #0 // =0x0 278; CHECK-NEXT: mov z27.h, p0/m, #0 // =0x0 279; CHECK-NEXT: mov z0.d, z25.d 280; CHECK-NEXT: mov z2.d, z26.d 281; CHECK-NEXT: mov z3.d, z24.d 282; CHECK-NEXT: mov z1.d, z27.d 283; CHECK-NEXT: ret 284 %a = call { <vscale x 32 x i16>, <vscale x 32 x i1> } @llvm.smul.with.overflow.nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %y) 285 %b = extractvalue { <vscale x 32 x i16>, <vscale x 32 x i1> } %a, 0 286 %c = extractvalue { <vscale x 32 x i16>, <vscale x 32 x i1> } %a, 1 287 %d = select <vscale x 32 x i1> %c, <vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> %b 288 ret <vscale x 32 x i16> %d 289} 290 291declare { <vscale x 2 x i32>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>) 292 293define <vscale x 2 x i32> @smulo_nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y) { 294; CHECK-LABEL: smulo_nxv2i32: 295; CHECK: // %bb.0: 296; CHECK-NEXT: ptrue p0.d 297; CHECK-NEXT: sxtw z1.d, p0/m, z1.d 298; CHECK-NEXT: sxtw z0.d, p0/m, z0.d 299; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d 300; CHECK-NEXT: movprfx z1, z0 301; CHECK-NEXT: sxtw z1.d, p0/m, z0.d 302; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, z0.d 303; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0 304; CHECK-NEXT: ret 305 %a = call { <vscale x 2 x i32>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y) 306 %b = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i1> } %a, 0 307 %c = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i1> } %a, 1 308 %d = select <vscale x 2 x i1> %c, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %b 309 ret <vscale x 2 x i32> %d 310} 311 312declare { <vscale x 4 x i32>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>) 313 314define <vscale x 4 x i32> @smulo_nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) { 315; CHECK-LABEL: smulo_nxv4i32: 316; CHECK: // %bb.0: 317; CHECK-NEXT: ptrue p0.s 318; CHECK-NEXT: movprfx z2, z0 319; CHECK-NEXT: mul z2.s, p0/m, z2.s, z1.s 320; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s 321; CHECK-NEXT: asr z1.s, z2.s, #31 322; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, z1.s 323; CHECK-NEXT: mov z2.s, p0/m, #0 // =0x0 324; CHECK-NEXT: mov z0.d, z2.d 325; CHECK-NEXT: ret 326 %a = call { <vscale x 4 x i32>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) 327 %b = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } %a, 0 328 %c = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } %a, 1 329 %d = select <vscale x 4 x i1> %c, <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> %b 330 ret <vscale x 4 x i32> %d 331} 332 333declare { <vscale x 8 x i32>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>) 334 335define <vscale x 8 x i32> @smulo_nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y) { 336; CHECK-LABEL: smulo_nxv8i32: 337; CHECK: // %bb.0: 338; CHECK-NEXT: ptrue p0.s 339; CHECK-NEXT: movprfx z4, z1 340; CHECK-NEXT: mul z4.s, p0/m, z4.s, z3.s 341; CHECK-NEXT: movprfx z5, z0 342; CHECK-NEXT: mul z5.s, p0/m, z5.s, z2.s 343; CHECK-NEXT: smulh z1.s, p0/m, z1.s, z3.s 344; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z2.s 345; CHECK-NEXT: asr z2.s, z4.s, #31 346; CHECK-NEXT: asr z3.s, z5.s, #31 347; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, z2.s 348; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, z3.s 349; CHECK-NEXT: mov z5.s, p0/m, #0 // =0x0 350; CHECK-NEXT: mov z4.s, p1/m, #0 // =0x0 351; CHECK-NEXT: mov z0.d, z5.d 352; CHECK-NEXT: mov z1.d, z4.d 353; CHECK-NEXT: ret 354 %a = call { <vscale x 8 x i32>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y) 355 %b = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i1> } %a, 0 356 %c = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i1> } %a, 1 357 %d = select <vscale x 8 x i1> %c, <vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> %b 358 ret <vscale x 8 x i32> %d 359} 360 361declare { <vscale x 16 x i32>, <vscale x 16 x i1> } @llvm.smul.with.overflow.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>) 362 363define <vscale x 16 x i32> @smulo_nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y) { 364; CHECK-LABEL: smulo_nxv16i32: 365; CHECK: // %bb.0: 366; CHECK-NEXT: ptrue p0.s 367; CHECK-NEXT: movprfx z24, z3 368; CHECK-NEXT: mul z24.s, p0/m, z24.s, z7.s 369; CHECK-NEXT: movprfx z25, z0 370; CHECK-NEXT: mul z25.s, p0/m, z25.s, z4.s 371; CHECK-NEXT: movprfx z26, z2 372; CHECK-NEXT: mul z26.s, p0/m, z26.s, z6.s 373; CHECK-NEXT: movprfx z27, z1 374; CHECK-NEXT: mul z27.s, p0/m, z27.s, z5.s 375; CHECK-NEXT: smulh z3.s, p0/m, z3.s, z7.s 376; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z4.s 377; CHECK-NEXT: smulh z2.s, p0/m, z2.s, z6.s 378; CHECK-NEXT: smulh z1.s, p0/m, z1.s, z5.s 379; CHECK-NEXT: asr z4.s, z25.s, #31 380; CHECK-NEXT: asr z5.s, z24.s, #31 381; CHECK-NEXT: asr z6.s, z26.s, #31 382; CHECK-NEXT: asr z7.s, z27.s, #31 383; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, z4.s 384; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, z5.s 385; CHECK-NEXT: cmpne p3.s, p0/z, z2.s, z6.s 386; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, z7.s 387; CHECK-NEXT: mov z25.s, p1/m, #0 // =0x0 388; CHECK-NEXT: mov z26.s, p3/m, #0 // =0x0 389; CHECK-NEXT: mov z24.s, p2/m, #0 // =0x0 390; CHECK-NEXT: mov z27.s, p0/m, #0 // =0x0 391; CHECK-NEXT: mov z0.d, z25.d 392; CHECK-NEXT: mov z2.d, z26.d 393; CHECK-NEXT: mov z3.d, z24.d 394; CHECK-NEXT: mov z1.d, z27.d 395; CHECK-NEXT: ret 396 %a = call { <vscale x 16 x i32>, <vscale x 16 x i1> } @llvm.smul.with.overflow.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y) 397 %b = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i1> } %a, 0 398 %c = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i1> } %a, 1 399 %d = select <vscale x 16 x i1> %c, <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> %b 400 ret <vscale x 16 x i32> %d 401} 402 403declare { <vscale x 2 x i64>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>) 404 405define <vscale x 2 x i64> @smulo_nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) { 406; CHECK-LABEL: smulo_nxv2i64: 407; CHECK: // %bb.0: 408; CHECK-NEXT: ptrue p0.d 409; CHECK-NEXT: movprfx z2, z0 410; CHECK-NEXT: mul z2.d, p0/m, z2.d, z1.d 411; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d 412; CHECK-NEXT: asr z1.d, z2.d, #63 413; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, z1.d 414; CHECK-NEXT: mov z2.d, p0/m, #0 // =0x0 415; CHECK-NEXT: mov z0.d, z2.d 416; CHECK-NEXT: ret 417 %a = call { <vscale x 2 x i64>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) 418 %b = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i1> } %a, 0 419 %c = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i1> } %a, 1 420 %d = select <vscale x 2 x i1> %c, <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> %b 421 ret <vscale x 2 x i64> %d 422} 423 424declare { <vscale x 4 x i64>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>) 425 426define <vscale x 4 x i64> @smulo_nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %y) { 427; CHECK-LABEL: smulo_nxv4i64: 428; CHECK: // %bb.0: 429; CHECK-NEXT: ptrue p0.d 430; CHECK-NEXT: movprfx z4, z1 431; CHECK-NEXT: mul z4.d, p0/m, z4.d, z3.d 432; CHECK-NEXT: movprfx z5, z0 433; CHECK-NEXT: mul z5.d, p0/m, z5.d, z2.d 434; CHECK-NEXT: smulh z1.d, p0/m, z1.d, z3.d 435; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z2.d 436; CHECK-NEXT: asr z2.d, z4.d, #63 437; CHECK-NEXT: asr z3.d, z5.d, #63 438; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, z2.d 439; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, z3.d 440; CHECK-NEXT: mov z5.d, p0/m, #0 // =0x0 441; CHECK-NEXT: mov z4.d, p1/m, #0 // =0x0 442; CHECK-NEXT: mov z0.d, z5.d 443; CHECK-NEXT: mov z1.d, z4.d 444; CHECK-NEXT: ret 445 %a = call { <vscale x 4 x i64>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %y) 446 %b = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i1> } %a, 0 447 %c = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i1> } %a, 1 448 %d = select <vscale x 4 x i1> %c, <vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> %b 449 ret <vscale x 4 x i64> %d 450} 451 452declare { <vscale x 8 x i64>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>) 453 454define <vscale x 8 x i64> @smulo_nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y) { 455; CHECK-LABEL: smulo_nxv8i64: 456; CHECK: // %bb.0: 457; CHECK-NEXT: ptrue p0.d 458; CHECK-NEXT: movprfx z24, z3 459; CHECK-NEXT: mul z24.d, p0/m, z24.d, z7.d 460; CHECK-NEXT: movprfx z25, z0 461; CHECK-NEXT: mul z25.d, p0/m, z25.d, z4.d 462; CHECK-NEXT: movprfx z26, z2 463; CHECK-NEXT: mul z26.d, p0/m, z26.d, z6.d 464; CHECK-NEXT: movprfx z27, z1 465; CHECK-NEXT: mul z27.d, p0/m, z27.d, z5.d 466; CHECK-NEXT: smulh z3.d, p0/m, z3.d, z7.d 467; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z4.d 468; CHECK-NEXT: smulh z2.d, p0/m, z2.d, z6.d 469; CHECK-NEXT: smulh z1.d, p0/m, z1.d, z5.d 470; CHECK-NEXT: asr z4.d, z25.d, #63 471; CHECK-NEXT: asr z5.d, z24.d, #63 472; CHECK-NEXT: asr z6.d, z26.d, #63 473; CHECK-NEXT: asr z7.d, z27.d, #63 474; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, z4.d 475; CHECK-NEXT: cmpne p2.d, p0/z, z3.d, z5.d 476; CHECK-NEXT: cmpne p3.d, p0/z, z2.d, z6.d 477; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, z7.d 478; CHECK-NEXT: mov z25.d, p1/m, #0 // =0x0 479; CHECK-NEXT: mov z26.d, p3/m, #0 // =0x0 480; CHECK-NEXT: mov z24.d, p2/m, #0 // =0x0 481; CHECK-NEXT: mov z27.d, p0/m, #0 // =0x0 482; CHECK-NEXT: mov z0.d, z25.d 483; CHECK-NEXT: mov z2.d, z26.d 484; CHECK-NEXT: mov z3.d, z24.d 485; CHECK-NEXT: mov z1.d, z27.d 486; CHECK-NEXT: ret 487 %a = call { <vscale x 8 x i64>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y) 488 %b = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i1> } %a, 0 489 %c = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i1> } %a, 1 490 %d = select <vscale x 8 x i1> %c, <vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> %b 491 ret <vscale x 8 x i64> %d 492} 493