1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128 3; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 4; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 6 7target triple = "aarch64-unknown-linux-gnu" 8 9; 10; SREM 11; 12 13; Vector vXi8 sdiv are not legal for NEON so use SVE when available. 14; FIXME: We should be able to improve the codegen for >= 256 bits here. 15define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { 16; VBITS_GE_128-LABEL: srem_v8i8: 17; VBITS_GE_128: // %bb.0: 18; VBITS_GE_128-NEXT: sshll v2.8h, v1.8b, #0 19; VBITS_GE_128-NEXT: sshll v3.8h, v0.8b, #0 20; VBITS_GE_128-NEXT: ptrue p0.s, vl4 21; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0 22; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0 23; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 24; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0 25; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s 26; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 27; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h 28; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h 29; VBITS_GE_128-NEXT: mls v0.8b, v2.8b, v1.8b 30; VBITS_GE_128-NEXT: ret 31; 32; VBITS_GE_256-LABEL: srem_v8i8: 33; VBITS_GE_256: // %bb.0: 34; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1 35; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 36; VBITS_GE_256-NEXT: sunpklo z2.h, z1.b 37; VBITS_GE_256-NEXT: sunpklo z3.h, z0.b 38; VBITS_GE_256-NEXT: ptrue p0.s, vl8 39; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h 40; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h 41; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 42; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h 43; VBITS_GE_256-NEXT: umov w8, v2.h[0] 44; VBITS_GE_256-NEXT: umov w9, v2.h[1] 45; VBITS_GE_256-NEXT: fmov s3, w8 46; VBITS_GE_256-NEXT: umov w8, v2.h[2] 47; VBITS_GE_256-NEXT: mov v3.b[1], w9 48; VBITS_GE_256-NEXT: mov v3.b[2], w8 49; VBITS_GE_256-NEXT: umov w8, v2.h[3] 50; VBITS_GE_256-NEXT: mov v3.b[3], w8 51; VBITS_GE_256-NEXT: umov w8, v2.h[4] 52; VBITS_GE_256-NEXT: mov v3.b[4], w8 53; VBITS_GE_256-NEXT: umov w8, v2.h[5] 54; VBITS_GE_256-NEXT: mov v3.b[5], w8 55; VBITS_GE_256-NEXT: umov w8, v2.h[6] 56; VBITS_GE_256-NEXT: mov v3.b[6], w8 57; VBITS_GE_256-NEXT: umov w8, v2.h[7] 58; VBITS_GE_256-NEXT: mov v3.b[7], w8 59; VBITS_GE_256-NEXT: mls v0.8b, v3.8b, v1.8b 60; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0 61; VBITS_GE_256-NEXT: ret 62; 63; VBITS_GE_512-LABEL: srem_v8i8: 64; VBITS_GE_512: // %bb.0: 65; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1 66; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 67; VBITS_GE_512-NEXT: sunpklo z2.h, z1.b 68; VBITS_GE_512-NEXT: sunpklo z3.h, z0.b 69; VBITS_GE_512-NEXT: ptrue p0.s, vl8 70; VBITS_GE_512-NEXT: sunpklo z2.s, z2.h 71; VBITS_GE_512-NEXT: sunpklo z3.s, z3.h 72; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 73; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h 74; VBITS_GE_512-NEXT: umov w8, v2.h[0] 75; VBITS_GE_512-NEXT: umov w9, v2.h[1] 76; VBITS_GE_512-NEXT: fmov s3, w8 77; VBITS_GE_512-NEXT: umov w8, v2.h[2] 78; VBITS_GE_512-NEXT: mov v3.b[1], w9 79; VBITS_GE_512-NEXT: mov v3.b[2], w8 80; VBITS_GE_512-NEXT: umov w8, v2.h[3] 81; VBITS_GE_512-NEXT: mov v3.b[3], w8 82; VBITS_GE_512-NEXT: umov w8, v2.h[4] 83; VBITS_GE_512-NEXT: mov v3.b[4], w8 84; VBITS_GE_512-NEXT: umov w8, v2.h[5] 85; VBITS_GE_512-NEXT: mov v3.b[5], w8 86; VBITS_GE_512-NEXT: umov w8, v2.h[6] 87; VBITS_GE_512-NEXT: mov v3.b[6], w8 88; VBITS_GE_512-NEXT: umov w8, v2.h[7] 89; VBITS_GE_512-NEXT: mov v3.b[7], w8 90; VBITS_GE_512-NEXT: mls v0.8b, v3.8b, v1.8b 91; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0 92; VBITS_GE_512-NEXT: ret 93 %res = srem <8 x i8> %op1, %op2 94 ret <8 x i8> %res 95} 96 97define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { 98; VBITS_GE_128-LABEL: srem_v16i8: 99; VBITS_GE_128: // %bb.0: 100; VBITS_GE_128-NEXT: sshll2 v2.8h, v1.16b, #0 101; VBITS_GE_128-NEXT: sshll2 v3.8h, v0.16b, #0 102; VBITS_GE_128-NEXT: ptrue p0.s, vl4 103; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0 104; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0 105; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 106; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0 107; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s 108; VBITS_GE_128-NEXT: sshll v5.8h, v0.8b, #0 109; VBITS_GE_128-NEXT: sshll2 v7.4s, v5.8h, #0 110; VBITS_GE_128-NEXT: sshll v5.4s, v5.4h, #0 111; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 112; VBITS_GE_128-NEXT: sshll v3.8h, v1.8b, #0 113; VBITS_GE_128-NEXT: sshll2 v6.4s, v3.8h, #0 114; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0 115; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s 116; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h 117; VBITS_GE_128-NEXT: sdivr z3.s, p0/m, z3.s, z5.s 118; VBITS_GE_128-NEXT: uzp1 v3.8h, v3.8h, v6.8h 119; VBITS_GE_128-NEXT: uzp1 v2.16b, v3.16b, v2.16b 120; VBITS_GE_128-NEXT: mls v0.16b, v2.16b, v1.16b 121; VBITS_GE_128-NEXT: ret 122; 123; VBITS_GE_256-LABEL: srem_v16i8: 124; VBITS_GE_256: // %bb.0: 125; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 126; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 127; VBITS_GE_256-NEXT: sunpklo z2.h, z1.b 128; VBITS_GE_256-NEXT: sunpklo z3.h, z0.b 129; VBITS_GE_256-NEXT: ptrue p0.s, vl8 130; VBITS_GE_256-NEXT: sunpklo z4.s, z2.h 131; VBITS_GE_256-NEXT: sunpklo z5.s, z3.h 132; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 133; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 134; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h 135; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h 136; VBITS_GE_256-NEXT: sdivr z4.s, p0/m, z4.s, z5.s 137; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 138; VBITS_GE_256-NEXT: ptrue p0.h, vl8 139; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h 140; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h 141; VBITS_GE_256-NEXT: splice z3.h, p0, z3.h, z2.h 142; VBITS_GE_256-NEXT: uzp1 z2.b, z3.b, z3.b 143; VBITS_GE_256-NEXT: mls v0.16b, v2.16b, v1.16b 144; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 145; VBITS_GE_256-NEXT: ret 146; 147; VBITS_GE_512-LABEL: srem_v16i8: 148; VBITS_GE_512: // %bb.0: 149; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 150; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 151; VBITS_GE_512-NEXT: sunpklo z2.h, z1.b 152; VBITS_GE_512-NEXT: sunpklo z3.h, z0.b 153; VBITS_GE_512-NEXT: ptrue p0.s, vl16 154; VBITS_GE_512-NEXT: sunpklo z2.s, z2.h 155; VBITS_GE_512-NEXT: sunpklo z3.s, z3.h 156; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 157; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h 158; VBITS_GE_512-NEXT: uzp1 z2.b, z2.b, z2.b 159; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b 160; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 161; VBITS_GE_512-NEXT: ret 162 %res = srem <16 x i8> %op1, %op2 163 ret <16 x i8> %res 164} 165 166define void @srem_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 167; CHECK-LABEL: srem_v32i8: 168; CHECK: // %bb.0: 169; CHECK-NEXT: ptrue p0.b, vl32 170; CHECK-NEXT: ptrue p1.s, vl32 171; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 172; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 173; CHECK-NEXT: sunpklo z2.h, z1.b 174; CHECK-NEXT: sunpklo z3.h, z0.b 175; CHECK-NEXT: sunpklo z2.s, z2.h 176; CHECK-NEXT: sunpklo z3.s, z3.h 177; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s 178; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 179; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b 180; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b 181; CHECK-NEXT: st1b { z0.b }, p0, [x0] 182; CHECK-NEXT: ret 183 %op1 = load <32 x i8>, ptr %a 184 %op2 = load <32 x i8>, ptr %b 185 %res = srem <32 x i8> %op1, %op2 186 store <32 x i8> %res, ptr %a 187 ret void 188} 189 190define void @srem_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 191; CHECK-LABEL: srem_v64i8: 192; CHECK: // %bb.0: 193; CHECK-NEXT: ptrue p0.b, vl64 194; CHECK-NEXT: ptrue p1.s, vl64 195; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 196; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 197; CHECK-NEXT: sunpklo z2.h, z1.b 198; CHECK-NEXT: sunpklo z3.h, z0.b 199; CHECK-NEXT: sunpklo z2.s, z2.h 200; CHECK-NEXT: sunpklo z3.s, z3.h 201; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s 202; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 203; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b 204; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b 205; CHECK-NEXT: st1b { z0.b }, p0, [x0] 206; CHECK-NEXT: ret 207 %op1 = load <64 x i8>, ptr %a 208 %op2 = load <64 x i8>, ptr %b 209 %res = srem <64 x i8> %op1, %op2 210 store <64 x i8> %res, ptr %a 211 ret void 212} 213 214define void @srem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 215; CHECK-LABEL: srem_v128i8: 216; CHECK: // %bb.0: 217; CHECK-NEXT: ptrue p0.b, vl128 218; CHECK-NEXT: ptrue p1.s, vl64 219; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 220; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 221; CHECK-NEXT: sunpklo z2.h, z1.b 222; CHECK-NEXT: sunpklo z3.h, z0.b 223; CHECK-NEXT: sunpklo z4.s, z2.h 224; CHECK-NEXT: sunpklo z5.s, z3.h 225; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 226; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 227; CHECK-NEXT: sunpklo z2.s, z2.h 228; CHECK-NEXT: sunpklo z3.s, z3.h 229; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s 230; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s 231; CHECK-NEXT: ptrue p1.h, vl64 232; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h 233; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 234; CHECK-NEXT: splice z3.h, p1, z3.h, z2.h 235; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b 236; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b 237; CHECK-NEXT: st1b { z0.b }, p0, [x0] 238; CHECK-NEXT: ret 239 %op1 = load <128 x i8>, ptr %a 240 %op2 = load <128 x i8>, ptr %b 241 %res = srem <128 x i8> %op1, %op2 242 store <128 x i8> %res, ptr %a 243 ret void 244} 245 246define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 247; CHECK-LABEL: srem_v256i8: 248; CHECK: // %bb.0: 249; CHECK-NEXT: ptrue p0.b, vl256 250; CHECK-NEXT: ptrue p1.s, vl64 251; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 252; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 253; CHECK-NEXT: sunpklo z2.h, z1.b 254; CHECK-NEXT: sunpklo z3.h, z0.b 255; CHECK-NEXT: sunpklo z4.s, z2.h 256; CHECK-NEXT: sunpklo z5.s, z3.h 257; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 258; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 259; CHECK-NEXT: sunpklo z2.s, z2.h 260; CHECK-NEXT: sunpklo z3.s, z3.h 261; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s 262; CHECK-NEXT: mov z5.d, z0.d 263; CHECK-NEXT: ext z5.b, z5.b, z0.b, #128 264; CHECK-NEXT: sunpklo z5.h, z5.b 265; CHECK-NEXT: sunpklo z7.s, z5.h 266; CHECK-NEXT: ext z5.b, z5.b, z5.b, #128 267; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s 268; CHECK-NEXT: mov z3.d, z1.d 269; CHECK-NEXT: sunpklo z5.s, z5.h 270; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128 271; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h 272; CHECK-NEXT: sunpklo z3.h, z3.b 273; CHECK-NEXT: sunpklo z6.s, z3.h 274; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 275; CHECK-NEXT: sunpklo z3.s, z3.h 276; CHECK-NEXT: sdivr z6.s, p1/m, z6.s, z7.s 277; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 278; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z5.s 279; CHECK-NEXT: ptrue p1.h, vl64 280; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h 281; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h 282; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b 283; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h 284; CHECK-NEXT: splice z5.h, p1, z5.h, z3.h 285; CHECK-NEXT: ptrue p1.b, vl128 286; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b 287; CHECK-NEXT: splice z2.b, p1, z2.b, z3.b 288; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b 289; CHECK-NEXT: st1b { z0.b }, p0, [x0] 290; CHECK-NEXT: ret 291 %op1 = load <256 x i8>, ptr %a 292 %op2 = load <256 x i8>, ptr %b 293 %res = srem <256 x i8> %op1, %op2 294 store <256 x i8> %res, ptr %a 295 ret void 296} 297 298; Vector vXi16 sdiv are not legal for NEON so use SVE when available. 299; FIXME: We should be able to improve the codegen for >= 256 bits here. 300define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { 301; VBITS_GE_128-LABEL: srem_v4i16: 302; VBITS_GE_128: // %bb.0: 303; VBITS_GE_128-NEXT: sshll v2.4s, v1.4h, #0 304; VBITS_GE_128-NEXT: sshll v3.4s, v0.4h, #0 305; VBITS_GE_128-NEXT: ptrue p0.s, vl4 306; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 307; VBITS_GE_128-NEXT: xtn v2.4h, v2.4s 308; VBITS_GE_128-NEXT: mls v0.4h, v2.4h, v1.4h 309; VBITS_GE_128-NEXT: ret 310; 311; VBITS_GE_256-LABEL: srem_v4i16: 312; VBITS_GE_256: // %bb.0: 313; VBITS_GE_256-NEXT: sshll v2.4s, v1.4h, #0 314; VBITS_GE_256-NEXT: sshll v3.4s, v0.4h, #0 315; VBITS_GE_256-NEXT: ptrue p0.s, vl4 316; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 317; VBITS_GE_256-NEXT: mov w8, v2.s[1] 318; VBITS_GE_256-NEXT: mov v3.16b, v2.16b 319; VBITS_GE_256-NEXT: mov w9, v2.s[2] 320; VBITS_GE_256-NEXT: mov v3.h[1], w8 321; VBITS_GE_256-NEXT: mov w8, v2.s[3] 322; VBITS_GE_256-NEXT: mov v3.h[2], w9 323; VBITS_GE_256-NEXT: mov v3.h[3], w8 324; VBITS_GE_256-NEXT: mls v0.4h, v3.4h, v1.4h 325; VBITS_GE_256-NEXT: ret 326; 327; VBITS_GE_512-LABEL: srem_v4i16: 328; VBITS_GE_512: // %bb.0: 329; VBITS_GE_512-NEXT: sshll v2.4s, v1.4h, #0 330; VBITS_GE_512-NEXT: sshll v3.4s, v0.4h, #0 331; VBITS_GE_512-NEXT: ptrue p0.s, vl4 332; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 333; VBITS_GE_512-NEXT: mov w8, v2.s[1] 334; VBITS_GE_512-NEXT: mov v3.16b, v2.16b 335; VBITS_GE_512-NEXT: mov w9, v2.s[2] 336; VBITS_GE_512-NEXT: mov v3.h[1], w8 337; VBITS_GE_512-NEXT: mov w8, v2.s[3] 338; VBITS_GE_512-NEXT: mov v3.h[2], w9 339; VBITS_GE_512-NEXT: mov v3.h[3], w8 340; VBITS_GE_512-NEXT: mls v0.4h, v3.4h, v1.4h 341; VBITS_GE_512-NEXT: ret 342 %res = srem <4 x i16> %op1, %op2 343 ret <4 x i16> %res 344} 345 346define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { 347; VBITS_GE_128-LABEL: srem_v8i16: 348; VBITS_GE_128: // %bb.0: 349; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 350; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 351; VBITS_GE_128-NEXT: ptrue p0.s, vl4 352; VBITS_GE_128-NEXT: sshll v4.4s, v0.4h, #0 353; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 354; VBITS_GE_128-NEXT: sshll v3.4s, v1.4h, #0 355; VBITS_GE_128-NEXT: sdivr z3.s, p0/m, z3.s, z4.s 356; VBITS_GE_128-NEXT: uzp1 v2.8h, v3.8h, v2.8h 357; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h 358; VBITS_GE_128-NEXT: ret 359; 360; VBITS_GE_256-LABEL: srem_v8i16: 361; VBITS_GE_256: // %bb.0: 362; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 363; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 364; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h 365; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h 366; VBITS_GE_256-NEXT: ptrue p0.s, vl8 367; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 368; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h 369; VBITS_GE_256-NEXT: mls v0.8h, v2.8h, v1.8h 370; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 371; VBITS_GE_256-NEXT: ret 372; 373; VBITS_GE_512-LABEL: srem_v8i16: 374; VBITS_GE_512: // %bb.0: 375; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 376; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 377; VBITS_GE_512-NEXT: sunpklo z2.s, z1.h 378; VBITS_GE_512-NEXT: sunpklo z3.s, z0.h 379; VBITS_GE_512-NEXT: ptrue p0.s, vl8 380; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 381; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h 382; VBITS_GE_512-NEXT: mls v0.8h, v2.8h, v1.8h 383; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 384; VBITS_GE_512-NEXT: ret 385 %res = srem <8 x i16> %op1, %op2 386 ret <8 x i16> %res 387} 388 389define void @srem_v16i16(ptr %a, ptr %b) #0 { 390; VBITS_GE_128-LABEL: srem_v16i16: 391; VBITS_GE_128: // %bb.0: 392; VBITS_GE_128-NEXT: ldp q4, q1, [x1] 393; VBITS_GE_128-NEXT: ptrue p0.s, vl4 394; VBITS_GE_128-NEXT: ldr q0, [x0, #16] 395; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 396; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 397; VBITS_GE_128-NEXT: sshll2 v5.4s, v4.8h, #0 398; VBITS_GE_128-NEXT: sshll v16.4s, v0.4h, #0 399; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 400; VBITS_GE_128-NEXT: ldr q3, [x0] 401; VBITS_GE_128-NEXT: sshll2 v6.4s, v3.8h, #0 402; VBITS_GE_128-NEXT: sshll v7.4s, v3.4h, #0 403; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s 404; VBITS_GE_128-NEXT: sshll v6.4s, v4.4h, #0 405; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z7.s 406; VBITS_GE_128-NEXT: sshll v7.4s, v1.4h, #0 407; VBITS_GE_128-NEXT: sdivr z7.s, p0/m, z7.s, z16.s 408; VBITS_GE_128-NEXT: uzp1 v5.8h, v6.8h, v5.8h 409; VBITS_GE_128-NEXT: mls v3.8h, v5.8h, v4.8h 410; VBITS_GE_128-NEXT: uzp1 v2.8h, v7.8h, v2.8h 411; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h 412; VBITS_GE_128-NEXT: stp q3, q0, [x0] 413; VBITS_GE_128-NEXT: ret 414; 415; VBITS_GE_256-LABEL: srem_v16i16: 416; VBITS_GE_256: // %bb.0: 417; VBITS_GE_256-NEXT: ptrue p0.h, vl16 418; VBITS_GE_256-NEXT: ptrue p1.s, vl8 419; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] 420; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] 421; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h 422; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h 423; VBITS_GE_256-NEXT: mov z4.d, z0.d 424; VBITS_GE_256-NEXT: ext z4.b, z4.b, z0.b, #16 425; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s 426; VBITS_GE_256-NEXT: mov z3.d, z1.d 427; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h 428; VBITS_GE_256-NEXT: ext z3.b, z3.b, z1.b, #16 429; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h 430; VBITS_GE_256-NEXT: sdivr z3.s, p1/m, z3.s, z4.s 431; VBITS_GE_256-NEXT: ptrue p1.h, vl8 432; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h 433; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h 434; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z3.h 435; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h 436; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] 437; VBITS_GE_256-NEXT: ret 438; 439; VBITS_GE_512-LABEL: srem_v16i16: 440; VBITS_GE_512: // %bb.0: 441; VBITS_GE_512-NEXT: ptrue p0.h, vl16 442; VBITS_GE_512-NEXT: ptrue p1.s, vl16 443; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 444; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 445; VBITS_GE_512-NEXT: sunpklo z2.s, z1.h 446; VBITS_GE_512-NEXT: sunpklo z3.s, z0.h 447; VBITS_GE_512-NEXT: sdivr z2.s, p1/m, z2.s, z3.s 448; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h 449; VBITS_GE_512-NEXT: mls z0.h, p0/m, z2.h, z1.h 450; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 451; VBITS_GE_512-NEXT: ret 452 %op1 = load <16 x i16>, ptr %a 453 %op2 = load <16 x i16>, ptr %b 454 %res = srem <16 x i16> %op1, %op2 455 store <16 x i16> %res, ptr %a 456 ret void 457} 458 459define void @srem_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 460; CHECK-LABEL: srem_v32i16: 461; CHECK: // %bb.0: 462; CHECK-NEXT: ptrue p0.h, vl32 463; CHECK-NEXT: ptrue p1.s, vl32 464; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 465; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 466; CHECK-NEXT: sunpklo z2.s, z1.h 467; CHECK-NEXT: sunpklo z3.s, z0.h 468; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s 469; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 470; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h 471; CHECK-NEXT: st1h { z0.h }, p0, [x0] 472; CHECK-NEXT: ret 473 %op1 = load <32 x i16>, ptr %a 474 %op2 = load <32 x i16>, ptr %b 475 %res = srem <32 x i16> %op1, %op2 476 store <32 x i16> %res, ptr %a 477 ret void 478} 479 480define void @srem_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 481; CHECK-LABEL: srem_v64i16: 482; CHECK: // %bb.0: 483; CHECK-NEXT: ptrue p0.h, vl64 484; CHECK-NEXT: ptrue p1.s, vl64 485; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 486; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 487; CHECK-NEXT: sunpklo z2.s, z1.h 488; CHECK-NEXT: sunpklo z3.s, z0.h 489; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s 490; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 491; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h 492; CHECK-NEXT: st1h { z0.h }, p0, [x0] 493; CHECK-NEXT: ret 494 %op1 = load <64 x i16>, ptr %a 495 %op2 = load <64 x i16>, ptr %b 496 %res = srem <64 x i16> %op1, %op2 497 store <64 x i16> %res, ptr %a 498 ret void 499} 500 501define void @srem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 502; CHECK-LABEL: srem_v128i16: 503; CHECK: // %bb.0: 504; CHECK-NEXT: ptrue p0.h, vl128 505; CHECK-NEXT: ptrue p1.s, vl64 506; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 507; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 508; CHECK-NEXT: sunpklo z2.s, z1.h 509; CHECK-NEXT: sunpklo z3.s, z0.h 510; CHECK-NEXT: mov z4.d, z0.d 511; CHECK-NEXT: ext z4.b, z4.b, z0.b, #128 512; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s 513; CHECK-NEXT: mov z3.d, z1.d 514; CHECK-NEXT: sunpklo z4.s, z4.h 515; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128 516; CHECK-NEXT: sunpklo z3.s, z3.h 517; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z4.s 518; CHECK-NEXT: ptrue p1.h, vl64 519; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 520; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h 521; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h 522; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h 523; CHECK-NEXT: st1h { z0.h }, p0, [x0] 524; CHECK-NEXT: ret 525 %op1 = load <128 x i16>, ptr %a 526 %op2 = load <128 x i16>, ptr %b 527 %res = srem <128 x i16> %op1, %op2 528 store <128 x i16> %res, ptr %a 529 ret void 530} 531 532; Vector v2i32 sdiv are not legal for NEON so use SVE when available. 533define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 { 534; CHECK-LABEL: srem_v2i32: 535; CHECK: // %bb.0: 536; CHECK-NEXT: ptrue p0.s, vl2 537; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 538; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 539; CHECK-NEXT: movprfx z2, z0 540; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s 541; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s 542; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 543; CHECK-NEXT: ret 544 %res = srem <2 x i32> %op1, %op2 545 ret <2 x i32> %res 546} 547 548; Vector v4i32 sdiv are not legal for NEON so use SVE when available. 549define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 { 550; CHECK-LABEL: srem_v4i32: 551; CHECK: // %bb.0: 552; CHECK-NEXT: ptrue p0.s, vl4 553; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 554; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 555; CHECK-NEXT: movprfx z2, z0 556; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s 557; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s 558; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 559; CHECK-NEXT: ret 560 %res = srem <4 x i32> %op1, %op2 561 ret <4 x i32> %res 562} 563 564define void @srem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 565; CHECK-LABEL: srem_v8i32: 566; CHECK: // %bb.0: 567; CHECK-NEXT: ptrue p0.s, vl8 568; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 569; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 570; CHECK-NEXT: movprfx z2, z0 571; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s 572; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s 573; CHECK-NEXT: st1w { z0.s }, p0, [x0] 574; CHECK-NEXT: ret 575 %op1 = load <8 x i32>, ptr %a 576 %op2 = load <8 x i32>, ptr %b 577 %res = srem <8 x i32> %op1, %op2 578 store <8 x i32> %res, ptr %a 579 ret void 580} 581 582define void @srem_v16i32(ptr %a, ptr %b) #0 { 583; VBITS_GE_128-LABEL: srem_v16i32: 584; VBITS_GE_128: // %bb.0: 585; VBITS_GE_128-NEXT: ldp q0, q3, [x1] 586; VBITS_GE_128-NEXT: ptrue p0.s, vl4 587; VBITS_GE_128-NEXT: ldp q1, q2, [x0] 588; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32] 589; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32] 590; VBITS_GE_128-NEXT: movprfx z4, z1 591; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z0.s 592; VBITS_GE_128-NEXT: movprfx z19, z2 593; VBITS_GE_128-NEXT: sdiv z19.s, p0/m, z19.s, z3.s 594; VBITS_GE_128-NEXT: movprfx z7, z5 595; VBITS_GE_128-NEXT: sdiv z7.s, p0/m, z7.s, z6.s 596; VBITS_GE_128-NEXT: movprfx z18, z16 597; VBITS_GE_128-NEXT: sdiv z18.s, p0/m, z18.s, z17.s 598; VBITS_GE_128-NEXT: mls v1.4s, v4.4s, v0.4s 599; VBITS_GE_128-NEXT: mls v2.4s, v19.4s, v3.4s 600; VBITS_GE_128-NEXT: mls v16.4s, v18.4s, v17.4s 601; VBITS_GE_128-NEXT: mls v5.4s, v7.4s, v6.4s 602; VBITS_GE_128-NEXT: stp q1, q2, [x0] 603; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] 604; VBITS_GE_128-NEXT: ret 605; 606; VBITS_GE_256-LABEL: srem_v16i32: 607; VBITS_GE_256: // %bb.0: 608; VBITS_GE_256-NEXT: ptrue p0.s, vl8 609; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 610; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 611; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 612; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] 613; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1] 614; VBITS_GE_256-NEXT: movprfx z2, z0 615; VBITS_GE_256-NEXT: sdiv z2.s, p0/m, z2.s, z1.s 616; VBITS_GE_256-NEXT: movprfx z5, z3 617; VBITS_GE_256-NEXT: sdiv z5.s, p0/m, z5.s, z4.s 618; VBITS_GE_256-NEXT: mls z0.s, p0/m, z2.s, z1.s 619; VBITS_GE_256-NEXT: movprfx z1, z3 620; VBITS_GE_256-NEXT: mls z1.s, p0/m, z5.s, z4.s 621; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 622; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 623; VBITS_GE_256-NEXT: ret 624; 625; VBITS_GE_512-LABEL: srem_v16i32: 626; VBITS_GE_512: // %bb.0: 627; VBITS_GE_512-NEXT: ptrue p0.s, vl16 628; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 629; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 630; VBITS_GE_512-NEXT: movprfx z2, z0 631; VBITS_GE_512-NEXT: sdiv z2.s, p0/m, z2.s, z1.s 632; VBITS_GE_512-NEXT: mls z0.s, p0/m, z2.s, z1.s 633; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 634; VBITS_GE_512-NEXT: ret 635 %op1 = load <16 x i32>, ptr %a 636 %op2 = load <16 x i32>, ptr %b 637 %res = srem <16 x i32> %op1, %op2 638 store <16 x i32> %res, ptr %a 639 ret void 640} 641 642define void @srem_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 643; CHECK-LABEL: srem_v32i32: 644; CHECK: // %bb.0: 645; CHECK-NEXT: ptrue p0.s, vl32 646; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 647; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 648; CHECK-NEXT: movprfx z2, z0 649; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s 650; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s 651; CHECK-NEXT: st1w { z0.s }, p0, [x0] 652; CHECK-NEXT: ret 653 %op1 = load <32 x i32>, ptr %a 654 %op2 = load <32 x i32>, ptr %b 655 %res = srem <32 x i32> %op1, %op2 656 store <32 x i32> %res, ptr %a 657 ret void 658} 659 660define void @srem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 661; CHECK-LABEL: srem_v64i32: 662; CHECK: // %bb.0: 663; CHECK-NEXT: ptrue p0.s, vl64 664; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 665; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 666; CHECK-NEXT: movprfx z2, z0 667; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s 668; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s 669; CHECK-NEXT: st1w { z0.s }, p0, [x0] 670; CHECK-NEXT: ret 671 %op1 = load <64 x i32>, ptr %a 672 %op2 = load <64 x i32>, ptr %b 673 %res = srem <64 x i32> %op1, %op2 674 store <64 x i32> %res, ptr %a 675 ret void 676} 677 678; Vector i64 sdiv are not legal for NEON so use SVE when available. 679; FIXME: We should be able to improve the codegen for the 128 bits case here. 680define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 { 681; CHECK-LABEL: srem_v1i64: 682; CHECK: // %bb.0: 683; CHECK-NEXT: ptrue p0.d, vl1 684; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 685; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 686; CHECK-NEXT: movprfx z2, z0 687; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d 688; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d 689; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 690; CHECK-NEXT: ret 691 %res = srem <1 x i64> %op1, %op2 692 ret <1 x i64> %res 693} 694 695; Vector i64 sdiv are not legal for NEON so use SVE when available. 696; FIXME: We should be able to improve the codegen for the 128 bits case here. 697define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 { 698; CHECK-LABEL: srem_v2i64: 699; CHECK: // %bb.0: 700; CHECK-NEXT: ptrue p0.d, vl2 701; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 702; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 703; CHECK-NEXT: movprfx z2, z0 704; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d 705; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d 706; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 707; CHECK-NEXT: ret 708 %res = srem <2 x i64> %op1, %op2 709 ret <2 x i64> %res 710} 711 712define void @srem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 713; CHECK-LABEL: srem_v4i64: 714; CHECK: // %bb.0: 715; CHECK-NEXT: ptrue p0.d, vl4 716; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 717; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 718; CHECK-NEXT: movprfx z2, z0 719; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d 720; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d 721; CHECK-NEXT: st1d { z0.d }, p0, [x0] 722; CHECK-NEXT: ret 723 %op1 = load <4 x i64>, ptr %a 724 %op2 = load <4 x i64>, ptr %b 725 %res = srem <4 x i64> %op1, %op2 726 store <4 x i64> %res, ptr %a 727 ret void 728} 729 730define void @srem_v8i64(ptr %a, ptr %b) #0 { 731; VBITS_GE_128-LABEL: srem_v8i64: 732; VBITS_GE_128: // %bb.0: 733; VBITS_GE_128-NEXT: ldp q0, q3, [x1] 734; VBITS_GE_128-NEXT: ptrue p0.d, vl2 735; VBITS_GE_128-NEXT: ldp q1, q2, [x0] 736; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32] 737; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32] 738; VBITS_GE_128-NEXT: movprfx z4, z1 739; VBITS_GE_128-NEXT: sdiv z4.d, p0/m, z4.d, z0.d 740; VBITS_GE_128-NEXT: movprfx z19, z2 741; VBITS_GE_128-NEXT: sdiv z19.d, p0/m, z19.d, z3.d 742; VBITS_GE_128-NEXT: movprfx z7, z5 743; VBITS_GE_128-NEXT: sdiv z7.d, p0/m, z7.d, z6.d 744; VBITS_GE_128-NEXT: movprfx z18, z16 745; VBITS_GE_128-NEXT: sdiv z18.d, p0/m, z18.d, z17.d 746; VBITS_GE_128-NEXT: msb z0.d, p0/m, z4.d, z1.d 747; VBITS_GE_128-NEXT: movprfx z1, z2 748; VBITS_GE_128-NEXT: mls z1.d, p0/m, z19.d, z3.d 749; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d 750; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d 751; VBITS_GE_128-NEXT: stp q0, q1, [x0] 752; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] 753; VBITS_GE_128-NEXT: ret 754; 755; VBITS_GE_256-LABEL: srem_v8i64: 756; VBITS_GE_256: // %bb.0: 757; VBITS_GE_256-NEXT: ptrue p0.d, vl4 758; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 759; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 760; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 761; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] 762; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] 763; VBITS_GE_256-NEXT: movprfx z2, z0 764; VBITS_GE_256-NEXT: sdiv z2.d, p0/m, z2.d, z1.d 765; VBITS_GE_256-NEXT: movprfx z5, z3 766; VBITS_GE_256-NEXT: sdiv z5.d, p0/m, z5.d, z4.d 767; VBITS_GE_256-NEXT: mls z0.d, p0/m, z2.d, z1.d 768; VBITS_GE_256-NEXT: movprfx z1, z3 769; VBITS_GE_256-NEXT: mls z1.d, p0/m, z5.d, z4.d 770; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 771; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 772; VBITS_GE_256-NEXT: ret 773; 774; VBITS_GE_512-LABEL: srem_v8i64: 775; VBITS_GE_512: // %bb.0: 776; VBITS_GE_512-NEXT: ptrue p0.d, vl8 777; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 778; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 779; VBITS_GE_512-NEXT: movprfx z2, z0 780; VBITS_GE_512-NEXT: sdiv z2.d, p0/m, z2.d, z1.d 781; VBITS_GE_512-NEXT: mls z0.d, p0/m, z2.d, z1.d 782; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 783; VBITS_GE_512-NEXT: ret 784 %op1 = load <8 x i64>, ptr %a 785 %op2 = load <8 x i64>, ptr %b 786 %res = srem <8 x i64> %op1, %op2 787 store <8 x i64> %res, ptr %a 788 ret void 789} 790 791define void @srem_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 792; CHECK-LABEL: srem_v16i64: 793; CHECK: // %bb.0: 794; CHECK-NEXT: ptrue p0.d, vl16 795; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 796; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 797; CHECK-NEXT: movprfx z2, z0 798; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d 799; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d 800; CHECK-NEXT: st1d { z0.d }, p0, [x0] 801; CHECK-NEXT: ret 802 %op1 = load <16 x i64>, ptr %a 803 %op2 = load <16 x i64>, ptr %b 804 %res = srem <16 x i64> %op1, %op2 805 store <16 x i64> %res, ptr %a 806 ret void 807} 808 809define void @srem_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 810; CHECK-LABEL: srem_v32i64: 811; CHECK: // %bb.0: 812; CHECK-NEXT: ptrue p0.d, vl32 813; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 814; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 815; CHECK-NEXT: movprfx z2, z0 816; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d 817; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d 818; CHECK-NEXT: st1d { z0.d }, p0, [x0] 819; CHECK-NEXT: ret 820 %op1 = load <32 x i64>, ptr %a 821 %op2 = load <32 x i64>, ptr %b 822 %res = srem <32 x i64> %op1, %op2 823 store <32 x i64> %res, ptr %a 824 ret void 825} 826 827; 828; UREM 829; 830 831; Vector vXi8 udiv are not legal for NEON so use SVE when available. 832; FIXME: We should be able to improve the codegen for >= 256 bits here. 833define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { 834; VBITS_GE_128-LABEL: urem_v8i8: 835; VBITS_GE_128: // %bb.0: 836; VBITS_GE_128-NEXT: ushll v2.8h, v1.8b, #0 837; VBITS_GE_128-NEXT: ushll v3.8h, v0.8b, #0 838; VBITS_GE_128-NEXT: ptrue p0.s, vl4 839; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0 840; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0 841; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 842; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0 843; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s 844; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s 845; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h 846; VBITS_GE_128-NEXT: xtn v2.8b, v2.8h 847; VBITS_GE_128-NEXT: mls v0.8b, v2.8b, v1.8b 848; VBITS_GE_128-NEXT: ret 849; 850; VBITS_GE_256-LABEL: urem_v8i8: 851; VBITS_GE_256: // %bb.0: 852; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1 853; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 854; VBITS_GE_256-NEXT: uunpklo z2.h, z1.b 855; VBITS_GE_256-NEXT: uunpklo z3.h, z0.b 856; VBITS_GE_256-NEXT: ptrue p0.s, vl8 857; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h 858; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h 859; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s 860; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h 861; VBITS_GE_256-NEXT: umov w8, v2.h[0] 862; VBITS_GE_256-NEXT: umov w9, v2.h[1] 863; VBITS_GE_256-NEXT: fmov s3, w8 864; VBITS_GE_256-NEXT: umov w8, v2.h[2] 865; VBITS_GE_256-NEXT: mov v3.b[1], w9 866; VBITS_GE_256-NEXT: mov v3.b[2], w8 867; VBITS_GE_256-NEXT: umov w8, v2.h[3] 868; VBITS_GE_256-NEXT: mov v3.b[3], w8 869; VBITS_GE_256-NEXT: umov w8, v2.h[4] 870; VBITS_GE_256-NEXT: mov v3.b[4], w8 871; VBITS_GE_256-NEXT: umov w8, v2.h[5] 872; VBITS_GE_256-NEXT: mov v3.b[5], w8 873; VBITS_GE_256-NEXT: umov w8, v2.h[6] 874; VBITS_GE_256-NEXT: mov v3.b[6], w8 875; VBITS_GE_256-NEXT: umov w8, v2.h[7] 876; VBITS_GE_256-NEXT: mov v3.b[7], w8 877; VBITS_GE_256-NEXT: mls v0.8b, v3.8b, v1.8b 878; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0 879; VBITS_GE_256-NEXT: ret 880; 881; VBITS_GE_512-LABEL: urem_v8i8: 882; VBITS_GE_512: // %bb.0: 883; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1 884; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 885; VBITS_GE_512-NEXT: uunpklo z2.h, z1.b 886; VBITS_GE_512-NEXT: uunpklo z3.h, z0.b 887; VBITS_GE_512-NEXT: ptrue p0.s, vl8 888; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h 889; VBITS_GE_512-NEXT: uunpklo z3.s, z3.h 890; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s 891; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h 892; VBITS_GE_512-NEXT: umov w8, v2.h[0] 893; VBITS_GE_512-NEXT: umov w9, v2.h[1] 894; VBITS_GE_512-NEXT: fmov s3, w8 895; VBITS_GE_512-NEXT: umov w8, v2.h[2] 896; VBITS_GE_512-NEXT: mov v3.b[1], w9 897; VBITS_GE_512-NEXT: mov v3.b[2], w8 898; VBITS_GE_512-NEXT: umov w8, v2.h[3] 899; VBITS_GE_512-NEXT: mov v3.b[3], w8 900; VBITS_GE_512-NEXT: umov w8, v2.h[4] 901; VBITS_GE_512-NEXT: mov v3.b[4], w8 902; VBITS_GE_512-NEXT: umov w8, v2.h[5] 903; VBITS_GE_512-NEXT: mov v3.b[5], w8 904; VBITS_GE_512-NEXT: umov w8, v2.h[6] 905; VBITS_GE_512-NEXT: mov v3.b[6], w8 906; VBITS_GE_512-NEXT: umov w8, v2.h[7] 907; VBITS_GE_512-NEXT: mov v3.b[7], w8 908; VBITS_GE_512-NEXT: mls v0.8b, v3.8b, v1.8b 909; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0 910; VBITS_GE_512-NEXT: ret 911 %res = urem <8 x i8> %op1, %op2 912 ret <8 x i8> %res 913} 914 915define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { 916; VBITS_GE_128-LABEL: urem_v16i8: 917; VBITS_GE_128: // %bb.0: 918; VBITS_GE_128-NEXT: ushll2 v2.8h, v1.16b, #0 919; VBITS_GE_128-NEXT: ushll2 v3.8h, v0.16b, #0 920; VBITS_GE_128-NEXT: ptrue p0.s, vl4 921; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0 922; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0 923; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 924; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0 925; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s 926; VBITS_GE_128-NEXT: ushll v5.8h, v0.8b, #0 927; VBITS_GE_128-NEXT: ushll2 v7.4s, v5.8h, #0 928; VBITS_GE_128-NEXT: ushll v5.4s, v5.4h, #0 929; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s 930; VBITS_GE_128-NEXT: ushll v3.8h, v1.8b, #0 931; VBITS_GE_128-NEXT: ushll2 v6.4s, v3.8h, #0 932; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0 933; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s 934; VBITS_GE_128-NEXT: uzp1 v2.8h, v2.8h, v4.8h 935; VBITS_GE_128-NEXT: udivr z3.s, p0/m, z3.s, z5.s 936; VBITS_GE_128-NEXT: uzp1 v3.8h, v3.8h, v6.8h 937; VBITS_GE_128-NEXT: uzp1 v2.16b, v3.16b, v2.16b 938; VBITS_GE_128-NEXT: mls v0.16b, v2.16b, v1.16b 939; VBITS_GE_128-NEXT: ret 940; 941; VBITS_GE_256-LABEL: urem_v16i8: 942; VBITS_GE_256: // %bb.0: 943; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 944; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 945; VBITS_GE_256-NEXT: uunpklo z2.h, z1.b 946; VBITS_GE_256-NEXT: uunpklo z3.h, z0.b 947; VBITS_GE_256-NEXT: ptrue p0.s, vl8 948; VBITS_GE_256-NEXT: uunpklo z4.s, z2.h 949; VBITS_GE_256-NEXT: uunpklo z5.s, z3.h 950; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 951; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 952; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h 953; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h 954; VBITS_GE_256-NEXT: udivr z4.s, p0/m, z4.s, z5.s 955; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s 956; VBITS_GE_256-NEXT: ptrue p0.h, vl8 957; VBITS_GE_256-NEXT: uzp1 z3.h, z4.h, z4.h 958; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h 959; VBITS_GE_256-NEXT: splice z3.h, p0, z3.h, z2.h 960; VBITS_GE_256-NEXT: uzp1 z2.b, z3.b, z3.b 961; VBITS_GE_256-NEXT: mls v0.16b, v2.16b, v1.16b 962; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 963; VBITS_GE_256-NEXT: ret 964; 965; VBITS_GE_512-LABEL: urem_v16i8: 966; VBITS_GE_512: // %bb.0: 967; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 968; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 969; VBITS_GE_512-NEXT: uunpklo z2.h, z1.b 970; VBITS_GE_512-NEXT: uunpklo z3.h, z0.b 971; VBITS_GE_512-NEXT: ptrue p0.s, vl16 972; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h 973; VBITS_GE_512-NEXT: uunpklo z3.s, z3.h 974; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s 975; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h 976; VBITS_GE_512-NEXT: uzp1 z2.b, z2.b, z2.b 977; VBITS_GE_512-NEXT: mls v0.16b, v2.16b, v1.16b 978; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 979; VBITS_GE_512-NEXT: ret 980 %res = urem <16 x i8> %op1, %op2 981 ret <16 x i8> %res 982} 983 984define void @urem_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 985; CHECK-LABEL: urem_v32i8: 986; CHECK: // %bb.0: 987; CHECK-NEXT: ptrue p0.b, vl32 988; CHECK-NEXT: ptrue p1.s, vl32 989; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 990; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 991; CHECK-NEXT: uunpklo z2.h, z1.b 992; CHECK-NEXT: uunpklo z3.h, z0.b 993; CHECK-NEXT: uunpklo z2.s, z2.h 994; CHECK-NEXT: uunpklo z3.s, z3.h 995; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s 996; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 997; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b 998; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b 999; CHECK-NEXT: st1b { z0.b }, p0, [x0] 1000; CHECK-NEXT: ret 1001 %op1 = load <32 x i8>, ptr %a 1002 %op2 = load <32 x i8>, ptr %b 1003 %res = urem <32 x i8> %op1, %op2 1004 store <32 x i8> %res, ptr %a 1005 ret void 1006} 1007 1008define void @urem_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 1009; CHECK-LABEL: urem_v64i8: 1010; CHECK: // %bb.0: 1011; CHECK-NEXT: ptrue p0.b, vl64 1012; CHECK-NEXT: ptrue p1.s, vl64 1013; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 1014; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 1015; CHECK-NEXT: uunpklo z2.h, z1.b 1016; CHECK-NEXT: uunpklo z3.h, z0.b 1017; CHECK-NEXT: uunpklo z2.s, z2.h 1018; CHECK-NEXT: uunpklo z3.s, z3.h 1019; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s 1020; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 1021; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b 1022; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b 1023; CHECK-NEXT: st1b { z0.b }, p0, [x0] 1024; CHECK-NEXT: ret 1025 %op1 = load <64 x i8>, ptr %a 1026 %op2 = load <64 x i8>, ptr %b 1027 %res = urem <64 x i8> %op1, %op2 1028 store <64 x i8> %res, ptr %a 1029 ret void 1030} 1031 1032define void @urem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 1033; CHECK-LABEL: urem_v128i8: 1034; CHECK: // %bb.0: 1035; CHECK-NEXT: ptrue p0.b, vl128 1036; CHECK-NEXT: ptrue p1.s, vl64 1037; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 1038; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 1039; CHECK-NEXT: uunpklo z2.h, z1.b 1040; CHECK-NEXT: uunpklo z3.h, z0.b 1041; CHECK-NEXT: uunpklo z4.s, z2.h 1042; CHECK-NEXT: uunpklo z5.s, z3.h 1043; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 1044; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 1045; CHECK-NEXT: uunpklo z2.s, z2.h 1046; CHECK-NEXT: uunpklo z3.s, z3.h 1047; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s 1048; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s 1049; CHECK-NEXT: ptrue p1.h, vl64 1050; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h 1051; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 1052; CHECK-NEXT: splice z3.h, p1, z3.h, z2.h 1053; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b 1054; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b 1055; CHECK-NEXT: st1b { z0.b }, p0, [x0] 1056; CHECK-NEXT: ret 1057 %op1 = load <128 x i8>, ptr %a 1058 %op2 = load <128 x i8>, ptr %b 1059 %res = urem <128 x i8> %op1, %op2 1060 store <128 x i8> %res, ptr %a 1061 ret void 1062} 1063 1064define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 1065; CHECK-LABEL: urem_v256i8: 1066; CHECK: // %bb.0: 1067; CHECK-NEXT: ptrue p0.b, vl256 1068; CHECK-NEXT: ptrue p1.s, vl64 1069; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 1070; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 1071; CHECK-NEXT: uunpklo z2.h, z1.b 1072; CHECK-NEXT: uunpklo z3.h, z0.b 1073; CHECK-NEXT: uunpklo z4.s, z2.h 1074; CHECK-NEXT: uunpklo z5.s, z3.h 1075; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 1076; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 1077; CHECK-NEXT: uunpklo z2.s, z2.h 1078; CHECK-NEXT: uunpklo z3.s, z3.h 1079; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s 1080; CHECK-NEXT: mov z5.d, z0.d 1081; CHECK-NEXT: ext z5.b, z5.b, z0.b, #128 1082; CHECK-NEXT: uunpklo z5.h, z5.b 1083; CHECK-NEXT: uunpklo z7.s, z5.h 1084; CHECK-NEXT: ext z5.b, z5.b, z5.b, #128 1085; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s 1086; CHECK-NEXT: mov z3.d, z1.d 1087; CHECK-NEXT: uunpklo z5.s, z5.h 1088; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128 1089; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h 1090; CHECK-NEXT: uunpklo z3.h, z3.b 1091; CHECK-NEXT: uunpklo z6.s, z3.h 1092; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 1093; CHECK-NEXT: uunpklo z3.s, z3.h 1094; CHECK-NEXT: udivr z6.s, p1/m, z6.s, z7.s 1095; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 1096; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z5.s 1097; CHECK-NEXT: ptrue p1.h, vl64 1098; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h 1099; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h 1100; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b 1101; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h 1102; CHECK-NEXT: splice z5.h, p1, z5.h, z3.h 1103; CHECK-NEXT: ptrue p1.b, vl128 1104; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b 1105; CHECK-NEXT: splice z2.b, p1, z2.b, z3.b 1106; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b 1107; CHECK-NEXT: st1b { z0.b }, p0, [x0] 1108; CHECK-NEXT: ret 1109 %op1 = load <256 x i8>, ptr %a 1110 %op2 = load <256 x i8>, ptr %b 1111 %res = urem <256 x i8> %op1, %op2 1112 store <256 x i8> %res, ptr %a 1113 ret void 1114} 1115 1116; Vector vXi16 udiv are not legal for NEON so use SVE when available. 1117; FIXME: We should be able to improve the codegen for >= 256 bits here. 1118define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { 1119; VBITS_GE_128-LABEL: urem_v4i16: 1120; VBITS_GE_128: // %bb.0: 1121; VBITS_GE_128-NEXT: ushll v2.4s, v1.4h, #0 1122; VBITS_GE_128-NEXT: ushll v3.4s, v0.4h, #0 1123; VBITS_GE_128-NEXT: ptrue p0.s, vl4 1124; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s 1125; VBITS_GE_128-NEXT: xtn v2.4h, v2.4s 1126; VBITS_GE_128-NEXT: mls v0.4h, v2.4h, v1.4h 1127; VBITS_GE_128-NEXT: ret 1128; 1129; VBITS_GE_256-LABEL: urem_v4i16: 1130; VBITS_GE_256: // %bb.0: 1131; VBITS_GE_256-NEXT: ushll v2.4s, v1.4h, #0 1132; VBITS_GE_256-NEXT: ushll v3.4s, v0.4h, #0 1133; VBITS_GE_256-NEXT: ptrue p0.s, vl4 1134; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s 1135; VBITS_GE_256-NEXT: mov w8, v2.s[1] 1136; VBITS_GE_256-NEXT: mov v3.16b, v2.16b 1137; VBITS_GE_256-NEXT: mov w9, v2.s[2] 1138; VBITS_GE_256-NEXT: mov v3.h[1], w8 1139; VBITS_GE_256-NEXT: mov w8, v2.s[3] 1140; VBITS_GE_256-NEXT: mov v3.h[2], w9 1141; VBITS_GE_256-NEXT: mov v3.h[3], w8 1142; VBITS_GE_256-NEXT: mls v0.4h, v3.4h, v1.4h 1143; VBITS_GE_256-NEXT: ret 1144; 1145; VBITS_GE_512-LABEL: urem_v4i16: 1146; VBITS_GE_512: // %bb.0: 1147; VBITS_GE_512-NEXT: ushll v2.4s, v1.4h, #0 1148; VBITS_GE_512-NEXT: ushll v3.4s, v0.4h, #0 1149; VBITS_GE_512-NEXT: ptrue p0.s, vl4 1150; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s 1151; VBITS_GE_512-NEXT: mov w8, v2.s[1] 1152; VBITS_GE_512-NEXT: mov v3.16b, v2.16b 1153; VBITS_GE_512-NEXT: mov w9, v2.s[2] 1154; VBITS_GE_512-NEXT: mov v3.h[1], w8 1155; VBITS_GE_512-NEXT: mov w8, v2.s[3] 1156; VBITS_GE_512-NEXT: mov v3.h[2], w9 1157; VBITS_GE_512-NEXT: mov v3.h[3], w8 1158; VBITS_GE_512-NEXT: mls v0.4h, v3.4h, v1.4h 1159; VBITS_GE_512-NEXT: ret 1160 %res = urem <4 x i16> %op1, %op2 1161 ret <4 x i16> %res 1162} 1163 1164define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { 1165; VBITS_GE_128-LABEL: urem_v8i16: 1166; VBITS_GE_128: // %bb.0: 1167; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 1168; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 1169; VBITS_GE_128-NEXT: ptrue p0.s, vl4 1170; VBITS_GE_128-NEXT: ushll v4.4s, v0.4h, #0 1171; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s 1172; VBITS_GE_128-NEXT: ushll v3.4s, v1.4h, #0 1173; VBITS_GE_128-NEXT: udivr z3.s, p0/m, z3.s, z4.s 1174; VBITS_GE_128-NEXT: uzp1 v2.8h, v3.8h, v2.8h 1175; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h 1176; VBITS_GE_128-NEXT: ret 1177; 1178; VBITS_GE_256-LABEL: urem_v8i16: 1179; VBITS_GE_256: // %bb.0: 1180; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 1181; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 1182; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h 1183; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h 1184; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1185; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s 1186; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h 1187; VBITS_GE_256-NEXT: mls v0.8h, v2.8h, v1.8h 1188; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 1189; VBITS_GE_256-NEXT: ret 1190; 1191; VBITS_GE_512-LABEL: urem_v8i16: 1192; VBITS_GE_512: // %bb.0: 1193; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 1194; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 1195; VBITS_GE_512-NEXT: uunpklo z2.s, z1.h 1196; VBITS_GE_512-NEXT: uunpklo z3.s, z0.h 1197; VBITS_GE_512-NEXT: ptrue p0.s, vl8 1198; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s 1199; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h 1200; VBITS_GE_512-NEXT: mls v0.8h, v2.8h, v1.8h 1201; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 1202; VBITS_GE_512-NEXT: ret 1203 %res = urem <8 x i16> %op1, %op2 1204 ret <8 x i16> %res 1205} 1206 1207define void @urem_v16i16(ptr %a, ptr %b) #0 { 1208; VBITS_GE_128-LABEL: urem_v16i16: 1209; VBITS_GE_128: // %bb.0: 1210; VBITS_GE_128-NEXT: ldp q4, q1, [x1] 1211; VBITS_GE_128-NEXT: ptrue p0.s, vl4 1212; VBITS_GE_128-NEXT: ldr q0, [x0, #16] 1213; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 1214; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 1215; VBITS_GE_128-NEXT: ushll2 v5.4s, v4.8h, #0 1216; VBITS_GE_128-NEXT: ushll v16.4s, v0.4h, #0 1217; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s 1218; VBITS_GE_128-NEXT: ldr q3, [x0] 1219; VBITS_GE_128-NEXT: ushll2 v6.4s, v3.8h, #0 1220; VBITS_GE_128-NEXT: ushll v7.4s, v3.4h, #0 1221; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s 1222; VBITS_GE_128-NEXT: ushll v6.4s, v4.4h, #0 1223; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z7.s 1224; VBITS_GE_128-NEXT: ushll v7.4s, v1.4h, #0 1225; VBITS_GE_128-NEXT: udivr z7.s, p0/m, z7.s, z16.s 1226; VBITS_GE_128-NEXT: uzp1 v5.8h, v6.8h, v5.8h 1227; VBITS_GE_128-NEXT: mls v3.8h, v5.8h, v4.8h 1228; VBITS_GE_128-NEXT: uzp1 v2.8h, v7.8h, v2.8h 1229; VBITS_GE_128-NEXT: mls v0.8h, v2.8h, v1.8h 1230; VBITS_GE_128-NEXT: stp q3, q0, [x0] 1231; VBITS_GE_128-NEXT: ret 1232; 1233; VBITS_GE_256-LABEL: urem_v16i16: 1234; VBITS_GE_256: // %bb.0: 1235; VBITS_GE_256-NEXT: ptrue p0.h, vl16 1236; VBITS_GE_256-NEXT: ptrue p1.s, vl8 1237; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] 1238; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] 1239; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h 1240; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h 1241; VBITS_GE_256-NEXT: mov z4.d, z0.d 1242; VBITS_GE_256-NEXT: ext z4.b, z4.b, z0.b, #16 1243; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s 1244; VBITS_GE_256-NEXT: mov z3.d, z1.d 1245; VBITS_GE_256-NEXT: uunpklo z4.s, z4.h 1246; VBITS_GE_256-NEXT: ext z3.b, z3.b, z1.b, #16 1247; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h 1248; VBITS_GE_256-NEXT: udivr z3.s, p1/m, z3.s, z4.s 1249; VBITS_GE_256-NEXT: ptrue p1.h, vl8 1250; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h 1251; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h 1252; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z3.h 1253; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h 1254; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] 1255; VBITS_GE_256-NEXT: ret 1256; 1257; VBITS_GE_512-LABEL: urem_v16i16: 1258; VBITS_GE_512: // %bb.0: 1259; VBITS_GE_512-NEXT: ptrue p0.h, vl16 1260; VBITS_GE_512-NEXT: ptrue p1.s, vl16 1261; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 1262; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 1263; VBITS_GE_512-NEXT: uunpklo z2.s, z1.h 1264; VBITS_GE_512-NEXT: uunpklo z3.s, z0.h 1265; VBITS_GE_512-NEXT: udivr z2.s, p1/m, z2.s, z3.s 1266; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h 1267; VBITS_GE_512-NEXT: mls z0.h, p0/m, z2.h, z1.h 1268; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 1269; VBITS_GE_512-NEXT: ret 1270 %op1 = load <16 x i16>, ptr %a 1271 %op2 = load <16 x i16>, ptr %b 1272 %res = urem <16 x i16> %op1, %op2 1273 store <16 x i16> %res, ptr %a 1274 ret void 1275} 1276 1277define void @urem_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 1278; CHECK-LABEL: urem_v32i16: 1279; CHECK: // %bb.0: 1280; CHECK-NEXT: ptrue p0.h, vl32 1281; CHECK-NEXT: ptrue p1.s, vl32 1282; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1283; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1284; CHECK-NEXT: uunpklo z2.s, z1.h 1285; CHECK-NEXT: uunpklo z3.s, z0.h 1286; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s 1287; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 1288; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h 1289; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1290; CHECK-NEXT: ret 1291 %op1 = load <32 x i16>, ptr %a 1292 %op2 = load <32 x i16>, ptr %b 1293 %res = urem <32 x i16> %op1, %op2 1294 store <32 x i16> %res, ptr %a 1295 ret void 1296} 1297 1298define void @urem_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 1299; CHECK-LABEL: urem_v64i16: 1300; CHECK: // %bb.0: 1301; CHECK-NEXT: ptrue p0.h, vl64 1302; CHECK-NEXT: ptrue p1.s, vl64 1303; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1304; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1305; CHECK-NEXT: uunpklo z2.s, z1.h 1306; CHECK-NEXT: uunpklo z3.s, z0.h 1307; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s 1308; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 1309; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h 1310; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1311; CHECK-NEXT: ret 1312 %op1 = load <64 x i16>, ptr %a 1313 %op2 = load <64 x i16>, ptr %b 1314 %res = urem <64 x i16> %op1, %op2 1315 store <64 x i16> %res, ptr %a 1316 ret void 1317} 1318 1319define void @urem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 1320; CHECK-LABEL: urem_v128i16: 1321; CHECK: // %bb.0: 1322; CHECK-NEXT: ptrue p0.h, vl128 1323; CHECK-NEXT: ptrue p1.s, vl64 1324; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1325; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1326; CHECK-NEXT: uunpklo z2.s, z1.h 1327; CHECK-NEXT: uunpklo z3.s, z0.h 1328; CHECK-NEXT: mov z4.d, z0.d 1329; CHECK-NEXT: ext z4.b, z4.b, z0.b, #128 1330; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s 1331; CHECK-NEXT: mov z3.d, z1.d 1332; CHECK-NEXT: uunpklo z4.s, z4.h 1333; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128 1334; CHECK-NEXT: uunpklo z3.s, z3.h 1335; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z4.s 1336; CHECK-NEXT: ptrue p1.h, vl64 1337; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 1338; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h 1339; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h 1340; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h 1341; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1342; CHECK-NEXT: ret 1343 %op1 = load <128 x i16>, ptr %a 1344 %op2 = load <128 x i16>, ptr %b 1345 %res = urem <128 x i16> %op1, %op2 1346 store <128 x i16> %res, ptr %a 1347 ret void 1348} 1349 1350; Vector v2i32 udiv are not legal for NEON so use SVE when available. 1351define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 { 1352; CHECK-LABEL: urem_v2i32: 1353; CHECK: // %bb.0: 1354; CHECK-NEXT: ptrue p0.s, vl2 1355; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 1356; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 1357; CHECK-NEXT: movprfx z2, z0 1358; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s 1359; CHECK-NEXT: mls v0.2s, v2.2s, v1.2s 1360; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1361; CHECK-NEXT: ret 1362 %res = urem <2 x i32> %op1, %op2 1363 ret <2 x i32> %res 1364} 1365 1366; Vector v4i32 udiv are not legal for NEON so use SVE when available. 1367define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 { 1368; CHECK-LABEL: urem_v4i32: 1369; CHECK: // %bb.0: 1370; CHECK-NEXT: ptrue p0.s, vl4 1371; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 1372; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 1373; CHECK-NEXT: movprfx z2, z0 1374; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s 1375; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s 1376; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 1377; CHECK-NEXT: ret 1378 %res = urem <4 x i32> %op1, %op2 1379 ret <4 x i32> %res 1380} 1381 1382define void @urem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 1383; CHECK-LABEL: urem_v8i32: 1384; CHECK: // %bb.0: 1385; CHECK-NEXT: ptrue p0.s, vl8 1386; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1387; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1388; CHECK-NEXT: movprfx z2, z0 1389; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s 1390; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s 1391; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1392; CHECK-NEXT: ret 1393 %op1 = load <8 x i32>, ptr %a 1394 %op2 = load <8 x i32>, ptr %b 1395 %res = urem <8 x i32> %op1, %op2 1396 store <8 x i32> %res, ptr %a 1397 ret void 1398} 1399 1400define void @urem_v16i32(ptr %a, ptr %b) #0 { 1401; VBITS_GE_128-LABEL: urem_v16i32: 1402; VBITS_GE_128: // %bb.0: 1403; VBITS_GE_128-NEXT: ldp q0, q3, [x1] 1404; VBITS_GE_128-NEXT: ptrue p0.s, vl4 1405; VBITS_GE_128-NEXT: ldp q1, q2, [x0] 1406; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32] 1407; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32] 1408; VBITS_GE_128-NEXT: movprfx z4, z1 1409; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z0.s 1410; VBITS_GE_128-NEXT: movprfx z19, z2 1411; VBITS_GE_128-NEXT: udiv z19.s, p0/m, z19.s, z3.s 1412; VBITS_GE_128-NEXT: movprfx z7, z5 1413; VBITS_GE_128-NEXT: udiv z7.s, p0/m, z7.s, z6.s 1414; VBITS_GE_128-NEXT: movprfx z18, z16 1415; VBITS_GE_128-NEXT: udiv z18.s, p0/m, z18.s, z17.s 1416; VBITS_GE_128-NEXT: mls v1.4s, v4.4s, v0.4s 1417; VBITS_GE_128-NEXT: mls v2.4s, v19.4s, v3.4s 1418; VBITS_GE_128-NEXT: mls v16.4s, v18.4s, v17.4s 1419; VBITS_GE_128-NEXT: mls v5.4s, v7.4s, v6.4s 1420; VBITS_GE_128-NEXT: stp q1, q2, [x0] 1421; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] 1422; VBITS_GE_128-NEXT: ret 1423; 1424; VBITS_GE_256-LABEL: urem_v16i32: 1425; VBITS_GE_256: // %bb.0: 1426; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1427; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1428; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 1429; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 1430; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] 1431; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1] 1432; VBITS_GE_256-NEXT: movprfx z2, z0 1433; VBITS_GE_256-NEXT: udiv z2.s, p0/m, z2.s, z1.s 1434; VBITS_GE_256-NEXT: movprfx z5, z3 1435; VBITS_GE_256-NEXT: udiv z5.s, p0/m, z5.s, z4.s 1436; VBITS_GE_256-NEXT: mls z0.s, p0/m, z2.s, z1.s 1437; VBITS_GE_256-NEXT: movprfx z1, z3 1438; VBITS_GE_256-NEXT: mls z1.s, p0/m, z5.s, z4.s 1439; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 1440; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 1441; VBITS_GE_256-NEXT: ret 1442; 1443; VBITS_GE_512-LABEL: urem_v16i32: 1444; VBITS_GE_512: // %bb.0: 1445; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1446; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1447; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 1448; VBITS_GE_512-NEXT: movprfx z2, z0 1449; VBITS_GE_512-NEXT: udiv z2.s, p0/m, z2.s, z1.s 1450; VBITS_GE_512-NEXT: mls z0.s, p0/m, z2.s, z1.s 1451; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 1452; VBITS_GE_512-NEXT: ret 1453 %op1 = load <16 x i32>, ptr %a 1454 %op2 = load <16 x i32>, ptr %b 1455 %res = urem <16 x i32> %op1, %op2 1456 store <16 x i32> %res, ptr %a 1457 ret void 1458} 1459 1460define void @urem_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 1461; CHECK-LABEL: urem_v32i32: 1462; CHECK: // %bb.0: 1463; CHECK-NEXT: ptrue p0.s, vl32 1464; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1465; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1466; CHECK-NEXT: movprfx z2, z0 1467; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s 1468; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s 1469; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1470; CHECK-NEXT: ret 1471 %op1 = load <32 x i32>, ptr %a 1472 %op2 = load <32 x i32>, ptr %b 1473 %res = urem <32 x i32> %op1, %op2 1474 store <32 x i32> %res, ptr %a 1475 ret void 1476} 1477 1478define void @urem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 1479; CHECK-LABEL: urem_v64i32: 1480; CHECK: // %bb.0: 1481; CHECK-NEXT: ptrue p0.s, vl64 1482; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1483; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1484; CHECK-NEXT: movprfx z2, z0 1485; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s 1486; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s 1487; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1488; CHECK-NEXT: ret 1489 %op1 = load <64 x i32>, ptr %a 1490 %op2 = load <64 x i32>, ptr %b 1491 %res = urem <64 x i32> %op1, %op2 1492 store <64 x i32> %res, ptr %a 1493 ret void 1494} 1495 1496; Vector i64 udiv are not legal for NEON so use SVE when available. 1497; FIXME: We should be able to improve the codegen for the 128 bits case here. 1498define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 { 1499; CHECK-LABEL: urem_v1i64: 1500; CHECK: // %bb.0: 1501; CHECK-NEXT: ptrue p0.d, vl1 1502; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 1503; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 1504; CHECK-NEXT: movprfx z2, z0 1505; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d 1506; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d 1507; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1508; CHECK-NEXT: ret 1509 %res = urem <1 x i64> %op1, %op2 1510 ret <1 x i64> %res 1511} 1512 1513; Vector i64 udiv are not legal for NEON so use SVE when available. 1514; FIXME: We should be able to improve the codegen for the 128 bits case here. 1515define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 { 1516; CHECK-LABEL: urem_v2i64: 1517; CHECK: // %bb.0: 1518; CHECK-NEXT: ptrue p0.d, vl2 1519; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 1520; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 1521; CHECK-NEXT: movprfx z2, z0 1522; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d 1523; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d 1524; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 1525; CHECK-NEXT: ret 1526 %res = urem <2 x i64> %op1, %op2 1527 ret <2 x i64> %res 1528} 1529 1530define void @urem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 1531; CHECK-LABEL: urem_v4i64: 1532; CHECK: // %bb.0: 1533; CHECK-NEXT: ptrue p0.d, vl4 1534; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1535; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1536; CHECK-NEXT: movprfx z2, z0 1537; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d 1538; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d 1539; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1540; CHECK-NEXT: ret 1541 %op1 = load <4 x i64>, ptr %a 1542 %op2 = load <4 x i64>, ptr %b 1543 %res = urem <4 x i64> %op1, %op2 1544 store <4 x i64> %res, ptr %a 1545 ret void 1546} 1547 1548define void @urem_v8i64(ptr %a, ptr %b) #0 { 1549; VBITS_GE_128-LABEL: urem_v8i64: 1550; VBITS_GE_128: // %bb.0: 1551; VBITS_GE_128-NEXT: ldp q0, q3, [x1] 1552; VBITS_GE_128-NEXT: ptrue p0.d, vl2 1553; VBITS_GE_128-NEXT: ldp q1, q2, [x0] 1554; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32] 1555; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32] 1556; VBITS_GE_128-NEXT: movprfx z4, z1 1557; VBITS_GE_128-NEXT: udiv z4.d, p0/m, z4.d, z0.d 1558; VBITS_GE_128-NEXT: movprfx z19, z2 1559; VBITS_GE_128-NEXT: udiv z19.d, p0/m, z19.d, z3.d 1560; VBITS_GE_128-NEXT: movprfx z7, z5 1561; VBITS_GE_128-NEXT: udiv z7.d, p0/m, z7.d, z6.d 1562; VBITS_GE_128-NEXT: movprfx z18, z16 1563; VBITS_GE_128-NEXT: udiv z18.d, p0/m, z18.d, z17.d 1564; VBITS_GE_128-NEXT: msb z0.d, p0/m, z4.d, z1.d 1565; VBITS_GE_128-NEXT: movprfx z1, z2 1566; VBITS_GE_128-NEXT: mls z1.d, p0/m, z19.d, z3.d 1567; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d 1568; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d 1569; VBITS_GE_128-NEXT: stp q0, q1, [x0] 1570; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] 1571; VBITS_GE_128-NEXT: ret 1572; 1573; VBITS_GE_256-LABEL: urem_v8i64: 1574; VBITS_GE_256: // %bb.0: 1575; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1576; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1577; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1578; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 1579; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] 1580; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] 1581; VBITS_GE_256-NEXT: movprfx z2, z0 1582; VBITS_GE_256-NEXT: udiv z2.d, p0/m, z2.d, z1.d 1583; VBITS_GE_256-NEXT: movprfx z5, z3 1584; VBITS_GE_256-NEXT: udiv z5.d, p0/m, z5.d, z4.d 1585; VBITS_GE_256-NEXT: mls z0.d, p0/m, z2.d, z1.d 1586; VBITS_GE_256-NEXT: movprfx z1, z3 1587; VBITS_GE_256-NEXT: mls z1.d, p0/m, z5.d, z4.d 1588; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 1589; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 1590; VBITS_GE_256-NEXT: ret 1591; 1592; VBITS_GE_512-LABEL: urem_v8i64: 1593; VBITS_GE_512: // %bb.0: 1594; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1595; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1596; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 1597; VBITS_GE_512-NEXT: movprfx z2, z0 1598; VBITS_GE_512-NEXT: udiv z2.d, p0/m, z2.d, z1.d 1599; VBITS_GE_512-NEXT: mls z0.d, p0/m, z2.d, z1.d 1600; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 1601; VBITS_GE_512-NEXT: ret 1602 %op1 = load <8 x i64>, ptr %a 1603 %op2 = load <8 x i64>, ptr %b 1604 %res = urem <8 x i64> %op1, %op2 1605 store <8 x i64> %res, ptr %a 1606 ret void 1607} 1608 1609define void @urem_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 1610; CHECK-LABEL: urem_v16i64: 1611; CHECK: // %bb.0: 1612; CHECK-NEXT: ptrue p0.d, vl16 1613; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1614; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1615; CHECK-NEXT: movprfx z2, z0 1616; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d 1617; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d 1618; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1619; CHECK-NEXT: ret 1620 %op1 = load <16 x i64>, ptr %a 1621 %op2 = load <16 x i64>, ptr %b 1622 %res = urem <16 x i64> %op1, %op2 1623 store <16 x i64> %res, ptr %a 1624 ret void 1625} 1626 1627define void @urem_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 1628; CHECK-LABEL: urem_v32i64: 1629; CHECK: // %bb.0: 1630; CHECK-NEXT: ptrue p0.d, vl32 1631; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1632; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1633; CHECK-NEXT: movprfx z2, z0 1634; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d 1635; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d 1636; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1637; CHECK-NEXT: ret 1638 %op1 = load <32 x i64>, ptr %a 1639 %op2 = load <32 x i64>, ptr %b 1640 %res = urem <32 x i64> %op1, %op2 1641 store <32 x i64> %res, ptr %a 1642 ret void 1643} 1644 1645attributes #0 = { "target-features"="+sve" } 1646