1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128 3; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 4; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 6 7target triple = "aarch64-unknown-linux-gnu" 8 9; 10; SDIV 11; 12 13; Vector vXi8 sdiv are not legal for NEON so use SVE when available. 14; FIXME: We should be able to improve the codegen for >= 256 bits here. 15define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { 16; VBITS_GE_128-LABEL: sdiv_v8i8: 17; VBITS_GE_128: // %bb.0: 18; VBITS_GE_128-NEXT: sshll v1.8h, v1.8b, #0 19; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0 20; VBITS_GE_128-NEXT: ptrue p0.s, vl4 21; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 22; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 23; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 24; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 25; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 26; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 27; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h 28; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h 29; VBITS_GE_128-NEXT: ret 30; 31; VBITS_GE_256-LABEL: sdiv_v8i8: 32; VBITS_GE_256: // %bb.0: 33; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1 34; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 35; VBITS_GE_256-NEXT: ptrue p0.s, vl8 36; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b 37; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b 38; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 39; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 40; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 41; VBITS_GE_256-NEXT: uzp1 z1.h, z0.h, z0.h 42; VBITS_GE_256-NEXT: umov w8, v1.h[0] 43; VBITS_GE_256-NEXT: umov w9, v1.h[1] 44; VBITS_GE_256-NEXT: fmov s0, w8 45; VBITS_GE_256-NEXT: umov w8, v1.h[2] 46; VBITS_GE_256-NEXT: mov v0.b[1], w9 47; VBITS_GE_256-NEXT: mov v0.b[2], w8 48; VBITS_GE_256-NEXT: umov w8, v1.h[3] 49; VBITS_GE_256-NEXT: mov v0.b[3], w8 50; VBITS_GE_256-NEXT: umov w8, v1.h[4] 51; VBITS_GE_256-NEXT: mov v0.b[4], w8 52; VBITS_GE_256-NEXT: umov w8, v1.h[5] 53; VBITS_GE_256-NEXT: mov v0.b[5], w8 54; VBITS_GE_256-NEXT: umov w8, v1.h[6] 55; VBITS_GE_256-NEXT: mov v0.b[6], w8 56; VBITS_GE_256-NEXT: umov w8, v1.h[7] 57; VBITS_GE_256-NEXT: mov v0.b[7], w8 58; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0 59; VBITS_GE_256-NEXT: ret 60; 61; VBITS_GE_512-LABEL: sdiv_v8i8: 62; VBITS_GE_512: // %bb.0: 63; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1 64; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 65; VBITS_GE_512-NEXT: ptrue p0.s, vl8 66; VBITS_GE_512-NEXT: sunpklo z1.h, z1.b 67; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b 68; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h 69; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h 70; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 71; VBITS_GE_512-NEXT: uzp1 z1.h, z0.h, z0.h 72; VBITS_GE_512-NEXT: umov w8, v1.h[0] 73; VBITS_GE_512-NEXT: umov w9, v1.h[1] 74; VBITS_GE_512-NEXT: fmov s0, w8 75; VBITS_GE_512-NEXT: umov w8, v1.h[2] 76; VBITS_GE_512-NEXT: mov v0.b[1], w9 77; VBITS_GE_512-NEXT: mov v0.b[2], w8 78; VBITS_GE_512-NEXT: umov w8, v1.h[3] 79; VBITS_GE_512-NEXT: mov v0.b[3], w8 80; VBITS_GE_512-NEXT: umov w8, v1.h[4] 81; VBITS_GE_512-NEXT: mov v0.b[4], w8 82; VBITS_GE_512-NEXT: umov w8, v1.h[5] 83; VBITS_GE_512-NEXT: mov v0.b[5], w8 84; VBITS_GE_512-NEXT: umov w8, v1.h[6] 85; VBITS_GE_512-NEXT: mov v0.b[6], w8 86; VBITS_GE_512-NEXT: umov w8, v1.h[7] 87; VBITS_GE_512-NEXT: mov v0.b[7], w8 88; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0 89; VBITS_GE_512-NEXT: ret 90 %res = sdiv <8 x i8> %op1, %op2 91 ret <8 x i8> %res 92} 93 94define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { 95; VBITS_GE_128-LABEL: sdiv_v16i8: 96; VBITS_GE_128: // %bb.0: 97; VBITS_GE_128-NEXT: sshll2 v2.8h, v1.16b, #0 98; VBITS_GE_128-NEXT: sshll2 v3.8h, v0.16b, #0 99; VBITS_GE_128-NEXT: sshll v1.8h, v1.8b, #0 100; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0 101; VBITS_GE_128-NEXT: ptrue p0.s, vl4 102; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0 103; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0 104; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 105; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0 106; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s 107; VBITS_GE_128-NEXT: sshll2 v5.4s, v0.8h, #0 108; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 109; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 110; VBITS_GE_128-NEXT: sshll2 v3.4s, v1.8h, #0 111; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 112; VBITS_GE_128-NEXT: sdivr z3.s, p0/m, z3.s, z5.s 113; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 114; VBITS_GE_128-NEXT: uzp1 v1.8h, v2.8h, v4.8h 115; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v3.8h 116; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v1.16b 117; VBITS_GE_128-NEXT: ret 118; 119; VBITS_GE_256-LABEL: sdiv_v16i8: 120; VBITS_GE_256: // %bb.0: 121; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 122; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 123; VBITS_GE_256-NEXT: ptrue p0.s, vl8 124; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b 125; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b 126; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h 127; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h 128; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 129; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 130; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 131; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 132; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 133; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 134; VBITS_GE_256-NEXT: ptrue p0.h, vl8 135; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h 136; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 137; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h 138; VBITS_GE_256-NEXT: uzp1 z0.b, z1.b, z1.b 139; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 140; VBITS_GE_256-NEXT: ret 141; 142; VBITS_GE_512-LABEL: sdiv_v16i8: 143; VBITS_GE_512: // %bb.0: 144; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 145; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 146; VBITS_GE_512-NEXT: ptrue p0.s, vl16 147; VBITS_GE_512-NEXT: sunpklo z1.h, z1.b 148; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b 149; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h 150; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h 151; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 152; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h 153; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b 154; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 155; VBITS_GE_512-NEXT: ret 156 %res = sdiv <16 x i8> %op1, %op2 157 ret <16 x i8> %res 158} 159 160define void @sdiv_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 161; CHECK-LABEL: sdiv_v32i8: 162; CHECK: // %bb.0: 163; CHECK-NEXT: ptrue p0.b, vl32 164; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 165; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 166; CHECK-NEXT: ptrue p0.s, vl32 167; CHECK-NEXT: sunpklo z1.h, z1.b 168; CHECK-NEXT: sunpklo z0.h, z0.b 169; CHECK-NEXT: sunpklo z1.s, z1.h 170; CHECK-NEXT: sunpklo z0.s, z0.h 171; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 172; CHECK-NEXT: st1b { z0.s }, p0, [x0] 173; CHECK-NEXT: ret 174 %op1 = load <32 x i8>, ptr %a 175 %op2 = load <32 x i8>, ptr %b 176 %res = sdiv <32 x i8> %op1, %op2 177 store <32 x i8> %res, ptr %a 178 ret void 179} 180 181define void @sdiv_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 182; CHECK-LABEL: sdiv_v64i8: 183; CHECK: // %bb.0: 184; CHECK-NEXT: ptrue p0.b, vl64 185; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 186; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 187; CHECK-NEXT: ptrue p0.s, vl64 188; CHECK-NEXT: sunpklo z1.h, z1.b 189; CHECK-NEXT: sunpklo z0.h, z0.b 190; CHECK-NEXT: sunpklo z1.s, z1.h 191; CHECK-NEXT: sunpklo z0.s, z0.h 192; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 193; CHECK-NEXT: st1b { z0.s }, p0, [x0] 194; CHECK-NEXT: ret 195 %op1 = load <64 x i8>, ptr %a 196 %op2 = load <64 x i8>, ptr %b 197 %res = sdiv <64 x i8> %op1, %op2 198 store <64 x i8> %res, ptr %a 199 ret void 200} 201 202define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 203; CHECK-LABEL: sdiv_v128i8: 204; CHECK: // %bb.0: 205; CHECK-NEXT: ptrue p0.b, vl128 206; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 207; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 208; CHECK-NEXT: ptrue p0.s, vl64 209; CHECK-NEXT: sunpklo z1.h, z1.b 210; CHECK-NEXT: sunpklo z0.h, z0.b 211; CHECK-NEXT: sunpklo z2.s, z1.h 212; CHECK-NEXT: sunpklo z3.s, z0.h 213; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 214; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 215; CHECK-NEXT: sunpklo z1.s, z1.h 216; CHECK-NEXT: sunpklo z0.s, z0.h 217; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 218; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 219; CHECK-NEXT: ptrue p0.h, vl64 220; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h 221; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 222; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h 223; CHECK-NEXT: ptrue p0.h, vl128 224; CHECK-NEXT: st1b { z1.h }, p0, [x0] 225; CHECK-NEXT: ret 226 %op1 = load <128 x i8>, ptr %a 227 %op2 = load <128 x i8>, ptr %b 228 %res = sdiv <128 x i8> %op1, %op2 229 store <128 x i8> %res, ptr %a 230 ret void 231} 232 233define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 234; CHECK-LABEL: sdiv_v256i8: 235; CHECK: // %bb.0: 236; CHECK-NEXT: ptrue p0.b, vl256 237; CHECK-NEXT: ptrue p1.s, vl64 238; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 239; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 240; CHECK-NEXT: sunpklo z2.h, z1.b 241; CHECK-NEXT: sunpklo z3.h, z0.b 242; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 243; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 244; CHECK-NEXT: sunpklo z1.h, z1.b 245; CHECK-NEXT: sunpklo z4.s, z2.h 246; CHECK-NEXT: sunpklo z5.s, z3.h 247; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 248; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 249; CHECK-NEXT: sunpklo z0.h, z0.b 250; CHECK-NEXT: sunpklo z2.s, z2.h 251; CHECK-NEXT: sunpklo z3.s, z3.h 252; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s 253; CHECK-NEXT: sunpklo z5.s, z0.h 254; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 255; CHECK-NEXT: sunpklo z0.s, z0.h 256; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s 257; CHECK-NEXT: sunpklo z3.s, z1.h 258; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 259; CHECK-NEXT: sunpklo z1.s, z1.h 260; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z5.s 261; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 262; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s 263; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h 264; CHECK-NEXT: ptrue p1.h, vl64 265; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h 266; CHECK-NEXT: splice z1.h, p1, z1.h, z2.h 267; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 268; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h 269; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b 270; CHECK-NEXT: ptrue p1.b, vl128 271; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b 272; CHECK-NEXT: splice z0.b, p1, z0.b, z1.b 273; CHECK-NEXT: st1b { z0.b }, p0, [x0] 274; CHECK-NEXT: ret 275 %op1 = load <256 x i8>, ptr %a 276 %op2 = load <256 x i8>, ptr %b 277 %res = sdiv <256 x i8> %op1, %op2 278 store <256 x i8> %res, ptr %a 279 ret void 280} 281 282; Vector vXi16 sdiv are not legal for NEON so use SVE when available. 283; FIXME: We should be able to improve the codegen for >= 256 bits here. 284define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { 285; VBITS_GE_128-LABEL: sdiv_v4i16: 286; VBITS_GE_128: // %bb.0: 287; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 288; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 289; VBITS_GE_128-NEXT: ptrue p0.s, vl4 290; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 291; VBITS_GE_128-NEXT: xtn v0.4h, v0.4s 292; VBITS_GE_128-NEXT: ret 293; 294; VBITS_GE_256-LABEL: sdiv_v4i16: 295; VBITS_GE_256: // %bb.0: 296; VBITS_GE_256-NEXT: sshll v1.4s, v1.4h, #0 297; VBITS_GE_256-NEXT: sshll v0.4s, v0.4h, #0 298; VBITS_GE_256-NEXT: ptrue p0.s, vl4 299; VBITS_GE_256-NEXT: sdivr z1.s, p0/m, z1.s, z0.s 300; VBITS_GE_256-NEXT: mov w8, v1.s[1] 301; VBITS_GE_256-NEXT: mov v0.16b, v1.16b 302; VBITS_GE_256-NEXT: mov w9, v1.s[2] 303; VBITS_GE_256-NEXT: mov v0.h[1], w8 304; VBITS_GE_256-NEXT: mov w8, v1.s[3] 305; VBITS_GE_256-NEXT: mov v0.h[2], w9 306; VBITS_GE_256-NEXT: mov v0.h[3], w8 307; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0 308; VBITS_GE_256-NEXT: ret 309; 310; VBITS_GE_512-LABEL: sdiv_v4i16: 311; VBITS_GE_512: // %bb.0: 312; VBITS_GE_512-NEXT: sshll v1.4s, v1.4h, #0 313; VBITS_GE_512-NEXT: sshll v0.4s, v0.4h, #0 314; VBITS_GE_512-NEXT: ptrue p0.s, vl4 315; VBITS_GE_512-NEXT: sdivr z1.s, p0/m, z1.s, z0.s 316; VBITS_GE_512-NEXT: mov w8, v1.s[1] 317; VBITS_GE_512-NEXT: mov v0.16b, v1.16b 318; VBITS_GE_512-NEXT: mov w9, v1.s[2] 319; VBITS_GE_512-NEXT: mov v0.h[1], w8 320; VBITS_GE_512-NEXT: mov w8, v1.s[3] 321; VBITS_GE_512-NEXT: mov v0.h[2], w9 322; VBITS_GE_512-NEXT: mov v0.h[3], w8 323; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0 324; VBITS_GE_512-NEXT: ret 325 %res = sdiv <4 x i16> %op1, %op2 326 ret <4 x i16> %res 327} 328 329define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { 330; VBITS_GE_128-LABEL: sdiv_v8i16: 331; VBITS_GE_128: // %bb.0: 332; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 333; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 334; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 335; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 336; VBITS_GE_128-NEXT: ptrue p0.s, vl4 337; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 338; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 339; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h 340; VBITS_GE_128-NEXT: ret 341; 342; VBITS_GE_256-LABEL: sdiv_v8i16: 343; VBITS_GE_256: // %bb.0: 344; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 345; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 346; VBITS_GE_256-NEXT: ptrue p0.s, vl8 347; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 348; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 349; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 350; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 351; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 352; VBITS_GE_256-NEXT: ret 353; 354; VBITS_GE_512-LABEL: sdiv_v8i16: 355; VBITS_GE_512: // %bb.0: 356; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 357; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 358; VBITS_GE_512-NEXT: ptrue p0.s, vl8 359; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h 360; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h 361; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 362; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h 363; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 364; VBITS_GE_512-NEXT: ret 365 %res = sdiv <8 x i16> %op1, %op2 366 ret <8 x i16> %res 367} 368 369define void @sdiv_v16i16(ptr %a, ptr %b) #0 { 370; VBITS_GE_128-LABEL: sdiv_v16i16: 371; VBITS_GE_128: // %bb.0: 372; VBITS_GE_128-NEXT: ldp q4, q1, [x1] 373; VBITS_GE_128-NEXT: ptrue p0.s, vl4 374; VBITS_GE_128-NEXT: ldr q0, [x0, #16] 375; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 376; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 377; VBITS_GE_128-NEXT: sshll2 v5.4s, v4.8h, #0 378; VBITS_GE_128-NEXT: sshll v4.4s, v4.4h, #0 379; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 380; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 381; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s 382; VBITS_GE_128-NEXT: ldr q3, [x0] 383; VBITS_GE_128-NEXT: sshll2 v6.4s, v3.8h, #0 384; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0 385; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s 386; VBITS_GE_128-NEXT: sdiv z3.s, p0/m, z3.s, z4.s 387; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 388; VBITS_GE_128-NEXT: uzp1 v1.8h, v3.8h, v5.8h 389; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h 390; VBITS_GE_128-NEXT: stp q1, q0, [x0] 391; VBITS_GE_128-NEXT: ret 392; 393; VBITS_GE_256-LABEL: sdiv_v16i16: 394; VBITS_GE_256: // %bb.0: 395; VBITS_GE_256-NEXT: ptrue p0.h, vl16 396; VBITS_GE_256-NEXT: ptrue p1.s, vl8 397; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] 398; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] 399; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h 400; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h 401; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 402; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 403; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 404; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 405; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s 406; VBITS_GE_256-NEXT: sdiv z0.s, p1/m, z0.s, z1.s 407; VBITS_GE_256-NEXT: ptrue p1.h, vl8 408; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h 409; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 410; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z0.h 411; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 412; VBITS_GE_256-NEXT: ret 413; 414; VBITS_GE_512-LABEL: sdiv_v16i16: 415; VBITS_GE_512: // %bb.0: 416; VBITS_GE_512-NEXT: ptrue p0.h, vl16 417; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 418; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 419; VBITS_GE_512-NEXT: ptrue p0.s, vl16 420; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h 421; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h 422; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 423; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0] 424; VBITS_GE_512-NEXT: ret 425 %op1 = load <16 x i16>, ptr %a 426 %op2 = load <16 x i16>, ptr %b 427 %res = sdiv <16 x i16> %op1, %op2 428 store <16 x i16> %res, ptr %a 429 ret void 430} 431 432define void @sdiv_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 433; CHECK-LABEL: sdiv_v32i16: 434; CHECK: // %bb.0: 435; CHECK-NEXT: ptrue p0.h, vl32 436; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 437; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 438; CHECK-NEXT: ptrue p0.s, vl32 439; CHECK-NEXT: sunpklo z1.s, z1.h 440; CHECK-NEXT: sunpklo z0.s, z0.h 441; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 442; CHECK-NEXT: st1h { z0.s }, p0, [x0] 443; CHECK-NEXT: ret 444 %op1 = load <32 x i16>, ptr %a 445 %op2 = load <32 x i16>, ptr %b 446 %res = sdiv <32 x i16> %op1, %op2 447 store <32 x i16> %res, ptr %a 448 ret void 449} 450 451define void @sdiv_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 452; CHECK-LABEL: sdiv_v64i16: 453; CHECK: // %bb.0: 454; CHECK-NEXT: ptrue p0.h, vl64 455; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 456; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 457; CHECK-NEXT: ptrue p0.s, vl64 458; CHECK-NEXT: sunpklo z1.s, z1.h 459; CHECK-NEXT: sunpklo z0.s, z0.h 460; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 461; CHECK-NEXT: st1h { z0.s }, p0, [x0] 462; CHECK-NEXT: ret 463 %op1 = load <64 x i16>, ptr %a 464 %op2 = load <64 x i16>, ptr %b 465 %res = sdiv <64 x i16> %op1, %op2 466 store <64 x i16> %res, ptr %a 467 ret void 468} 469 470define void @sdiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 471; CHECK-LABEL: sdiv_v128i16: 472; CHECK: // %bb.0: 473; CHECK-NEXT: ptrue p0.h, vl128 474; CHECK-NEXT: ptrue p1.s, vl64 475; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 476; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 477; CHECK-NEXT: sunpklo z2.s, z1.h 478; CHECK-NEXT: sunpklo z3.s, z0.h 479; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 480; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 481; CHECK-NEXT: sunpklo z1.s, z1.h 482; CHECK-NEXT: sunpklo z0.s, z0.h 483; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s 484; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s 485; CHECK-NEXT: ptrue p1.h, vl64 486; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h 487; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 488; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h 489; CHECK-NEXT: st1h { z1.h }, p0, [x0] 490; CHECK-NEXT: ret 491 %op1 = load <128 x i16>, ptr %a 492 %op2 = load <128 x i16>, ptr %b 493 %res = sdiv <128 x i16> %op1, %op2 494 store <128 x i16> %res, ptr %a 495 ret void 496} 497 498; Vector v2i32 sdiv are not legal for NEON so use SVE when available. 499define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 { 500; CHECK-LABEL: sdiv_v2i32: 501; CHECK: // %bb.0: 502; CHECK-NEXT: ptrue p0.s, vl2 503; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 504; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 505; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 506; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 507; CHECK-NEXT: ret 508 %res = sdiv <2 x i32> %op1, %op2 509 ret <2 x i32> %res 510} 511 512; Vector v4i32 sdiv are not legal for NEON so use SVE when available. 513define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 { 514; CHECK-LABEL: sdiv_v4i32: 515; CHECK: // %bb.0: 516; CHECK-NEXT: ptrue p0.s, vl4 517; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 518; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 519; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 520; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 521; CHECK-NEXT: ret 522 %res = sdiv <4 x i32> %op1, %op2 523 ret <4 x i32> %res 524} 525 526define void @sdiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 527; CHECK-LABEL: sdiv_v8i32: 528; CHECK: // %bb.0: 529; CHECK-NEXT: ptrue p0.s, vl8 530; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 531; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 532; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 533; CHECK-NEXT: st1w { z0.s }, p0, [x0] 534; CHECK-NEXT: ret 535 %op1 = load <8 x i32>, ptr %a 536 %op2 = load <8 x i32>, ptr %b 537 %res = sdiv <8 x i32> %op1, %op2 538 store <8 x i32> %res, ptr %a 539 ret void 540} 541 542define void @sdiv_v16i32(ptr %a, ptr %b) #0 { 543; VBITS_GE_128-LABEL: sdiv_v16i32: 544; VBITS_GE_128: // %bb.0: 545; VBITS_GE_128-NEXT: ldp q0, q3, [x1] 546; VBITS_GE_128-NEXT: ptrue p0.s, vl4 547; VBITS_GE_128-NEXT: ldp q1, q2, [x0] 548; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] 549; VBITS_GE_128-NEXT: sdivr z0.s, p0/m, z0.s, z1.s 550; VBITS_GE_128-NEXT: ldr q1, [x0, #48] 551; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z4.s 552; VBITS_GE_128-NEXT: ldr q4, [x0, #32] 553; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z5.s 554; VBITS_GE_128-NEXT: sdiv z2.s, p0/m, z2.s, z3.s 555; VBITS_GE_128-NEXT: stp q4, q1, [x0, #32] 556; VBITS_GE_128-NEXT: stp q0, q2, [x0] 557; VBITS_GE_128-NEXT: ret 558; 559; VBITS_GE_256-LABEL: sdiv_v16i32: 560; VBITS_GE_256: // %bb.0: 561; VBITS_GE_256-NEXT: ptrue p0.s, vl8 562; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 563; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 564; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 565; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1] 566; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 567; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 568; VBITS_GE_256-NEXT: sdiv z1.s, p0/m, z1.s, z2.s 569; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 570; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 571; VBITS_GE_256-NEXT: ret 572; 573; VBITS_GE_512-LABEL: sdiv_v16i32: 574; VBITS_GE_512: // %bb.0: 575; VBITS_GE_512-NEXT: ptrue p0.s, vl16 576; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 577; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 578; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 579; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 580; VBITS_GE_512-NEXT: ret 581 %op1 = load <16 x i32>, ptr %a 582 %op2 = load <16 x i32>, ptr %b 583 %res = sdiv <16 x i32> %op1, %op2 584 store <16 x i32> %res, ptr %a 585 ret void 586} 587 588define void @sdiv_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 589; CHECK-LABEL: sdiv_v32i32: 590; CHECK: // %bb.0: 591; CHECK-NEXT: ptrue p0.s, vl32 592; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 593; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 594; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 595; CHECK-NEXT: st1w { z0.s }, p0, [x0] 596; CHECK-NEXT: ret 597 %op1 = load <32 x i32>, ptr %a 598 %op2 = load <32 x i32>, ptr %b 599 %res = sdiv <32 x i32> %op1, %op2 600 store <32 x i32> %res, ptr %a 601 ret void 602} 603 604define void @sdiv_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 605; CHECK-LABEL: sdiv_v64i32: 606; CHECK: // %bb.0: 607; CHECK-NEXT: ptrue p0.s, vl64 608; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 609; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 610; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s 611; CHECK-NEXT: st1w { z0.s }, p0, [x0] 612; CHECK-NEXT: ret 613 %op1 = load <64 x i32>, ptr %a 614 %op2 = load <64 x i32>, ptr %b 615 %res = sdiv <64 x i32> %op1, %op2 616 store <64 x i32> %res, ptr %a 617 ret void 618} 619 620; Vector i64 sdiv are not legal for NEON so use SVE when available. 621define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 { 622; CHECK-LABEL: sdiv_v1i64: 623; CHECK: // %bb.0: 624; CHECK-NEXT: ptrue p0.d, vl1 625; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 626; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 627; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d 628; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 629; CHECK-NEXT: ret 630 %res = sdiv <1 x i64> %op1, %op2 631 ret <1 x i64> %res 632} 633 634; Vector i64 sdiv are not legal for NEON so use SVE when available. 635define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 { 636; CHECK-LABEL: sdiv_v2i64: 637; CHECK: // %bb.0: 638; CHECK-NEXT: ptrue p0.d, vl2 639; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 640; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 641; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d 642; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 643; CHECK-NEXT: ret 644 %res = sdiv <2 x i64> %op1, %op2 645 ret <2 x i64> %res 646} 647 648define void @sdiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 649; CHECK-LABEL: sdiv_v4i64: 650; CHECK: // %bb.0: 651; CHECK-NEXT: ptrue p0.d, vl4 652; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 653; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 654; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d 655; CHECK-NEXT: st1d { z0.d }, p0, [x0] 656; CHECK-NEXT: ret 657 %op1 = load <4 x i64>, ptr %a 658 %op2 = load <4 x i64>, ptr %b 659 %res = sdiv <4 x i64> %op1, %op2 660 store <4 x i64> %res, ptr %a 661 ret void 662} 663 664define void @sdiv_v8i64(ptr %a, ptr %b) #0 { 665; VBITS_GE_128-LABEL: sdiv_v8i64: 666; VBITS_GE_128: // %bb.0: 667; VBITS_GE_128-NEXT: ldp q0, q3, [x1] 668; VBITS_GE_128-NEXT: ptrue p0.d, vl2 669; VBITS_GE_128-NEXT: ldp q1, q2, [x0] 670; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] 671; VBITS_GE_128-NEXT: sdivr z0.d, p0/m, z0.d, z1.d 672; VBITS_GE_128-NEXT: ldr q1, [x0, #48] 673; VBITS_GE_128-NEXT: sdiv z1.d, p0/m, z1.d, z4.d 674; VBITS_GE_128-NEXT: ldr q4, [x0, #32] 675; VBITS_GE_128-NEXT: sdiv z4.d, p0/m, z4.d, z5.d 676; VBITS_GE_128-NEXT: sdiv z2.d, p0/m, z2.d, z3.d 677; VBITS_GE_128-NEXT: stp q4, q1, [x0, #32] 678; VBITS_GE_128-NEXT: stp q0, q2, [x0] 679; VBITS_GE_128-NEXT: ret 680; 681; VBITS_GE_256-LABEL: sdiv_v8i64: 682; VBITS_GE_256: // %bb.0: 683; VBITS_GE_256-NEXT: ptrue p0.d, vl4 684; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 685; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 686; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 687; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] 688; VBITS_GE_256-NEXT: sdiv z0.d, p0/m, z0.d, z1.d 689; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 690; VBITS_GE_256-NEXT: sdiv z1.d, p0/m, z1.d, z2.d 691; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 692; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 693; VBITS_GE_256-NEXT: ret 694; 695; VBITS_GE_512-LABEL: sdiv_v8i64: 696; VBITS_GE_512: // %bb.0: 697; VBITS_GE_512-NEXT: ptrue p0.d, vl8 698; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 699; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 700; VBITS_GE_512-NEXT: sdiv z0.d, p0/m, z0.d, z1.d 701; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 702; VBITS_GE_512-NEXT: ret 703 %op1 = load <8 x i64>, ptr %a 704 %op2 = load <8 x i64>, ptr %b 705 %res = sdiv <8 x i64> %op1, %op2 706 store <8 x i64> %res, ptr %a 707 ret void 708} 709 710define void @sdiv_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 711; CHECK-LABEL: sdiv_v16i64: 712; CHECK: // %bb.0: 713; CHECK-NEXT: ptrue p0.d, vl16 714; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 715; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 716; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d 717; CHECK-NEXT: st1d { z0.d }, p0, [x0] 718; CHECK-NEXT: ret 719 %op1 = load <16 x i64>, ptr %a 720 %op2 = load <16 x i64>, ptr %b 721 %res = sdiv <16 x i64> %op1, %op2 722 store <16 x i64> %res, ptr %a 723 ret void 724} 725 726define void @sdiv_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 727; CHECK-LABEL: sdiv_v32i64: 728; CHECK: // %bb.0: 729; CHECK-NEXT: ptrue p0.d, vl32 730; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 731; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 732; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d 733; CHECK-NEXT: st1d { z0.d }, p0, [x0] 734; CHECK-NEXT: ret 735 %op1 = load <32 x i64>, ptr %a 736 %op2 = load <32 x i64>, ptr %b 737 %res = sdiv <32 x i64> %op1, %op2 738 store <32 x i64> %res, ptr %a 739 ret void 740} 741 742; 743; UDIV 744; 745 746; Vector vXi8 udiv are not legal for NEON so use SVE when available. 747; FIXME: We should be able to improve the codegen for >= 256 bits here. 748define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { 749; VBITS_GE_128-LABEL: udiv_v8i8: 750; VBITS_GE_128: // %bb.0: 751; VBITS_GE_128-NEXT: ushll v1.8h, v1.8b, #0 752; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0 753; VBITS_GE_128-NEXT: ptrue p0.s, vl4 754; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 755; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 756; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 757; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 758; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s 759; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s 760; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h 761; VBITS_GE_128-NEXT: xtn v0.8b, v0.8h 762; VBITS_GE_128-NEXT: ret 763; 764; VBITS_GE_256-LABEL: udiv_v8i8: 765; VBITS_GE_256: // %bb.0: 766; VBITS_GE_256-NEXT: // kill: def $d1 killed $d1 def $z1 767; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 768; VBITS_GE_256-NEXT: ptrue p0.s, vl8 769; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b 770; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b 771; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h 772; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 773; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s 774; VBITS_GE_256-NEXT: uzp1 z1.h, z0.h, z0.h 775; VBITS_GE_256-NEXT: umov w8, v1.h[0] 776; VBITS_GE_256-NEXT: umov w9, v1.h[1] 777; VBITS_GE_256-NEXT: fmov s0, w8 778; VBITS_GE_256-NEXT: umov w8, v1.h[2] 779; VBITS_GE_256-NEXT: mov v0.b[1], w9 780; VBITS_GE_256-NEXT: mov v0.b[2], w8 781; VBITS_GE_256-NEXT: umov w8, v1.h[3] 782; VBITS_GE_256-NEXT: mov v0.b[3], w8 783; VBITS_GE_256-NEXT: umov w8, v1.h[4] 784; VBITS_GE_256-NEXT: mov v0.b[4], w8 785; VBITS_GE_256-NEXT: umov w8, v1.h[5] 786; VBITS_GE_256-NEXT: mov v0.b[5], w8 787; VBITS_GE_256-NEXT: umov w8, v1.h[6] 788; VBITS_GE_256-NEXT: mov v0.b[6], w8 789; VBITS_GE_256-NEXT: umov w8, v1.h[7] 790; VBITS_GE_256-NEXT: mov v0.b[7], w8 791; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0 792; VBITS_GE_256-NEXT: ret 793; 794; VBITS_GE_512-LABEL: udiv_v8i8: 795; VBITS_GE_512: // %bb.0: 796; VBITS_GE_512-NEXT: // kill: def $d1 killed $d1 def $z1 797; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 798; VBITS_GE_512-NEXT: ptrue p0.s, vl8 799; VBITS_GE_512-NEXT: uunpklo z1.h, z1.b 800; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b 801; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h 802; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h 803; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s 804; VBITS_GE_512-NEXT: uzp1 z1.h, z0.h, z0.h 805; VBITS_GE_512-NEXT: umov w8, v1.h[0] 806; VBITS_GE_512-NEXT: umov w9, v1.h[1] 807; VBITS_GE_512-NEXT: fmov s0, w8 808; VBITS_GE_512-NEXT: umov w8, v1.h[2] 809; VBITS_GE_512-NEXT: mov v0.b[1], w9 810; VBITS_GE_512-NEXT: mov v0.b[2], w8 811; VBITS_GE_512-NEXT: umov w8, v1.h[3] 812; VBITS_GE_512-NEXT: mov v0.b[3], w8 813; VBITS_GE_512-NEXT: umov w8, v1.h[4] 814; VBITS_GE_512-NEXT: mov v0.b[4], w8 815; VBITS_GE_512-NEXT: umov w8, v1.h[5] 816; VBITS_GE_512-NEXT: mov v0.b[5], w8 817; VBITS_GE_512-NEXT: umov w8, v1.h[6] 818; VBITS_GE_512-NEXT: mov v0.b[6], w8 819; VBITS_GE_512-NEXT: umov w8, v1.h[7] 820; VBITS_GE_512-NEXT: mov v0.b[7], w8 821; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0 822; VBITS_GE_512-NEXT: ret 823 %res = udiv <8 x i8> %op1, %op2 824 ret <8 x i8> %res 825} 826 827define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { 828; VBITS_GE_128-LABEL: udiv_v16i8: 829; VBITS_GE_128: // %bb.0: 830; VBITS_GE_128-NEXT: ushll2 v2.8h, v1.16b, #0 831; VBITS_GE_128-NEXT: ushll2 v3.8h, v0.16b, #0 832; VBITS_GE_128-NEXT: ushll v1.8h, v1.8b, #0 833; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0 834; VBITS_GE_128-NEXT: ptrue p0.s, vl4 835; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0 836; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0 837; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 838; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0 839; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s 840; VBITS_GE_128-NEXT: ushll2 v5.4s, v0.8h, #0 841; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 842; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s 843; VBITS_GE_128-NEXT: ushll2 v3.4s, v1.8h, #0 844; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 845; VBITS_GE_128-NEXT: udivr z3.s, p0/m, z3.s, z5.s 846; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s 847; VBITS_GE_128-NEXT: uzp1 v1.8h, v2.8h, v4.8h 848; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v3.8h 849; VBITS_GE_128-NEXT: uzp1 v0.16b, v0.16b, v1.16b 850; VBITS_GE_128-NEXT: ret 851; 852; VBITS_GE_256-LABEL: udiv_v16i8: 853; VBITS_GE_256: // %bb.0: 854; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 855; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 856; VBITS_GE_256-NEXT: ptrue p0.s, vl8 857; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b 858; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b 859; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h 860; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h 861; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 862; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 863; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h 864; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 865; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s 866; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s 867; VBITS_GE_256-NEXT: ptrue p0.h, vl8 868; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h 869; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 870; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h 871; VBITS_GE_256-NEXT: uzp1 z0.b, z1.b, z1.b 872; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 873; VBITS_GE_256-NEXT: ret 874; 875; VBITS_GE_512-LABEL: udiv_v16i8: 876; VBITS_GE_512: // %bb.0: 877; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 878; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 879; VBITS_GE_512-NEXT: ptrue p0.s, vl16 880; VBITS_GE_512-NEXT: uunpklo z1.h, z1.b 881; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b 882; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h 883; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h 884; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s 885; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h 886; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b 887; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 888; VBITS_GE_512-NEXT: ret 889 %res = udiv <16 x i8> %op1, %op2 890 ret <16 x i8> %res 891} 892 893define void @udiv_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 894; CHECK-LABEL: udiv_v32i8: 895; CHECK: // %bb.0: 896; CHECK-NEXT: ptrue p0.s, vl32 897; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1] 898; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0] 899; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s 900; CHECK-NEXT: st1b { z0.s }, p0, [x0] 901; CHECK-NEXT: ret 902 %op1 = load <32 x i8>, ptr %a 903 %op2 = load <32 x i8>, ptr %b 904 %res = udiv <32 x i8> %op1, %op2 905 store <32 x i8> %res, ptr %a 906 ret void 907} 908 909define void @udiv_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 910; CHECK-LABEL: udiv_v64i8: 911; CHECK: // %bb.0: 912; CHECK-NEXT: ptrue p0.s, vl64 913; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1] 914; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0] 915; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s 916; CHECK-NEXT: st1b { z0.s }, p0, [x0] 917; CHECK-NEXT: ret 918 %op1 = load <64 x i8>, ptr %a 919 %op2 = load <64 x i8>, ptr %b 920 %res = udiv <64 x i8> %op1, %op2 921 store <64 x i8> %res, ptr %a 922 ret void 923} 924 925define void @udiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 926; CHECK-LABEL: udiv_v128i8: 927; CHECK: // %bb.0: 928; CHECK-NEXT: ptrue p0.h, vl128 929; CHECK-NEXT: ptrue p1.s, vl64 930; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1] 931; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] 932; CHECK-NEXT: uunpklo z2.s, z0.h 933; CHECK-NEXT: uunpklo z3.s, z1.h 934; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 935; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 936; CHECK-NEXT: uunpklo z0.s, z0.h 937; CHECK-NEXT: uunpklo z1.s, z1.h 938; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s 939; CHECK-NEXT: udivr z0.s, p1/m, z0.s, z1.s 940; CHECK-NEXT: ptrue p1.h, vl64 941; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h 942; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 943; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h 944; CHECK-NEXT: st1b { z1.h }, p0, [x0] 945; CHECK-NEXT: ret 946 %op1 = load <128 x i8>, ptr %a 947 %op2 = load <128 x i8>, ptr %b 948 %res = udiv <128 x i8> %op1, %op2 949 store <128 x i8> %res, ptr %a 950 ret void 951} 952 953define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 954; CHECK-LABEL: udiv_v256i8: 955; CHECK: // %bb.0: 956; CHECK-NEXT: ptrue p0.b, vl256 957; CHECK-NEXT: ptrue p1.s, vl64 958; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 959; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 960; CHECK-NEXT: uunpklo z2.h, z1.b 961; CHECK-NEXT: uunpklo z3.h, z0.b 962; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 963; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 964; CHECK-NEXT: uunpklo z1.h, z1.b 965; CHECK-NEXT: uunpklo z4.s, z2.h 966; CHECK-NEXT: uunpklo z5.s, z3.h 967; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 968; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 969; CHECK-NEXT: uunpklo z0.h, z0.b 970; CHECK-NEXT: uunpklo z2.s, z2.h 971; CHECK-NEXT: uunpklo z3.s, z3.h 972; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s 973; CHECK-NEXT: uunpklo z5.s, z0.h 974; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 975; CHECK-NEXT: uunpklo z0.s, z0.h 976; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s 977; CHECK-NEXT: uunpklo z3.s, z1.h 978; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 979; CHECK-NEXT: uunpklo z1.s, z1.h 980; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z5.s 981; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h 982; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s 983; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h 984; CHECK-NEXT: ptrue p1.h, vl64 985; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h 986; CHECK-NEXT: splice z1.h, p1, z1.h, z2.h 987; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 988; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h 989; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b 990; CHECK-NEXT: ptrue p1.b, vl128 991; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b 992; CHECK-NEXT: splice z0.b, p1, z0.b, z1.b 993; CHECK-NEXT: st1b { z0.b }, p0, [x0] 994; CHECK-NEXT: ret 995 %op1 = load <256 x i8>, ptr %a 996 %op2 = load <256 x i8>, ptr %b 997 %res = udiv <256 x i8> %op1, %op2 998 store <256 x i8> %res, ptr %a 999 ret void 1000} 1001 1002; Vector vXi16 udiv are not legal for NEON so use SVE when available. 1003; FIXME: We should be able to improve the codegen for >= 256 bits here. 1004define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { 1005; VBITS_GE_128-LABEL: udiv_v4i16: 1006; VBITS_GE_128: // %bb.0: 1007; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 1008; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 1009; VBITS_GE_128-NEXT: ptrue p0.s, vl4 1010; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s 1011; VBITS_GE_128-NEXT: xtn v0.4h, v0.4s 1012; VBITS_GE_128-NEXT: ret 1013; 1014; VBITS_GE_256-LABEL: udiv_v4i16: 1015; VBITS_GE_256: // %bb.0: 1016; VBITS_GE_256-NEXT: ushll v1.4s, v1.4h, #0 1017; VBITS_GE_256-NEXT: ushll v0.4s, v0.4h, #0 1018; VBITS_GE_256-NEXT: ptrue p0.s, vl4 1019; VBITS_GE_256-NEXT: udivr z1.s, p0/m, z1.s, z0.s 1020; VBITS_GE_256-NEXT: mov w8, v1.s[1] 1021; VBITS_GE_256-NEXT: mov v0.16b, v1.16b 1022; VBITS_GE_256-NEXT: mov w9, v1.s[2] 1023; VBITS_GE_256-NEXT: mov v0.h[1], w8 1024; VBITS_GE_256-NEXT: mov w8, v1.s[3] 1025; VBITS_GE_256-NEXT: mov v0.h[2], w9 1026; VBITS_GE_256-NEXT: mov v0.h[3], w8 1027; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0 1028; VBITS_GE_256-NEXT: ret 1029; 1030; VBITS_GE_512-LABEL: udiv_v4i16: 1031; VBITS_GE_512: // %bb.0: 1032; VBITS_GE_512-NEXT: ushll v1.4s, v1.4h, #0 1033; VBITS_GE_512-NEXT: ushll v0.4s, v0.4h, #0 1034; VBITS_GE_512-NEXT: ptrue p0.s, vl4 1035; VBITS_GE_512-NEXT: udivr z1.s, p0/m, z1.s, z0.s 1036; VBITS_GE_512-NEXT: mov w8, v1.s[1] 1037; VBITS_GE_512-NEXT: mov v0.16b, v1.16b 1038; VBITS_GE_512-NEXT: mov w9, v1.s[2] 1039; VBITS_GE_512-NEXT: mov v0.h[1], w8 1040; VBITS_GE_512-NEXT: mov w8, v1.s[3] 1041; VBITS_GE_512-NEXT: mov v0.h[2], w9 1042; VBITS_GE_512-NEXT: mov v0.h[3], w8 1043; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $q0 1044; VBITS_GE_512-NEXT: ret 1045 %res = udiv <4 x i16> %op1, %op2 1046 ret <4 x i16> %res 1047} 1048 1049define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { 1050; VBITS_GE_128-LABEL: udiv_v8i16: 1051; VBITS_GE_128: // %bb.0: 1052; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 1053; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 1054; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 1055; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 1056; VBITS_GE_128-NEXT: ptrue p0.s, vl4 1057; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s 1058; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s 1059; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h 1060; VBITS_GE_128-NEXT: ret 1061; 1062; VBITS_GE_256-LABEL: udiv_v8i16: 1063; VBITS_GE_256: // %bb.0: 1064; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 1065; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 1066; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1067; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h 1068; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 1069; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s 1070; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 1071; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 1072; VBITS_GE_256-NEXT: ret 1073; 1074; VBITS_GE_512-LABEL: udiv_v8i16: 1075; VBITS_GE_512: // %bb.0: 1076; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 1077; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 1078; VBITS_GE_512-NEXT: ptrue p0.s, vl8 1079; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h 1080; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h 1081; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s 1082; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h 1083; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 1084; VBITS_GE_512-NEXT: ret 1085 %res = udiv <8 x i16> %op1, %op2 1086 ret <8 x i16> %res 1087} 1088 1089define void @udiv_v16i16(ptr %a, ptr %b) #0 { 1090; VBITS_GE_128-LABEL: udiv_v16i16: 1091; VBITS_GE_128: // %bb.0: 1092; VBITS_GE_128-NEXT: ldp q4, q1, [x1] 1093; VBITS_GE_128-NEXT: ptrue p0.s, vl4 1094; VBITS_GE_128-NEXT: ldr q0, [x0, #16] 1095; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 1096; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 1097; VBITS_GE_128-NEXT: ushll2 v5.4s, v4.8h, #0 1098; VBITS_GE_128-NEXT: ushll v4.4s, v4.4h, #0 1099; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 1100; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 1101; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s 1102; VBITS_GE_128-NEXT: ldr q3, [x0] 1103; VBITS_GE_128-NEXT: ushll2 v6.4s, v3.8h, #0 1104; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0 1105; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s 1106; VBITS_GE_128-NEXT: udiv z3.s, p0/m, z3.s, z4.s 1107; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s 1108; VBITS_GE_128-NEXT: uzp1 v1.8h, v3.8h, v5.8h 1109; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h 1110; VBITS_GE_128-NEXT: stp q1, q0, [x0] 1111; VBITS_GE_128-NEXT: ret 1112; 1113; VBITS_GE_256-LABEL: udiv_v16i16: 1114; VBITS_GE_256: // %bb.0: 1115; VBITS_GE_256-NEXT: ptrue p0.h, vl16 1116; VBITS_GE_256-NEXT: ptrue p1.s, vl8 1117; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] 1118; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] 1119; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h 1120; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h 1121; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 1122; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 1123; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h 1124; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 1125; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s 1126; VBITS_GE_256-NEXT: udiv z0.s, p1/m, z0.s, z1.s 1127; VBITS_GE_256-NEXT: ptrue p1.h, vl8 1128; VBITS_GE_256-NEXT: uzp1 z1.h, z2.h, z2.h 1129; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 1130; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z0.h 1131; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 1132; VBITS_GE_256-NEXT: ret 1133; 1134; VBITS_GE_512-LABEL: udiv_v16i16: 1135; VBITS_GE_512: // %bb.0: 1136; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1137; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1] 1138; VBITS_GE_512-NEXT: ld1h { z1.s }, p0/z, [x0] 1139; VBITS_GE_512-NEXT: udivr z0.s, p0/m, z0.s, z1.s 1140; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0] 1141; VBITS_GE_512-NEXT: ret 1142 %op1 = load <16 x i16>, ptr %a 1143 %op2 = load <16 x i16>, ptr %b 1144 %res = udiv <16 x i16> %op1, %op2 1145 store <16 x i16> %res, ptr %a 1146 ret void 1147} 1148 1149define void @udiv_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 1150; CHECK-LABEL: udiv_v32i16: 1151; CHECK: // %bb.0: 1152; CHECK-NEXT: ptrue p0.s, vl32 1153; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1] 1154; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] 1155; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s 1156; CHECK-NEXT: st1h { z0.s }, p0, [x0] 1157; CHECK-NEXT: ret 1158 %op1 = load <32 x i16>, ptr %a 1159 %op2 = load <32 x i16>, ptr %b 1160 %res = udiv <32 x i16> %op1, %op2 1161 store <32 x i16> %res, ptr %a 1162 ret void 1163} 1164 1165define void @udiv_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 1166; CHECK-LABEL: udiv_v64i16: 1167; CHECK: // %bb.0: 1168; CHECK-NEXT: ptrue p0.s, vl64 1169; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1] 1170; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] 1171; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s 1172; CHECK-NEXT: st1h { z0.s }, p0, [x0] 1173; CHECK-NEXT: ret 1174 %op1 = load <64 x i16>, ptr %a 1175 %op2 = load <64 x i16>, ptr %b 1176 %res = udiv <64 x i16> %op1, %op2 1177 store <64 x i16> %res, ptr %a 1178 ret void 1179} 1180 1181define void @udiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 1182; CHECK-LABEL: udiv_v128i16: 1183; CHECK: // %bb.0: 1184; CHECK-NEXT: ptrue p0.h, vl128 1185; CHECK-NEXT: ptrue p1.s, vl64 1186; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1187; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1188; CHECK-NEXT: uunpklo z2.s, z1.h 1189; CHECK-NEXT: uunpklo z3.s, z0.h 1190; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 1191; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 1192; CHECK-NEXT: uunpklo z1.s, z1.h 1193; CHECK-NEXT: uunpklo z0.s, z0.h 1194; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s 1195; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s 1196; CHECK-NEXT: ptrue p1.h, vl64 1197; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h 1198; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 1199; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h 1200; CHECK-NEXT: st1h { z1.h }, p0, [x0] 1201; CHECK-NEXT: ret 1202 %op1 = load <128 x i16>, ptr %a 1203 %op2 = load <128 x i16>, ptr %b 1204 %res = udiv <128 x i16> %op1, %op2 1205 store <128 x i16> %res, ptr %a 1206 ret void 1207} 1208 1209; Vector v2i32 udiv are not legal for NEON so use SVE when available. 1210define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 { 1211; CHECK-LABEL: udiv_v2i32: 1212; CHECK: // %bb.0: 1213; CHECK-NEXT: ptrue p0.s, vl2 1214; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 1215; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 1216; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s 1217; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1218; CHECK-NEXT: ret 1219 %res = udiv <2 x i32> %op1, %op2 1220 ret <2 x i32> %res 1221} 1222 1223; Vector v4i32 udiv are not legal for NEON so use SVE when available. 1224define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 { 1225; CHECK-LABEL: udiv_v4i32: 1226; CHECK: // %bb.0: 1227; CHECK-NEXT: ptrue p0.s, vl4 1228; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 1229; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 1230; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s 1231; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 1232; CHECK-NEXT: ret 1233 %res = udiv <4 x i32> %op1, %op2 1234 ret <4 x i32> %res 1235} 1236 1237define void @udiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 1238; CHECK-LABEL: udiv_v8i32: 1239; CHECK: // %bb.0: 1240; CHECK-NEXT: ptrue p0.s, vl8 1241; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1242; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1243; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s 1244; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1245; CHECK-NEXT: ret 1246 %op1 = load <8 x i32>, ptr %a 1247 %op2 = load <8 x i32>, ptr %b 1248 %res = udiv <8 x i32> %op1, %op2 1249 store <8 x i32> %res, ptr %a 1250 ret void 1251} 1252 1253define void @udiv_v16i32(ptr %a, ptr %b) #0 { 1254; VBITS_GE_128-LABEL: udiv_v16i32: 1255; VBITS_GE_128: // %bb.0: 1256; VBITS_GE_128-NEXT: ldp q0, q3, [x1] 1257; VBITS_GE_128-NEXT: ptrue p0.s, vl4 1258; VBITS_GE_128-NEXT: ldp q1, q2, [x0] 1259; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] 1260; VBITS_GE_128-NEXT: udivr z0.s, p0/m, z0.s, z1.s 1261; VBITS_GE_128-NEXT: ldr q1, [x0, #48] 1262; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z4.s 1263; VBITS_GE_128-NEXT: ldr q4, [x0, #32] 1264; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z5.s 1265; VBITS_GE_128-NEXT: udiv z2.s, p0/m, z2.s, z3.s 1266; VBITS_GE_128-NEXT: stp q4, q1, [x0, #32] 1267; VBITS_GE_128-NEXT: stp q0, q2, [x0] 1268; VBITS_GE_128-NEXT: ret 1269; 1270; VBITS_GE_256-LABEL: udiv_v16i32: 1271; VBITS_GE_256: // %bb.0: 1272; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1273; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1274; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 1275; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 1276; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1] 1277; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s 1278; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 1279; VBITS_GE_256-NEXT: udiv z1.s, p0/m, z1.s, z2.s 1280; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 1281; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 1282; VBITS_GE_256-NEXT: ret 1283; 1284; VBITS_GE_512-LABEL: udiv_v16i32: 1285; VBITS_GE_512: // %bb.0: 1286; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1287; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1288; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 1289; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s 1290; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 1291; VBITS_GE_512-NEXT: ret 1292 %op1 = load <16 x i32>, ptr %a 1293 %op2 = load <16 x i32>, ptr %b 1294 %res = udiv <16 x i32> %op1, %op2 1295 store <16 x i32> %res, ptr %a 1296 ret void 1297} 1298 1299define void @udiv_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 1300; CHECK-LABEL: udiv_v32i32: 1301; CHECK: // %bb.0: 1302; CHECK-NEXT: ptrue p0.s, vl32 1303; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1304; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1305; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s 1306; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1307; CHECK-NEXT: ret 1308 %op1 = load <32 x i32>, ptr %a 1309 %op2 = load <32 x i32>, ptr %b 1310 %res = udiv <32 x i32> %op1, %op2 1311 store <32 x i32> %res, ptr %a 1312 ret void 1313} 1314 1315define void @udiv_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 1316; CHECK-LABEL: udiv_v64i32: 1317; CHECK: // %bb.0: 1318; CHECK-NEXT: ptrue p0.s, vl64 1319; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1320; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1321; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s 1322; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1323; CHECK-NEXT: ret 1324 %op1 = load <64 x i32>, ptr %a 1325 %op2 = load <64 x i32>, ptr %b 1326 %res = udiv <64 x i32> %op1, %op2 1327 store <64 x i32> %res, ptr %a 1328 ret void 1329} 1330 1331; Vector i64 udiv are not legal for NEON so use SVE when available. 1332define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 { 1333; CHECK-LABEL: udiv_v1i64: 1334; CHECK: // %bb.0: 1335; CHECK-NEXT: ptrue p0.d, vl1 1336; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 1337; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 1338; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d 1339; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1340; CHECK-NEXT: ret 1341 %res = udiv <1 x i64> %op1, %op2 1342 ret <1 x i64> %res 1343} 1344 1345; Vector i64 udiv are not legal for NEON so use SVE when available. 1346define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 { 1347; CHECK-LABEL: udiv_v2i64: 1348; CHECK: // %bb.0: 1349; CHECK-NEXT: ptrue p0.d, vl2 1350; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 1351; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 1352; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d 1353; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 1354; CHECK-NEXT: ret 1355 %res = udiv <2 x i64> %op1, %op2 1356 ret <2 x i64> %res 1357} 1358 1359define void @udiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 1360; CHECK-LABEL: udiv_v4i64: 1361; CHECK: // %bb.0: 1362; CHECK-NEXT: ptrue p0.d, vl4 1363; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1364; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1365; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d 1366; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1367; CHECK-NEXT: ret 1368 %op1 = load <4 x i64>, ptr %a 1369 %op2 = load <4 x i64>, ptr %b 1370 %res = udiv <4 x i64> %op1, %op2 1371 store <4 x i64> %res, ptr %a 1372 ret void 1373} 1374 1375define void @udiv_v8i64(ptr %a, ptr %b) #0 { 1376; VBITS_GE_128-LABEL: udiv_v8i64: 1377; VBITS_GE_128: // %bb.0: 1378; VBITS_GE_128-NEXT: ldp q0, q3, [x1] 1379; VBITS_GE_128-NEXT: ptrue p0.d, vl2 1380; VBITS_GE_128-NEXT: ldp q1, q2, [x0] 1381; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] 1382; VBITS_GE_128-NEXT: udivr z0.d, p0/m, z0.d, z1.d 1383; VBITS_GE_128-NEXT: ldr q1, [x0, #48] 1384; VBITS_GE_128-NEXT: udiv z1.d, p0/m, z1.d, z4.d 1385; VBITS_GE_128-NEXT: ldr q4, [x0, #32] 1386; VBITS_GE_128-NEXT: udiv z4.d, p0/m, z4.d, z5.d 1387; VBITS_GE_128-NEXT: udiv z2.d, p0/m, z2.d, z3.d 1388; VBITS_GE_128-NEXT: stp q4, q1, [x0, #32] 1389; VBITS_GE_128-NEXT: stp q0, q2, [x0] 1390; VBITS_GE_128-NEXT: ret 1391; 1392; VBITS_GE_256-LABEL: udiv_v8i64: 1393; VBITS_GE_256: // %bb.0: 1394; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1395; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1396; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1397; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 1398; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] 1399; VBITS_GE_256-NEXT: udiv z0.d, p0/m, z0.d, z1.d 1400; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 1401; VBITS_GE_256-NEXT: udiv z1.d, p0/m, z1.d, z2.d 1402; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 1403; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 1404; VBITS_GE_256-NEXT: ret 1405; 1406; VBITS_GE_512-LABEL: udiv_v8i64: 1407; VBITS_GE_512: // %bb.0: 1408; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1409; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1410; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 1411; VBITS_GE_512-NEXT: udiv z0.d, p0/m, z0.d, z1.d 1412; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 1413; VBITS_GE_512-NEXT: ret 1414 %op1 = load <8 x i64>, ptr %a 1415 %op2 = load <8 x i64>, ptr %b 1416 %res = udiv <8 x i64> %op1, %op2 1417 store <8 x i64> %res, ptr %a 1418 ret void 1419} 1420 1421define void @udiv_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 1422; CHECK-LABEL: udiv_v16i64: 1423; CHECK: // %bb.0: 1424; CHECK-NEXT: ptrue p0.d, vl16 1425; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1426; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1427; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d 1428; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1429; CHECK-NEXT: ret 1430 %op1 = load <16 x i64>, ptr %a 1431 %op2 = load <16 x i64>, ptr %b 1432 %res = udiv <16 x i64> %op1, %op2 1433 store <16 x i64> %res, ptr %a 1434 ret void 1435} 1436 1437define void @udiv_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 1438; CHECK-LABEL: udiv_v32i64: 1439; CHECK: // %bb.0: 1440; CHECK-NEXT: ptrue p0.d, vl32 1441; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1442; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1443; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d 1444; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1445; CHECK-NEXT: ret 1446 %op1 = load <32 x i64>, ptr %a 1447 %op2 = load <32 x i64>, ptr %b 1448 %res = udiv <32 x i64> %op1, %op2 1449 store <32 x i64> %res, ptr %a 1450 ret void 1451} 1452 1453; This used to crash because isUnaryPredicate and BuildUDIV don't know how 1454; a SPLAT_VECTOR of fixed vector type should be handled. 1455define void @udiv_constantsplat_v8i32(ptr %a) vscale_range(2,0) #1 { 1456; CHECK-LABEL: udiv_constantsplat_v8i32: 1457; CHECK: // %bb.0: 1458; CHECK-NEXT: ptrue p0.s, vl8 1459; CHECK-NEXT: mov z1.s, #95 // =0x5f 1460; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1461; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s 1462; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1463; CHECK-NEXT: ret 1464 %op1 = load <8 x i32>, ptr %a 1465 %res = udiv <8 x i32> %op1, <i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95> 1466 store <8 x i32> %res, ptr %a 1467 ret void 1468} 1469 1470attributes #0 = { "target-features"="+sve" } 1471attributes #1 = { "target-features"="+sve" minsize } 1472