1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; SMAX 10; 11 12; Don't use SVE for 64-bit vectors. 13define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 { 14; CHECK-LABEL: smax_v8i8: 15; CHECK: // %bb.0: 16; CHECK-NEXT: smax v0.8b, v0.8b, v1.8b 17; CHECK-NEXT: ret 18 %res = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %op1, <8 x i8> %op2) 19 ret <8 x i8> %res 20} 21 22; Don't use SVE for 128-bit vectors. 23define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 { 24; CHECK-LABEL: smax_v16i8: 25; CHECK: // %bb.0: 26; CHECK-NEXT: smax v0.16b, v0.16b, v1.16b 27; CHECK-NEXT: ret 28 %res = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %op1, <16 x i8> %op2) 29 ret <16 x i8> %res 30} 31 32define void @smax_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 33; CHECK-LABEL: smax_v32i8: 34; CHECK: // %bb.0: 35; CHECK-NEXT: ptrue p0.b, vl32 36; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 37; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 38; CHECK-NEXT: smax z0.b, p0/m, z0.b, z1.b 39; CHECK-NEXT: st1b { z0.b }, p0, [x0] 40; CHECK-NEXT: ret 41 %op1 = load <32 x i8>, ptr %a 42 %op2 = load <32 x i8>, ptr %b 43 %res = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %op1, <32 x i8> %op2) 44 store <32 x i8> %res, ptr %a 45 ret void 46} 47 48define void @smax_v64i8(ptr %a, ptr %b) #0 { 49; VBITS_GE_256-LABEL: smax_v64i8: 50; VBITS_GE_256: // %bb.0: 51; VBITS_GE_256-NEXT: ptrue p0.b, vl32 52; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 53; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 54; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] 55; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] 56; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] 57; VBITS_GE_256-NEXT: smax z0.b, p0/m, z0.b, z1.b 58; VBITS_GE_256-NEXT: movprfx z1, z2 59; VBITS_GE_256-NEXT: smax z1.b, p0/m, z1.b, z3.b 60; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 61; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 62; VBITS_GE_256-NEXT: ret 63; 64; VBITS_GE_512-LABEL: smax_v64i8: 65; VBITS_GE_512: // %bb.0: 66; VBITS_GE_512-NEXT: ptrue p0.b, vl64 67; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 68; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] 69; VBITS_GE_512-NEXT: smax z0.b, p0/m, z0.b, z1.b 70; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 71; VBITS_GE_512-NEXT: ret 72 %op1 = load <64 x i8>, ptr %a 73 %op2 = load <64 x i8>, ptr %b 74 %res = call <64 x i8> @llvm.smax.v64i8(<64 x i8> %op1, <64 x i8> %op2) 75 store <64 x i8> %res, ptr %a 76 ret void 77} 78 79define void @smax_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 80; CHECK-LABEL: smax_v128i8: 81; CHECK: // %bb.0: 82; CHECK-NEXT: ptrue p0.b, vl128 83; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 84; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 85; CHECK-NEXT: smax z0.b, p0/m, z0.b, z1.b 86; CHECK-NEXT: st1b { z0.b }, p0, [x0] 87; CHECK-NEXT: ret 88 %op1 = load <128 x i8>, ptr %a 89 %op2 = load <128 x i8>, ptr %b 90 %res = call <128 x i8> @llvm.smax.v128i8(<128 x i8> %op1, <128 x i8> %op2) 91 store <128 x i8> %res, ptr %a 92 ret void 93} 94 95define void @smax_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 96; CHECK-LABEL: smax_v256i8: 97; CHECK: // %bb.0: 98; CHECK-NEXT: ptrue p0.b, vl256 99; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 100; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 101; CHECK-NEXT: smax z0.b, p0/m, z0.b, z1.b 102; CHECK-NEXT: st1b { z0.b }, p0, [x0] 103; CHECK-NEXT: ret 104 %op1 = load <256 x i8>, ptr %a 105 %op2 = load <256 x i8>, ptr %b 106 %res = call <256 x i8> @llvm.smax.v256i8(<256 x i8> %op1, <256 x i8> %op2) 107 store <256 x i8> %res, ptr %a 108 ret void 109} 110 111; Don't use SVE for 64-bit vectors. 112define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 { 113; CHECK-LABEL: smax_v4i16: 114; CHECK: // %bb.0: 115; CHECK-NEXT: smax v0.4h, v0.4h, v1.4h 116; CHECK-NEXT: ret 117 %res = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %op1, <4 x i16> %op2) 118 ret <4 x i16> %res 119} 120 121; Don't use SVE for 128-bit vectors. 122define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 { 123; CHECK-LABEL: smax_v8i16: 124; CHECK: // %bb.0: 125; CHECK-NEXT: smax v0.8h, v0.8h, v1.8h 126; CHECK-NEXT: ret 127 %res = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %op1, <8 x i16> %op2) 128 ret <8 x i16> %res 129} 130 131define void @smax_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 132; CHECK-LABEL: smax_v16i16: 133; CHECK: // %bb.0: 134; CHECK-NEXT: ptrue p0.h, vl16 135; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 136; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 137; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h 138; CHECK-NEXT: st1h { z0.h }, p0, [x0] 139; CHECK-NEXT: ret 140 %op1 = load <16 x i16>, ptr %a 141 %op2 = load <16 x i16>, ptr %b 142 %res = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %op1, <16 x i16> %op2) 143 store <16 x i16> %res, ptr %a 144 ret void 145} 146 147define void @smax_v32i16(ptr %a, ptr %b) #0 { 148; VBITS_GE_256-LABEL: smax_v32i16: 149; VBITS_GE_256: // %bb.0: 150; VBITS_GE_256-NEXT: ptrue p0.h, vl16 151; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 152; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 153; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 154; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 155; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 156; VBITS_GE_256-NEXT: smax z0.h, p0/m, z0.h, z1.h 157; VBITS_GE_256-NEXT: movprfx z1, z2 158; VBITS_GE_256-NEXT: smax z1.h, p0/m, z1.h, z3.h 159; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 160; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 161; VBITS_GE_256-NEXT: ret 162; 163; VBITS_GE_512-LABEL: smax_v32i16: 164; VBITS_GE_512: // %bb.0: 165; VBITS_GE_512-NEXT: ptrue p0.h, vl32 166; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 167; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 168; VBITS_GE_512-NEXT: smax z0.h, p0/m, z0.h, z1.h 169; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 170; VBITS_GE_512-NEXT: ret 171 %op1 = load <32 x i16>, ptr %a 172 %op2 = load <32 x i16>, ptr %b 173 %res = call <32 x i16> @llvm.smax.v32i16(<32 x i16> %op1, <32 x i16> %op2) 174 store <32 x i16> %res, ptr %a 175 ret void 176} 177 178define void @smax_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 179; CHECK-LABEL: smax_v64i16: 180; CHECK: // %bb.0: 181; CHECK-NEXT: ptrue p0.h, vl64 182; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 183; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 184; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h 185; CHECK-NEXT: st1h { z0.h }, p0, [x0] 186; CHECK-NEXT: ret 187 %op1 = load <64 x i16>, ptr %a 188 %op2 = load <64 x i16>, ptr %b 189 %res = call <64 x i16> @llvm.smax.v64i16(<64 x i16> %op1, <64 x i16> %op2) 190 store <64 x i16> %res, ptr %a 191 ret void 192} 193 194define void @smax_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 195; CHECK-LABEL: smax_v128i16: 196; CHECK: // %bb.0: 197; CHECK-NEXT: ptrue p0.h, vl128 198; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 199; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 200; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h 201; CHECK-NEXT: st1h { z0.h }, p0, [x0] 202; CHECK-NEXT: ret 203 %op1 = load <128 x i16>, ptr %a 204 %op2 = load <128 x i16>, ptr %b 205 %res = call <128 x i16> @llvm.smax.v128i16(<128 x i16> %op1, <128 x i16> %op2) 206 store <128 x i16> %res, ptr %a 207 ret void 208} 209 210; Don't use SVE for 64-bit vectors. 211define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 { 212; CHECK-LABEL: smax_v2i32: 213; CHECK: // %bb.0: 214; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s 215; CHECK-NEXT: ret 216 %res = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %op1, <2 x i32> %op2) 217 ret <2 x i32> %res 218} 219 220; Don't use SVE for 128-bit vectors. 221define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 { 222; CHECK-LABEL: smax_v4i32: 223; CHECK: // %bb.0: 224; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s 225; CHECK-NEXT: ret 226 %res = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %op1, <4 x i32> %op2) 227 ret <4 x i32> %res 228} 229 230define void @smax_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 231; CHECK-LABEL: smax_v8i32: 232; CHECK: // %bb.0: 233; CHECK-NEXT: ptrue p0.s, vl8 234; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 235; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 236; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s 237; CHECK-NEXT: st1w { z0.s }, p0, [x0] 238; CHECK-NEXT: ret 239 %op1 = load <8 x i32>, ptr %a 240 %op2 = load <8 x i32>, ptr %b 241 %res = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %op1, <8 x i32> %op2) 242 store <8 x i32> %res, ptr %a 243 ret void 244} 245 246define void @smax_v16i32(ptr %a, ptr %b) #0 { 247; VBITS_GE_256-LABEL: smax_v16i32: 248; VBITS_GE_256: // %bb.0: 249; VBITS_GE_256-NEXT: ptrue p0.s, vl8 250; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 251; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 252; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 253; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 254; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 255; VBITS_GE_256-NEXT: smax z0.s, p0/m, z0.s, z1.s 256; VBITS_GE_256-NEXT: movprfx z1, z2 257; VBITS_GE_256-NEXT: smax z1.s, p0/m, z1.s, z3.s 258; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 259; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 260; VBITS_GE_256-NEXT: ret 261; 262; VBITS_GE_512-LABEL: smax_v16i32: 263; VBITS_GE_512: // %bb.0: 264; VBITS_GE_512-NEXT: ptrue p0.s, vl16 265; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 266; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 267; VBITS_GE_512-NEXT: smax z0.s, p0/m, z0.s, z1.s 268; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 269; VBITS_GE_512-NEXT: ret 270 %op1 = load <16 x i32>, ptr %a 271 %op2 = load <16 x i32>, ptr %b 272 %res = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %op1, <16 x i32> %op2) 273 store <16 x i32> %res, ptr %a 274 ret void 275} 276 277define void @smax_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 278; CHECK-LABEL: smax_v32i32: 279; CHECK: // %bb.0: 280; CHECK-NEXT: ptrue p0.s, vl32 281; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 282; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 283; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s 284; CHECK-NEXT: st1w { z0.s }, p0, [x0] 285; CHECK-NEXT: ret 286 %op1 = load <32 x i32>, ptr %a 287 %op2 = load <32 x i32>, ptr %b 288 %res = call <32 x i32> @llvm.smax.v32i32(<32 x i32> %op1, <32 x i32> %op2) 289 store <32 x i32> %res, ptr %a 290 ret void 291} 292 293define void @smax_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 294; CHECK-LABEL: smax_v64i32: 295; CHECK: // %bb.0: 296; CHECK-NEXT: ptrue p0.s, vl64 297; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 298; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 299; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s 300; CHECK-NEXT: st1w { z0.s }, p0, [x0] 301; CHECK-NEXT: ret 302 %op1 = load <64 x i32>, ptr %a 303 %op2 = load <64 x i32>, ptr %b 304 %res = call <64 x i32> @llvm.smax.v64i32(<64 x i32> %op1, <64 x i32> %op2) 305 store <64 x i32> %res, ptr %a 306 ret void 307} 308 309; Vector i64 max are not legal for NEON so use SVE when available. 310define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 { 311; CHECK-LABEL: smax_v1i64: 312; CHECK: // %bb.0: 313; CHECK-NEXT: ptrue p0.d, vl1 314; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 315; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 316; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d 317; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 318; CHECK-NEXT: ret 319 %res = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %op1, <1 x i64> %op2) 320 ret <1 x i64> %res 321} 322 323; Vector i64 max are not legal for NEON so use SVE when available. 324define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 { 325; CHECK-LABEL: smax_v2i64: 326; CHECK: // %bb.0: 327; CHECK-NEXT: ptrue p0.d, vl2 328; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 329; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 330; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d 331; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 332; CHECK-NEXT: ret 333 %res = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %op1, <2 x i64> %op2) 334 ret <2 x i64> %res 335} 336 337define void @smax_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 338; CHECK-LABEL: smax_v4i64: 339; CHECK: // %bb.0: 340; CHECK-NEXT: ptrue p0.d, vl4 341; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 342; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 343; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d 344; CHECK-NEXT: st1d { z0.d }, p0, [x0] 345; CHECK-NEXT: ret 346 %op1 = load <4 x i64>, ptr %a 347 %op2 = load <4 x i64>, ptr %b 348 %res = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %op1, <4 x i64> %op2) 349 store <4 x i64> %res, ptr %a 350 ret void 351} 352 353define void @smax_v8i64(ptr %a, ptr %b) #0 { 354; VBITS_GE_256-LABEL: smax_v8i64: 355; VBITS_GE_256: // %bb.0: 356; VBITS_GE_256-NEXT: ptrue p0.d, vl4 357; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 358; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 359; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 360; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 361; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 362; VBITS_GE_256-NEXT: smax z0.d, p0/m, z0.d, z1.d 363; VBITS_GE_256-NEXT: movprfx z1, z2 364; VBITS_GE_256-NEXT: smax z1.d, p0/m, z1.d, z3.d 365; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 366; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 367; VBITS_GE_256-NEXT: ret 368; 369; VBITS_GE_512-LABEL: smax_v8i64: 370; VBITS_GE_512: // %bb.0: 371; VBITS_GE_512-NEXT: ptrue p0.d, vl8 372; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 373; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 374; VBITS_GE_512-NEXT: smax z0.d, p0/m, z0.d, z1.d 375; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 376; VBITS_GE_512-NEXT: ret 377 %op1 = load <8 x i64>, ptr %a 378 %op2 = load <8 x i64>, ptr %b 379 %res = call <8 x i64> @llvm.smax.v8i64(<8 x i64> %op1, <8 x i64> %op2) 380 store <8 x i64> %res, ptr %a 381 ret void 382} 383 384define void @smax_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 385; CHECK-LABEL: smax_v16i64: 386; CHECK: // %bb.0: 387; CHECK-NEXT: ptrue p0.d, vl16 388; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 389; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 390; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d 391; CHECK-NEXT: st1d { z0.d }, p0, [x0] 392; CHECK-NEXT: ret 393 %op1 = load <16 x i64>, ptr %a 394 %op2 = load <16 x i64>, ptr %b 395 %res = call <16 x i64> @llvm.smax.v16i64(<16 x i64> %op1, <16 x i64> %op2) 396 store <16 x i64> %res, ptr %a 397 ret void 398} 399 400define void @smax_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 401; CHECK-LABEL: smax_v32i64: 402; CHECK: // %bb.0: 403; CHECK-NEXT: ptrue p0.d, vl32 404; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 405; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 406; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d 407; CHECK-NEXT: st1d { z0.d }, p0, [x0] 408; CHECK-NEXT: ret 409 %op1 = load <32 x i64>, ptr %a 410 %op2 = load <32 x i64>, ptr %b 411 %res = call <32 x i64> @llvm.smax.v32i64(<32 x i64> %op1, <32 x i64> %op2) 412 store <32 x i64> %res, ptr %a 413 ret void 414} 415 416; 417; SMIN 418; 419 420; Don't use SVE for 64-bit vectors. 421define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 { 422; CHECK-LABEL: smin_v8i8: 423; CHECK: // %bb.0: 424; CHECK-NEXT: smin v0.8b, v0.8b, v1.8b 425; CHECK-NEXT: ret 426 %res = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %op1, <8 x i8> %op2) 427 ret <8 x i8> %res 428} 429 430; Don't use SVE for 128-bit vectors. 431define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 { 432; CHECK-LABEL: smin_v16i8: 433; CHECK: // %bb.0: 434; CHECK-NEXT: smin v0.16b, v0.16b, v1.16b 435; CHECK-NEXT: ret 436 %res = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %op1, <16 x i8> %op2) 437 ret <16 x i8> %res 438} 439 440define void @smin_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 441; CHECK-LABEL: smin_v32i8: 442; CHECK: // %bb.0: 443; CHECK-NEXT: ptrue p0.b, vl32 444; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 445; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 446; CHECK-NEXT: smin z0.b, p0/m, z0.b, z1.b 447; CHECK-NEXT: st1b { z0.b }, p0, [x0] 448; CHECK-NEXT: ret 449 %op1 = load <32 x i8>, ptr %a 450 %op2 = load <32 x i8>, ptr %b 451 %res = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %op1, <32 x i8> %op2) 452 store <32 x i8> %res, ptr %a 453 ret void 454} 455 456define void @smin_v64i8(ptr %a, ptr %b) #0 { 457; VBITS_GE_256-LABEL: smin_v64i8: 458; VBITS_GE_256: // %bb.0: 459; VBITS_GE_256-NEXT: ptrue p0.b, vl32 460; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 461; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 462; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] 463; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] 464; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] 465; VBITS_GE_256-NEXT: smin z0.b, p0/m, z0.b, z1.b 466; VBITS_GE_256-NEXT: movprfx z1, z2 467; VBITS_GE_256-NEXT: smin z1.b, p0/m, z1.b, z3.b 468; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 469; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 470; VBITS_GE_256-NEXT: ret 471; 472; VBITS_GE_512-LABEL: smin_v64i8: 473; VBITS_GE_512: // %bb.0: 474; VBITS_GE_512-NEXT: ptrue p0.b, vl64 475; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 476; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] 477; VBITS_GE_512-NEXT: smin z0.b, p0/m, z0.b, z1.b 478; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 479; VBITS_GE_512-NEXT: ret 480 %op1 = load <64 x i8>, ptr %a 481 %op2 = load <64 x i8>, ptr %b 482 %res = call <64 x i8> @llvm.smin.v64i8(<64 x i8> %op1, <64 x i8> %op2) 483 store <64 x i8> %res, ptr %a 484 ret void 485} 486 487define void @smin_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 488; CHECK-LABEL: smin_v128i8: 489; CHECK: // %bb.0: 490; CHECK-NEXT: ptrue p0.b, vl128 491; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 492; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 493; CHECK-NEXT: smin z0.b, p0/m, z0.b, z1.b 494; CHECK-NEXT: st1b { z0.b }, p0, [x0] 495; CHECK-NEXT: ret 496 %op1 = load <128 x i8>, ptr %a 497 %op2 = load <128 x i8>, ptr %b 498 %res = call <128 x i8> @llvm.smin.v128i8(<128 x i8> %op1, <128 x i8> %op2) 499 store <128 x i8> %res, ptr %a 500 ret void 501} 502 503define void @smin_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 504; CHECK-LABEL: smin_v256i8: 505; CHECK: // %bb.0: 506; CHECK-NEXT: ptrue p0.b, vl256 507; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 508; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 509; CHECK-NEXT: smin z0.b, p0/m, z0.b, z1.b 510; CHECK-NEXT: st1b { z0.b }, p0, [x0] 511; CHECK-NEXT: ret 512 %op1 = load <256 x i8>, ptr %a 513 %op2 = load <256 x i8>, ptr %b 514 %res = call <256 x i8> @llvm.smin.v256i8(<256 x i8> %op1, <256 x i8> %op2) 515 store <256 x i8> %res, ptr %a 516 ret void 517} 518 519; Don't use SVE for 64-bit vectors. 520define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 { 521; CHECK-LABEL: smin_v4i16: 522; CHECK: // %bb.0: 523; CHECK-NEXT: smin v0.4h, v0.4h, v1.4h 524; CHECK-NEXT: ret 525 %res = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %op1, <4 x i16> %op2) 526 ret <4 x i16> %res 527} 528 529; Don't use SVE for 128-bit vectors. 530define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 { 531; CHECK-LABEL: smin_v8i16: 532; CHECK: // %bb.0: 533; CHECK-NEXT: smin v0.8h, v0.8h, v1.8h 534; CHECK-NEXT: ret 535 %res = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %op1, <8 x i16> %op2) 536 ret <8 x i16> %res 537} 538 539define void @smin_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 540; CHECK-LABEL: smin_v16i16: 541; CHECK: // %bb.0: 542; CHECK-NEXT: ptrue p0.h, vl16 543; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 544; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 545; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h 546; CHECK-NEXT: st1h { z0.h }, p0, [x0] 547; CHECK-NEXT: ret 548 %op1 = load <16 x i16>, ptr %a 549 %op2 = load <16 x i16>, ptr %b 550 %res = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %op1, <16 x i16> %op2) 551 store <16 x i16> %res, ptr %a 552 ret void 553} 554 555define void @smin_v32i16(ptr %a, ptr %b) #0 { 556; VBITS_GE_256-LABEL: smin_v32i16: 557; VBITS_GE_256: // %bb.0: 558; VBITS_GE_256-NEXT: ptrue p0.h, vl16 559; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 560; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 561; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 562; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 563; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 564; VBITS_GE_256-NEXT: smin z0.h, p0/m, z0.h, z1.h 565; VBITS_GE_256-NEXT: movprfx z1, z2 566; VBITS_GE_256-NEXT: smin z1.h, p0/m, z1.h, z3.h 567; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 568; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 569; VBITS_GE_256-NEXT: ret 570; 571; VBITS_GE_512-LABEL: smin_v32i16: 572; VBITS_GE_512: // %bb.0: 573; VBITS_GE_512-NEXT: ptrue p0.h, vl32 574; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 575; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 576; VBITS_GE_512-NEXT: smin z0.h, p0/m, z0.h, z1.h 577; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 578; VBITS_GE_512-NEXT: ret 579 %op1 = load <32 x i16>, ptr %a 580 %op2 = load <32 x i16>, ptr %b 581 %res = call <32 x i16> @llvm.smin.v32i16(<32 x i16> %op1, <32 x i16> %op2) 582 store <32 x i16> %res, ptr %a 583 ret void 584} 585 586define void @smin_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 587; CHECK-LABEL: smin_v64i16: 588; CHECK: // %bb.0: 589; CHECK-NEXT: ptrue p0.h, vl64 590; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 591; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 592; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h 593; CHECK-NEXT: st1h { z0.h }, p0, [x0] 594; CHECK-NEXT: ret 595 %op1 = load <64 x i16>, ptr %a 596 %op2 = load <64 x i16>, ptr %b 597 %res = call <64 x i16> @llvm.smin.v64i16(<64 x i16> %op1, <64 x i16> %op2) 598 store <64 x i16> %res, ptr %a 599 ret void 600} 601 602define void @smin_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 603; CHECK-LABEL: smin_v128i16: 604; CHECK: // %bb.0: 605; CHECK-NEXT: ptrue p0.h, vl128 606; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 607; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 608; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h 609; CHECK-NEXT: st1h { z0.h }, p0, [x0] 610; CHECK-NEXT: ret 611 %op1 = load <128 x i16>, ptr %a 612 %op2 = load <128 x i16>, ptr %b 613 %res = call <128 x i16> @llvm.smin.v128i16(<128 x i16> %op1, <128 x i16> %op2) 614 store <128 x i16> %res, ptr %a 615 ret void 616} 617 618; Don't use SVE for 64-bit vectors. 619define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 { 620; CHECK-LABEL: smin_v2i32: 621; CHECK: // %bb.0: 622; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s 623; CHECK-NEXT: ret 624 %res = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %op1, <2 x i32> %op2) 625 ret <2 x i32> %res 626} 627 628; Don't use SVE for 128-bit vectors. 629define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 { 630; CHECK-LABEL: smin_v4i32: 631; CHECK: // %bb.0: 632; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s 633; CHECK-NEXT: ret 634 %res = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %op1, <4 x i32> %op2) 635 ret <4 x i32> %res 636} 637 638define void @smin_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 639; CHECK-LABEL: smin_v8i32: 640; CHECK: // %bb.0: 641; CHECK-NEXT: ptrue p0.s, vl8 642; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 643; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 644; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s 645; CHECK-NEXT: st1w { z0.s }, p0, [x0] 646; CHECK-NEXT: ret 647 %op1 = load <8 x i32>, ptr %a 648 %op2 = load <8 x i32>, ptr %b 649 %res = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %op1, <8 x i32> %op2) 650 store <8 x i32> %res, ptr %a 651 ret void 652} 653 654define void @smin_v16i32(ptr %a, ptr %b) #0 { 655; VBITS_GE_256-LABEL: smin_v16i32: 656; VBITS_GE_256: // %bb.0: 657; VBITS_GE_256-NEXT: ptrue p0.s, vl8 658; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 659; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 660; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 661; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 662; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 663; VBITS_GE_256-NEXT: smin z0.s, p0/m, z0.s, z1.s 664; VBITS_GE_256-NEXT: movprfx z1, z2 665; VBITS_GE_256-NEXT: smin z1.s, p0/m, z1.s, z3.s 666; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 667; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 668; VBITS_GE_256-NEXT: ret 669; 670; VBITS_GE_512-LABEL: smin_v16i32: 671; VBITS_GE_512: // %bb.0: 672; VBITS_GE_512-NEXT: ptrue p0.s, vl16 673; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 674; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 675; VBITS_GE_512-NEXT: smin z0.s, p0/m, z0.s, z1.s 676; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 677; VBITS_GE_512-NEXT: ret 678 %op1 = load <16 x i32>, ptr %a 679 %op2 = load <16 x i32>, ptr %b 680 %res = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %op1, <16 x i32> %op2) 681 store <16 x i32> %res, ptr %a 682 ret void 683} 684 685define void @smin_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 686; CHECK-LABEL: smin_v32i32: 687; CHECK: // %bb.0: 688; CHECK-NEXT: ptrue p0.s, vl32 689; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 690; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 691; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s 692; CHECK-NEXT: st1w { z0.s }, p0, [x0] 693; CHECK-NEXT: ret 694 %op1 = load <32 x i32>, ptr %a 695 %op2 = load <32 x i32>, ptr %b 696 %res = call <32 x i32> @llvm.smin.v32i32(<32 x i32> %op1, <32 x i32> %op2) 697 store <32 x i32> %res, ptr %a 698 ret void 699} 700 701define void @smin_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 702; CHECK-LABEL: smin_v64i32: 703; CHECK: // %bb.0: 704; CHECK-NEXT: ptrue p0.s, vl64 705; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 706; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 707; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s 708; CHECK-NEXT: st1w { z0.s }, p0, [x0] 709; CHECK-NEXT: ret 710 %op1 = load <64 x i32>, ptr %a 711 %op2 = load <64 x i32>, ptr %b 712 %res = call <64 x i32> @llvm.smin.v64i32(<64 x i32> %op1, <64 x i32> %op2) 713 store <64 x i32> %res, ptr %a 714 ret void 715} 716 717; Vector i64 min are not legal for NEON so use SVE when available. 718define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 { 719; CHECK-LABEL: smin_v1i64: 720; CHECK: // %bb.0: 721; CHECK-NEXT: ptrue p0.d, vl1 722; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 723; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 724; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d 725; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 726; CHECK-NEXT: ret 727 %res = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %op1, <1 x i64> %op2) 728 ret <1 x i64> %res 729} 730 731; Vector i64 min are not legal for NEON so use SVE when available. 732define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 { 733; CHECK-LABEL: smin_v2i64: 734; CHECK: // %bb.0: 735; CHECK-NEXT: ptrue p0.d, vl2 736; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 737; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 738; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d 739; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 740; CHECK-NEXT: ret 741 %res = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %op1, <2 x i64> %op2) 742 ret <2 x i64> %res 743} 744 745define void @smin_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 746; CHECK-LABEL: smin_v4i64: 747; CHECK: // %bb.0: 748; CHECK-NEXT: ptrue p0.d, vl4 749; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 750; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 751; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d 752; CHECK-NEXT: st1d { z0.d }, p0, [x0] 753; CHECK-NEXT: ret 754 %op1 = load <4 x i64>, ptr %a 755 %op2 = load <4 x i64>, ptr %b 756 %res = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %op1, <4 x i64> %op2) 757 store <4 x i64> %res, ptr %a 758 ret void 759} 760 761define void @smin_v8i64(ptr %a, ptr %b) #0 { 762; VBITS_GE_256-LABEL: smin_v8i64: 763; VBITS_GE_256: // %bb.0: 764; VBITS_GE_256-NEXT: ptrue p0.d, vl4 765; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 766; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 767; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 768; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 769; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 770; VBITS_GE_256-NEXT: smin z0.d, p0/m, z0.d, z1.d 771; VBITS_GE_256-NEXT: movprfx z1, z2 772; VBITS_GE_256-NEXT: smin z1.d, p0/m, z1.d, z3.d 773; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 774; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 775; VBITS_GE_256-NEXT: ret 776; 777; VBITS_GE_512-LABEL: smin_v8i64: 778; VBITS_GE_512: // %bb.0: 779; VBITS_GE_512-NEXT: ptrue p0.d, vl8 780; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 781; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 782; VBITS_GE_512-NEXT: smin z0.d, p0/m, z0.d, z1.d 783; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 784; VBITS_GE_512-NEXT: ret 785 %op1 = load <8 x i64>, ptr %a 786 %op2 = load <8 x i64>, ptr %b 787 %res = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %op1, <8 x i64> %op2) 788 store <8 x i64> %res, ptr %a 789 ret void 790} 791 792define void @smin_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 793; CHECK-LABEL: smin_v16i64: 794; CHECK: // %bb.0: 795; CHECK-NEXT: ptrue p0.d, vl16 796; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 797; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 798; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d 799; CHECK-NEXT: st1d { z0.d }, p0, [x0] 800; CHECK-NEXT: ret 801 %op1 = load <16 x i64>, ptr %a 802 %op2 = load <16 x i64>, ptr %b 803 %res = call <16 x i64> @llvm.smin.v16i64(<16 x i64> %op1, <16 x i64> %op2) 804 store <16 x i64> %res, ptr %a 805 ret void 806} 807 808define void @smin_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 809; CHECK-LABEL: smin_v32i64: 810; CHECK: // %bb.0: 811; CHECK-NEXT: ptrue p0.d, vl32 812; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 813; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 814; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d 815; CHECK-NEXT: st1d { z0.d }, p0, [x0] 816; CHECK-NEXT: ret 817 %op1 = load <32 x i64>, ptr %a 818 %op2 = load <32 x i64>, ptr %b 819 %res = call <32 x i64> @llvm.smin.v32i64(<32 x i64> %op1, <32 x i64> %op2) 820 store <32 x i64> %res, ptr %a 821 ret void 822} 823 824; 825; UMAX 826; 827 828; Don't use SVE for 64-bit vectors. 829define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 { 830; CHECK-LABEL: umax_v8i8: 831; CHECK: // %bb.0: 832; CHECK-NEXT: umax v0.8b, v0.8b, v1.8b 833; CHECK-NEXT: ret 834 %res = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %op1, <8 x i8> %op2) 835 ret <8 x i8> %res 836} 837 838; Don't use SVE for 128-bit vectors. 839define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 { 840; CHECK-LABEL: umax_v16i8: 841; CHECK: // %bb.0: 842; CHECK-NEXT: umax v0.16b, v0.16b, v1.16b 843; CHECK-NEXT: ret 844 %res = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %op1, <16 x i8> %op2) 845 ret <16 x i8> %res 846} 847 848define void @umax_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 849; CHECK-LABEL: umax_v32i8: 850; CHECK: // %bb.0: 851; CHECK-NEXT: ptrue p0.b, vl32 852; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 853; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 854; CHECK-NEXT: umax z0.b, p0/m, z0.b, z1.b 855; CHECK-NEXT: st1b { z0.b }, p0, [x0] 856; CHECK-NEXT: ret 857 %op1 = load <32 x i8>, ptr %a 858 %op2 = load <32 x i8>, ptr %b 859 %res = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %op1, <32 x i8> %op2) 860 store <32 x i8> %res, ptr %a 861 ret void 862} 863 864define void @umax_v64i8(ptr %a, ptr %b) #0 { 865; VBITS_GE_256-LABEL: umax_v64i8: 866; VBITS_GE_256: // %bb.0: 867; VBITS_GE_256-NEXT: ptrue p0.b, vl32 868; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 869; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 870; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] 871; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] 872; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] 873; VBITS_GE_256-NEXT: umax z0.b, p0/m, z0.b, z1.b 874; VBITS_GE_256-NEXT: movprfx z1, z2 875; VBITS_GE_256-NEXT: umax z1.b, p0/m, z1.b, z3.b 876; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 877; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 878; VBITS_GE_256-NEXT: ret 879; 880; VBITS_GE_512-LABEL: umax_v64i8: 881; VBITS_GE_512: // %bb.0: 882; VBITS_GE_512-NEXT: ptrue p0.b, vl64 883; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 884; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] 885; VBITS_GE_512-NEXT: umax z0.b, p0/m, z0.b, z1.b 886; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 887; VBITS_GE_512-NEXT: ret 888 %op1 = load <64 x i8>, ptr %a 889 %op2 = load <64 x i8>, ptr %b 890 %res = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %op1, <64 x i8> %op2) 891 store <64 x i8> %res, ptr %a 892 ret void 893} 894 895define void @umax_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 896; CHECK-LABEL: umax_v128i8: 897; CHECK: // %bb.0: 898; CHECK-NEXT: ptrue p0.b, vl128 899; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 900; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 901; CHECK-NEXT: umax z0.b, p0/m, z0.b, z1.b 902; CHECK-NEXT: st1b { z0.b }, p0, [x0] 903; CHECK-NEXT: ret 904 %op1 = load <128 x i8>, ptr %a 905 %op2 = load <128 x i8>, ptr %b 906 %res = call <128 x i8> @llvm.umax.v128i8(<128 x i8> %op1, <128 x i8> %op2) 907 store <128 x i8> %res, ptr %a 908 ret void 909} 910 911define void @umax_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 912; CHECK-LABEL: umax_v256i8: 913; CHECK: // %bb.0: 914; CHECK-NEXT: ptrue p0.b, vl256 915; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 916; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 917; CHECK-NEXT: umax z0.b, p0/m, z0.b, z1.b 918; CHECK-NEXT: st1b { z0.b }, p0, [x0] 919; CHECK-NEXT: ret 920 %op1 = load <256 x i8>, ptr %a 921 %op2 = load <256 x i8>, ptr %b 922 %res = call <256 x i8> @llvm.umax.v256i8(<256 x i8> %op1, <256 x i8> %op2) 923 store <256 x i8> %res, ptr %a 924 ret void 925} 926 927; Don't use SVE for 64-bit vectors. 928define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 { 929; CHECK-LABEL: umax_v4i16: 930; CHECK: // %bb.0: 931; CHECK-NEXT: umax v0.4h, v0.4h, v1.4h 932; CHECK-NEXT: ret 933 %res = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %op1, <4 x i16> %op2) 934 ret <4 x i16> %res 935} 936 937; Don't use SVE for 128-bit vectors. 938define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 { 939; CHECK-LABEL: umax_v8i16: 940; CHECK: // %bb.0: 941; CHECK-NEXT: umax v0.8h, v0.8h, v1.8h 942; CHECK-NEXT: ret 943 %res = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %op1, <8 x i16> %op2) 944 ret <8 x i16> %res 945} 946 947define void @umax_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 948; CHECK-LABEL: umax_v16i16: 949; CHECK: // %bb.0: 950; CHECK-NEXT: ptrue p0.h, vl16 951; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 952; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 953; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h 954; CHECK-NEXT: st1h { z0.h }, p0, [x0] 955; CHECK-NEXT: ret 956 %op1 = load <16 x i16>, ptr %a 957 %op2 = load <16 x i16>, ptr %b 958 %res = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %op1, <16 x i16> %op2) 959 store <16 x i16> %res, ptr %a 960 ret void 961} 962 963define void @umax_v32i16(ptr %a, ptr %b) #0 { 964; VBITS_GE_256-LABEL: umax_v32i16: 965; VBITS_GE_256: // %bb.0: 966; VBITS_GE_256-NEXT: ptrue p0.h, vl16 967; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 968; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 969; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 970; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 971; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 972; VBITS_GE_256-NEXT: umax z0.h, p0/m, z0.h, z1.h 973; VBITS_GE_256-NEXT: movprfx z1, z2 974; VBITS_GE_256-NEXT: umax z1.h, p0/m, z1.h, z3.h 975; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 976; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 977; VBITS_GE_256-NEXT: ret 978; 979; VBITS_GE_512-LABEL: umax_v32i16: 980; VBITS_GE_512: // %bb.0: 981; VBITS_GE_512-NEXT: ptrue p0.h, vl32 982; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 983; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 984; VBITS_GE_512-NEXT: umax z0.h, p0/m, z0.h, z1.h 985; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 986; VBITS_GE_512-NEXT: ret 987 %op1 = load <32 x i16>, ptr %a 988 %op2 = load <32 x i16>, ptr %b 989 %res = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %op1, <32 x i16> %op2) 990 store <32 x i16> %res, ptr %a 991 ret void 992} 993 994define void @umax_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 995; CHECK-LABEL: umax_v64i16: 996; CHECK: // %bb.0: 997; CHECK-NEXT: ptrue p0.h, vl64 998; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 999; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1000; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h 1001; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1002; CHECK-NEXT: ret 1003 %op1 = load <64 x i16>, ptr %a 1004 %op2 = load <64 x i16>, ptr %b 1005 %res = call <64 x i16> @llvm.umax.v64i16(<64 x i16> %op1, <64 x i16> %op2) 1006 store <64 x i16> %res, ptr %a 1007 ret void 1008} 1009 1010define void @umax_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 1011; CHECK-LABEL: umax_v128i16: 1012; CHECK: // %bb.0: 1013; CHECK-NEXT: ptrue p0.h, vl128 1014; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1015; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1016; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h 1017; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1018; CHECK-NEXT: ret 1019 %op1 = load <128 x i16>, ptr %a 1020 %op2 = load <128 x i16>, ptr %b 1021 %res = call <128 x i16> @llvm.umax.v128i16(<128 x i16> %op1, <128 x i16> %op2) 1022 store <128 x i16> %res, ptr %a 1023 ret void 1024} 1025 1026; Don't use SVE for 64-bit vectors. 1027define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 { 1028; CHECK-LABEL: umax_v2i32: 1029; CHECK: // %bb.0: 1030; CHECK-NEXT: umax v0.2s, v0.2s, v1.2s 1031; CHECK-NEXT: ret 1032 %res = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %op1, <2 x i32> %op2) 1033 ret <2 x i32> %res 1034} 1035 1036; Don't use SVE for 128-bit vectors. 1037define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 { 1038; CHECK-LABEL: umax_v4i32: 1039; CHECK: // %bb.0: 1040; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s 1041; CHECK-NEXT: ret 1042 %res = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %op1, <4 x i32> %op2) 1043 ret <4 x i32> %res 1044} 1045 1046define void @umax_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 1047; CHECK-LABEL: umax_v8i32: 1048; CHECK: // %bb.0: 1049; CHECK-NEXT: ptrue p0.s, vl8 1050; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1051; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1052; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s 1053; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1054; CHECK-NEXT: ret 1055 %op1 = load <8 x i32>, ptr %a 1056 %op2 = load <8 x i32>, ptr %b 1057 %res = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %op1, <8 x i32> %op2) 1058 store <8 x i32> %res, ptr %a 1059 ret void 1060} 1061 1062define void @umax_v16i32(ptr %a, ptr %b) #0 { 1063; VBITS_GE_256-LABEL: umax_v16i32: 1064; VBITS_GE_256: // %bb.0: 1065; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1066; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1067; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 1068; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 1069; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 1070; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 1071; VBITS_GE_256-NEXT: umax z0.s, p0/m, z0.s, z1.s 1072; VBITS_GE_256-NEXT: movprfx z1, z2 1073; VBITS_GE_256-NEXT: umax z1.s, p0/m, z1.s, z3.s 1074; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 1075; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 1076; VBITS_GE_256-NEXT: ret 1077; 1078; VBITS_GE_512-LABEL: umax_v16i32: 1079; VBITS_GE_512: // %bb.0: 1080; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1081; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1082; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 1083; VBITS_GE_512-NEXT: umax z0.s, p0/m, z0.s, z1.s 1084; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 1085; VBITS_GE_512-NEXT: ret 1086 %op1 = load <16 x i32>, ptr %a 1087 %op2 = load <16 x i32>, ptr %b 1088 %res = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %op1, <16 x i32> %op2) 1089 store <16 x i32> %res, ptr %a 1090 ret void 1091} 1092 1093define void @umax_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 1094; CHECK-LABEL: umax_v32i32: 1095; CHECK: // %bb.0: 1096; CHECK-NEXT: ptrue p0.s, vl32 1097; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1098; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1099; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s 1100; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1101; CHECK-NEXT: ret 1102 %op1 = load <32 x i32>, ptr %a 1103 %op2 = load <32 x i32>, ptr %b 1104 %res = call <32 x i32> @llvm.umax.v32i32(<32 x i32> %op1, <32 x i32> %op2) 1105 store <32 x i32> %res, ptr %a 1106 ret void 1107} 1108 1109define void @umax_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 1110; CHECK-LABEL: umax_v64i32: 1111; CHECK: // %bb.0: 1112; CHECK-NEXT: ptrue p0.s, vl64 1113; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1114; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1115; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s 1116; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1117; CHECK-NEXT: ret 1118 %op1 = load <64 x i32>, ptr %a 1119 %op2 = load <64 x i32>, ptr %b 1120 %res = call <64 x i32> @llvm.umax.v64i32(<64 x i32> %op1, <64 x i32> %op2) 1121 store <64 x i32> %res, ptr %a 1122 ret void 1123} 1124 1125; Vector i64 max are not legal for NEON so use SVE when available. 1126define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 { 1127; CHECK-LABEL: umax_v1i64: 1128; CHECK: // %bb.0: 1129; CHECK-NEXT: ptrue p0.d, vl1 1130; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 1131; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 1132; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d 1133; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1134; CHECK-NEXT: ret 1135 %res = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %op1, <1 x i64> %op2) 1136 ret <1 x i64> %res 1137} 1138 1139; Vector i64 max are not legal for NEON so use SVE when available. 1140define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 { 1141; CHECK-LABEL: umax_v2i64: 1142; CHECK: // %bb.0: 1143; CHECK-NEXT: ptrue p0.d, vl2 1144; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 1145; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 1146; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d 1147; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 1148; CHECK-NEXT: ret 1149 %res = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %op1, <2 x i64> %op2) 1150 ret <2 x i64> %res 1151} 1152 1153define void @umax_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 1154; CHECK-LABEL: umax_v4i64: 1155; CHECK: // %bb.0: 1156; CHECK-NEXT: ptrue p0.d, vl4 1157; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1158; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1159; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d 1160; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1161; CHECK-NEXT: ret 1162 %op1 = load <4 x i64>, ptr %a 1163 %op2 = load <4 x i64>, ptr %b 1164 %res = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %op1, <4 x i64> %op2) 1165 store <4 x i64> %res, ptr %a 1166 ret void 1167} 1168 1169define void @umax_v8i64(ptr %a, ptr %b) #0 { 1170; VBITS_GE_256-LABEL: umax_v8i64: 1171; VBITS_GE_256: // %bb.0: 1172; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1173; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1174; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1175; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 1176; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 1177; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 1178; VBITS_GE_256-NEXT: umax z0.d, p0/m, z0.d, z1.d 1179; VBITS_GE_256-NEXT: movprfx z1, z2 1180; VBITS_GE_256-NEXT: umax z1.d, p0/m, z1.d, z3.d 1181; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 1182; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 1183; VBITS_GE_256-NEXT: ret 1184; 1185; VBITS_GE_512-LABEL: umax_v8i64: 1186; VBITS_GE_512: // %bb.0: 1187; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1188; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1189; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 1190; VBITS_GE_512-NEXT: umax z0.d, p0/m, z0.d, z1.d 1191; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 1192; VBITS_GE_512-NEXT: ret 1193 %op1 = load <8 x i64>, ptr %a 1194 %op2 = load <8 x i64>, ptr %b 1195 %res = call <8 x i64> @llvm.umax.v8i64(<8 x i64> %op1, <8 x i64> %op2) 1196 store <8 x i64> %res, ptr %a 1197 ret void 1198} 1199 1200define void @umax_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 1201; CHECK-LABEL: umax_v16i64: 1202; CHECK: // %bb.0: 1203; CHECK-NEXT: ptrue p0.d, vl16 1204; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1205; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1206; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d 1207; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1208; CHECK-NEXT: ret 1209 %op1 = load <16 x i64>, ptr %a 1210 %op2 = load <16 x i64>, ptr %b 1211 %res = call <16 x i64> @llvm.umax.v16i64(<16 x i64> %op1, <16 x i64> %op2) 1212 store <16 x i64> %res, ptr %a 1213 ret void 1214} 1215 1216define void @umax_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 1217; CHECK-LABEL: umax_v32i64: 1218; CHECK: // %bb.0: 1219; CHECK-NEXT: ptrue p0.d, vl32 1220; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1221; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1222; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d 1223; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1224; CHECK-NEXT: ret 1225 %op1 = load <32 x i64>, ptr %a 1226 %op2 = load <32 x i64>, ptr %b 1227 %res = call <32 x i64> @llvm.umax.v32i64(<32 x i64> %op1, <32 x i64> %op2) 1228 store <32 x i64> %res, ptr %a 1229 ret void 1230} 1231 1232; 1233; UMIN 1234; 1235 1236; Don't use SVE for 64-bit vectors. 1237define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 { 1238; CHECK-LABEL: umin_v8i8: 1239; CHECK: // %bb.0: 1240; CHECK-NEXT: umin v0.8b, v0.8b, v1.8b 1241; CHECK-NEXT: ret 1242 %res = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %op1, <8 x i8> %op2) 1243 ret <8 x i8> %res 1244} 1245 1246; Don't use SVE for 128-bit vectors. 1247define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 { 1248; CHECK-LABEL: umin_v16i8: 1249; CHECK: // %bb.0: 1250; CHECK-NEXT: umin v0.16b, v0.16b, v1.16b 1251; CHECK-NEXT: ret 1252 %res = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %op1, <16 x i8> %op2) 1253 ret <16 x i8> %res 1254} 1255 1256define void @umin_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 1257; CHECK-LABEL: umin_v32i8: 1258; CHECK: // %bb.0: 1259; CHECK-NEXT: ptrue p0.b, vl32 1260; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 1261; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 1262; CHECK-NEXT: umin z0.b, p0/m, z0.b, z1.b 1263; CHECK-NEXT: st1b { z0.b }, p0, [x0] 1264; CHECK-NEXT: ret 1265 %op1 = load <32 x i8>, ptr %a 1266 %op2 = load <32 x i8>, ptr %b 1267 %res = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %op1, <32 x i8> %op2) 1268 store <32 x i8> %res, ptr %a 1269 ret void 1270} 1271 1272define void @umin_v64i8(ptr %a, ptr %b) #0 { 1273; VBITS_GE_256-LABEL: umin_v64i8: 1274; VBITS_GE_256: // %bb.0: 1275; VBITS_GE_256-NEXT: ptrue p0.b, vl32 1276; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 1277; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 1278; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] 1279; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] 1280; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] 1281; VBITS_GE_256-NEXT: umin z0.b, p0/m, z0.b, z1.b 1282; VBITS_GE_256-NEXT: movprfx z1, z2 1283; VBITS_GE_256-NEXT: umin z1.b, p0/m, z1.b, z3.b 1284; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 1285; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 1286; VBITS_GE_256-NEXT: ret 1287; 1288; VBITS_GE_512-LABEL: umin_v64i8: 1289; VBITS_GE_512: // %bb.0: 1290; VBITS_GE_512-NEXT: ptrue p0.b, vl64 1291; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 1292; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] 1293; VBITS_GE_512-NEXT: umin z0.b, p0/m, z0.b, z1.b 1294; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 1295; VBITS_GE_512-NEXT: ret 1296 %op1 = load <64 x i8>, ptr %a 1297 %op2 = load <64 x i8>, ptr %b 1298 %res = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %op1, <64 x i8> %op2) 1299 store <64 x i8> %res, ptr %a 1300 ret void 1301} 1302 1303define void @umin_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 1304; CHECK-LABEL: umin_v128i8: 1305; CHECK: // %bb.0: 1306; CHECK-NEXT: ptrue p0.b, vl128 1307; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 1308; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 1309; CHECK-NEXT: umin z0.b, p0/m, z0.b, z1.b 1310; CHECK-NEXT: st1b { z0.b }, p0, [x0] 1311; CHECK-NEXT: ret 1312 %op1 = load <128 x i8>, ptr %a 1313 %op2 = load <128 x i8>, ptr %b 1314 %res = call <128 x i8> @llvm.umin.v128i8(<128 x i8> %op1, <128 x i8> %op2) 1315 store <128 x i8> %res, ptr %a 1316 ret void 1317} 1318 1319define void @umin_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 1320; CHECK-LABEL: umin_v256i8: 1321; CHECK: // %bb.0: 1322; CHECK-NEXT: ptrue p0.b, vl256 1323; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 1324; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 1325; CHECK-NEXT: umin z0.b, p0/m, z0.b, z1.b 1326; CHECK-NEXT: st1b { z0.b }, p0, [x0] 1327; CHECK-NEXT: ret 1328 %op1 = load <256 x i8>, ptr %a 1329 %op2 = load <256 x i8>, ptr %b 1330 %res = call <256 x i8> @llvm.umin.v256i8(<256 x i8> %op1, <256 x i8> %op2) 1331 store <256 x i8> %res, ptr %a 1332 ret void 1333} 1334 1335; Don't use SVE for 64-bit vectors. 1336define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 { 1337; CHECK-LABEL: umin_v4i16: 1338; CHECK: // %bb.0: 1339; CHECK-NEXT: umin v0.4h, v0.4h, v1.4h 1340; CHECK-NEXT: ret 1341 %res = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %op1, <4 x i16> %op2) 1342 ret <4 x i16> %res 1343} 1344 1345; Don't use SVE for 128-bit vectors. 1346define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 { 1347; CHECK-LABEL: umin_v8i16: 1348; CHECK: // %bb.0: 1349; CHECK-NEXT: umin v0.8h, v0.8h, v1.8h 1350; CHECK-NEXT: ret 1351 %res = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %op1, <8 x i16> %op2) 1352 ret <8 x i16> %res 1353} 1354 1355define void @umin_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 1356; CHECK-LABEL: umin_v16i16: 1357; CHECK: // %bb.0: 1358; CHECK-NEXT: ptrue p0.h, vl16 1359; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1360; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1361; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h 1362; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1363; CHECK-NEXT: ret 1364 %op1 = load <16 x i16>, ptr %a 1365 %op2 = load <16 x i16>, ptr %b 1366 %res = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %op1, <16 x i16> %op2) 1367 store <16 x i16> %res, ptr %a 1368 ret void 1369} 1370 1371define void @umin_v32i16(ptr %a, ptr %b) #0 { 1372; VBITS_GE_256-LABEL: umin_v32i16: 1373; VBITS_GE_256: // %bb.0: 1374; VBITS_GE_256-NEXT: ptrue p0.h, vl16 1375; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 1376; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 1377; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 1378; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 1379; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 1380; VBITS_GE_256-NEXT: umin z0.h, p0/m, z0.h, z1.h 1381; VBITS_GE_256-NEXT: movprfx z1, z2 1382; VBITS_GE_256-NEXT: umin z1.h, p0/m, z1.h, z3.h 1383; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 1384; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 1385; VBITS_GE_256-NEXT: ret 1386; 1387; VBITS_GE_512-LABEL: umin_v32i16: 1388; VBITS_GE_512: // %bb.0: 1389; VBITS_GE_512-NEXT: ptrue p0.h, vl32 1390; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 1391; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 1392; VBITS_GE_512-NEXT: umin z0.h, p0/m, z0.h, z1.h 1393; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 1394; VBITS_GE_512-NEXT: ret 1395 %op1 = load <32 x i16>, ptr %a 1396 %op2 = load <32 x i16>, ptr %b 1397 %res = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %op1, <32 x i16> %op2) 1398 store <32 x i16> %res, ptr %a 1399 ret void 1400} 1401 1402define void @umin_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 1403; CHECK-LABEL: umin_v64i16: 1404; CHECK: // %bb.0: 1405; CHECK-NEXT: ptrue p0.h, vl64 1406; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1407; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1408; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h 1409; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1410; CHECK-NEXT: ret 1411 %op1 = load <64 x i16>, ptr %a 1412 %op2 = load <64 x i16>, ptr %b 1413 %res = call <64 x i16> @llvm.umin.v64i16(<64 x i16> %op1, <64 x i16> %op2) 1414 store <64 x i16> %res, ptr %a 1415 ret void 1416} 1417 1418define void @umin_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 1419; CHECK-LABEL: umin_v128i16: 1420; CHECK: // %bb.0: 1421; CHECK-NEXT: ptrue p0.h, vl128 1422; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1423; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1424; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h 1425; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1426; CHECK-NEXT: ret 1427 %op1 = load <128 x i16>, ptr %a 1428 %op2 = load <128 x i16>, ptr %b 1429 %res = call <128 x i16> @llvm.umin.v128i16(<128 x i16> %op1, <128 x i16> %op2) 1430 store <128 x i16> %res, ptr %a 1431 ret void 1432} 1433 1434; Don't use SVE for 64-bit vectors. 1435define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 { 1436; CHECK-LABEL: umin_v2i32: 1437; CHECK: // %bb.0: 1438; CHECK-NEXT: umin v0.2s, v0.2s, v1.2s 1439; CHECK-NEXT: ret 1440 %res = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %op1, <2 x i32> %op2) 1441 ret <2 x i32> %res 1442} 1443 1444; Don't use SVE for 128-bit vectors. 1445define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 { 1446; CHECK-LABEL: umin_v4i32: 1447; CHECK: // %bb.0: 1448; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s 1449; CHECK-NEXT: ret 1450 %res = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %op1, <4 x i32> %op2) 1451 ret <4 x i32> %res 1452} 1453 1454define void @umin_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 1455; CHECK-LABEL: umin_v8i32: 1456; CHECK: // %bb.0: 1457; CHECK-NEXT: ptrue p0.s, vl8 1458; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1459; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1460; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s 1461; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1462; CHECK-NEXT: ret 1463 %op1 = load <8 x i32>, ptr %a 1464 %op2 = load <8 x i32>, ptr %b 1465 %res = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %op1, <8 x i32> %op2) 1466 store <8 x i32> %res, ptr %a 1467 ret void 1468} 1469 1470define void @umin_v16i32(ptr %a, ptr %b) #0 { 1471; VBITS_GE_256-LABEL: umin_v16i32: 1472; VBITS_GE_256: // %bb.0: 1473; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1474; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1475; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 1476; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 1477; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 1478; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 1479; VBITS_GE_256-NEXT: umin z0.s, p0/m, z0.s, z1.s 1480; VBITS_GE_256-NEXT: movprfx z1, z2 1481; VBITS_GE_256-NEXT: umin z1.s, p0/m, z1.s, z3.s 1482; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 1483; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 1484; VBITS_GE_256-NEXT: ret 1485; 1486; VBITS_GE_512-LABEL: umin_v16i32: 1487; VBITS_GE_512: // %bb.0: 1488; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1489; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1490; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 1491; VBITS_GE_512-NEXT: umin z0.s, p0/m, z0.s, z1.s 1492; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 1493; VBITS_GE_512-NEXT: ret 1494 %op1 = load <16 x i32>, ptr %a 1495 %op2 = load <16 x i32>, ptr %b 1496 %res = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %op1, <16 x i32> %op2) 1497 store <16 x i32> %res, ptr %a 1498 ret void 1499} 1500 1501define void @umin_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 1502; CHECK-LABEL: umin_v32i32: 1503; CHECK: // %bb.0: 1504; CHECK-NEXT: ptrue p0.s, vl32 1505; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1506; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1507; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s 1508; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1509; CHECK-NEXT: ret 1510 %op1 = load <32 x i32>, ptr %a 1511 %op2 = load <32 x i32>, ptr %b 1512 %res = call <32 x i32> @llvm.umin.v32i32(<32 x i32> %op1, <32 x i32> %op2) 1513 store <32 x i32> %res, ptr %a 1514 ret void 1515} 1516 1517define void @umin_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 1518; CHECK-LABEL: umin_v64i32: 1519; CHECK: // %bb.0: 1520; CHECK-NEXT: ptrue p0.s, vl64 1521; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1522; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1523; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s 1524; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1525; CHECK-NEXT: ret 1526 %op1 = load <64 x i32>, ptr %a 1527 %op2 = load <64 x i32>, ptr %b 1528 %res = call <64 x i32> @llvm.umin.v64i32(<64 x i32> %op1, <64 x i32> %op2) 1529 store <64 x i32> %res, ptr %a 1530 ret void 1531} 1532 1533; Vector i64 min are not legal for NEON so use SVE when available. 1534define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 { 1535; CHECK-LABEL: umin_v1i64: 1536; CHECK: // %bb.0: 1537; CHECK-NEXT: ptrue p0.d, vl1 1538; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 1539; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 1540; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d 1541; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1542; CHECK-NEXT: ret 1543 %res = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %op1, <1 x i64> %op2) 1544 ret <1 x i64> %res 1545} 1546 1547; Vector i64 min are not legal for NEON so use SVE when available. 1548define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 { 1549; CHECK-LABEL: umin_v2i64: 1550; CHECK: // %bb.0: 1551; CHECK-NEXT: ptrue p0.d, vl2 1552; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 1553; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 1554; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d 1555; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 1556; CHECK-NEXT: ret 1557 %res = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %op1, <2 x i64> %op2) 1558 ret <2 x i64> %res 1559} 1560 1561define void @umin_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 1562; CHECK-LABEL: umin_v4i64: 1563; CHECK: // %bb.0: 1564; CHECK-NEXT: ptrue p0.d, vl4 1565; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1566; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1567; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d 1568; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1569; CHECK-NEXT: ret 1570 %op1 = load <4 x i64>, ptr %a 1571 %op2 = load <4 x i64>, ptr %b 1572 %res = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %op1, <4 x i64> %op2) 1573 store <4 x i64> %res, ptr %a 1574 ret void 1575} 1576 1577define void @umin_v8i64(ptr %a, ptr %b) #0 { 1578; VBITS_GE_256-LABEL: umin_v8i64: 1579; VBITS_GE_256: // %bb.0: 1580; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1581; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1582; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1583; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 1584; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 1585; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 1586; VBITS_GE_256-NEXT: umin z0.d, p0/m, z0.d, z1.d 1587; VBITS_GE_256-NEXT: movprfx z1, z2 1588; VBITS_GE_256-NEXT: umin z1.d, p0/m, z1.d, z3.d 1589; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 1590; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 1591; VBITS_GE_256-NEXT: ret 1592; 1593; VBITS_GE_512-LABEL: umin_v8i64: 1594; VBITS_GE_512: // %bb.0: 1595; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1596; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1597; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 1598; VBITS_GE_512-NEXT: umin z0.d, p0/m, z0.d, z1.d 1599; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 1600; VBITS_GE_512-NEXT: ret 1601 %op1 = load <8 x i64>, ptr %a 1602 %op2 = load <8 x i64>, ptr %b 1603 %res = call <8 x i64> @llvm.umin.v8i64(<8 x i64> %op1, <8 x i64> %op2) 1604 store <8 x i64> %res, ptr %a 1605 ret void 1606} 1607 1608define void @umin_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 1609; CHECK-LABEL: umin_v16i64: 1610; CHECK: // %bb.0: 1611; CHECK-NEXT: ptrue p0.d, vl16 1612; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1613; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1614; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d 1615; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1616; CHECK-NEXT: ret 1617 %op1 = load <16 x i64>, ptr %a 1618 %op2 = load <16 x i64>, ptr %b 1619 %res = call <16 x i64> @llvm.umin.v16i64(<16 x i64> %op1, <16 x i64> %op2) 1620 store <16 x i64> %res, ptr %a 1621 ret void 1622} 1623 1624define void @umin_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 1625; CHECK-LABEL: umin_v32i64: 1626; CHECK: // %bb.0: 1627; CHECK-NEXT: ptrue p0.d, vl32 1628; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1629; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1630; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d 1631; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1632; CHECK-NEXT: ret 1633 %op1 = load <32 x i64>, ptr %a 1634 %op2 = load <32 x i64>, ptr %b 1635 %res = call <32 x i64> @llvm.umin.v32i64(<32 x i64> %op1, <32 x i64> %op2) 1636 store <32 x i64> %res, ptr %a 1637 ret void 1638} 1639 1640attributes #0 = { "target-features"="+sve" } 1641 1642declare <8 x i8> @llvm.smin.v8i8(<8 x i8>, <8 x i8>) 1643declare <16 x i8> @llvm.smin.v16i8(<16 x i8>, <16 x i8>) 1644declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>) 1645declare <64 x i8> @llvm.smin.v64i8(<64 x i8>, <64 x i8>) 1646declare <128 x i8> @llvm.smin.v128i8(<128 x i8>, <128 x i8>) 1647declare <256 x i8> @llvm.smin.v256i8(<256 x i8>, <256 x i8>) 1648declare <4 x i16> @llvm.smin.v4i16(<4 x i16>, <4 x i16>) 1649declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>) 1650declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>) 1651declare <32 x i16> @llvm.smin.v32i16(<32 x i16>, <32 x i16>) 1652declare <64 x i16> @llvm.smin.v64i16(<64 x i16>, <64 x i16>) 1653declare <128 x i16> @llvm.smin.v128i16(<128 x i16>, <128 x i16>) 1654declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>) 1655declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) 1656declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) 1657declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>) 1658declare <32 x i32> @llvm.smin.v32i32(<32 x i32>, <32 x i32>) 1659declare <64 x i32> @llvm.smin.v64i32(<64 x i32>, <64 x i32>) 1660declare <1 x i64> @llvm.smin.v1i64(<1 x i64>, <1 x i64>) 1661declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>) 1662declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>) 1663declare <8 x i64> @llvm.smin.v8i64(<8 x i64>, <8 x i64>) 1664declare <16 x i64> @llvm.smin.v16i64(<16 x i64>, <16 x i64>) 1665declare <32 x i64> @llvm.smin.v32i64(<32 x i64>, <32 x i64>) 1666 1667declare <8 x i8> @llvm.smax.v8i8(<8 x i8>, <8 x i8>) 1668declare <16 x i8> @llvm.smax.v16i8(<16 x i8>, <16 x i8>) 1669declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>) 1670declare <64 x i8> @llvm.smax.v64i8(<64 x i8>, <64 x i8>) 1671declare <128 x i8> @llvm.smax.v128i8(<128 x i8>, <128 x i8>) 1672declare <256 x i8> @llvm.smax.v256i8(<256 x i8>, <256 x i8>) 1673declare <4 x i16> @llvm.smax.v4i16(<4 x i16>, <4 x i16>) 1674declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>) 1675declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>) 1676declare <32 x i16> @llvm.smax.v32i16(<32 x i16>, <32 x i16>) 1677declare <64 x i16> @llvm.smax.v64i16(<64 x i16>, <64 x i16>) 1678declare <128 x i16> @llvm.smax.v128i16(<128 x i16>, <128 x i16>) 1679declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>) 1680declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) 1681declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) 1682declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>) 1683declare <32 x i32> @llvm.smax.v32i32(<32 x i32>, <32 x i32>) 1684declare <64 x i32> @llvm.smax.v64i32(<64 x i32>, <64 x i32>) 1685declare <1 x i64> @llvm.smax.v1i64(<1 x i64>, <1 x i64>) 1686declare <2 x i64> @llvm.smax.v2i64(<2 x i64>, <2 x i64>) 1687declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>) 1688declare <8 x i64> @llvm.smax.v8i64(<8 x i64>, <8 x i64>) 1689declare <16 x i64> @llvm.smax.v16i64(<16 x i64>, <16 x i64>) 1690declare <32 x i64> @llvm.smax.v32i64(<32 x i64>, <32 x i64>) 1691 1692declare <8 x i8> @llvm.umin.v8i8(<8 x i8>, <8 x i8>) 1693declare <16 x i8> @llvm.umin.v16i8(<16 x i8>, <16 x i8>) 1694declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>) 1695declare <64 x i8> @llvm.umin.v64i8(<64 x i8>, <64 x i8>) 1696declare <128 x i8> @llvm.umin.v128i8(<128 x i8>, <128 x i8>) 1697declare <256 x i8> @llvm.umin.v256i8(<256 x i8>, <256 x i8>) 1698declare <4 x i16> @llvm.umin.v4i16(<4 x i16>, <4 x i16>) 1699declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>) 1700declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>) 1701declare <32 x i16> @llvm.umin.v32i16(<32 x i16>, <32 x i16>) 1702declare <64 x i16> @llvm.umin.v64i16(<64 x i16>, <64 x i16>) 1703declare <128 x i16> @llvm.umin.v128i16(<128 x i16>, <128 x i16>) 1704declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>) 1705declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>) 1706declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>) 1707declare <16 x i32> @llvm.umin.v16i32(<16 x i32>, <16 x i32>) 1708declare <32 x i32> @llvm.umin.v32i32(<32 x i32>, <32 x i32>) 1709declare <64 x i32> @llvm.umin.v64i32(<64 x i32>, <64 x i32>) 1710declare <1 x i64> @llvm.umin.v1i64(<1 x i64>, <1 x i64>) 1711declare <2 x i64> @llvm.umin.v2i64(<2 x i64>, <2 x i64>) 1712declare <4 x i64> @llvm.umin.v4i64(<4 x i64>, <4 x i64>) 1713declare <8 x i64> @llvm.umin.v8i64(<8 x i64>, <8 x i64>) 1714declare <16 x i64> @llvm.umin.v16i64(<16 x i64>, <16 x i64>) 1715declare <32 x i64> @llvm.umin.v32i64(<32 x i64>, <32 x i64>) 1716 1717declare <8 x i8> @llvm.umax.v8i8(<8 x i8>, <8 x i8>) 1718declare <16 x i8> @llvm.umax.v16i8(<16 x i8>, <16 x i8>) 1719declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>) 1720declare <64 x i8> @llvm.umax.v64i8(<64 x i8>, <64 x i8>) 1721declare <128 x i8> @llvm.umax.v128i8(<128 x i8>, <128 x i8>) 1722declare <256 x i8> @llvm.umax.v256i8(<256 x i8>, <256 x i8>) 1723declare <4 x i16> @llvm.umax.v4i16(<4 x i16>, <4 x i16>) 1724declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>) 1725declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>) 1726declare <32 x i16> @llvm.umax.v32i16(<32 x i16>, <32 x i16>) 1727declare <64 x i16> @llvm.umax.v64i16(<64 x i16>, <64 x i16>) 1728declare <128 x i16> @llvm.umax.v128i16(<128 x i16>, <128 x i16>) 1729declare <2 x i32> @llvm.umax.v2i32(<2 x i32>, <2 x i32>) 1730declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) 1731declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>) 1732declare <16 x i32> @llvm.umax.v16i32(<16 x i32>, <16 x i32>) 1733declare <32 x i32> @llvm.umax.v32i32(<32 x i32>, <32 x i32>) 1734declare <64 x i32> @llvm.umax.v64i32(<64 x i32>, <64 x i32>) 1735declare <1 x i64> @llvm.umax.v1i64(<1 x i64>, <1 x i64>) 1736declare <2 x i64> @llvm.umax.v2i64(<2 x i64>, <2 x i64>) 1737declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>) 1738declare <8 x i64> @llvm.umax.v8i64(<8 x i64>, <8 x i64>) 1739declare <16 x i64> @llvm.umax.v16i64(<16 x i64>, <16 x i64>) 1740declare <32 x i64> @llvm.umax.v32i64(<32 x i64>, <32 x i64>) 1741