1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV32 3; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV64 4; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+unaligned-vector-mem -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN 5 6; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,ZVE64F 7 8; The two loads are contigous and should be folded into one 9define void @widen_2xv4i16(ptr %x, ptr %z) { 10; CHECK-LABEL: widen_2xv4i16: 11; CHECK: # %bb.0: 12; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma 13; CHECK-NEXT: vle64.v v8, (a0) 14; CHECK-NEXT: vse64.v v8, (a1) 15; CHECK-NEXT: ret 16 %a = load <4 x i16>, ptr %x 17 %b.gep = getelementptr i8, ptr %x, i64 8 18 %b = load <4 x i16>, ptr %b.gep 19 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 20 store <8 x i16> %c, ptr %z 21 ret void 22} 23 24define void @widen_3xv4i16(ptr %x, ptr %z) { 25; CHECK-LABEL: widen_3xv4i16: 26; CHECK: # %bb.0: 27; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 28; CHECK-NEXT: vle16.v v8, (a0) 29; CHECK-NEXT: addi a2, a0, 8 30; CHECK-NEXT: vle16.v v9, (a2) 31; CHECK-NEXT: addi a0, a0, 16 32; CHECK-NEXT: vle16.v v10, (a0) 33; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 34; CHECK-NEXT: vslideup.vi v8, v9, 4 35; CHECK-NEXT: vsetivli zero, 12, e16, m2, ta, ma 36; CHECK-NEXT: vslideup.vi v8, v10, 8 37; CHECK-NEXT: vse16.v v8, (a1) 38; CHECK-NEXT: ret 39 %a = load <4 x i16>, ptr %x 40 %b.gep = getelementptr i8, ptr %x, i64 8 41 %b = load <4 x i16>, ptr %b.gep 42 %c.gep = getelementptr i8, ptr %b.gep, i64 8 43 %c = load <4 x i16>, ptr %c.gep 44 %d.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 45 %d.1 = shufflevector <4 x i16> %c, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison> 46 %d.2 = shufflevector <8 x i16> %d.0, <8 x i16> %d.1, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 47 store <12 x i16> %d.2, ptr %z 48 ret void 49} 50 51define void @widen_4xv4i16(ptr %x, ptr %z) { 52; CHECK-LABEL: widen_4xv4i16: 53; CHECK: # %bb.0: 54; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 55; CHECK-NEXT: vle64.v v8, (a0) 56; CHECK-NEXT: vse64.v v8, (a1) 57; CHECK-NEXT: ret 58 %a = load <4 x i16>, ptr %x 59 %b.gep = getelementptr i8, ptr %x, i64 8 60 %b = load <4 x i16>, ptr %b.gep 61 %c.gep = getelementptr i8, ptr %b.gep, i64 8 62 %c = load <4 x i16>, ptr %c.gep 63 %d.gep = getelementptr i8, ptr %c.gep, i64 8 64 %d = load <4 x i16>, ptr %d.gep 65 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 66 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 67 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 68 store <16 x i16> %e.2, ptr %z 69 ret void 70} 71 72define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) { 73; CHECK-NO-MISALIGN-LABEL: widen_4xv4i16_unaligned: 74; CHECK-NO-MISALIGN: # %bb.0: 75; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m1, ta, ma 76; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0) 77; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 16 78; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2) 79; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8 80; CHECK-NO-MISALIGN-NEXT: addi a0, a0, 24 81; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a0) 82; CHECK-NO-MISALIGN-NEXT: vle8.v v11, (a2) 83; CHECK-NO-MISALIGN-NEXT: vslideup.vi v10, v9, 4 84; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v11, 4 85; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma 86; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 8 87; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1) 88; CHECK-NO-MISALIGN-NEXT: ret 89; 90; RV64-MISALIGN-LABEL: widen_4xv4i16_unaligned: 91; RV64-MISALIGN: # %bb.0: 92; RV64-MISALIGN-NEXT: vsetivli zero, 4, e64, m2, ta, ma 93; RV64-MISALIGN-NEXT: vle64.v v8, (a0) 94; RV64-MISALIGN-NEXT: vse64.v v8, (a1) 95; RV64-MISALIGN-NEXT: ret 96 %a = load <4 x i16>, ptr %x, align 1 97 %b.gep = getelementptr i8, ptr %x, i64 8 98 %b = load <4 x i16>, ptr %b.gep, align 1 99 %c.gep = getelementptr i8, ptr %b.gep, i64 8 100 %c = load <4 x i16>, ptr %c.gep, align 1 101 %d.gep = getelementptr i8, ptr %c.gep, i64 8 102 %d = load <4 x i16>, ptr %d.gep, align 1 103 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 104 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 105 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 106 store <16 x i16> %e.2, ptr %z 107 ret void 108} 109 110; Should be a strided load - with type coercion to i64 111define void @strided_constant(ptr %x, ptr %z) { 112; CHECK-LABEL: strided_constant: 113; CHECK: # %bb.0: 114; CHECK-NEXT: li a2, 16 115; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma 116; CHECK-NEXT: vlse64.v v8, (a0), a2 117; CHECK-NEXT: vse64.v v8, (a1) 118; CHECK-NEXT: ret 119 %a = load <4 x i16>, ptr %x 120 %b.gep = getelementptr i8, ptr %x, i64 16 121 %b = load <4 x i16>, ptr %b.gep 122 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 123 store <8 x i16> %c, ptr %z 124 ret void 125} 126 127; Should be a strided load 128define void @strided_constant_64(ptr %x, ptr %z) { 129; CHECK-LABEL: strided_constant_64: 130; CHECK: # %bb.0: 131; CHECK-NEXT: li a2, 64 132; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma 133; CHECK-NEXT: vlse64.v v8, (a0), a2 134; CHECK-NEXT: vse64.v v8, (a1) 135; CHECK-NEXT: ret 136 %a = load <4 x i16>, ptr %x 137 %b.gep = getelementptr i8, ptr %x, i64 64 138 %b = load <4 x i16>, ptr %b.gep 139 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 140 store <8 x i16> %c, ptr %z 141 ret void 142} 143 144; Vector is too large to fit into a single strided load 145define void @strided_constant_v4i32(ptr %x, ptr %z) { 146; CHECK-LABEL: strided_constant_v4i32: 147; CHECK: # %bb.0: 148; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 149; CHECK-NEXT: vle32.v v8, (a0) 150; CHECK-NEXT: addi a0, a0, 32 151; CHECK-NEXT: vle32.v v10, (a0) 152; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma 153; CHECK-NEXT: vslideup.vi v8, v10, 4 154; CHECK-NEXT: vse32.v v8, (a1) 155; CHECK-NEXT: ret 156 %a = load <4 x i32>, ptr %x 157 %b.gep = getelementptr i8, ptr %x, i64 32 158 %b = load <4 x i32>, ptr %b.gep 159 %c = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 160 store <8 x i32> %c, ptr %z 161 ret void 162} 163 164; Interestingly, can be a stride 0 load 165define void @strided_constant_0(ptr %x, ptr %z) { 166; CHECK-LABEL: strided_constant_0: 167; CHECK: # %bb.0: 168; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 169; CHECK-NEXT: vle16.v v8, (a0) 170; CHECK-NEXT: vmv1r.v v9, v8 171; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 172; CHECK-NEXT: vslideup.vi v9, v8, 4 173; CHECK-NEXT: vse16.v v9, (a1) 174; CHECK-NEXT: ret 175 %a = load <4 x i16>, ptr %x 176 %b = load <4 x i16>, ptr %x 177 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 178 store <8 x i16> %c, ptr %z 179 ret void 180} 181 182; Stride isn't consistent, so shouldn't be combined 183define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) { 184; CHECK-LABEL: strided_constant_mismatch_4xv4i16: 185; CHECK: # %bb.0: 186; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 187; CHECK-NEXT: vle16.v v8, (a0) 188; CHECK-NEXT: addi a2, a0, 6 189; CHECK-NEXT: vle16.v v10, (a2) 190; CHECK-NEXT: addi a2, a0, 2 191; CHECK-NEXT: addi a0, a0, 8 192; CHECK-NEXT: vle16.v v9, (a0) 193; CHECK-NEXT: vle16.v v11, (a2) 194; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 195; CHECK-NEXT: vslideup.vi v10, v9, 4 196; CHECK-NEXT: vslideup.vi v8, v11, 4 197; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma 198; CHECK-NEXT: vslideup.vi v8, v10, 8 199; CHECK-NEXT: vse16.v v8, (a1) 200; CHECK-NEXT: ret 201 %a = load <4 x i16>, ptr %x 202 %b.gep = getelementptr i8, ptr %x, i64 2 203 %b = load <4 x i16>, ptr %b.gep 204 %c.gep = getelementptr i8, ptr %b.gep, i64 4 205 %c = load <4 x i16>, ptr %c.gep 206 %d.gep = getelementptr i8, ptr %c.gep, i64 2 207 %d = load <4 x i16>, ptr %d.gep 208 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 209 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 210 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 211 store <16 x i16> %e.2, ptr %z 212 ret void 213} 214 215define void @strided_runtime(ptr %x, ptr %z, i64 %s) { 216; CHECK-LABEL: strided_runtime: 217; CHECK: # %bb.0: 218; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma 219; CHECK-NEXT: vlse64.v v8, (a0), a2 220; CHECK-NEXT: vse64.v v8, (a1) 221; CHECK-NEXT: ret 222 %a = load <4 x i16>, ptr %x 223 %b.gep = getelementptr i8, ptr %x, i64 %s 224 %b = load <4 x i16>, ptr %b.gep 225 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 226 store <8 x i16> %c, ptr %z 227 ret void 228} 229 230define void @strided_runtime_4xv4i16(ptr %x, ptr %z, i64 %s) { 231; CHECK-LABEL: strided_runtime_4xv4i16: 232; CHECK: # %bb.0: 233; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 234; CHECK-NEXT: vlse64.v v8, (a0), a2 235; CHECK-NEXT: vse64.v v8, (a1) 236; CHECK-NEXT: ret 237 %a = load <4 x i16>, ptr %x 238 %b.gep = getelementptr i8, ptr %x, i64 %s 239 %b = load <4 x i16>, ptr %b.gep 240 %c.gep = getelementptr i8, ptr %b.gep, i64 %s 241 %c = load <4 x i16>, ptr %c.gep 242 %d.gep = getelementptr i8, ptr %c.gep, i64 %s 243 %d = load <4 x i16>, ptr %d.gep 244 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 245 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 246 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 247 store <16 x i16> %e.2, ptr %z 248 ret void 249} 250 251; Stride isn't consistent, so shouldn't be combined 252define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) { 253; RV32-LABEL: strided_runtime_mismatch_4xv4i16: 254; RV32: # %bb.0: 255; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 256; RV32-NEXT: vle16.v v8, (a0) 257; RV32-NEXT: add a0, a0, a2 258; RV32-NEXT: add a4, a0, a4 259; RV32-NEXT: vle16.v v10, (a4) 260; RV32-NEXT: add a2, a4, a2 261; RV32-NEXT: vle16.v v9, (a2) 262; RV32-NEXT: vle16.v v11, (a0) 263; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma 264; RV32-NEXT: vslideup.vi v10, v9, 4 265; RV32-NEXT: vslideup.vi v8, v11, 4 266; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma 267; RV32-NEXT: vslideup.vi v8, v10, 8 268; RV32-NEXT: vse16.v v8, (a1) 269; RV32-NEXT: ret 270; 271; RV64-LABEL: strided_runtime_mismatch_4xv4i16: 272; RV64: # %bb.0: 273; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 274; RV64-NEXT: vle16.v v8, (a0) 275; RV64-NEXT: add a0, a0, a2 276; RV64-NEXT: add a3, a0, a3 277; RV64-NEXT: vle16.v v10, (a3) 278; RV64-NEXT: add a2, a3, a2 279; RV64-NEXT: vle16.v v9, (a2) 280; RV64-NEXT: vle16.v v11, (a0) 281; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma 282; RV64-NEXT: vslideup.vi v10, v9, 4 283; RV64-NEXT: vslideup.vi v8, v11, 4 284; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma 285; RV64-NEXT: vslideup.vi v8, v10, 8 286; RV64-NEXT: vse16.v v8, (a1) 287; RV64-NEXT: ret 288; 289; ZVE64F-LABEL: strided_runtime_mismatch_4xv4i16: 290; ZVE64F: # %bb.0: 291; ZVE64F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 292; ZVE64F-NEXT: vle16.v v8, (a0) 293; ZVE64F-NEXT: add a0, a0, a2 294; ZVE64F-NEXT: add a3, a0, a3 295; ZVE64F-NEXT: vle16.v v10, (a3) 296; ZVE64F-NEXT: add a2, a3, a2 297; ZVE64F-NEXT: vle16.v v9, (a2) 298; ZVE64F-NEXT: vle16.v v11, (a0) 299; ZVE64F-NEXT: vsetivli zero, 8, e16, m1, ta, ma 300; ZVE64F-NEXT: vslideup.vi v10, v9, 4 301; ZVE64F-NEXT: vslideup.vi v8, v11, 4 302; ZVE64F-NEXT: vsetivli zero, 16, e16, m2, ta, ma 303; ZVE64F-NEXT: vslideup.vi v8, v10, 8 304; ZVE64F-NEXT: vse16.v v8, (a1) 305; ZVE64F-NEXT: ret 306 %a = load <4 x i16>, ptr %x 307 %b.gep = getelementptr i8, ptr %x, i64 %s 308 %b = load <4 x i16>, ptr %b.gep 309 %c.gep = getelementptr i8, ptr %b.gep, i64 %t 310 %c = load <4 x i16>, ptr %c.gep 311 %d.gep = getelementptr i8, ptr %c.gep, i64 %s 312 %d = load <4 x i16>, ptr %d.gep 313 %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 314 %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 315 %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 316 store <16 x i16> %e.2, ptr %z 317 ret void 318} 319 320define void @strided_runtime_4xv4f16(ptr %x, ptr %z, i64 %s) { 321; CHECK-LABEL: strided_runtime_4xv4f16: 322; CHECK: # %bb.0: 323; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 324; CHECK-NEXT: vlse64.v v8, (a0), a2 325; CHECK-NEXT: vse64.v v8, (a1) 326; CHECK-NEXT: ret 327 %a = load <4 x half>, ptr %x 328 %b.gep = getelementptr i8, ptr %x, i64 %s 329 %b = load <4 x half>, ptr %b.gep 330 %c.gep = getelementptr i8, ptr %b.gep, i64 %s 331 %c = load <4 x half>, ptr %c.gep 332 %d.gep = getelementptr i8, ptr %c.gep, i64 %s 333 %d = load <4 x half>, ptr %d.gep 334 %e.0 = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 335 %e.1 = shufflevector <4 x half> %c, <4 x half> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 336 %e.2 = shufflevector <8 x half> %e.0, <8 x half> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 337 store <16 x half> %e.2, ptr %z 338 ret void 339} 340 341define void @strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) { 342; CHECK-LABEL: strided_runtime_4xv2f32: 343; CHECK: # %bb.0: 344; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 345; CHECK-NEXT: vlse64.v v8, (a0), a2 346; CHECK-NEXT: vse64.v v8, (a1) 347; CHECK-NEXT: ret 348 %a = load <2 x float>, ptr %x 349 %b.gep = getelementptr i8, ptr %x, i64 %s 350 %b = load <2 x float>, ptr %b.gep 351 %c.gep = getelementptr i8, ptr %b.gep, i64 %s 352 %c = load <2 x float>, ptr %c.gep 353 %d.gep = getelementptr i8, ptr %c.gep, i64 %s 354 %d = load <2 x float>, ptr %d.gep 355 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 356 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 357 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 358 store <8 x float> %e.2, ptr %z 359 ret void 360} 361 362define void @strided_unaligned(ptr %x, ptr %z, i64 %s) { 363; CHECK-NO-MISALIGN-LABEL: strided_unaligned: 364; CHECK-NO-MISALIGN: # %bb.0: 365; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m1, ta, ma 366; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0) 367; CHECK-NO-MISALIGN-NEXT: add a0, a0, a2 368; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a0) 369; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v9, 4 370; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1) 371; CHECK-NO-MISALIGN-NEXT: ret 372; 373; RV64-MISALIGN-LABEL: strided_unaligned: 374; RV64-MISALIGN: # %bb.0: 375; RV64-MISALIGN-NEXT: vsetivli zero, 2, e64, m1, ta, ma 376; RV64-MISALIGN-NEXT: vlse64.v v8, (a0), a2 377; RV64-MISALIGN-NEXT: vse64.v v8, (a1) 378; RV64-MISALIGN-NEXT: ret 379 %a = load <4 x i16>, ptr %x, align 1 380 %b.gep = getelementptr i8, ptr %x, i64 %s 381 %b = load <4 x i16>, ptr %b.gep, align 1 382 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 383 store <8 x i16> %c, ptr %z 384 ret void 385} 386 387; Should use the most restrictive common alignment 388define void @strided_mismatched_alignments(ptr %x, ptr %z, i64 %s) { 389; CHECK-LABEL: strided_mismatched_alignments: 390; CHECK: # %bb.0: 391; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma 392; CHECK-NEXT: vlse64.v v8, (a0), a2 393; CHECK-NEXT: vse64.v v8, (a1) 394; CHECK-NEXT: ret 395 %a = load <4 x i16>, ptr %x, align 8 396 %b.gep = getelementptr i8, ptr %x, i64 %s 397 %b = load <4 x i16>, ptr %b.gep, align 16 398 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 399 store <8 x i16> %c, ptr %z 400 ret void 401} 402 403define void @strided_ok_alignments_8(ptr %x, ptr %z, i64 %s) { 404; CHECK-LABEL: strided_ok_alignments_8: 405; CHECK: # %bb.0: 406; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma 407; CHECK-NEXT: vlse64.v v8, (a0), a2 408; CHECK-NEXT: vse64.v v8, (a1) 409; CHECK-NEXT: ret 410 %a = load <4 x i16>, ptr %x, align 8 411 %b.gep = getelementptr i8, ptr %x, i64 %s 412 %b = load <4 x i16>, ptr %b.gep, align 8 413 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 414 store <8 x i16> %c, ptr %z 415 ret void 416} 417 418define void @strided_ok_alignments_16(ptr %x, ptr %z, i64 %s) { 419; CHECK-LABEL: strided_ok_alignments_16: 420; CHECK: # %bb.0: 421; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma 422; CHECK-NEXT: vlse64.v v8, (a0), a2 423; CHECK-NEXT: vse64.v v8, (a1) 424; CHECK-NEXT: ret 425 %a = load <4 x i16>, ptr %x, align 16 426 %b.gep = getelementptr i8, ptr %x, i64 %s 427 %b = load <4 x i16>, ptr %b.gep, align 16 428 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 429 store <8 x i16> %c, ptr %z 430 ret void 431} 432 433; Shouldn't be combined because one of the loads is not simple 434define void @strided_non_simple_load(ptr %x, ptr %z, i64 %s) { 435; CHECK-LABEL: strided_non_simple_load: 436; CHECK: # %bb.0: 437; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 438; CHECK-NEXT: vle16.v v8, (a0) 439; CHECK-NEXT: add a0, a0, a2 440; CHECK-NEXT: vle16.v v9, (a0) 441; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 442; CHECK-NEXT: vslideup.vi v8, v9, 4 443; CHECK-NEXT: vse16.v v8, (a1) 444; CHECK-NEXT: ret 445 %a = load <4 x i16>, ptr %x 446 %b.gep = getelementptr i8, ptr %x, i64 %s 447 %b = load volatile <4 x i16>, ptr %b.gep 448 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 449 store <8 x i16> %c, ptr %z 450 ret void 451} 452 453; Shouldn't be combined because one of the operands is not a load 454define void @strided_non_load(ptr %x, ptr %z, <4 x i16> %b) { 455; CHECK-LABEL: strided_non_load: 456; CHECK: # %bb.0: 457; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 458; CHECK-NEXT: vle16.v v9, (a0) 459; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 460; CHECK-NEXT: vslideup.vi v9, v8, 4 461; CHECK-NEXT: vse16.v v9, (a1) 462; CHECK-NEXT: ret 463 %a = load <4 x i16>, ptr %x 464 %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 465 store <8 x i16> %c, ptr %z 466 ret void 467} 468 469define void @strided_constant_neg_4xv2f32(ptr %x, ptr %z, i64 %s) { 470; CHECK-LABEL: strided_constant_neg_4xv2f32: 471; CHECK: # %bb.0: 472; CHECK-NEXT: li a2, -64 473; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 474; CHECK-NEXT: vlse64.v v8, (a0), a2 475; CHECK-NEXT: vse64.v v8, (a1) 476; CHECK-NEXT: ret 477 %a = load <2 x float>, ptr %x 478 %b.gep = getelementptr i8, ptr %x, i64 -64 479 %b = load <2 x float>, ptr %b.gep 480 %c.gep = getelementptr i8, ptr %b.gep, i64 -64 481 %c = load <2 x float>, ptr %c.gep 482 %d.gep = getelementptr i8, ptr %c.gep, i64 -64 483 %d = load <2 x float>, ptr %d.gep 484 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 485 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 486 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 487 store <8 x float> %e.2, ptr %z 488 ret void 489} 490 491; This is a strided load with a negative stride 492define void @reverse_strided_constant_pos_4xv2f32(ptr %x, ptr %z, i64 %s) { 493; CHECK-LABEL: reverse_strided_constant_pos_4xv2f32: 494; CHECK: # %bb.0: 495; CHECK-NEXT: addi a0, a0, 192 496; CHECK-NEXT: li a2, -64 497; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 498; CHECK-NEXT: vlse64.v v8, (a0), a2 499; CHECK-NEXT: vse64.v v8, (a1) 500; CHECK-NEXT: ret 501 %x.1 = getelementptr i8, ptr %x, i64 64 502 %x.2 = getelementptr i8, ptr %x.1, i64 64 503 %x.3 = getelementptr i8, ptr %x.2, i64 64 504 %a = load <2 x float>, ptr %x.3 505 %b = load <2 x float>, ptr %x.2 506 %c = load <2 x float>, ptr %x.1 507 %d = load <2 x float>, ptr %x 508 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 509 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 510 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 511 store <8 x float> %e.2, ptr %z 512 ret void 513} 514 515define void @reverse_strided_constant_neg_4xv2f32(ptr %x, ptr %z, i64 %s) { 516; CHECK-LABEL: reverse_strided_constant_neg_4xv2f32: 517; CHECK: # %bb.0: 518; CHECK-NEXT: addi a0, a0, -192 519; CHECK-NEXT: li a2, 64 520; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 521; CHECK-NEXT: vlse64.v v8, (a0), a2 522; CHECK-NEXT: vse64.v v8, (a1) 523; CHECK-NEXT: ret 524 %x.1 = getelementptr i8, ptr %x, i64 -64 525 %x.2 = getelementptr i8, ptr %x.1, i64 -64 526 %x.3 = getelementptr i8, ptr %x.2, i64 -64 527 %a = load <2 x float>, ptr %x.3 528 %b = load <2 x float>, ptr %x.2 529 %c = load <2 x float>, ptr %x.1 530 %d = load <2 x float>, ptr %x 531 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 532 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 533 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 534 store <8 x float> %e.2, ptr %z 535 ret void 536} 537 538; This is a strided load with a negative stride 539define void @reverse_strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) { 540; CHECK-LABEL: reverse_strided_runtime_4xv2f32: 541; CHECK: # %bb.0: 542; CHECK-NEXT: add a0, a0, a2 543; CHECK-NEXT: add a3, a2, a2 544; CHECK-NEXT: add a0, a0, a3 545; CHECK-NEXT: neg a2, a2 546; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 547; CHECK-NEXT: vlse64.v v8, (a0), a2 548; CHECK-NEXT: vse64.v v8, (a1) 549; CHECK-NEXT: ret 550 %x.1 = getelementptr i8, ptr %x, i64 %s 551 %x.2 = getelementptr i8, ptr %x.1, i64 %s 552 %x.3 = getelementptr i8, ptr %x.2, i64 %s 553 %a = load <2 x float>, ptr %x.3 554 %b = load <2 x float>, ptr %x.2 555 %c = load <2 x float>, ptr %x.1 556 %d = load <2 x float>, ptr %x 557 %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 558 %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 559 %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 560 store <8 x float> %e.2, ptr %z 561 ret void 562} 563 564; The middle end sometimes produces this pattern of shuffles, where the 565; intermediate shuffles are the full result vector size padded with poison 566; elements. 567define <16 x i8> @widen_4xv4i8_immediate_expand(ptr %p, i64 %s) { 568; CHECK-LABEL: widen_4xv4i8_immediate_expand: 569; CHECK: # %bb.0: 570; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 571; CHECK-NEXT: vlse32.v v8, (a0), a1 572; CHECK-NEXT: ret 573 %a = load <4 x i8>, ptr %p 574 %b.ptr = getelementptr i8, ptr %p, i64 %s 575 %b = load <4 x i8>, ptr %b.ptr 576 %c.ptr = getelementptr i8, ptr %b.ptr, i64 %s 577 %c = load <4 x i8>, ptr %c.ptr 578 %d.ptr = getelementptr i8, ptr %c.ptr, i64 %s 579 %d = load <4 x i8>, ptr %d.ptr 580 581 %ab = shufflevector <4 x i8> %a, <4 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> 582 %cx = shufflevector <4 x i8> %c, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> 583 %dx = shufflevector <4 x i8> %d, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> 584 %abcx = shufflevector <16 x i8> %ab, <16 x i8> %cx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison> 585 %abcd = shufflevector <16 x i8> %abcx, <16 x i8> %dx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19> 586 ret <16 x i8> %abcd 587} 588