1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvl256b | FileCheck %s --check-prefixes=CHECK,V 3; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+zvl256b | FileCheck %s --check-prefixes=CHECK,ZVE32F 4; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED,OPTZVE32F 5; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED,OPTV 6 7%struct.foo = type { i32, i32, i32, i32 } 8 9; void gather(signed char * __restrict A, signed char * __restrict B) { 10; for (int i = 0; i != 1024; ++i) 11; A[i] += B[i * 5]; 12; } 13define void @gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { 14; CHECK-LABEL: gather: 15; CHECK: # %bb.0: # %entry 16; CHECK-NEXT: addi a2, a0, 1024 17; CHECK-NEXT: li a4, 32 18; CHECK-NEXT: li a3, 5 19; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma 20; CHECK-NEXT: .LBB0_1: # %vector.body 21; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 22; CHECK-NEXT: vlse8.v v8, (a1), a3 23; CHECK-NEXT: vle8.v v9, (a0) 24; CHECK-NEXT: vadd.vv v8, v9, v8 25; CHECK-NEXT: vse8.v v8, (a0) 26; CHECK-NEXT: addi a0, a0, 32 27; CHECK-NEXT: addi a1, a1, 160 28; CHECK-NEXT: bne a0, a2, .LBB0_1 29; CHECK-NEXT: # %bb.2: # %for.cond.cleanup 30; CHECK-NEXT: ret 31entry: 32 br label %vector.body 33 34vector.body: ; preds = %vector.body, %entry 35 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 36 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ] 37 %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) 38 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i 39 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef) 40 %i2 = getelementptr inbounds i8, ptr %A, i64 %index 41 %wide.load = load <32 x i8>, ptr %i2, align 1 42 %i4 = add <32 x i8> %wide.load, %wide.masked.gather 43 store <32 x i8> %i4, ptr %i2, align 1 44 %index.next = add nuw i64 %index, 32 45 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) 46 %i6 = icmp eq i64 %index.next, 1024 47 br i1 %i6, label %for.cond.cleanup, label %vector.body 48 49for.cond.cleanup: ; preds = %vector.body 50 ret void 51} 52 53define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) { 54; CHECK-LABEL: gather_masked: 55; CHECK: # %bb.0: # %entry 56; CHECK-NEXT: addi a2, a0, 1024 57; CHECK-NEXT: lui a4, 983765 58; CHECK-NEXT: li a3, 32 59; CHECK-NEXT: addi a4, a4, 873 60; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma 61; CHECK-NEXT: vmv.s.x v0, a4 62; CHECK-NEXT: li a4, 5 63; CHECK-NEXT: .LBB1_1: # %vector.body 64; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 65; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu 66; CHECK-NEXT: vmv1r.v v9, v8 67; CHECK-NEXT: vlse8.v v9, (a1), a4, v0.t 68; CHECK-NEXT: vle8.v v10, (a0) 69; CHECK-NEXT: vadd.vv v9, v10, v9 70; CHECK-NEXT: vse8.v v9, (a0) 71; CHECK-NEXT: addi a0, a0, 32 72; CHECK-NEXT: addi a1, a1, 160 73; CHECK-NEXT: bne a0, a2, .LBB1_1 74; CHECK-NEXT: # %bb.2: # %for.cond.cleanup 75; CHECK-NEXT: ret 76entry: 77 br label %vector.body 78 79vector.body: ; preds = %vector.body, %entry 80 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 81 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ] 82 %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) 83 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i 84 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff) 85 %i2 = getelementptr inbounds i8, ptr %A, i64 %index 86 %wide.load = load <32 x i8>, ptr %i2, align 1 87 %i4 = add <32 x i8> %wide.load, %wide.masked.gather 88 store <32 x i8> %i4, ptr %i2, align 1 89 %index.next = add nuw i64 %index, 32 90 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) 91 %i6 = icmp eq i64 %index.next, 1024 92 br i1 %i6, label %for.cond.cleanup, label %vector.body 93 94for.cond.cleanup: ; preds = %vector.body 95 ret void 96} 97 98define void @gather_negative_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { 99; CHECK-LABEL: gather_negative_stride: 100; CHECK: # %bb.0: # %entry 101; CHECK-NEXT: addi a1, a1, 155 102; CHECK-NEXT: addi a2, a0, 1024 103; CHECK-NEXT: li a4, 32 104; CHECK-NEXT: li a3, -5 105; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma 106; CHECK-NEXT: .LBB2_1: # %vector.body 107; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 108; CHECK-NEXT: vlse8.v v8, (a1), a3 109; CHECK-NEXT: vle8.v v9, (a0) 110; CHECK-NEXT: vadd.vv v8, v9, v8 111; CHECK-NEXT: vse8.v v8, (a0) 112; CHECK-NEXT: addi a0, a0, 32 113; CHECK-NEXT: addi a1, a1, 160 114; CHECK-NEXT: bne a0, a2, .LBB2_1 115; CHECK-NEXT: # %bb.2: # %for.cond.cleanup 116; CHECK-NEXT: ret 117entry: 118 br label %vector.body 119 120vector.body: ; preds = %vector.body, %entry 121 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 122 %vec.ind = phi <32 x i64> [ <i64 31, i64 30, i64 29, i64 28, i64 27, i64 26, i64 25, i64 24, i64 23, i64 22, i64 21, i64 20, i64 19, i64 18, i64 17, i64 16, i64 15, i64 14, i64 13, i64 12, i64 11, i64 10, i64 9, i64 8, i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, %entry ], [ %vec.ind.next, %vector.body ] 123 %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) 124 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i 125 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef) 126 %i2 = getelementptr inbounds i8, ptr %A, i64 %index 127 %wide.load = load <32 x i8>, ptr %i2, align 1 128 %i4 = add <32 x i8> %wide.load, %wide.masked.gather 129 store <32 x i8> %i4, ptr %i2, align 1 130 %index.next = add nuw i64 %index, 32 131 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) 132 %i6 = icmp eq i64 %index.next, 1024 133 br i1 %i6, label %for.cond.cleanup, label %vector.body 134 135for.cond.cleanup: ; preds = %vector.body 136 ret void 137} 138 139define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { 140; CHECK-LABEL: gather_zero_stride: 141; CHECK: # %bb.0: # %entry 142; CHECK-NEXT: addi a2, a0, 1024 143; CHECK-NEXT: li a3, 32 144; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma 145; CHECK-NEXT: .LBB3_1: # %vector.body 146; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 147; CHECK-NEXT: lbu a3, 0(a1) 148; CHECK-NEXT: vle8.v v8, (a0) 149; CHECK-NEXT: vadd.vx v8, v8, a3 150; CHECK-NEXT: vse8.v v8, (a0) 151; CHECK-NEXT: addi a0, a0, 32 152; CHECK-NEXT: addi a1, a1, 160 153; CHECK-NEXT: bne a0, a2, .LBB3_1 154; CHECK-NEXT: # %bb.2: # %for.cond.cleanup 155; CHECK-NEXT: ret 156entry: 157 br label %vector.body 158 159vector.body: ; preds = %vector.body, %entry 160 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 161 %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] 162 %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) 163 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i 164 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef) 165 %i2 = getelementptr inbounds i8, ptr %A, i64 %index 166 %wide.load = load <32 x i8>, ptr %i2, align 1 167 %i4 = add <32 x i8> %wide.load, %wide.masked.gather 168 store <32 x i8> %i4, ptr %i2, align 1 169 %index.next = add nuw i64 %index, 32 170 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) 171 %i6 = icmp eq i64 %index.next, 1024 172 br i1 %i6, label %for.cond.cleanup, label %vector.body 173 174for.cond.cleanup: ; preds = %vector.body 175 ret void 176} 177 178define void @gather_zero_stride_i32(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { 179; CHECK-LABEL: gather_zero_stride_i32: 180; CHECK: # %bb.0: # %entry 181; CHECK-NEXT: addi a2, a0, 1024 182; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma 183; CHECK-NEXT: .LBB4_1: # %vector.body 184; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 185; CHECK-NEXT: lw a3, 0(a1) 186; CHECK-NEXT: vle32.v v8, (a0) 187; CHECK-NEXT: vadd.vx v8, v8, a3 188; CHECK-NEXT: vse32.v v8, (a0) 189; CHECK-NEXT: addi a0, a0, 8 190; CHECK-NEXT: addi a1, a1, 160 191; CHECK-NEXT: bne a0, a2, .LBB4_1 192; CHECK-NEXT: # %bb.2: # %for.cond.cleanup 193; CHECK-NEXT: ret 194entry: 195 br label %vector.body 196 197vector.body: ; preds = %vector.body, %entry 198 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 199 %vec.ind = phi <8 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] 200 %i = mul nuw nsw <8 x i64> %vec.ind, splat (i64 5) 201 %i1 = getelementptr inbounds i8, ptr %B, <8 x i64> %i 202 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) 203 %i2 = getelementptr inbounds i8, ptr %A, i64 %index 204 %wide.load = load <8 x i32>, ptr %i2, align 4 205 %i4 = add <8 x i32> %wide.load, %wide.masked.gather 206 store <8 x i32> %i4, ptr %i2, align 4 207 %index.next = add nuw i64 %index, 8 208 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32) 209 %i6 = icmp eq i64 %index.next, 1024 210 br i1 %i6, label %for.cond.cleanup, label %vector.body 211 212for.cond.cleanup: ; preds = %vector.body 213 ret void 214} 215 216define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { 217; V-LABEL: gather_zero_stride_unfold: 218; V: # %bb.0: # %entry 219; V-NEXT: addi a2, a0, 1024 220; V-NEXT: li a3, 32 221; V-NEXT: vsetvli zero, a3, e8, m1, ta, ma 222; V-NEXT: .LBB5_1: # %vector.body 223; V-NEXT: # =>This Inner Loop Header: Depth=1 224; V-NEXT: lbu a3, 0(a1) 225; V-NEXT: vle8.v v8, (a0) 226; V-NEXT: vmv.v.x v9, a3 227; V-NEXT: vdivu.vv v8, v9, v8 228; V-NEXT: vse8.v v8, (a0) 229; V-NEXT: addi a0, a0, 32 230; V-NEXT: addi a1, a1, 160 231; V-NEXT: bne a0, a2, .LBB5_1 232; V-NEXT: # %bb.2: # %for.cond.cleanup 233; V-NEXT: ret 234; 235; ZVE32F-LABEL: gather_zero_stride_unfold: 236; ZVE32F: # %bb.0: # %entry 237; ZVE32F-NEXT: addi a2, a0, 1024 238; ZVE32F-NEXT: li a3, 32 239; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma 240; ZVE32F-NEXT: .LBB5_1: # %vector.body 241; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 242; ZVE32F-NEXT: lbu a3, 0(a1) 243; ZVE32F-NEXT: vle8.v v8, (a0) 244; ZVE32F-NEXT: vmv.v.x v9, a3 245; ZVE32F-NEXT: vdivu.vv v8, v9, v8 246; ZVE32F-NEXT: vse8.v v8, (a0) 247; ZVE32F-NEXT: addi a0, a0, 32 248; ZVE32F-NEXT: addi a1, a1, 160 249; ZVE32F-NEXT: bne a0, a2, .LBB5_1 250; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup 251; ZVE32F-NEXT: ret 252; 253; OPTIMIZED-LABEL: gather_zero_stride_unfold: 254; OPTIMIZED: # %bb.0: # %entry 255; OPTIMIZED-NEXT: addi a2, a0, 1024 256; OPTIMIZED-NEXT: li a3, 32 257; OPTIMIZED-NEXT: vsetvli zero, a3, e8, m1, ta, ma 258; OPTIMIZED-NEXT: .LBB5_1: # %vector.body 259; OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1 260; OPTIMIZED-NEXT: vlse8.v v8, (a1), zero 261; OPTIMIZED-NEXT: vle8.v v9, (a0) 262; OPTIMIZED-NEXT: vdivu.vv v8, v8, v9 263; OPTIMIZED-NEXT: vse8.v v8, (a0) 264; OPTIMIZED-NEXT: addi a0, a0, 32 265; OPTIMIZED-NEXT: addi a1, a1, 160 266; OPTIMIZED-NEXT: bne a0, a2, .LBB5_1 267; OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup 268; OPTIMIZED-NEXT: ret 269entry: 270 br label %vector.body 271 272vector.body: ; preds = %vector.body, %entry 273 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 274 %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] 275 %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) 276 %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i 277 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef) 278 %i2 = getelementptr inbounds i8, ptr %A, i64 %index 279 %wide.load = load <32 x i8>, ptr %i2, align 1 280 %i4 = udiv <32 x i8> %wide.masked.gather, %wide.load 281 store <32 x i8> %i4, ptr %i2, align 1 282 %index.next = add nuw i64 %index, 32 283 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) 284 %i6 = icmp eq i64 %index.next, 1024 285 br i1 %i6, label %for.cond.cleanup, label %vector.body 286 287for.cond.cleanup: ; preds = %vector.body 288 ret void 289} 290 291;void scatter(signed char * __restrict A, signed char * __restrict B) { 292; for (int i = 0; i < 1024; ++i) 293; A[i * 5] += B[i]; 294;} 295define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { 296; CHECK-LABEL: scatter: 297; CHECK: # %bb.0: # %entry 298; CHECK-NEXT: addi a2, a1, 1024 299; CHECK-NEXT: li a4, 32 300; CHECK-NEXT: li a3, 5 301; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma 302; CHECK-NEXT: .LBB6_1: # %vector.body 303; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 304; CHECK-NEXT: vle8.v v8, (a1) 305; CHECK-NEXT: vlse8.v v9, (a0), a3 306; CHECK-NEXT: addi a1, a1, 32 307; CHECK-NEXT: vadd.vv v8, v9, v8 308; CHECK-NEXT: vsse8.v v8, (a0), a3 309; CHECK-NEXT: addi a0, a0, 160 310; CHECK-NEXT: bne a1, a2, .LBB6_1 311; CHECK-NEXT: # %bb.2: # %for.cond.cleanup 312; CHECK-NEXT: ret 313entry: 314 br label %vector.body 315 316vector.body: ; preds = %vector.body, %entry 317 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 318 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ] 319 %i = getelementptr inbounds i8, ptr %B, i64 %index 320 %wide.load = load <32 x i8>, ptr %i, align 1 321 %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) 322 %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2 323 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef) 324 %i4 = add <32 x i8> %wide.masked.gather, %wide.load 325 call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true)) 326 %index.next = add nuw i64 %index, 32 327 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) 328 %i5 = icmp eq i64 %index.next, 1024 329 br i1 %i5, label %for.cond.cleanup, label %vector.body 330 331for.cond.cleanup: ; preds = %vector.body 332 ret void 333} 334 335define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) { 336; CHECK-LABEL: scatter_masked: 337; CHECK: # %bb.0: # %entry 338; CHECK-NEXT: addi a2, a1, 1024 339; CHECK-NEXT: li a3, 32 340; CHECK-NEXT: lui a4, 983765 341; CHECK-NEXT: addi a4, a4, 873 342; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma 343; CHECK-NEXT: vmv.s.x v0, a4 344; CHECK-NEXT: li a4, 5 345; CHECK-NEXT: .LBB7_1: # %vector.body 346; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 347; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu 348; CHECK-NEXT: vle8.v v9, (a1) 349; CHECK-NEXT: vmv1r.v v10, v8 350; CHECK-NEXT: vlse8.v v10, (a0), a4, v0.t 351; CHECK-NEXT: addi a1, a1, 32 352; CHECK-NEXT: vadd.vv v9, v10, v9 353; CHECK-NEXT: vsse8.v v9, (a0), a4, v0.t 354; CHECK-NEXT: addi a0, a0, 160 355; CHECK-NEXT: bne a1, a2, .LBB7_1 356; CHECK-NEXT: # %bb.2: # %for.cond.cleanup 357; CHECK-NEXT: ret 358entry: 359 br label %vector.body 360 361vector.body: ; preds = %vector.body, %entry 362 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 363 %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ] 364 %i = getelementptr inbounds i8, ptr %B, i64 %index 365 %wide.load = load <32 x i8>, ptr %i, align 1 366 %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) 367 %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2 368 %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff) 369 %i4 = add <32 x i8> %wide.masked.gather, %wide.load 370 call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>) 371 %index.next = add nuw i64 %index, 32 372 %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) 373 %i5 = icmp eq i64 %index.next, 1024 374 br i1 %i5, label %for.cond.cleanup, label %vector.body 375 376for.cond.cleanup: ; preds = %vector.body 377 ret void 378} 379 380; void gather_pow2(signed char * __restrict A, signed char * __restrict B) { 381; for (int i = 0; i != 1024; ++i) 382; A[i] += B[i * 4]; 383; } 384define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { 385; CHECK-LABEL: gather_pow2: 386; CHECK: # %bb.0: # %entry 387; CHECK-NEXT: lui a3, 1 388; CHECK-NEXT: li a2, 16 389; CHECK-NEXT: add a3, a0, a3 390; CHECK-NEXT: li a4, 32 391; CHECK-NEXT: .LBB8_1: # %vector.body 392; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 393; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma 394; CHECK-NEXT: vlse32.v v8, (a1), a2 395; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma 396; CHECK-NEXT: vle8.v v9, (a0) 397; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma 398; CHECK-NEXT: vadd.vv v8, v9, v8 399; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma 400; CHECK-NEXT: vse8.v v8, (a0) 401; CHECK-NEXT: addi a0, a0, 32 402; CHECK-NEXT: addi a1, a1, 128 403; CHECK-NEXT: bne a0, a3, .LBB8_1 404; CHECK-NEXT: # %bb.2: # %for.cond.cleanup 405; CHECK-NEXT: ret 406entry: 407 br label %vector.body 408 409vector.body: ; preds = %vector.body, %entry 410 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 411 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ] 412 %i = shl nsw <8 x i64> %vec.ind, splat (i64 2) 413 %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i 414 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) 415 %i2 = getelementptr inbounds i32, ptr %A, i64 %index 416 %wide.load = load <8 x i32>, ptr %i2, align 1 417 %i4 = add <8 x i32> %wide.load, %wide.masked.gather 418 store <8 x i32> %i4, ptr %i2, align 1 419 %index.next = add nuw i64 %index, 8 420 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8) 421 %i6 = icmp eq i64 %index.next, 1024 422 br i1 %i6, label %for.cond.cleanup, label %vector.body 423 424for.cond.cleanup: ; preds = %vector.body 425 ret void 426} 427 428;void scatter_pow2(signed char * __restrict A, signed char * __restrict B) { 429; for (int i = 0; i < 1024; ++i) 430; A[i * 4] += B[i]; 431;} 432define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { 433; CHECK-LABEL: scatter_pow2: 434; CHECK: # %bb.0: # %entry 435; CHECK-NEXT: lui a3, 1 436; CHECK-NEXT: li a2, 32 437; CHECK-NEXT: add a3, a1, a3 438; CHECK-NEXT: li a4, 16 439; CHECK-NEXT: .LBB9_1: # %vector.body 440; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 441; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma 442; CHECK-NEXT: vle8.v v8, (a1) 443; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma 444; CHECK-NEXT: vlse32.v v9, (a0), a4 445; CHECK-NEXT: addi a1, a1, 32 446; CHECK-NEXT: vadd.vv v8, v9, v8 447; CHECK-NEXT: vsse32.v v8, (a0), a4 448; CHECK-NEXT: addi a0, a0, 128 449; CHECK-NEXT: bne a1, a3, .LBB9_1 450; CHECK-NEXT: # %bb.2: # %for.cond.cleanup 451; CHECK-NEXT: ret 452entry: 453 br label %vector.body 454 455vector.body: ; preds = %vector.body, %entry 456 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 457 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ] 458 %i = getelementptr inbounds i32, ptr %B, i64 %index 459 %wide.load = load <8 x i32>, ptr %i, align 1 460 %i2 = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2) 461 %i3 = getelementptr inbounds i32, ptr %A, <8 x i64> %i2 462 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) 463 %i4 = add <8 x i32> %wide.masked.gather, %wide.load 464 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i4, <8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true)) 465 %index.next = add nuw i64 %index, 8 466 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8) 467 %i5 = icmp eq i64 %index.next, 1024 468 br i1 %i5, label %for.cond.cleanup, label %vector.body 469 470for.cond.cleanup: ; preds = %vector.body 471 ret void 472} 473 474;struct foo { 475; int a, b, c, d; 476;}; 477; 478;void struct_gather(int * __restrict A, struct foo * __restrict B) { 479; for (int i = 0; i < 1024; ++i) 480; A[i] += B[i].b; 481;} 482define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { 483; CHECK-LABEL: struct_gather: 484; CHECK: # %bb.0: # %entry 485; CHECK-NEXT: addi a1, a1, 132 486; CHECK-NEXT: lui a2, 1 487; CHECK-NEXT: add a2, a0, a2 488; CHECK-NEXT: li a3, 16 489; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma 490; CHECK-NEXT: .LBB10_1: # %vector.body 491; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 492; CHECK-NEXT: addi a4, a0, 32 493; CHECK-NEXT: addi a5, a1, -128 494; CHECK-NEXT: vlse32.v v8, (a1), a3 495; CHECK-NEXT: vle32.v v9, (a0) 496; CHECK-NEXT: vlse32.v v10, (a5), a3 497; CHECK-NEXT: vle32.v v11, (a4) 498; CHECK-NEXT: vadd.vv v9, v9, v10 499; CHECK-NEXT: vadd.vv v8, v11, v8 500; CHECK-NEXT: vse32.v v9, (a0) 501; CHECK-NEXT: vse32.v v8, (a4) 502; CHECK-NEXT: addi a0, a0, 64 503; CHECK-NEXT: addi a1, a1, 256 504; CHECK-NEXT: bne a0, a2, .LBB10_1 505; CHECK-NEXT: # %bb.2: # %for.cond.cleanup 506; CHECK-NEXT: ret 507entry: 508 br label %vector.body 509 510vector.body: ; preds = %vector.body, %entry 511 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 512 %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ] 513 %step.add = add <8 x i64> %vec.ind, splat (i64 8) 514 %i = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %vec.ind, i32 1 515 %i1 = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %step.add, i32 1 516 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) 517 %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) 518 %i2 = getelementptr inbounds i32, ptr %A, i64 %index 519 %wide.load = load <8 x i32>, ptr %i2, align 4 520 %i4 = getelementptr inbounds i32, ptr %i2, i64 8 521 %wide.load10 = load <8 x i32>, ptr %i4, align 4 522 %i6 = add nsw <8 x i32> %wide.load, %wide.masked.gather 523 %i7 = add nsw <8 x i32> %wide.load10, %wide.masked.gather9 524 store <8 x i32> %i6, ptr %i2, align 4 525 store <8 x i32> %i7, ptr %i4, align 4 526 %index.next = add nuw i64 %index, 16 527 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 16) 528 %i10 = icmp eq i64 %index.next, 1024 529 br i1 %i10, label %for.cond.cleanup, label %vector.body 530 531for.cond.cleanup: ; preds = %vector.body 532 ret void 533} 534 535;void gather_unroll(int * __restrict A, int * __restrict B) { 536; for (int i = 0; i < 1024; i+= 4 ) { 537; A[i] += B[i * 4]; 538; A[i+1] += B[(i+1) * 4]; 539; A[i+2] += B[(i+2) * 4]; 540; A[i+3] += B[(i+3) * 4]; 541; } 542;} 543define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { 544; CHECK-LABEL: gather_unroll: 545; CHECK: # %bb.0: # %entry 546; CHECK-NEXT: li a2, 256 547; CHECK-NEXT: li a3, 64 548; CHECK-NEXT: li a4, 16 549; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma 550; CHECK-NEXT: .LBB11_1: # %vector.body 551; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 552; CHECK-NEXT: vlse32.v v8, (a1), a3 553; CHECK-NEXT: vlse32.v v9, (a0), a4 554; CHECK-NEXT: addi a5, a1, 16 555; CHECK-NEXT: vadd.vv v8, v9, v8 556; CHECK-NEXT: vsse32.v v8, (a0), a4 557; CHECK-NEXT: vlse32.v v8, (a5), a3 558; CHECK-NEXT: addi a5, a0, 4 559; CHECK-NEXT: vlse32.v v9, (a5), a4 560; CHECK-NEXT: vadd.vv v8, v9, v8 561; CHECK-NEXT: vsse32.v v8, (a5), a4 562; CHECK-NEXT: addi a5, a1, 32 563; CHECK-NEXT: vlse32.v v8, (a5), a3 564; CHECK-NEXT: addi a5, a0, 8 565; CHECK-NEXT: vlse32.v v9, (a5), a4 566; CHECK-NEXT: vadd.vv v8, v9, v8 567; CHECK-NEXT: vsse32.v v8, (a5), a4 568; CHECK-NEXT: addi a5, a1, 48 569; CHECK-NEXT: vlse32.v v8, (a5), a3 570; CHECK-NEXT: addi a5, a0, 12 571; CHECK-NEXT: vlse32.v v9, (a5), a4 572; CHECK-NEXT: addi a2, a2, -8 573; CHECK-NEXT: addi a1, a1, 512 574; CHECK-NEXT: vadd.vv v8, v9, v8 575; CHECK-NEXT: vsse32.v v8, (a5), a4 576; CHECK-NEXT: addi a0, a0, 128 577; CHECK-NEXT: bnez a2, .LBB11_1 578; CHECK-NEXT: # %bb.2: # %for.cond.cleanup 579; CHECK-NEXT: ret 580entry: 581 br label %vector.body 582 583vector.body: ; preds = %vector.body, %entry 584 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 585 %vec.ind = phi <8 x i64> [ <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>, %entry ], [ %vec.ind.next, %vector.body ] 586 %i = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2) 587 %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i 588 %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) 589 %i2 = getelementptr inbounds i32, ptr %A, <8 x i64> %vec.ind 590 %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) 591 %i3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather 592 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i3, <8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true)) 593 %i4 = or disjoint <8 x i64> %vec.ind, splat (i64 1) 594 %i5 = shl nsw <8 x i64> %i4, splat (i64 2) 595 %i6 = getelementptr inbounds i32, ptr %B, <8 x i64> %i5 596 %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i6, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) 597 %i7 = getelementptr inbounds i32, ptr %A, <8 x i64> %i4 598 %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) 599 %i8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53 600 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i8, <8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true)) 601 %i9 = or disjoint <8 x i64> %vec.ind, splat (i64 2) 602 %i10 = shl nsw <8 x i64> %i9, splat (i64 2) 603 %i11 = getelementptr inbounds i32, ptr %B, <8 x i64> %i10 604 %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i11, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) 605 %i12 = getelementptr inbounds i32, ptr %A, <8 x i64> %i9 606 %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) 607 %i13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55 608 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i13, <8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true)) 609 %i14 = or disjoint <8 x i64> %vec.ind, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3> 610 %i15 = shl nsw <8 x i64> %i14, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> 611 %i16 = getelementptr inbounds i32, ptr %B, <8 x i64> %i15 612 %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i16, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) 613 %i17 = getelementptr inbounds i32, ptr %A, <8 x i64> %i14 614 %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef) 615 %i18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57 616 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i18, <8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true)) 617 %index.next = add nuw i64 %index, 8 618 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32) 619 %i19 = icmp eq i64 %index.next, 256 620 br i1 %i19, label %for.cond.cleanup, label %vector.body 621 622for.cond.cleanup: ; preds = %vector.body 623 ret void 624} 625 626declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i8>) 627declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x i32>) 628declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32 immarg, <32 x i1>) 629declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32 immarg, <8 x i1>) 630 631; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. 632define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) { 633; V-LABEL: gather_of_pointers: 634; V: # %bb.0: # %bb 635; V-NEXT: lui a2, 2 636; V-NEXT: add a2, a0, a2 637; V-NEXT: li a3, 40 638; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma 639; V-NEXT: .LBB12_1: # %bb2 640; V-NEXT: # =>This Inner Loop Header: Depth=1 641; V-NEXT: vlse64.v v8, (a1), a3 642; V-NEXT: addi a4, a1, 80 643; V-NEXT: vlse64.v v9, (a4), a3 644; V-NEXT: addi a4, a0, 16 645; V-NEXT: vse64.v v8, (a0) 646; V-NEXT: addi a0, a0, 32 647; V-NEXT: vse64.v v9, (a4) 648; V-NEXT: addi a1, a1, 160 649; V-NEXT: bne a0, a2, .LBB12_1 650; V-NEXT: # %bb.2: # %bb18 651; V-NEXT: ret 652; 653; ZVE32F-LABEL: gather_of_pointers: 654; ZVE32F: # %bb.0: # %bb 655; ZVE32F-NEXT: li a2, 0 656; ZVE32F-NEXT: lui a4, 2 657; ZVE32F-NEXT: li a3, 1 658; ZVE32F-NEXT: add a4, a0, a4 659; ZVE32F-NEXT: li a5, 40 660; ZVE32F-NEXT: .LBB12_1: # %bb2 661; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 662; ZVE32F-NEXT: mul a6, a3, a5 663; ZVE32F-NEXT: mul a7, a2, a5 664; ZVE32F-NEXT: addi a2, a2, 4 665; ZVE32F-NEXT: add a6, a1, a6 666; ZVE32F-NEXT: add a7, a1, a7 667; ZVE32F-NEXT: ld t0, 0(a7) 668; ZVE32F-NEXT: ld t1, 0(a6) 669; ZVE32F-NEXT: ld a7, 80(a7) 670; ZVE32F-NEXT: ld a6, 80(a6) 671; ZVE32F-NEXT: sd t0, 0(a0) 672; ZVE32F-NEXT: sd t1, 8(a0) 673; ZVE32F-NEXT: sd a7, 16(a0) 674; ZVE32F-NEXT: sd a6, 24(a0) 675; ZVE32F-NEXT: addi a0, a0, 32 676; ZVE32F-NEXT: addi a3, a3, 4 677; ZVE32F-NEXT: bne a0, a4, .LBB12_1 678; ZVE32F-NEXT: # %bb.2: # %bb18 679; ZVE32F-NEXT: ret 680; 681; OPTZVE32F-LABEL: gather_of_pointers: 682; OPTZVE32F: # %bb.0: # %bb 683; OPTZVE32F-NEXT: lui a2, 2 684; OPTZVE32F-NEXT: add a2, a0, a2 685; OPTZVE32F-NEXT: li a3, 40 686; OPTZVE32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma 687; OPTZVE32F-NEXT: .LBB12_1: # %bb2 688; OPTZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 689; OPTZVE32F-NEXT: vlse64.v v8, (a1), a3 690; OPTZVE32F-NEXT: addi a4, a1, 80 691; OPTZVE32F-NEXT: vlse64.v v9, (a4), a3 692; OPTZVE32F-NEXT: addi a4, a0, 16 693; OPTZVE32F-NEXT: vse64.v v8, (a0) 694; OPTZVE32F-NEXT: addi a0, a0, 32 695; OPTZVE32F-NEXT: vse64.v v9, (a4) 696; OPTZVE32F-NEXT: addi a1, a1, 160 697; OPTZVE32F-NEXT: bne a0, a2, .LBB12_1 698; OPTZVE32F-NEXT: # %bb.2: # %bb18 699; OPTZVE32F-NEXT: ret 700; 701; OPTV-LABEL: gather_of_pointers: 702; OPTV: # %bb.0: # %bb 703; OPTV-NEXT: li a2, 0 704; OPTV-NEXT: lui a4, 2 705; OPTV-NEXT: li a3, 1 706; OPTV-NEXT: add a4, a0, a4 707; OPTV-NEXT: li a5, 40 708; OPTV-NEXT: .LBB12_1: # %bb2 709; OPTV-NEXT: # =>This Inner Loop Header: Depth=1 710; OPTV-NEXT: mul a6, a3, a5 711; OPTV-NEXT: mul a7, a2, a5 712; OPTV-NEXT: addi a2, a2, 4 713; OPTV-NEXT: add a6, a1, a6 714; OPTV-NEXT: add a7, a1, a7 715; OPTV-NEXT: ld t0, 0(a7) 716; OPTV-NEXT: ld t1, 0(a6) 717; OPTV-NEXT: ld a7, 80(a7) 718; OPTV-NEXT: ld a6, 80(a6) 719; OPTV-NEXT: sd t0, 0(a0) 720; OPTV-NEXT: sd t1, 8(a0) 721; OPTV-NEXT: sd a7, 16(a0) 722; OPTV-NEXT: sd a6, 24(a0) 723; OPTV-NEXT: addi a0, a0, 32 724; OPTV-NEXT: addi a3, a3, 4 725; OPTV-NEXT: bne a0, a4, .LBB12_1 726; OPTV-NEXT: # %bb.2: # %bb18 727; OPTV-NEXT: ret 728bb: 729 br label %bb2 730 731bb2: ; preds = %bb2, %bb 732 %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ] 733 %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ] 734 %i4 = mul nuw nsw <2 x i64> %i3, splat (i64 5) 735 %i5 = mul <2 x i64> %i3, splat (i64 5) 736 %i6 = add <2 x i64> %i5, <i64 10, i64 10> 737 %i7 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i4 738 %i8 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i6 739 %i9 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i7, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef) 740 %i10 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i8, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef) 741 %i11 = getelementptr inbounds ptr, ptr %arg, i64 %i 742 store <2 x ptr> %i9, ptr %i11, align 8 743 %i13 = getelementptr inbounds ptr, ptr %i11, i64 2 744 store <2 x ptr> %i10, ptr %i13, align 8 745 %i15 = add nuw i64 %i, 4 746 %i16 = add <2 x i64> %i3, <i64 4, i64 4> 747 %i17 = icmp eq i64 %i15, 1024 748 br i1 %i17, label %bb18, label %bb2 749 750bb18: ; preds = %bb2 751 ret void 752} 753 754declare <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x ptr>) 755 756; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers. 757define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) { 758; V-LABEL: scatter_of_pointers: 759; V: # %bb.0: # %bb 760; V-NEXT: lui a2, 2 761; V-NEXT: add a2, a1, a2 762; V-NEXT: li a3, 40 763; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma 764; V-NEXT: .LBB13_1: # %bb2 765; V-NEXT: # =>This Inner Loop Header: Depth=1 766; V-NEXT: addi a4, a1, 16 767; V-NEXT: vle64.v v8, (a1) 768; V-NEXT: vle64.v v9, (a4) 769; V-NEXT: addi a4, a0, 80 770; V-NEXT: addi a1, a1, 32 771; V-NEXT: vsse64.v v8, (a0), a3 772; V-NEXT: vsse64.v v9, (a4), a3 773; V-NEXT: addi a0, a0, 160 774; V-NEXT: bne a1, a2, .LBB13_1 775; V-NEXT: # %bb.2: # %bb18 776; V-NEXT: ret 777; 778; ZVE32F-LABEL: scatter_of_pointers: 779; ZVE32F: # %bb.0: # %bb 780; ZVE32F-NEXT: li a2, 0 781; ZVE32F-NEXT: lui a4, 2 782; ZVE32F-NEXT: li a3, 1 783; ZVE32F-NEXT: add a4, a1, a4 784; ZVE32F-NEXT: li a5, 40 785; ZVE32F-NEXT: .LBB13_1: # %bb2 786; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 787; ZVE32F-NEXT: ld a6, 0(a1) 788; ZVE32F-NEXT: ld a7, 8(a1) 789; ZVE32F-NEXT: ld t0, 16(a1) 790; ZVE32F-NEXT: ld t1, 24(a1) 791; ZVE32F-NEXT: mul t2, a3, a5 792; ZVE32F-NEXT: mul t3, a2, a5 793; ZVE32F-NEXT: addi a2, a2, 4 794; ZVE32F-NEXT: addi a1, a1, 32 795; ZVE32F-NEXT: add t2, a0, t2 796; ZVE32F-NEXT: add t3, a0, t3 797; ZVE32F-NEXT: sd a6, 0(t3) 798; ZVE32F-NEXT: sd a7, 0(t2) 799; ZVE32F-NEXT: sd t0, 80(t3) 800; ZVE32F-NEXT: sd t1, 80(t2) 801; ZVE32F-NEXT: addi a3, a3, 4 802; ZVE32F-NEXT: bne a1, a4, .LBB13_1 803; ZVE32F-NEXT: # %bb.2: # %bb18 804; ZVE32F-NEXT: ret 805; 806; OPTZVE32F-LABEL: scatter_of_pointers: 807; OPTZVE32F: # %bb.0: # %bb 808; OPTZVE32F-NEXT: lui a2, 2 809; OPTZVE32F-NEXT: add a2, a1, a2 810; OPTZVE32F-NEXT: li a3, 40 811; OPTZVE32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma 812; OPTZVE32F-NEXT: .LBB13_1: # %bb2 813; OPTZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 814; OPTZVE32F-NEXT: addi a4, a1, 16 815; OPTZVE32F-NEXT: vle64.v v8, (a1) 816; OPTZVE32F-NEXT: vle64.v v9, (a4) 817; OPTZVE32F-NEXT: addi a4, a0, 80 818; OPTZVE32F-NEXT: addi a1, a1, 32 819; OPTZVE32F-NEXT: vsse64.v v8, (a0), a3 820; OPTZVE32F-NEXT: vsse64.v v9, (a4), a3 821; OPTZVE32F-NEXT: addi a0, a0, 160 822; OPTZVE32F-NEXT: bne a1, a2, .LBB13_1 823; OPTZVE32F-NEXT: # %bb.2: # %bb18 824; OPTZVE32F-NEXT: ret 825; 826; OPTV-LABEL: scatter_of_pointers: 827; OPTV: # %bb.0: # %bb 828; OPTV-NEXT: li a2, 0 829; OPTV-NEXT: lui a4, 2 830; OPTV-NEXT: li a3, 1 831; OPTV-NEXT: add a4, a1, a4 832; OPTV-NEXT: li a5, 40 833; OPTV-NEXT: .LBB13_1: # %bb2 834; OPTV-NEXT: # =>This Inner Loop Header: Depth=1 835; OPTV-NEXT: ld a6, 0(a1) 836; OPTV-NEXT: ld a7, 8(a1) 837; OPTV-NEXT: ld t0, 16(a1) 838; OPTV-NEXT: ld t1, 24(a1) 839; OPTV-NEXT: mul t2, a3, a5 840; OPTV-NEXT: mul t3, a2, a5 841; OPTV-NEXT: addi a2, a2, 4 842; OPTV-NEXT: addi a1, a1, 32 843; OPTV-NEXT: add t2, a0, t2 844; OPTV-NEXT: add t3, a0, t3 845; OPTV-NEXT: sd a6, 0(t3) 846; OPTV-NEXT: sd a7, 0(t2) 847; OPTV-NEXT: sd t0, 80(t3) 848; OPTV-NEXT: sd t1, 80(t2) 849; OPTV-NEXT: addi a3, a3, 4 850; OPTV-NEXT: bne a1, a4, .LBB13_1 851; OPTV-NEXT: # %bb.2: # %bb18 852; OPTV-NEXT: ret 853bb: 854 br label %bb2 855 856bb2: ; preds = %bb2, %bb 857 %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ] 858 %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ] 859 %i4 = getelementptr inbounds ptr, ptr %arg1, i64 %i 860 %i6 = load <2 x ptr>, ptr %i4, align 8 861 %i7 = getelementptr inbounds ptr, ptr %i4, i64 2 862 %i9 = load <2 x ptr>, ptr %i7, align 8 863 %i10 = mul nuw nsw <2 x i64> %i3, splat (i64 5) 864 %i11 = mul <2 x i64> %i3, splat (i64 5) 865 %i12 = add <2 x i64> %i11, <i64 10, i64 10> 866 %i13 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i10 867 %i14 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i12 868 call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i6, <2 x ptr> %i13, i32 8, <2 x i1> splat (i1 true)) 869 call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i9, <2 x ptr> %i14, i32 8, <2 x i1> splat (i1 true)) 870 %i15 = add nuw i64 %i, 4 871 %i16 = add <2 x i64> %i3, <i64 4, i64 4> 872 %i17 = icmp eq i64 %i15, 1024 873 br i1 %i17, label %bb18, label %bb2 874 875bb18: ; preds = %bb2 876 ret void 877} 878 879declare void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr>, <2 x ptr>, i32 immarg, <2 x i1>) 880 881define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1, i32 signext %arg2) { 882; CHECK-LABEL: strided_load_startval_add_with_splat: 883; CHECK: # %bb.0: # %bb 884; CHECK-NEXT: li a3, 1024 885; CHECK-NEXT: beq a2, a3, .LBB14_7 886; CHECK-NEXT: # %bb.1: # %bb3 887; CHECK-NEXT: li a3, 1023 888; CHECK-NEXT: subw a5, a3, a2 889; CHECK-NEXT: li a6, 31 890; CHECK-NEXT: mv a4, a2 891; CHECK-NEXT: bltu a5, a6, .LBB14_5 892; CHECK-NEXT: # %bb.2: # %bb9 893; CHECK-NEXT: slli a4, a5, 32 894; CHECK-NEXT: slli t0, a2, 2 895; CHECK-NEXT: add a5, a0, a2 896; CHECK-NEXT: add a6, a1, a2 897; CHECK-NEXT: li t2, 32 898; CHECK-NEXT: srli a4, a4, 32 899; CHECK-NEXT: add t0, a6, t0 900; CHECK-NEXT: addi a6, a4, 1 901; CHECK-NEXT: andi a7, a6, -32 902; CHECK-NEXT: add a4, a7, a2 903; CHECK-NEXT: add a2, a4, a0 904; CHECK-NEXT: li t1, 5 905; CHECK-NEXT: vsetvli zero, t2, e8, m1, ta, ma 906; CHECK-NEXT: .LBB14_3: # %bb15 907; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 908; CHECK-NEXT: vlse8.v v8, (t0), t1 909; CHECK-NEXT: vle8.v v9, (a5) 910; CHECK-NEXT: vadd.vv v8, v9, v8 911; CHECK-NEXT: vse8.v v8, (a5) 912; CHECK-NEXT: addi a5, a5, 32 913; CHECK-NEXT: addi t0, t0, 160 914; CHECK-NEXT: bne a5, a2, .LBB14_3 915; CHECK-NEXT: # %bb.4: # %bb30 916; CHECK-NEXT: beq a6, a7, .LBB14_7 917; CHECK-NEXT: .LBB14_5: # %bb32 918; CHECK-NEXT: add a2, a0, a4 919; CHECK-NEXT: slli a5, a4, 2 920; CHECK-NEXT: add a1, a1, a4 921; CHECK-NEXT: subw a3, a3, a4 922; CHECK-NEXT: add a1, a1, a5 923; CHECK-NEXT: slli a3, a3, 32 924; CHECK-NEXT: srli a3, a3, 32 925; CHECK-NEXT: add a0, a4, a0 926; CHECK-NEXT: add a0, a0, a3 927; CHECK-NEXT: addi a0, a0, 1 928; CHECK-NEXT: .LBB14_6: # %bb35 929; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 930; CHECK-NEXT: lbu a3, 0(a1) 931; CHECK-NEXT: lbu a4, 0(a2) 932; CHECK-NEXT: add a3, a4, a3 933; CHECK-NEXT: sb a3, 0(a2) 934; CHECK-NEXT: addi a2, a2, 1 935; CHECK-NEXT: addi a1, a1, 5 936; CHECK-NEXT: bne a2, a0, .LBB14_6 937; CHECK-NEXT: .LBB14_7: # %bb34 938; CHECK-NEXT: ret 939bb: 940 %i = icmp eq i32 %arg2, 1024 941 br i1 %i, label %bb34, label %bb3 942 943bb3: ; preds = %bb 944 %i4 = sext i32 %arg2 to i64 945 %i5 = sub i32 1023, %arg2 946 %i6 = zext i32 %i5 to i64 947 %i7 = add nuw nsw i64 %i6, 1 948 %i8 = icmp ult i32 %i5, 31 949 br i1 %i8, label %bb32, label %bb9 950 951bb9: ; preds = %bb3 952 %i10 = and i64 %i7, 8589934560 953 %i11 = add nsw i64 %i10, %i4 954 %i12 = insertelement <32 x i64> poison, i64 %i4, i64 0 955 %i13 = shufflevector <32 x i64> %i12, <32 x i64> poison, <32 x i32> zeroinitializer 956 %i14 = add <32 x i64> %i13, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31> 957 br label %bb15 958 959bb15: ; preds = %bb15, %bb9 960 %i16 = phi i64 [ 0, %bb9 ], [ %i27, %bb15 ] 961 %i17 = phi <32 x i64> [ %i14, %bb9 ], [ %i28, %bb15 ] 962 %i18 = add i64 %i16, %i4 963 %i19 = mul nsw <32 x i64> %i17, splat (i64 5) 964 %i20 = getelementptr inbounds i8, ptr %arg1, <32 x i64> %i19 965 %i21 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i20, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef) 966 %i22 = getelementptr inbounds i8, ptr %arg, i64 %i18 967 %i24 = load <32 x i8>, ptr %i22, align 1 968 %i25 = add <32 x i8> %i24, %i21 969 store <32 x i8> %i25, ptr %i22, align 1 970 %i27 = add nuw i64 %i16, 32 971 %i28 = add <32 x i64> %i17, splat (i64 32) 972 %i29 = icmp eq i64 %i27, %i10 973 br i1 %i29, label %bb30, label %bb15 974 975bb30: ; preds = %bb15 976 %i31 = icmp eq i64 %i7, %i10 977 br i1 %i31, label %bb34, label %bb32 978 979bb32: ; preds = %bb30, %bb3 980 %i33 = phi i64 [ %i4, %bb3 ], [ %i11, %bb30 ] 981 br label %bb35 982 983bb34: ; preds = %bb35, %bb30, %bb 984 ret void 985 986bb35: ; preds = %bb35, %bb32 987 %i36 = phi i64 [ %i43, %bb35 ], [ %i33, %bb32 ] 988 %i37 = mul nsw i64 %i36, 5 989 %i38 = getelementptr inbounds i8, ptr %arg1, i64 %i37 990 %i39 = load i8, ptr %i38, align 1 991 %i40 = getelementptr inbounds i8, ptr %arg, i64 %i36 992 %i41 = load i8, ptr %i40, align 1 993 %i42 = add i8 %i41, %i39 994 store i8 %i42, ptr %i40, align 1 995 %i43 = add nsw i64 %i36, 1 996 %i44 = trunc i64 %i43 to i32 997 %i45 = icmp eq i32 %i44, 1024 998 br i1 %i45, label %bb34, label %bb35 999} 1000 1001declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32 immarg, <16 x i1>, <16 x i8>) 1002declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32 immarg, <16 x i1>) 1003 1004define void @gather_no_scalar_remainder(ptr noalias nocapture noundef %arg, ptr noalias nocapture noundef readonly %arg1, i64 noundef %arg2) { 1005; CHECK-LABEL: gather_no_scalar_remainder: 1006; CHECK: # %bb.0: # %bb 1007; CHECK-NEXT: slli a2, a2, 4 1008; CHECK-NEXT: beqz a2, .LBB15_3 1009; CHECK-NEXT: # %bb.1: # %bb2 1010; CHECK-NEXT: addi a2, a2, -16 1011; CHECK-NEXT: andi a2, a2, -16 1012; CHECK-NEXT: add a2, a2, a0 1013; CHECK-NEXT: addi a2, a2, 16 1014; CHECK-NEXT: li a3, 5 1015; CHECK-NEXT: vsetivli zero, 16, e8, mf2, ta, ma 1016; CHECK-NEXT: .LBB15_2: # %bb4 1017; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1018; CHECK-NEXT: vlse8.v v8, (a1), a3 1019; CHECK-NEXT: vle8.v v9, (a0) 1020; CHECK-NEXT: vadd.vv v8, v9, v8 1021; CHECK-NEXT: vse8.v v8, (a0) 1022; CHECK-NEXT: addi a0, a0, 16 1023; CHECK-NEXT: addi a1, a1, 80 1024; CHECK-NEXT: bne a0, a2, .LBB15_2 1025; CHECK-NEXT: .LBB15_3: # %bb16 1026; CHECK-NEXT: ret 1027bb: 1028 %i = shl i64 %arg2, 4 1029 %i3 = icmp eq i64 %i, 0 1030 br i1 %i3, label %bb16, label %bb2 1031 1032bb2: ; preds = %bb 1033 br label %bb4 1034 1035bb4: ; preds = %bb4, %bb2 1036 %i5 = phi i64 [ %i13, %bb4 ], [ 0, %bb2 ] 1037 %i6 = phi <16 x i64> [ %i14, %bb4 ], [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %bb2 ] 1038 %i7 = mul <16 x i64> %i6, splat (i64 5) 1039 %i8 = getelementptr inbounds i8, ptr %arg1, <16 x i64> %i7 1040 %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %i8, i32 1, <16 x i1> splat (i1 true), <16 x i8> undef) 1041 %i10 = getelementptr inbounds i8, ptr %arg, i64 %i5 1042 %i11 = load <16 x i8>, ptr %i10, align 1 1043 %i12 = add <16 x i8> %i11, %i9 1044 store <16 x i8> %i12, ptr %i10, align 1 1045 %i13 = add nuw i64 %i5, 16 1046 %i14 = add <16 x i64> %i6, splat (i64 16) 1047 %i15 = icmp eq i64 %i13, %i 1048 br i1 %i15, label %bb16, label %bb4 1049 1050bb16: ; preds = %bb4, %bb 1051 ret void 1052} 1053 1054define void @gather_zero_stride_fp(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { 1055; CHECK-LABEL: gather_zero_stride_fp: 1056; CHECK: # %bb.0: # %entry 1057; CHECK-NEXT: lui a2, 1 1058; CHECK-NEXT: add a2, a0, a2 1059; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma 1060; CHECK-NEXT: .LBB16_1: # %vector.body 1061; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1062; CHECK-NEXT: flw fa5, 0(a1) 1063; CHECK-NEXT: vle32.v v8, (a0) 1064; CHECK-NEXT: vfadd.vf v8, v8, fa5 1065; CHECK-NEXT: vse32.v v8, (a0) 1066; CHECK-NEXT: addi a0, a0, 128 1067; CHECK-NEXT: addi a1, a1, 640 1068; CHECK-NEXT: bne a0, a2, .LBB16_1 1069; CHECK-NEXT: # %bb.2: # %for.cond.cleanup 1070; CHECK-NEXT: ret 1071entry: 1072 br label %vector.body 1073 1074vector.body: ; preds = %vector.body, %entry 1075 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 1076 %vec.ind = phi <8 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] 1077 %i = mul nuw nsw <8 x i64> %vec.ind, splat (i64 5) 1078 %i1 = getelementptr inbounds float, ptr %B, <8 x i64> %i 1079 %wide.masked.gather = call <8 x float> @llvm.masked.gather.v8f32.v32p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x float> undef) 1080 %i2 = getelementptr inbounds float, ptr %A, i64 %index 1081 %wide.load = load <8 x float>, ptr %i2, align 4 1082 %i4 = fadd <8 x float> %wide.load, %wide.masked.gather 1083 store <8 x float> %i4, ptr %i2, align 4 1084 %index.next = add nuw i64 %index, 32 1085 %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32) 1086 %i6 = icmp eq i64 %index.next, 1024 1087 br i1 %i6, label %for.cond.cleanup, label %vector.body 1088 1089for.cond.cleanup: ; preds = %vector.body 1090 ret void 1091} 1092