1; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s 2 3define void @ldg_f16(ptr nocapture align 16 %rd0) { 4 %load1 = load <2 x half>, ptr %rd0, align 16 5 %p1 = fcmp ogt <2 x half> %load1, zeroinitializer 6 %s1 = select <2 x i1> %p1, <2 x half> %load1, <2 x half> zeroinitializer 7 store <2 x half> %s1, ptr %rd0, align 16 8 %in2 = getelementptr half, ptr %rd0, i64 2 9 %load2 = load <2 x half>, ptr %in2, align 4 10 %p2 = fcmp ogt <2 x half> %load2, zeroinitializer 11 %s2 = select <2 x i1> %p2, <2 x half> %load2, <2 x half> zeroinitializer 12 store <2 x half> %s2, ptr %in2, align 4 13 %in3 = getelementptr half, ptr %rd0, i64 4 14 %load3 = load <2 x half>, ptr %in3, align 4 15 %p3 = fcmp ogt <2 x half> %load3, zeroinitializer 16 %s3 = select <2 x i1> %p3, <2 x half> %load3, <2 x half> zeroinitializer 17 store <2 x half> %s3, ptr %in3, align 4 18 %in4 = getelementptr half, ptr %rd0, i64 6 19 %load4 = load <2 x half>, ptr %in4, align 4 20 %p4 = fcmp ogt <2 x half> %load4, zeroinitializer 21 %s4 = select <2 x i1> %p4, <2 x half> %load4, <2 x half> zeroinitializer 22 store <2 x half> %s4, ptr %in4, align 4 23 ret void 24 25; CHECK-LABEL: @ldg_f16 26; CHECK: %[[LD:.*]] = load <8 x half>, ptr 27; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 0, i32 1> 28; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 2, i32 3> 29; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 4, i32 5> 30; CHECK: shufflevector <8 x half> %[[LD]], <8 x half> poison, <2 x i32> <i32 6, i32 7> 31; CHECK: store <8 x half> 32} 33 34define void @no_nonpow2_vector(ptr nocapture align 16 %rd0) { 35 %load1 = load <3 x half>, ptr %rd0, align 4 36 %p1 = fcmp ogt <3 x half> %load1, zeroinitializer 37 %s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer 38 store <3 x half> %s1, ptr %rd0, align 4 39 %in2 = getelementptr half, ptr %rd0, i64 3 40 %load2 = load <3 x half>, ptr %in2, align 4 41 %p2 = fcmp ogt <3 x half> %load2, zeroinitializer 42 %s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer 43 store <3 x half> %s2, ptr %in2, align 4 44 %in3 = getelementptr half, ptr %rd0, i64 6 45 %load3 = load <3 x half>, ptr %in3, align 4 46 %p3 = fcmp ogt <3 x half> %load3, zeroinitializer 47 %s3 = select <3 x i1> %p3, <3 x half> %load3, <3 x half> zeroinitializer 48 store <3 x half> %s3, ptr %in3, align 4 49 %in4 = getelementptr half, ptr %rd0, i64 9 50 %load4 = load <3 x half>, ptr %in4, align 4 51 %p4 = fcmp ogt <3 x half> %load4, zeroinitializer 52 %s4 = select <3 x i1> %p4, <3 x half> %load4, <3 x half> zeroinitializer 53 store <3 x half> %s4, ptr %in4, align 4 54 ret void 55 56; CHECK-LABEL: @no_nonpow2_vector 57; CHECK-NOT: shufflevector 58} 59 60define void @no_pointer_vector(ptr nocapture align 16 %rd0) { 61 %load1 = load <2 x ptr>, ptr %rd0, align 4 62 %p1 = icmp ne <2 x ptr> %load1, zeroinitializer 63 %s1 = select <2 x i1> %p1, <2 x ptr> %load1, <2 x ptr> zeroinitializer 64 store <2 x ptr> %s1, ptr %rd0, align 4 65 %in2 = getelementptr ptr, ptr %rd0, i64 2 66 %load2 = load <2 x ptr>, ptr %in2, align 4 67 %p2 = icmp ne <2 x ptr> %load2, zeroinitializer 68 %s2 = select <2 x i1> %p2, <2 x ptr> %load2, <2 x ptr> zeroinitializer 69 store <2 x ptr> %s2, ptr %in2, align 4 70 %in3 = getelementptr ptr, ptr %rd0, i64 4 71 %load3 = load <2 x ptr>, ptr %in3, align 4 72 %p3 = icmp ne <2 x ptr> %load3, zeroinitializer 73 %s3 = select <2 x i1> %p3, <2 x ptr> %load3, <2 x ptr> zeroinitializer 74 store <2 x ptr> %s3, ptr %in3, align 4 75 %in4 = getelementptr ptr, ptr %rd0, i64 6 76 %load4 = load <2 x ptr>, ptr %in4, align 4 77 %p4 = icmp ne <2 x ptr> %load4, zeroinitializer 78 %s4 = select <2 x i1> %p4, <2 x ptr> %load4, <2 x ptr> zeroinitializer 79 store <2 x ptr> %s4, ptr %in4, align 4 80 ret void 81 82; CHECK-LABEL: @no_pointer_vector 83; CHECK-NOT: shufflevector 84} 85