CodeGen/X86/psubus.ll

*f4a2713aSLionel Sambuc; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE2
*f4a2713aSLionel Sambuc; RUN: llc -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
*f4a2713aSLionel Sambuc; RUN: llc -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuctarget datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
*f4a2713aSLionel Sambuctarget triple = "x86_64-apple-macosx10.8.0"
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucdefine void @test1(i16* nocapture %head) nounwind {
*f4a2713aSLionel Sambucvector.ph:
*f4a2713aSLionel Sambuc  br label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucvector.body:                                      ; preds = %vector.body, %vector.ph
*f4a2713aSLionel Sambuc  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
*f4a2713aSLionel Sambuc  %0 = getelementptr inbounds i16* %head, i64 %index
*f4a2713aSLionel Sambuc  %1 = bitcast i16* %0 to <8 x i16>*
*f4a2713aSLionel Sambuc  %2 = load <8 x i16>* %1, align 2
*f4a2713aSLionel Sambuc  %3 = icmp slt <8 x i16> %2, zeroinitializer
*f4a2713aSLionel Sambuc  %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
*f4a2713aSLionel Sambuc  %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
*f4a2713aSLionel Sambuc  store <8 x i16> %5, <8 x i16>* %1, align 2
*f4a2713aSLionel Sambuc  %index.next = add i64 %index, 8
*f4a2713aSLionel Sambuc  %6 = icmp eq i64 %index.next, 16384
*f4a2713aSLionel Sambuc  br i1 %6, label %for.end, label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucfor.end:                                          ; preds = %vector.body
*f4a2713aSLionel Sambuc  ret void
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; SSE2: @test1
*f4a2713aSLionel Sambuc; SSE2: psubusw LCPI0_0(%rip), %xmm0
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX1: @test1
*f4a2713aSLionel Sambuc; AVX1: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX2: @test1
*f4a2713aSLionel Sambuc; AVX2: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
*f4a2713aSLionel Sambuc}
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucdefine void @test2(i16* nocapture %head) nounwind {
*f4a2713aSLionel Sambucvector.ph:
*f4a2713aSLionel Sambuc  br label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucvector.body:                                      ; preds = %vector.body, %vector.ph
*f4a2713aSLionel Sambuc  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
*f4a2713aSLionel Sambuc  %0 = getelementptr inbounds i16* %head, i64 %index
*f4a2713aSLionel Sambuc  %1 = bitcast i16* %0 to <8 x i16>*
*f4a2713aSLionel Sambuc  %2 = load <8 x i16>* %1, align 2
*f4a2713aSLionel Sambuc  %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
*f4a2713aSLionel Sambuc  %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
*f4a2713aSLionel Sambuc  %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
*f4a2713aSLionel Sambuc  store <8 x i16> %5, <8 x i16>* %1, align 2
*f4a2713aSLionel Sambuc  %index.next = add i64 %index, 8
*f4a2713aSLionel Sambuc  %6 = icmp eq i64 %index.next, 16384
*f4a2713aSLionel Sambuc  br i1 %6, label %for.end, label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucfor.end:                                          ; preds = %vector.body
*f4a2713aSLionel Sambuc  ret void
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; SSE2: @test2
*f4a2713aSLionel Sambuc; SSE2: psubusw LCPI1_0(%rip), %xmm0
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX1: @test2
*f4a2713aSLionel Sambuc; AVX1: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX2: @test2
*f4a2713aSLionel Sambuc; AVX2: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
*f4a2713aSLionel Sambuc}
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucdefine void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
*f4a2713aSLionel Sambucvector.ph:
*f4a2713aSLionel Sambuc  %0 = insertelement <8 x i16> undef, i16 %w, i32 0
*f4a2713aSLionel Sambuc  %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
*f4a2713aSLionel Sambuc  br label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucvector.body:                                      ; preds = %vector.body, %vector.ph
*f4a2713aSLionel Sambuc  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
*f4a2713aSLionel Sambuc  %1 = getelementptr inbounds i16* %head, i64 %index
*f4a2713aSLionel Sambuc  %2 = bitcast i16* %1 to <8 x i16>*
*f4a2713aSLionel Sambuc  %3 = load <8 x i16>* %2, align 2
*f4a2713aSLionel Sambuc  %4 = icmp ult <8 x i16> %3, %broadcast15
*f4a2713aSLionel Sambuc  %5 = sub <8 x i16> %3, %broadcast15
*f4a2713aSLionel Sambuc  %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
*f4a2713aSLionel Sambuc  store <8 x i16> %6, <8 x i16>* %2, align 2
*f4a2713aSLionel Sambuc  %index.next = add i64 %index, 8
*f4a2713aSLionel Sambuc  %7 = icmp eq i64 %index.next, 16384
*f4a2713aSLionel Sambuc  br i1 %7, label %for.end, label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucfor.end:                                          ; preds = %vector.body
*f4a2713aSLionel Sambuc  ret void
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; SSE2: @test3
*f4a2713aSLionel Sambuc; SSE2: psubusw %xmm0, %xmm1
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX1: @test3
*f4a2713aSLionel Sambuc; AVX1: vpsubusw %xmm0, %xmm1, %xmm1
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX2: @test3
*f4a2713aSLionel Sambuc; AVX2: vpsubusw %xmm0, %xmm1, %xmm1
*f4a2713aSLionel Sambuc}
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucdefine void @test4(i8* nocapture %head) nounwind {
*f4a2713aSLionel Sambucvector.ph:
*f4a2713aSLionel Sambuc  br label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucvector.body:                                      ; preds = %vector.body, %vector.ph
*f4a2713aSLionel Sambuc  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
*f4a2713aSLionel Sambuc  %0 = getelementptr inbounds i8* %head, i64 %index
*f4a2713aSLionel Sambuc  %1 = bitcast i8* %0 to <16 x i8>*
*f4a2713aSLionel Sambuc  %2 = load <16 x i8>* %1, align 1
*f4a2713aSLionel Sambuc  %3 = icmp slt <16 x i8> %2, zeroinitializer
*f4a2713aSLionel Sambuc  %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
*f4a2713aSLionel Sambuc  %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
*f4a2713aSLionel Sambuc  store <16 x i8> %5, <16 x i8>* %1, align 1
*f4a2713aSLionel Sambuc  %index.next = add i64 %index, 16
*f4a2713aSLionel Sambuc  %6 = icmp eq i64 %index.next, 16384
*f4a2713aSLionel Sambuc  br i1 %6, label %for.end, label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucfor.end:                                          ; preds = %vector.body
*f4a2713aSLionel Sambuc  ret void
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; SSE2: @test4
*f4a2713aSLionel Sambuc; SSE2: psubusb LCPI3_0(%rip), %xmm0
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX1: @test4
*f4a2713aSLionel Sambuc; AVX1: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX2: @test4
*f4a2713aSLionel Sambuc; AVX2: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
*f4a2713aSLionel Sambuc}
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucdefine void @test5(i8* nocapture %head) nounwind {
*f4a2713aSLionel Sambucvector.ph:
*f4a2713aSLionel Sambuc  br label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucvector.body:                                      ; preds = %vector.body, %vector.ph
*f4a2713aSLionel Sambuc  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
*f4a2713aSLionel Sambuc  %0 = getelementptr inbounds i8* %head, i64 %index
*f4a2713aSLionel Sambuc  %1 = bitcast i8* %0 to <16 x i8>*
*f4a2713aSLionel Sambuc  %2 = load <16 x i8>* %1, align 1
*f4a2713aSLionel Sambuc  %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
*f4a2713aSLionel Sambuc  %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
*f4a2713aSLionel Sambuc  %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
*f4a2713aSLionel Sambuc  store <16 x i8> %5, <16 x i8>* %1, align 1
*f4a2713aSLionel Sambuc  %index.next = add i64 %index, 16
*f4a2713aSLionel Sambuc  %6 = icmp eq i64 %index.next, 16384
*f4a2713aSLionel Sambuc  br i1 %6, label %for.end, label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucfor.end:                                          ; preds = %vector.body
*f4a2713aSLionel Sambuc  ret void
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; SSE2: @test5
*f4a2713aSLionel Sambuc; SSE2: psubusb LCPI4_0(%rip), %xmm0
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX1: @test5
*f4a2713aSLionel Sambuc; AVX1: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX2: @test5
*f4a2713aSLionel Sambuc; AVX2: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0
*f4a2713aSLionel Sambuc}
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucdefine void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
*f4a2713aSLionel Sambucvector.ph:
*f4a2713aSLionel Sambuc  %0 = insertelement <16 x i8> undef, i8 %w, i32 0
*f4a2713aSLionel Sambuc  %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
*f4a2713aSLionel Sambuc  br label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucvector.body:                                      ; preds = %vector.body, %vector.ph
*f4a2713aSLionel Sambuc  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
*f4a2713aSLionel Sambuc  %1 = getelementptr inbounds i8* %head, i64 %index
*f4a2713aSLionel Sambuc  %2 = bitcast i8* %1 to <16 x i8>*
*f4a2713aSLionel Sambuc  %3 = load <16 x i8>* %2, align 1
*f4a2713aSLionel Sambuc  %4 = icmp ult <16 x i8> %3, %broadcast15
*f4a2713aSLionel Sambuc  %5 = sub <16 x i8> %3, %broadcast15
*f4a2713aSLionel Sambuc  %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
*f4a2713aSLionel Sambuc  store <16 x i8> %6, <16 x i8>* %2, align 1
*f4a2713aSLionel Sambuc  %index.next = add i64 %index, 16
*f4a2713aSLionel Sambuc  %7 = icmp eq i64 %index.next, 16384
*f4a2713aSLionel Sambuc  br i1 %7, label %for.end, label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucfor.end:                                          ; preds = %vector.body
*f4a2713aSLionel Sambuc  ret void
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; SSE2: @test6
*f4a2713aSLionel Sambuc; SSE2: psubusb %xmm0, %xmm1
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX1: @test6
*f4a2713aSLionel Sambuc; AVX1: vpsubusb %xmm0, %xmm1, %xmm1
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX2: @test6
*f4a2713aSLionel Sambuc; AVX2: vpsubusb %xmm0, %xmm1, %xmm1
*f4a2713aSLionel Sambuc}
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucdefine void @test7(i16* nocapture %head) nounwind {
*f4a2713aSLionel Sambucvector.ph:
*f4a2713aSLionel Sambuc  br label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucvector.body:                                      ; preds = %vector.body, %vector.ph
*f4a2713aSLionel Sambuc  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
*f4a2713aSLionel Sambuc  %0 = getelementptr inbounds i16* %head, i64 %index
*f4a2713aSLionel Sambuc  %1 = bitcast i16* %0 to <16 x i16>*
*f4a2713aSLionel Sambuc  %2 = load <16 x i16>* %1, align 2
*f4a2713aSLionel Sambuc  %3 = icmp slt <16 x i16> %2, zeroinitializer
*f4a2713aSLionel Sambuc  %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
*f4a2713aSLionel Sambuc  %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
*f4a2713aSLionel Sambuc  store <16 x i16> %5, <16 x i16>* %1, align 2
*f4a2713aSLionel Sambuc  %index.next = add i64 %index, 8
*f4a2713aSLionel Sambuc  %6 = icmp eq i64 %index.next, 16384
*f4a2713aSLionel Sambuc  br i1 %6, label %for.end, label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucfor.end:                                          ; preds = %vector.body
*f4a2713aSLionel Sambuc  ret void
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX2: @test7
*f4a2713aSLionel Sambuc; AVX2: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0
*f4a2713aSLionel Sambuc}
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucdefine void @test8(i16* nocapture %head) nounwind {
*f4a2713aSLionel Sambucvector.ph:
*f4a2713aSLionel Sambuc  br label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucvector.body:                                      ; preds = %vector.body, %vector.ph
*f4a2713aSLionel Sambuc  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
*f4a2713aSLionel Sambuc  %0 = getelementptr inbounds i16* %head, i64 %index
*f4a2713aSLionel Sambuc  %1 = bitcast i16* %0 to <16 x i16>*
*f4a2713aSLionel Sambuc  %2 = load <16 x i16>* %1, align 2
*f4a2713aSLionel Sambuc  %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
*f4a2713aSLionel Sambuc  %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
*f4a2713aSLionel Sambuc  %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
*f4a2713aSLionel Sambuc  store <16 x i16> %5, <16 x i16>* %1, align 2
*f4a2713aSLionel Sambuc  %index.next = add i64 %index, 8
*f4a2713aSLionel Sambuc  %6 = icmp eq i64 %index.next, 16384
*f4a2713aSLionel Sambuc  br i1 %6, label %for.end, label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucfor.end:                                          ; preds = %vector.body
*f4a2713aSLionel Sambuc  ret void
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX2: @test8
*f4a2713aSLionel Sambuc; AVX2: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0
*f4a2713aSLionel Sambuc}
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucdefine void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
*f4a2713aSLionel Sambucvector.ph:
*f4a2713aSLionel Sambuc  %0 = insertelement <16 x i16> undef, i16 %w, i32 0
*f4a2713aSLionel Sambuc  %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
*f4a2713aSLionel Sambuc  br label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucvector.body:                                      ; preds = %vector.body, %vector.ph
*f4a2713aSLionel Sambuc  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
*f4a2713aSLionel Sambuc  %1 = getelementptr inbounds i16* %head, i64 %index
*f4a2713aSLionel Sambuc  %2 = bitcast i16* %1 to <16 x i16>*
*f4a2713aSLionel Sambuc  %3 = load <16 x i16>* %2, align 2
*f4a2713aSLionel Sambuc  %4 = icmp ult <16 x i16> %3, %broadcast15
*f4a2713aSLionel Sambuc  %5 = sub <16 x i16> %3, %broadcast15
*f4a2713aSLionel Sambuc  %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
*f4a2713aSLionel Sambuc  store <16 x i16> %6, <16 x i16>* %2, align 2
*f4a2713aSLionel Sambuc  %index.next = add i64 %index, 8
*f4a2713aSLionel Sambuc  %7 = icmp eq i64 %index.next, 16384
*f4a2713aSLionel Sambuc  br i1 %7, label %for.end, label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucfor.end:                                          ; preds = %vector.body
*f4a2713aSLionel Sambuc  ret void
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX2: @test9
*f4a2713aSLionel Sambuc; AVX2: vpsubusw %ymm0, %ymm1, %ymm1
*f4a2713aSLionel Sambuc}
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucdefine void @test10(i8* nocapture %head) nounwind {
*f4a2713aSLionel Sambucvector.ph:
*f4a2713aSLionel Sambuc  br label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucvector.body:                                      ; preds = %vector.body, %vector.ph
*f4a2713aSLionel Sambuc  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
*f4a2713aSLionel Sambuc  %0 = getelementptr inbounds i8* %head, i64 %index
*f4a2713aSLionel Sambuc  %1 = bitcast i8* %0 to <32 x i8>*
*f4a2713aSLionel Sambuc  %2 = load <32 x i8>* %1, align 1
*f4a2713aSLionel Sambuc  %3 = icmp slt <32 x i8> %2, zeroinitializer
*f4a2713aSLionel Sambuc  %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
*f4a2713aSLionel Sambuc  %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
*f4a2713aSLionel Sambuc  store <32 x i8> %5, <32 x i8>* %1, align 1
*f4a2713aSLionel Sambuc  %index.next = add i64 %index, 16
*f4a2713aSLionel Sambuc  %6 = icmp eq i64 %index.next, 16384
*f4a2713aSLionel Sambuc  br i1 %6, label %for.end, label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucfor.end:                                          ; preds = %vector.body
*f4a2713aSLionel Sambuc  ret void
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX2: @test10
*f4a2713aSLionel Sambuc; AVX2: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0
*f4a2713aSLionel Sambuc}
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucdefine void @test11(i8* nocapture %head) nounwind {
*f4a2713aSLionel Sambucvector.ph:
*f4a2713aSLionel Sambuc  br label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucvector.body:                                      ; preds = %vector.body, %vector.ph
*f4a2713aSLionel Sambuc  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
*f4a2713aSLionel Sambuc  %0 = getelementptr inbounds i8* %head, i64 %index
*f4a2713aSLionel Sambuc  %1 = bitcast i8* %0 to <32 x i8>*
*f4a2713aSLionel Sambuc  %2 = load <32 x i8>* %1, align 1
*f4a2713aSLionel Sambuc  %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
*f4a2713aSLionel Sambuc  %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
*f4a2713aSLionel Sambuc  %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
*f4a2713aSLionel Sambuc  store <32 x i8> %5, <32 x i8>* %1, align 1
*f4a2713aSLionel Sambuc  %index.next = add i64 %index, 16
*f4a2713aSLionel Sambuc  %6 = icmp eq i64 %index.next, 16384
*f4a2713aSLionel Sambuc  br i1 %6, label %for.end, label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucfor.end:                                          ; preds = %vector.body
*f4a2713aSLionel Sambuc  ret void
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX2: @test11
*f4a2713aSLionel Sambuc; AVX2: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0
*f4a2713aSLionel Sambuc}
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucdefine void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
*f4a2713aSLionel Sambucvector.ph:
*f4a2713aSLionel Sambuc  %0 = insertelement <32 x i8> undef, i8 %w, i32 0
*f4a2713aSLionel Sambuc  %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
*f4a2713aSLionel Sambuc  br label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucvector.body:                                      ; preds = %vector.body, %vector.ph
*f4a2713aSLionel Sambuc  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
*f4a2713aSLionel Sambuc  %1 = getelementptr inbounds i8* %head, i64 %index
*f4a2713aSLionel Sambuc  %2 = bitcast i8* %1 to <32 x i8>*
*f4a2713aSLionel Sambuc  %3 = load <32 x i8>* %2, align 1
*f4a2713aSLionel Sambuc  %4 = icmp ult <32 x i8> %3, %broadcast15
*f4a2713aSLionel Sambuc  %5 = sub <32 x i8> %3, %broadcast15
*f4a2713aSLionel Sambuc  %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
*f4a2713aSLionel Sambuc  store <32 x i8> %6, <32 x i8>* %2, align 1
*f4a2713aSLionel Sambuc  %index.next = add i64 %index, 16
*f4a2713aSLionel Sambuc  %7 = icmp eq i64 %index.next, 16384
*f4a2713aSLionel Sambuc  br i1 %7, label %for.end, label %vector.body
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambucfor.end:                                          ; preds = %vector.body
*f4a2713aSLionel Sambuc  ret void
*f4a2713aSLionel Sambuc
*f4a2713aSLionel Sambuc; AVX2: @test12
*f4a2713aSLionel Sambuc; AVX2: vpsubusb %ymm0, %ymm1, %ymm1
*f4a2713aSLionel Sambuc}