1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s 3 4declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0 5declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0 6 7define void @SHA256_Compress_Generic(ptr noundef %ctx) #1 { 8; CHECK-LABEL: SHA256_Compress_Generic: 9; CHECK: # %bb.0: # %entry 10; CHECK-NEXT: movbel 0, %eax 11; CHECK-NEXT: movbel 12(%rdi), %ecx 12; CHECK-NEXT: vmovd %eax, %xmm0 13; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,0,1,2,3,128,128,128,128,128,128,128,128] 14; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm2 15; CHECK-NEXT: vpsrld $17, %xmm2, %xmm0 16; CHECK-NEXT: vpslld $15, %xmm2, %xmm3 17; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 18; CHECK-NEXT: vpsrld $19, %xmm2, %xmm3 19; CHECK-NEXT: vpslld $13, %xmm2, %xmm4 20; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 21; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm3 22; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm0 23; CHECK-NEXT: vmovd %ecx, %xmm4 24; CHECK-NEXT: vpshufb %xmm1, %xmm4, %xmm1 25; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm1 26; CHECK-NEXT: vpsrld $17, %xmm1, %xmm0 27; CHECK-NEXT: vpslld $15, %xmm1, %xmm4 28; CHECK-NEXT: vpor %xmm0, %xmm4, %xmm0 29; CHECK-NEXT: vpsrld $19, %xmm1, %xmm4 30; CHECK-NEXT: vpslld $13, %xmm1, %xmm5 31; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 32; CHECK-NEXT: vpxor %xmm4, %xmm0, %xmm0 33; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 34; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 35; CHECK-NEXT: vpsrld $17, %xmm0, %xmm4 36; CHECK-NEXT: vpslld $15, %xmm0, %xmm5 37; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 38; CHECK-NEXT: vpsrld $19, %xmm0, %xmm5 39; CHECK-NEXT: vpslld $13, %xmm0, %xmm6 40; CHECK-NEXT: vpor %xmm5, %xmm6, %xmm5 41; CHECK-NEXT: vpxor %xmm5, %xmm4, %xmm4 42; CHECK-NEXT: vpsrld $10, %xmm0, %xmm0 43; CHECK-NEXT: vpxor %xmm0, %xmm4, %xmm0 44; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 45; CHECK-NEXT: vpsrld $17, %xmm0, %xmm4 46; CHECK-NEXT: vpslld $15, %xmm0, %xmm5 47; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 48; CHECK-NEXT: vpsrld $19, %xmm0, %xmm5 49; CHECK-NEXT: vpslld $13, %xmm0, %xmm6 50; CHECK-NEXT: vpor %xmm5, %xmm6, %xmm5 51; CHECK-NEXT: vpxor %xmm5, %xmm4, %xmm4 52; CHECK-NEXT: vpsrld $10, %xmm0, %xmm5 53; CHECK-NEXT: vpxor %xmm5, %xmm4, %xmm4 54; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] 55; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,3] 56; CHECK-NEXT: vpaddd %xmm4, %xmm2, %xmm2 57; CHECK-NEXT: vpsrld $17, %xmm2, %xmm3 58; CHECK-NEXT: vpslld $15, %xmm2, %xmm4 59; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 60; CHECK-NEXT: vpsrld $19, %xmm2, %xmm4 61; CHECK-NEXT: vpslld $13, %xmm2, %xmm5 62; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 63; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 64; CHECK-NEXT: vpsrld $10, %xmm2, %xmm2 65; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm2 66; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm3 67; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm1 68; CHECK-NEXT: vpsrld $17, %xmm1, %xmm2 69; CHECK-NEXT: vpslld $15, %xmm1, %xmm4 70; CHECK-NEXT: vpor %xmm2, %xmm4, %xmm2 71; CHECK-NEXT: vpsrld $19, %xmm1, %xmm4 72; CHECK-NEXT: vpslld $13, %xmm1, %xmm5 73; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 74; CHECK-NEXT: vpxor %xmm4, %xmm2, %xmm2 75; CHECK-NEXT: vpsrld $10, %xmm1, %xmm4 76; CHECK-NEXT: vpxor %xmm4, %xmm2, %xmm2 77; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 78; CHECK-NEXT: vpsrld $17, %xmm2, %xmm3 79; CHECK-NEXT: vpslld $15, %xmm2, %xmm4 80; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 81; CHECK-NEXT: vpsrld $19, %xmm2, %xmm4 82; CHECK-NEXT: vpslld $13, %xmm2, %xmm5 83; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 84; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 85; CHECK-NEXT: vpsrld $10, %xmm2, %xmm2 86; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm2 87; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 88; CHECK-NEXT: vpsrld $17, %xmm0, %xmm2 89; CHECK-NEXT: vpslld $15, %xmm0, %xmm3 90; CHECK-NEXT: vpor %xmm2, %xmm3, %xmm2 91; CHECK-NEXT: vpsrld $19, %xmm0, %xmm3 92; CHECK-NEXT: vpslld $13, %xmm0, %xmm4 93; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 94; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm2 95; CHECK-NEXT: vpsrld $10, %xmm0, %xmm3 96; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm2 97; CHECK-NEXT: vpsllq $32, %xmm1, %xmm3 98; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 99; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 100; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] 101; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 102; CHECK-NEXT: vmovdqu %ymm0, 132(%rdi) 103; CHECK-NEXT: vzeroupper 104; CHECK-NEXT: retq 105entry: 106 %0 = load i32, ptr null, align 4 107 %1 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %0) #3 108 %arrayidx14 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 3 109 %2 = load i32, ptr %arrayidx14, align 4 110 %3 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %2) #3 111 %4 = insertelement <2 x i32> zeroinitializer, i32 %1, i64 1 112 %5 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> <i32 15, i32 15>) 113 %6 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> <i32 13, i32 13>) 114 %7 = xor <2 x i32> %5, %6 115 %8 = lshr <2 x i32> %4, zeroinitializer 116 %9 = xor <2 x i32> %7, %8 117 %10 = insertelement <2 x i32> zeroinitializer, i32 %3, i64 0 118 %11 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %10, <2 x i32> <i32 1, i32 2> 119 %12 = add <2 x i32> %11, %9 120 %13 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> <i32 15, i32 15>) 121 %14 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> <i32 13, i32 13>) 122 %15 = xor <2 x i32> %13, %14 123 %16 = lshr <2 x i32> %12, zeroinitializer 124 %17 = xor <2 x i32> %15, %16 125 %18 = add <2 x i32> %4, %17 126 %19 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> <i32 15, i32 15>) 127 %20 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> <i32 13, i32 13>) 128 %21 = xor <2 x i32> %19, %20 129 %22 = lshr <2 x i32> %18, <i32 10, i32 10> 130 %23 = xor <2 x i32> %21, %22 131 %24 = add <2 x i32> %4, %23 132 %25 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> <i32 15, i32 15>) 133 %26 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> <i32 13, i32 13>) 134 %27 = xor <2 x i32> %25, %26 135 %28 = lshr <2 x i32> %24, <i32 10, i32 10> 136 %29 = xor <2 x i32> %27, %28 137 %30 = shufflevector <2 x i32> %4, <2 x i32> %12, <2 x i32> <i32 1, i32 2> 138 %31 = add <2 x i32> %30, %29 139 %32 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> <i32 15, i32 15>) 140 %33 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> <i32 13, i32 13>) 141 %34 = xor <2 x i32> %32, %33 142 %35 = lshr <2 x i32> %31, <i32 10, i32 10> 143 %36 = xor <2 x i32> %34, %35 144 %37 = shufflevector <2 x i32> %12, <2 x i32> zeroinitializer, <2 x i32> <i32 1, i32 2> 145 %38 = add <2 x i32> %37, %36 146 %arrayidx918 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 33 147 store <2 x i32> %38, ptr %arrayidx918, align 4 148 %arrayidx1012 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 35 149 %39 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> <i32 15, i32 15>) 150 %40 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> <i32 13, i32 13>) 151 %41 = xor <2 x i32> %39, %40 152 %42 = lshr <2 x i32> %38, <i32 10, i32 10> 153 %43 = xor <2 x i32> %41, %42 154 %44 = add <2 x i32> %37, %43 155 store <2 x i32> zeroinitializer, ptr %arrayidx1012, align 4 156 %arrayidx1106 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 37 157 %45 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> <i32 15, i32 15>) 158 %46 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> <i32 13, i32 13>) 159 %47 = xor <2 x i32> %45, %46 160 %48 = lshr <2 x i32> %44, <i32 10, i32 10> 161 %49 = xor <2 x i32> %47, %48 162 %50 = lshr <2 x i32> %24, zeroinitializer 163 %51 = add <2 x i32> %50, %49 164 store <2 x i32> %51, ptr %arrayidx1106, align 4 165 %arrayidx1200 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 39 166 %52 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> <i32 15, i32 15>) 167 %53 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> <i32 13, i32 13>) 168 %54 = xor <2 x i32> %52, %53 169 %55 = lshr <2 x i32> %51, <i32 10, i32 10> 170 %56 = xor <2 x i32> %54, %55 171 %57 = shufflevector <2 x i32> %38, <2 x i32> zeroinitializer, <2 x i32> <i32 poison, i32 0> 172 %58 = insertelement <2 x i32> %57, i32 0, i64 0 173 %59 = add <2 x i32> %58, %56 174 store <2 x i32> %59, ptr %arrayidx1200, align 4 175 ret void 176 177; uselistorder directives 178 uselistorder <2 x i32> %4, { 7, 0, 1, 6, 5, 4, 3, 2 } 179 uselistorder <2 x i32> %38, { 6, 5, 4, 3, 2, 1, 0 } 180} 181 182declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) #2 183 184; uselistorder directives 185uselistorder ptr @llvm.fshl.v2i32, { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 } 186 187attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } 188attributes #1 = { nounwind sspstrong memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "probe-stack"="inline-asm" "stack-protector-buffer-size"="8" "target-cpu"="skylake" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" } 189attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } 190attributes #3 = { nounwind memory(none) } 191