xref: /llvm-project/llvm/test/CodeGen/X86/pr67333.ll (revision 9540a7ae82dfabe551bfef94fc9f29ebebf841da)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
3
4declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
5declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
6
7define void @SHA256_Compress_Generic(ptr noundef %ctx) #1 {
8; CHECK-LABEL: SHA256_Compress_Generic:
9; CHECK:       # %bb.0: # %entry
10; CHECK-NEXT:    movbel 0, %eax
11; CHECK-NEXT:    movbel 12(%rdi), %ecx
12; CHECK-NEXT:    vmovd %eax, %xmm0
13; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,0,1,2,3,128,128,128,128,128,128,128,128]
14; CHECK-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
15; CHECK-NEXT:    vpsrld $17, %xmm2, %xmm0
16; CHECK-NEXT:    vpslld $15, %xmm2, %xmm3
17; CHECK-NEXT:    vpor %xmm0, %xmm3, %xmm0
18; CHECK-NEXT:    vpsrld $19, %xmm2, %xmm3
19; CHECK-NEXT:    vpslld $13, %xmm2, %xmm4
20; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
21; CHECK-NEXT:    vpxor %xmm3, %xmm0, %xmm3
22; CHECK-NEXT:    vpxor %xmm2, %xmm3, %xmm0
23; CHECK-NEXT:    vmovd %ecx, %xmm4
24; CHECK-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
25; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
26; CHECK-NEXT:    vpsrld $17, %xmm1, %xmm0
27; CHECK-NEXT:    vpslld $15, %xmm1, %xmm4
28; CHECK-NEXT:    vpor %xmm0, %xmm4, %xmm0
29; CHECK-NEXT:    vpsrld $19, %xmm1, %xmm4
30; CHECK-NEXT:    vpslld $13, %xmm1, %xmm5
31; CHECK-NEXT:    vpor %xmm4, %xmm5, %xmm4
32; CHECK-NEXT:    vpxor %xmm4, %xmm0, %xmm0
33; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
34; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
35; CHECK-NEXT:    vpsrld $17, %xmm0, %xmm4
36; CHECK-NEXT:    vpslld $15, %xmm0, %xmm5
37; CHECK-NEXT:    vpor %xmm4, %xmm5, %xmm4
38; CHECK-NEXT:    vpsrld $19, %xmm0, %xmm5
39; CHECK-NEXT:    vpslld $13, %xmm0, %xmm6
40; CHECK-NEXT:    vpor %xmm5, %xmm6, %xmm5
41; CHECK-NEXT:    vpxor %xmm5, %xmm4, %xmm4
42; CHECK-NEXT:    vpsrld $10, %xmm0, %xmm0
43; CHECK-NEXT:    vpxor %xmm0, %xmm4, %xmm0
44; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
45; CHECK-NEXT:    vpsrld $17, %xmm0, %xmm4
46; CHECK-NEXT:    vpslld $15, %xmm0, %xmm5
47; CHECK-NEXT:    vpor %xmm4, %xmm5, %xmm4
48; CHECK-NEXT:    vpsrld $19, %xmm0, %xmm5
49; CHECK-NEXT:    vpslld $13, %xmm0, %xmm6
50; CHECK-NEXT:    vpor %xmm5, %xmm6, %xmm5
51; CHECK-NEXT:    vpxor %xmm5, %xmm4, %xmm4
52; CHECK-NEXT:    vpsrld $10, %xmm0, %xmm5
53; CHECK-NEXT:    vpxor %xmm5, %xmm4, %xmm4
54; CHECK-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
55; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,3]
56; CHECK-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
57; CHECK-NEXT:    vpsrld $17, %xmm2, %xmm3
58; CHECK-NEXT:    vpslld $15, %xmm2, %xmm4
59; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
60; CHECK-NEXT:    vpsrld $19, %xmm2, %xmm4
61; CHECK-NEXT:    vpslld $13, %xmm2, %xmm5
62; CHECK-NEXT:    vpor %xmm4, %xmm5, %xmm4
63; CHECK-NEXT:    vpxor %xmm4, %xmm3, %xmm3
64; CHECK-NEXT:    vpsrld $10, %xmm2, %xmm2
65; CHECK-NEXT:    vpxor %xmm2, %xmm3, %xmm2
66; CHECK-NEXT:    vpsrlq $32, %xmm1, %xmm3
67; CHECK-NEXT:    vpaddd %xmm2, %xmm3, %xmm1
68; CHECK-NEXT:    vpsrld $17, %xmm1, %xmm2
69; CHECK-NEXT:    vpslld $15, %xmm1, %xmm4
70; CHECK-NEXT:    vpor %xmm2, %xmm4, %xmm2
71; CHECK-NEXT:    vpsrld $19, %xmm1, %xmm4
72; CHECK-NEXT:    vpslld $13, %xmm1, %xmm5
73; CHECK-NEXT:    vpor %xmm4, %xmm5, %xmm4
74; CHECK-NEXT:    vpxor %xmm4, %xmm2, %xmm2
75; CHECK-NEXT:    vpsrld $10, %xmm1, %xmm4
76; CHECK-NEXT:    vpxor %xmm4, %xmm2, %xmm2
77; CHECK-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
78; CHECK-NEXT:    vpsrld $17, %xmm2, %xmm3
79; CHECK-NEXT:    vpslld $15, %xmm2, %xmm4
80; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
81; CHECK-NEXT:    vpsrld $19, %xmm2, %xmm4
82; CHECK-NEXT:    vpslld $13, %xmm2, %xmm5
83; CHECK-NEXT:    vpor %xmm4, %xmm5, %xmm4
84; CHECK-NEXT:    vpxor %xmm4, %xmm3, %xmm3
85; CHECK-NEXT:    vpsrld $10, %xmm2, %xmm2
86; CHECK-NEXT:    vpxor %xmm2, %xmm3, %xmm2
87; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
88; CHECK-NEXT:    vpsrld $17, %xmm0, %xmm2
89; CHECK-NEXT:    vpslld $15, %xmm0, %xmm3
90; CHECK-NEXT:    vpor %xmm2, %xmm3, %xmm2
91; CHECK-NEXT:    vpsrld $19, %xmm0, %xmm3
92; CHECK-NEXT:    vpslld $13, %xmm0, %xmm4
93; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
94; CHECK-NEXT:    vpxor %xmm3, %xmm2, %xmm2
95; CHECK-NEXT:    vpsrld $10, %xmm0, %xmm3
96; CHECK-NEXT:    vpxor %xmm3, %xmm2, %xmm2
97; CHECK-NEXT:    vpsllq $32, %xmm1, %xmm3
98; CHECK-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
99; CHECK-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
100; CHECK-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
101; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
102; CHECK-NEXT:    vmovdqu %ymm0, 132(%rdi)
103; CHECK-NEXT:    vzeroupper
104; CHECK-NEXT:    retq
105entry:
106  %0 = load i32, ptr null, align 4
107  %1 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %0) #3
108  %arrayidx14 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 3
109  %2 = load i32, ptr %arrayidx14, align 4
110  %3 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %2) #3
111  %4 = insertelement <2 x i32> zeroinitializer, i32 %1, i64 1
112  %5 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> <i32 15, i32 15>)
113  %6 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> <i32 13, i32 13>)
114  %7 = xor <2 x i32> %5, %6
115  %8 = lshr <2 x i32> %4, zeroinitializer
116  %9 = xor <2 x i32> %7, %8
117  %10 = insertelement <2 x i32> zeroinitializer, i32 %3, i64 0
118  %11 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %10, <2 x i32> <i32 1, i32 2>
119  %12 = add <2 x i32> %11, %9
120  %13 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> <i32 15, i32 15>)
121  %14 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> <i32 13, i32 13>)
122  %15 = xor <2 x i32> %13, %14
123  %16 = lshr <2 x i32> %12, zeroinitializer
124  %17 = xor <2 x i32> %15, %16
125  %18 = add <2 x i32> %4, %17
126  %19 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> <i32 15, i32 15>)
127  %20 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> <i32 13, i32 13>)
128  %21 = xor <2 x i32> %19, %20
129  %22 = lshr <2 x i32> %18, <i32 10, i32 10>
130  %23 = xor <2 x i32> %21, %22
131  %24 = add <2 x i32> %4, %23
132  %25 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> <i32 15, i32 15>)
133  %26 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> <i32 13, i32 13>)
134  %27 = xor <2 x i32> %25, %26
135  %28 = lshr <2 x i32> %24, <i32 10, i32 10>
136  %29 = xor <2 x i32> %27, %28
137  %30 = shufflevector <2 x i32> %4, <2 x i32> %12, <2 x i32> <i32 1, i32 2>
138  %31 = add <2 x i32> %30, %29
139  %32 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> <i32 15, i32 15>)
140  %33 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> <i32 13, i32 13>)
141  %34 = xor <2 x i32> %32, %33
142  %35 = lshr <2 x i32> %31, <i32 10, i32 10>
143  %36 = xor <2 x i32> %34, %35
144  %37 = shufflevector <2 x i32> %12, <2 x i32> zeroinitializer, <2 x i32> <i32 1, i32 2>
145  %38 = add <2 x i32> %37, %36
146  %arrayidx918 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 33
147  store <2 x i32> %38, ptr %arrayidx918, align 4
148  %arrayidx1012 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 35
149  %39 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> <i32 15, i32 15>)
150  %40 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> <i32 13, i32 13>)
151  %41 = xor <2 x i32> %39, %40
152  %42 = lshr <2 x i32> %38, <i32 10, i32 10>
153  %43 = xor <2 x i32> %41, %42
154  %44 = add <2 x i32> %37, %43
155  store <2 x i32> zeroinitializer, ptr %arrayidx1012, align 4
156  %arrayidx1106 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 37
157  %45 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> <i32 15, i32 15>)
158  %46 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> <i32 13, i32 13>)
159  %47 = xor <2 x i32> %45, %46
160  %48 = lshr <2 x i32> %44, <i32 10, i32 10>
161  %49 = xor <2 x i32> %47, %48
162  %50 = lshr <2 x i32> %24, zeroinitializer
163  %51 = add <2 x i32> %50, %49
164  store <2 x i32> %51, ptr %arrayidx1106, align 4
165  %arrayidx1200 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 39
166  %52 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> <i32 15, i32 15>)
167  %53 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> <i32 13, i32 13>)
168  %54 = xor <2 x i32> %52, %53
169  %55 = lshr <2 x i32> %51, <i32 10, i32 10>
170  %56 = xor <2 x i32> %54, %55
171  %57 = shufflevector <2 x i32> %38, <2 x i32> zeroinitializer, <2 x i32> <i32 poison, i32 0>
172  %58 = insertelement <2 x i32> %57, i32 0, i64 0
173  %59 = add <2 x i32> %58, %56
174  store <2 x i32> %59, ptr %arrayidx1200, align 4
175  ret void
176
177; uselistorder directives
178  uselistorder <2 x i32> %4, { 7, 0, 1, 6, 5, 4, 3, 2 }
179  uselistorder <2 x i32> %38, { 6, 5, 4, 3, 2, 1, 0 }
180}
181
182declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) #2
183
184; uselistorder directives
185uselistorder ptr @llvm.fshl.v2i32, { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }
186
187attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
188attributes #1 = { nounwind sspstrong memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "probe-stack"="inline-asm" "stack-protector-buffer-size"="8" "target-cpu"="skylake" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" }
189attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
190attributes #3 = { nounwind memory(none) }
191