xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll (revision a12d7e4b611f0db2525da68f5576beaeeb6c84ac)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
4; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
6; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256
7; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
8; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=znver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512VBMI2
9; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
10
11@a64 = common global [8 x i64] zeroinitializer, align 64
12@b64 = common global [8 x i64] zeroinitializer, align 64
13@c64 = common global [8 x i64] zeroinitializer, align 64
14@d64 = common global [8 x i64] zeroinitializer, align 64
15@a32 = common global [16 x i32] zeroinitializer, align 64
16@b32 = common global [16 x i32] zeroinitializer, align 64
17@c32 = common global [16 x i32] zeroinitializer, align 64
18@d32 = common global [16 x i32] zeroinitializer, align 64
19@a16 = common global [32 x i16] zeroinitializer, align 64
20@b16 = common global [32 x i16] zeroinitializer, align 64
21@c16 = common global [32 x i16] zeroinitializer, align 64
22@d16 = common global [32 x i16] zeroinitializer, align 64
23@a8  = common global [64 x i8] zeroinitializer, align 64
24@b8  = common global [64 x i8] zeroinitializer, align 64
25@c8  = common global [64 x i8] zeroinitializer, align 64
26@d8  = common global [64 x i8] zeroinitializer, align 64
27
28declare i64 @llvm.fshr.i64(i64, i64, i64)
29declare i32 @llvm.fshr.i32(i32, i32, i32)
30declare i16 @llvm.fshr.i16(i16, i16, i16)
31declare i8  @llvm.fshr.i8 (i8 , i8 , i8 )
32
33define void @fshr_v8i64() {
34; SSE-LABEL: @fshr_v8i64(
35; SSE-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
36; SSE-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
37; SSE-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
38; SSE-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
39; SSE-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
40; SSE-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
41; SSE-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
42; SSE-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
43; SSE-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
44; SSE-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
45; SSE-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
46; SSE-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
47; SSE-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
48; SSE-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
49; SSE-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
50; SSE-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
51; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.fshr.i64(i64 [[A0]], i64 [[A0]], i64 [[B0]])
52; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.fshr.i64(i64 [[A1]], i64 [[A1]], i64 [[B1]])
53; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.fshr.i64(i64 [[A2]], i64 [[A2]], i64 [[B2]])
54; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.fshr.i64(i64 [[A3]], i64 [[A3]], i64 [[B3]])
55; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.fshr.i64(i64 [[A4]], i64 [[A4]], i64 [[B4]])
56; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.fshr.i64(i64 [[A5]], i64 [[A5]], i64 [[B5]])
57; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.fshr.i64(i64 [[A6]], i64 [[A6]], i64 [[B6]])
58; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.fshr.i64(i64 [[A7]], i64 [[A7]], i64 [[B7]])
59; SSE-NEXT:    store i64 [[R0]], ptr @d64, align 8
60; SSE-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 1), align 8
61; SSE-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 2), align 8
62; SSE-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 3), align 8
63; SSE-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 4), align 8
64; SSE-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 5), align 8
65; SSE-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 6), align 8
66; SSE-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 7), align 8
67; SSE-NEXT:    ret void
68;
69; AVX1-LABEL: @fshr_v8i64(
70; AVX1-NEXT:    [[A0:%.*]] = load i64, ptr @a64, align 8
71; AVX1-NEXT:    [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
72; AVX1-NEXT:    [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
73; AVX1-NEXT:    [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
74; AVX1-NEXT:    [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
75; AVX1-NEXT:    [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
76; AVX1-NEXT:    [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
77; AVX1-NEXT:    [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
78; AVX1-NEXT:    [[B0:%.*]] = load i64, ptr @b64, align 8
79; AVX1-NEXT:    [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
80; AVX1-NEXT:    [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
81; AVX1-NEXT:    [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
82; AVX1-NEXT:    [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
83; AVX1-NEXT:    [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
84; AVX1-NEXT:    [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
85; AVX1-NEXT:    [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
86; AVX1-NEXT:    [[R0:%.*]] = call i64 @llvm.fshr.i64(i64 [[A0]], i64 [[A0]], i64 [[B0]])
87; AVX1-NEXT:    [[R1:%.*]] = call i64 @llvm.fshr.i64(i64 [[A1]], i64 [[A1]], i64 [[B1]])
88; AVX1-NEXT:    [[R2:%.*]] = call i64 @llvm.fshr.i64(i64 [[A2]], i64 [[A2]], i64 [[B2]])
89; AVX1-NEXT:    [[R3:%.*]] = call i64 @llvm.fshr.i64(i64 [[A3]], i64 [[A3]], i64 [[B3]])
90; AVX1-NEXT:    [[R4:%.*]] = call i64 @llvm.fshr.i64(i64 [[A4]], i64 [[A4]], i64 [[B4]])
91; AVX1-NEXT:    [[R5:%.*]] = call i64 @llvm.fshr.i64(i64 [[A5]], i64 [[A5]], i64 [[B5]])
92; AVX1-NEXT:    [[R6:%.*]] = call i64 @llvm.fshr.i64(i64 [[A6]], i64 [[A6]], i64 [[B6]])
93; AVX1-NEXT:    [[R7:%.*]] = call i64 @llvm.fshr.i64(i64 [[A7]], i64 [[A7]], i64 [[B7]])
94; AVX1-NEXT:    store i64 [[R0]], ptr @d64, align 8
95; AVX1-NEXT:    store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 1), align 8
96; AVX1-NEXT:    store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 2), align 8
97; AVX1-NEXT:    store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 3), align 8
98; AVX1-NEXT:    store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 4), align 8
99; AVX1-NEXT:    store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 5), align 8
100; AVX1-NEXT:    store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 6), align 8
101; AVX1-NEXT:    store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 7), align 8
102; AVX1-NEXT:    ret void
103;
104; AVX2-LABEL: @fshr_v8i64(
105; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
106; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
107; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
108; AVX2-NEXT:    store <4 x i64> [[TMP3]], ptr @d64, align 8
109; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
110; AVX2-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
111; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
112; AVX2-NEXT:    store <4 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 4), align 8
113; AVX2-NEXT:    ret void
114;
115; AVX256-LABEL: @fshr_v8i64(
116; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
117; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
118; AVX256-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
119; AVX256-NEXT:    store <4 x i64> [[TMP3]], ptr @d64, align 8
120; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
121; AVX256-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
122; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
123; AVX256-NEXT:    store <4 x i64> [[TMP6]], ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 4), align 8
124; AVX256-NEXT:    ret void
125;
126; AVX512-LABEL: @fshr_v8i64(
127; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
128; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
129; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
130; AVX512-NEXT:    store <8 x i64> [[TMP3]], ptr @d64, align 8
131; AVX512-NEXT:    ret void
132;
133; AVX512VBMI2-LABEL: @fshr_v8i64(
134; AVX512VBMI2-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
135; AVX512VBMI2-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
136; AVX512VBMI2-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
137; AVX512VBMI2-NEXT:    store <8 x i64> [[TMP3]], ptr @d64, align 8
138; AVX512VBMI2-NEXT:    ret void
139;
140  %a0 = load i64, ptr @a64, align 8
141  %a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
142  %a2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
143  %a3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
144  %a4 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
145  %a5 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
146  %a6 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
147  %a7 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
148  %b0 = load i64, ptr @b64, align 8
149  %b1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
150  %b2 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
151  %b3 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
152  %b4 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
153  %b5 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
154  %b6 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
155  %b7 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
156  %r0 = call i64 @llvm.fshr.i64(i64 %a0, i64 %a0, i64 %b0)
157  %r1 = call i64 @llvm.fshr.i64(i64 %a1, i64 %a1, i64 %b1)
158  %r2 = call i64 @llvm.fshr.i64(i64 %a2, i64 %a2, i64 %b2)
159  %r3 = call i64 @llvm.fshr.i64(i64 %a3, i64 %a3, i64 %b3)
160  %r4 = call i64 @llvm.fshr.i64(i64 %a4, i64 %a4, i64 %b4)
161  %r5 = call i64 @llvm.fshr.i64(i64 %a5, i64 %a5, i64 %b5)
162  %r6 = call i64 @llvm.fshr.i64(i64 %a6, i64 %a6, i64 %b6)
163  %r7 = call i64 @llvm.fshr.i64(i64 %a7, i64 %a7, i64 %b7)
164  store i64 %r0, ptr @d64, align 8
165  store i64 %r1, ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 1), align 8
166  store i64 %r2, ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 2), align 8
167  store i64 %r3, ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 3), align 8
168  store i64 %r4, ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 4), align 8
169  store i64 %r5, ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 5), align 8
170  store i64 %r6, ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 6), align 8
171  store i64 %r7, ptr getelementptr inbounds ([8 x i64], ptr @d64, i32 0, i64 7), align 8
172  ret void
173}
174
175define void @fshr_v16i32() {
176; SSE-LABEL: @fshr_v16i32(
177; SSE-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
178; SSE-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
179; SSE-NEXT:    [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4
180; SSE-NEXT:    [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4
181; SSE-NEXT:    [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
182; SSE-NEXT:    [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4
183; SSE-NEXT:    [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4
184; SSE-NEXT:    [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4
185; SSE-NEXT:    [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
186; SSE-NEXT:    [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4
187; SSE-NEXT:    [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4
188; SSE-NEXT:    [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4
189; SSE-NEXT:    [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
190; SSE-NEXT:    [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4
191; SSE-NEXT:    [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4
192; SSE-NEXT:    [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4
193; SSE-NEXT:    [[B0:%.*]] = load i32, ptr @b32, align 4
194; SSE-NEXT:    [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
195; SSE-NEXT:    [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4
196; SSE-NEXT:    [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4
197; SSE-NEXT:    [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
198; SSE-NEXT:    [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4
199; SSE-NEXT:    [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4
200; SSE-NEXT:    [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4
201; SSE-NEXT:    [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
202; SSE-NEXT:    [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4
203; SSE-NEXT:    [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4
204; SSE-NEXT:    [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4
205; SSE-NEXT:    [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
206; SSE-NEXT:    [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4
207; SSE-NEXT:    [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4
208; SSE-NEXT:    [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4
209; SSE-NEXT:    [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]])
210; SSE-NEXT:    [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]])
211; SSE-NEXT:    [[R2:%.*]] = call i32 @llvm.fshr.i32(i32 [[A2]], i32 [[A2]], i32 [[B2]])
212; SSE-NEXT:    [[R3:%.*]] = call i32 @llvm.fshr.i32(i32 [[A3]], i32 [[A3]], i32 [[B3]])
213; SSE-NEXT:    [[R4:%.*]] = call i32 @llvm.fshr.i32(i32 [[A4]], i32 [[A4]], i32 [[B4]])
214; SSE-NEXT:    [[R5:%.*]] = call i32 @llvm.fshr.i32(i32 [[A5]], i32 [[A5]], i32 [[B5]])
215; SSE-NEXT:    [[R6:%.*]] = call i32 @llvm.fshr.i32(i32 [[A6]], i32 [[A6]], i32 [[B6]])
216; SSE-NEXT:    [[R7:%.*]] = call i32 @llvm.fshr.i32(i32 [[A7]], i32 [[A7]], i32 [[B7]])
217; SSE-NEXT:    [[R8:%.*]] = call i32 @llvm.fshr.i32(i32 [[A8]], i32 [[A8]], i32 [[B8]])
218; SSE-NEXT:    [[R9:%.*]] = call i32 @llvm.fshr.i32(i32 [[A9]], i32 [[A9]], i32 [[B9]])
219; SSE-NEXT:    [[R10:%.*]] = call i32 @llvm.fshr.i32(i32 [[A10]], i32 [[A10]], i32 [[B10]])
220; SSE-NEXT:    [[R11:%.*]] = call i32 @llvm.fshr.i32(i32 [[A11]], i32 [[A11]], i32 [[B11]])
221; SSE-NEXT:    [[R12:%.*]] = call i32 @llvm.fshr.i32(i32 [[A12]], i32 [[A12]], i32 [[B12]])
222; SSE-NEXT:    [[R13:%.*]] = call i32 @llvm.fshr.i32(i32 [[A13]], i32 [[A13]], i32 [[B13]])
223; SSE-NEXT:    [[R14:%.*]] = call i32 @llvm.fshr.i32(i32 [[A14]], i32 [[A14]], i32 [[B14]])
224; SSE-NEXT:    [[R15:%.*]] = call i32 @llvm.fshr.i32(i32 [[A15]], i32 [[A15]], i32 [[B15]])
225; SSE-NEXT:    store i32 [[R0]], ptr @d32, align 4
226; SSE-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
227; SSE-NEXT:    store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 2), align 4
228; SSE-NEXT:    store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 3), align 4
229; SSE-NEXT:    store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 4), align 4
230; SSE-NEXT:    store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 5), align 4
231; SSE-NEXT:    store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 6), align 4
232; SSE-NEXT:    store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 7), align 4
233; SSE-NEXT:    store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
234; SSE-NEXT:    store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 9), align 4
235; SSE-NEXT:    store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 10), align 4
236; SSE-NEXT:    store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 11), align 4
237; SSE-NEXT:    store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 12), align 4
238; SSE-NEXT:    store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 13), align 4
239; SSE-NEXT:    store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 14), align 4
240; SSE-NEXT:    store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 15), align 4
241; SSE-NEXT:    ret void
242;
243; AVX1-LABEL: @fshr_v16i32(
244; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
245; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
246; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
247; AVX1-NEXT:    store <4 x i32> [[TMP3]], ptr @d32, align 4
248; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
249; AVX1-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
250; AVX1-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
251; AVX1-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 4), align 4
252; AVX1-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
253; AVX1-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
254; AVX1-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
255; AVX1-NEXT:    store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
256; AVX1-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
257; AVX1-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
258; AVX1-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
259; AVX1-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 12), align 4
260; AVX1-NEXT:    ret void
261;
262; AVX2-LABEL: @fshr_v16i32(
263; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
264; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
265; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
266; AVX2-NEXT:    store <8 x i32> [[TMP3]], ptr @d32, align 4
267; AVX2-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
268; AVX2-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
269; AVX2-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
270; AVX2-NEXT:    store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
271; AVX2-NEXT:    ret void
272;
273; AVX256-LABEL: @fshr_v16i32(
274; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
275; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
276; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
277; AVX256-NEXT:    store <8 x i32> [[TMP3]], ptr @d32, align 4
278; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
279; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
280; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
281; AVX256-NEXT:    store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
282; AVX256-NEXT:    ret void
283;
284; AVX512-LABEL: @fshr_v16i32(
285; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
286; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
287; AVX512-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
288; AVX512-NEXT:    store <16 x i32> [[TMP3]], ptr @d32, align 4
289; AVX512-NEXT:    ret void
290;
291; AVX512VBMI2-LABEL: @fshr_v16i32(
292; AVX512VBMI2-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
293; AVX512VBMI2-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
294; AVX512VBMI2-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
295; AVX512VBMI2-NEXT:    store <16 x i32> [[TMP3]], ptr @d32, align 4
296; AVX512VBMI2-NEXT:    ret void
297;
298  %a0  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
299  %a1  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
300  %a2  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2 ), align 4
301  %a3  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3 ), align 4
302  %a4  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4 ), align 4
303  %a5  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5 ), align 4
304  %a6  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6 ), align 4
305  %a7  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7 ), align 4
306  %a8  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8 ), align 4
307  %a9  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9 ), align 4
308  %a10 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4
309  %a11 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4
310  %a12 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
311  %a13 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4
312  %a14 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4
313  %a15 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4
314  %b0  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 0 ), align 4
315  %b1  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1 ), align 4
316  %b2  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2 ), align 4
317  %b3  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3 ), align 4
318  %b4  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4 ), align 4
319  %b5  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5 ), align 4
320  %b6  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6 ), align 4
321  %b7  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7 ), align 4
322  %b8  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8 ), align 4
323  %b9  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9 ), align 4
324  %b10 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4
325  %b11 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4
326  %b12 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
327  %b13 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4
328  %b14 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4
329  %b15 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4
330  %r0  = call i32 @llvm.fshr.i32(i32 %a0 , i32 %a0 , i32 %b0 )
331  %r1  = call i32 @llvm.fshr.i32(i32 %a1 , i32 %a1 , i32 %b1 )
332  %r2  = call i32 @llvm.fshr.i32(i32 %a2 , i32 %a2 , i32 %b2 )
333  %r3  = call i32 @llvm.fshr.i32(i32 %a3 , i32 %a3 , i32 %b3 )
334  %r4  = call i32 @llvm.fshr.i32(i32 %a4 , i32 %a4 , i32 %b4 )
335  %r5  = call i32 @llvm.fshr.i32(i32 %a5 , i32 %a5 , i32 %b5 )
336  %r6  = call i32 @llvm.fshr.i32(i32 %a6 , i32 %a6 , i32 %b6 )
337  %r7  = call i32 @llvm.fshr.i32(i32 %a7 , i32 %a7 , i32 %b7 )
338  %r8  = call i32 @llvm.fshr.i32(i32 %a8 , i32 %a8 , i32 %b8 )
339  %r9  = call i32 @llvm.fshr.i32(i32 %a9 , i32 %a9 , i32 %b9 )
340  %r10 = call i32 @llvm.fshr.i32(i32 %a10, i32 %a10, i32 %b10)
341  %r11 = call i32 @llvm.fshr.i32(i32 %a11, i32 %a11, i32 %b11)
342  %r12 = call i32 @llvm.fshr.i32(i32 %a12, i32 %a12, i32 %b12)
343  %r13 = call i32 @llvm.fshr.i32(i32 %a13, i32 %a13, i32 %b13)
344  %r14 = call i32 @llvm.fshr.i32(i32 %a14, i32 %a14, i32 %b14)
345  %r15 = call i32 @llvm.fshr.i32(i32 %a15, i32 %a15, i32 %b15)
346  store i32 %r0 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 0 ), align 4
347  store i32 %r1 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1 ), align 4
348  store i32 %r2 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 2 ), align 4
349  store i32 %r3 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 3 ), align 4
350  store i32 %r4 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 4 ), align 4
351  store i32 %r5 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 5 ), align 4
352  store i32 %r6 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 6 ), align 4
353  store i32 %r7 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 7 ), align 4
354  store i32 %r8 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8 ), align 4
355  store i32 %r9 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 9 ), align 4
356  store i32 %r10, ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 10), align 4
357  store i32 %r11, ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 11), align 4
358  store i32 %r12, ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 12), align 4
359  store i32 %r13, ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 13), align 4
360  store i32 %r14, ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 14), align 4
361  store i32 %r15, ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 15), align 4
362  ret void
363}
364
365define void @fshr_v32i16() {
366; SSE-LABEL: @fshr_v32i16(
367; SSE-NEXT:    [[A0:%.*]] = load i16, ptr @a16, align 2
368; SSE-NEXT:    [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2
369; SSE-NEXT:    [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2
370; SSE-NEXT:    [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2
371; SSE-NEXT:    [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2
372; SSE-NEXT:    [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2
373; SSE-NEXT:    [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2
374; SSE-NEXT:    [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2
375; SSE-NEXT:    [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
376; SSE-NEXT:    [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2
377; SSE-NEXT:    [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
378; SSE-NEXT:    [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
379; SSE-NEXT:    [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
380; SSE-NEXT:    [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
381; SSE-NEXT:    [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
382; SSE-NEXT:    [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
383; SSE-NEXT:    [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
384; SSE-NEXT:    [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
385; SSE-NEXT:    [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
386; SSE-NEXT:    [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
387; SSE-NEXT:    [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
388; SSE-NEXT:    [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
389; SSE-NEXT:    [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
390; SSE-NEXT:    [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
391; SSE-NEXT:    [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
392; SSE-NEXT:    [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
393; SSE-NEXT:    [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
394; SSE-NEXT:    [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
395; SSE-NEXT:    [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
396; SSE-NEXT:    [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
397; SSE-NEXT:    [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
398; SSE-NEXT:    [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
399; SSE-NEXT:    [[B0:%.*]] = load i16, ptr @b16, align 2
400; SSE-NEXT:    [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2
401; SSE-NEXT:    [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2
402; SSE-NEXT:    [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2
403; SSE-NEXT:    [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2
404; SSE-NEXT:    [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2
405; SSE-NEXT:    [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2
406; SSE-NEXT:    [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2
407; SSE-NEXT:    [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
408; SSE-NEXT:    [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2
409; SSE-NEXT:    [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
410; SSE-NEXT:    [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
411; SSE-NEXT:    [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
412; SSE-NEXT:    [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
413; SSE-NEXT:    [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
414; SSE-NEXT:    [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
415; SSE-NEXT:    [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
416; SSE-NEXT:    [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
417; SSE-NEXT:    [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
418; SSE-NEXT:    [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
419; SSE-NEXT:    [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
420; SSE-NEXT:    [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
421; SSE-NEXT:    [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
422; SSE-NEXT:    [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
423; SSE-NEXT:    [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
424; SSE-NEXT:    [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
425; SSE-NEXT:    [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
426; SSE-NEXT:    [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
427; SSE-NEXT:    [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
428; SSE-NEXT:    [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
429; SSE-NEXT:    [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
430; SSE-NEXT:    [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
431; SSE-NEXT:    [[R0:%.*]] = call i16 @llvm.fshr.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]])
432; SSE-NEXT:    [[R1:%.*]] = call i16 @llvm.fshr.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]])
433; SSE-NEXT:    [[R2:%.*]] = call i16 @llvm.fshr.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]])
434; SSE-NEXT:    [[R3:%.*]] = call i16 @llvm.fshr.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]])
435; SSE-NEXT:    [[R4:%.*]] = call i16 @llvm.fshr.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]])
436; SSE-NEXT:    [[R5:%.*]] = call i16 @llvm.fshr.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]])
437; SSE-NEXT:    [[R6:%.*]] = call i16 @llvm.fshr.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]])
438; SSE-NEXT:    [[R7:%.*]] = call i16 @llvm.fshr.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]])
439; SSE-NEXT:    [[R8:%.*]] = call i16 @llvm.fshr.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]])
440; SSE-NEXT:    [[R9:%.*]] = call i16 @llvm.fshr.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]])
441; SSE-NEXT:    [[R10:%.*]] = call i16 @llvm.fshr.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]])
442; SSE-NEXT:    [[R11:%.*]] = call i16 @llvm.fshr.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]])
443; SSE-NEXT:    [[R12:%.*]] = call i16 @llvm.fshr.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]])
444; SSE-NEXT:    [[R13:%.*]] = call i16 @llvm.fshr.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]])
445; SSE-NEXT:    [[R14:%.*]] = call i16 @llvm.fshr.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]])
446; SSE-NEXT:    [[R15:%.*]] = call i16 @llvm.fshr.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]])
447; SSE-NEXT:    [[R16:%.*]] = call i16 @llvm.fshr.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]])
448; SSE-NEXT:    [[R17:%.*]] = call i16 @llvm.fshr.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]])
449; SSE-NEXT:    [[R18:%.*]] = call i16 @llvm.fshr.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]])
450; SSE-NEXT:    [[R19:%.*]] = call i16 @llvm.fshr.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]])
451; SSE-NEXT:    [[R20:%.*]] = call i16 @llvm.fshr.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]])
452; SSE-NEXT:    [[R21:%.*]] = call i16 @llvm.fshr.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]])
453; SSE-NEXT:    [[R22:%.*]] = call i16 @llvm.fshr.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]])
454; SSE-NEXT:    [[R23:%.*]] = call i16 @llvm.fshr.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]])
455; SSE-NEXT:    [[R24:%.*]] = call i16 @llvm.fshr.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]])
456; SSE-NEXT:    [[R25:%.*]] = call i16 @llvm.fshr.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]])
457; SSE-NEXT:    [[R26:%.*]] = call i16 @llvm.fshr.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]])
458; SSE-NEXT:    [[R27:%.*]] = call i16 @llvm.fshr.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]])
459; SSE-NEXT:    [[R28:%.*]] = call i16 @llvm.fshr.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]])
460; SSE-NEXT:    [[R29:%.*]] = call i16 @llvm.fshr.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]])
461; SSE-NEXT:    [[R30:%.*]] = call i16 @llvm.fshr.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]])
462; SSE-NEXT:    [[R31:%.*]] = call i16 @llvm.fshr.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]])
463; SSE-NEXT:    store i16 [[R0]], ptr @d16, align 2
464; SSE-NEXT:    store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2
465; SSE-NEXT:    store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2
466; SSE-NEXT:    store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2
467; SSE-NEXT:    store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2
468; SSE-NEXT:    store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2
469; SSE-NEXT:    store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6), align 2
470; SSE-NEXT:    store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7), align 2
471; SSE-NEXT:    store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2
472; SSE-NEXT:    store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9), align 2
473; SSE-NEXT:    store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2
474; SSE-NEXT:    store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2
475; SSE-NEXT:    store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2
476; SSE-NEXT:    store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2
477; SSE-NEXT:    store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2
478; SSE-NEXT:    store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2
479; SSE-NEXT:    store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
480; SSE-NEXT:    store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2
481; SSE-NEXT:    store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2
482; SSE-NEXT:    store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2
483; SSE-NEXT:    store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2
484; SSE-NEXT:    store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2
485; SSE-NEXT:    store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2
486; SSE-NEXT:    store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2
487; SSE-NEXT:    store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2
488; SSE-NEXT:    store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2
489; SSE-NEXT:    store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2
490; SSE-NEXT:    store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2
491; SSE-NEXT:    store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2
492; SSE-NEXT:    store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2
493; SSE-NEXT:    store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2
494; SSE-NEXT:    store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2
495; SSE-NEXT:    ret void
496;
497; AVX-LABEL: @fshr_v32i16(
498; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
499; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
500; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
501; AVX-NEXT:    store <16 x i16> [[TMP3]], ptr @d16, align 2
502; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
503; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
504; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
505; AVX-NEXT:    store <16 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
506; AVX-NEXT:    ret void
507;
508; AVX512-LABEL: @fshr_v32i16(
509; AVX512-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
510; AVX512-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
511; AVX512-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
512; AVX512-NEXT:    store <32 x i16> [[TMP3]], ptr @d16, align 2
513; AVX512-NEXT:    ret void
514;
515; AVX512VBMI2-LABEL: @fshr_v32i16(
516; AVX512VBMI2-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
517; AVX512VBMI2-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
518; AVX512VBMI2-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
519; AVX512VBMI2-NEXT:    store <32 x i16> [[TMP3]], ptr @d16, align 2
520; AVX512VBMI2-NEXT:    ret void
521;
522  %a0  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
523  %a1  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
524  %a2  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2 ), align 2
525  %a3  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3 ), align 2
526  %a4  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4 ), align 2
527  %a5  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5 ), align 2
528  %a6  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6 ), align 2
529  %a7  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7 ), align 2
530  %a8  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8 ), align 2
531  %a9  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9 ), align 2
532  %a10 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
533  %a11 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
534  %a12 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
535  %a13 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
536  %a14 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
537  %a15 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
538  %a16 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
539  %a17 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
540  %a18 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
541  %a19 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
542  %a20 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
543  %a21 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
544  %a22 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
545  %a23 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
546  %a24 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
547  %a25 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
548  %a26 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
549  %a27 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
550  %a28 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
551  %a29 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
552  %a30 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
553  %a31 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
554  %b0  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 0 ), align 2
555  %b1  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1 ), align 2
556  %b2  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2 ), align 2
557  %b3  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3 ), align 2
558  %b4  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4 ), align 2
559  %b5  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5 ), align 2
560  %b6  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6 ), align 2
561  %b7  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7 ), align 2
562  %b8  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8 ), align 2
563  %b9  = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9 ), align 2
564  %b10 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
565  %b11 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
566  %b12 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
567  %b13 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
568  %b14 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
569  %b15 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
570  %b16 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
571  %b17 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
572  %b18 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
573  %b19 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
574  %b20 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
575  %b21 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
576  %b22 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
577  %b23 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
578  %b24 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
579  %b25 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
580  %b26 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
581  %b27 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
582  %b28 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
583  %b29 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
584  %b30 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
585  %b31 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
586  %r0  = call i16 @llvm.fshr.i16(i16 %a0 , i16 %a0 , i16 %b0 )
587  %r1  = call i16 @llvm.fshr.i16(i16 %a1 , i16 %a1 , i16 %b1 )
588  %r2  = call i16 @llvm.fshr.i16(i16 %a2 , i16 %a2 , i16 %b2 )
589  %r3  = call i16 @llvm.fshr.i16(i16 %a3 , i16 %a3 , i16 %b3 )
590  %r4  = call i16 @llvm.fshr.i16(i16 %a4 , i16 %a4 , i16 %b4 )
591  %r5  = call i16 @llvm.fshr.i16(i16 %a5 , i16 %a5 , i16 %b5 )
592  %r6  = call i16 @llvm.fshr.i16(i16 %a6 , i16 %a6 , i16 %b6 )
593  %r7  = call i16 @llvm.fshr.i16(i16 %a7 , i16 %a7 , i16 %b7 )
594  %r8  = call i16 @llvm.fshr.i16(i16 %a8 , i16 %a8 , i16 %b8 )
595  %r9  = call i16 @llvm.fshr.i16(i16 %a9 , i16 %a9 , i16 %b9 )
596  %r10 = call i16 @llvm.fshr.i16(i16 %a10, i16 %a10, i16 %b10)
597  %r11 = call i16 @llvm.fshr.i16(i16 %a11, i16 %a11, i16 %b11)
598  %r12 = call i16 @llvm.fshr.i16(i16 %a12, i16 %a12, i16 %b12)
599  %r13 = call i16 @llvm.fshr.i16(i16 %a13, i16 %a13, i16 %b13)
600  %r14 = call i16 @llvm.fshr.i16(i16 %a14, i16 %a14, i16 %b14)
601  %r15 = call i16 @llvm.fshr.i16(i16 %a15, i16 %a15, i16 %b15)
602  %r16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %b16)
603  %r17 = call i16 @llvm.fshr.i16(i16 %a17, i16 %a17, i16 %b17)
604  %r18 = call i16 @llvm.fshr.i16(i16 %a18, i16 %a18, i16 %b18)
605  %r19 = call i16 @llvm.fshr.i16(i16 %a19, i16 %a19, i16 %b19)
606  %r20 = call i16 @llvm.fshr.i16(i16 %a20, i16 %a20, i16 %b20)
607  %r21 = call i16 @llvm.fshr.i16(i16 %a21, i16 %a21, i16 %b21)
608  %r22 = call i16 @llvm.fshr.i16(i16 %a22, i16 %a22, i16 %b22)
609  %r23 = call i16 @llvm.fshr.i16(i16 %a23, i16 %a23, i16 %b23)
610  %r24 = call i16 @llvm.fshr.i16(i16 %a24, i16 %a24, i16 %b24)
611  %r25 = call i16 @llvm.fshr.i16(i16 %a25, i16 %a25, i16 %b25)
612  %r26 = call i16 @llvm.fshr.i16(i16 %a26, i16 %a26, i16 %b26)
613  %r27 = call i16 @llvm.fshr.i16(i16 %a27, i16 %a27, i16 %b27)
614  %r28 = call i16 @llvm.fshr.i16(i16 %a28, i16 %a28, i16 %b28)
615  %r29 = call i16 @llvm.fshr.i16(i16 %a29, i16 %a29, i16 %b29)
616  %r30 = call i16 @llvm.fshr.i16(i16 %a30, i16 %a30, i16 %b30)
617  %r31 = call i16 @llvm.fshr.i16(i16 %a31, i16 %a31, i16 %b31)
618  store i16 %r0 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 0 ), align 2
619  store i16 %r1 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1 ), align 2
620  store i16 %r2 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2 ), align 2
621  store i16 %r3 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3 ), align 2
622  store i16 %r4 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4 ), align 2
623  store i16 %r5 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5 ), align 2
624  store i16 %r6 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6 ), align 2
625  store i16 %r7 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7 ), align 2
626  store i16 %r8 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8 ), align 2
627  store i16 %r9 , ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9 ), align 2
628  store i16 %r10, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2
629  store i16 %r11, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2
630  store i16 %r12, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2
631  store i16 %r13, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2
632  store i16 %r14, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2
633  store i16 %r15, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2
634  store i16 %r16, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
635  store i16 %r17, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2
636  store i16 %r18, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2
637  store i16 %r19, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2
638  store i16 %r20, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2
639  store i16 %r21, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2
640  store i16 %r22, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2
641  store i16 %r23, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2
642  store i16 %r24, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2
643  store i16 %r25, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2
644  store i16 %r26, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2
645  store i16 %r27, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2
646  store i16 %r28, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2
647  store i16 %r29, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2
648  store i16 %r30, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2
649  store i16 %r31, ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2
650  ret void
651}
652
653define void @fshr_v64i8() {
654; SSE-LABEL: @fshr_v64i8(
655; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
656; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
657; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
658; SSE-NEXT:    store <16 x i8> [[TMP3]], ptr @d8, align 1
659; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
660; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
661; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
662; SSE-NEXT:    store <16 x i8> [[TMP6]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 16), align 1
663; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
664; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
665; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
666; SSE-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1
667; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
668; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
669; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
670; SSE-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 48), align 1
671; SSE-NEXT:    ret void
672;
673; AVX-LABEL: @fshr_v64i8(
674; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
675; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
676; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
677; AVX-NEXT:    store <32 x i8> [[TMP3]], ptr @d8, align 1
678; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
679; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
680; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
681; AVX-NEXT:    store <32 x i8> [[TMP6]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1
682; AVX-NEXT:    ret void
683;
684; AVX512-LABEL: @fshr_v64i8(
685; AVX512-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
686; AVX512-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
687; AVX512-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
688; AVX512-NEXT:    store <64 x i8> [[TMP3]], ptr @d8, align 1
689; AVX512-NEXT:    ret void
690;
691; AVX512VBMI2-LABEL: @fshr_v64i8(
692; AVX512VBMI2-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
693; AVX512VBMI2-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
694; AVX512VBMI2-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
695; AVX512VBMI2-NEXT:    store <64 x i8> [[TMP3]], ptr @d8, align 1
696; AVX512VBMI2-NEXT:    ret void
697;
698  %a0  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1
699  %a1  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1
700  %a2  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2 ), align 1
701  %a3  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3 ), align 1
702  %a4  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4 ), align 1
703  %a5  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5 ), align 1
704  %a6  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6 ), align 1
705  %a7  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7 ), align 1
706  %a8  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8 ), align 1
707  %a9  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9 ), align 1
708  %a10 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1
709  %a11 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1
710  %a12 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1
711  %a13 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1
712  %a14 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1
713  %a15 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1
714  %a16 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
715  %a17 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1
716  %a18 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1
717  %a19 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1
718  %a20 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1
719  %a21 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1
720  %a22 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1
721  %a23 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1
722  %a24 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1
723  %a25 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1
724  %a26 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1
725  %a27 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1
726  %a28 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1
727  %a29 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1
728  %a30 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1
729  %a31 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1
730  %a32 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
731  %a33 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1
732  %a34 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1
733  %a35 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1
734  %a36 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1
735  %a37 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1
736  %a38 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1
737  %a39 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1
738  %a40 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1
739  %a41 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1
740  %a42 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1
741  %a43 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1
742  %a44 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1
743  %a45 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1
744  %a46 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1
745  %a47 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1
746  %a48 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
747  %a49 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1
748  %a50 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1
749  %a51 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1
750  %a52 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1
751  %a53 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1
752  %a54 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1
753  %a55 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1
754  %a56 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1
755  %a57 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1
756  %a58 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1
757  %a59 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1
758  %a60 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1
759  %a61 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1
760  %a62 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1
761  %a63 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1
762  %b0  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 0 ), align 1
763  %b1  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1 ), align 1
764  %b2  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2 ), align 1
765  %b3  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3 ), align 1
766  %b4  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4 ), align 1
767  %b5  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5 ), align 1
768  %b6  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6 ), align 1
769  %b7  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7 ), align 1
770  %b8  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8 ), align 1
771  %b9  = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9 ), align 1
772  %b10 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1
773  %b11 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1
774  %b12 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1
775  %b13 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1
776  %b14 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1
777  %b15 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1
778  %b16 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
779  %b17 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1
780  %b18 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1
781  %b19 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1
782  %b20 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1
783  %b21 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1
784  %b22 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1
785  %b23 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1
786  %b24 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1
787  %b25 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1
788  %b26 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1
789  %b27 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1
790  %b28 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1
791  %b29 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1
792  %b30 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1
793  %b31 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1
794  %b32 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
795  %b33 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1
796  %b34 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1
797  %b35 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1
798  %b36 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1
799  %b37 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1
800  %b38 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1
801  %b39 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1
802  %b40 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1
803  %b41 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1
804  %b42 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1
805  %b43 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1
806  %b44 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1
807  %b45 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1
808  %b46 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1
809  %b47 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1
810  %b48 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
811  %b49 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1
812  %b50 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1
813  %b51 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1
814  %b52 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1
815  %b53 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1
816  %b54 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1
817  %b55 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1
818  %b56 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1
819  %b57 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1
820  %b58 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1
821  %b59 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1
822  %b60 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1
823  %b61 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1
824  %b62 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1
825  %b63 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1
826  %r0  = call i8 @llvm.fshr.i8(i8 %a0 , i8 %a0 , i8 %b0 )
827  %r1  = call i8 @llvm.fshr.i8(i8 %a1 , i8 %a1 , i8 %b1 )
828  %r2  = call i8 @llvm.fshr.i8(i8 %a2 , i8 %a2 , i8 %b2 )
829  %r3  = call i8 @llvm.fshr.i8(i8 %a3 , i8 %a3 , i8 %b3 )
830  %r4  = call i8 @llvm.fshr.i8(i8 %a4 , i8 %a4 , i8 %b4 )
831  %r5  = call i8 @llvm.fshr.i8(i8 %a5 , i8 %a5 , i8 %b5 )
832  %r6  = call i8 @llvm.fshr.i8(i8 %a6 , i8 %a6 , i8 %b6 )
833  %r7  = call i8 @llvm.fshr.i8(i8 %a7 , i8 %a7 , i8 %b7 )
834  %r8  = call i8 @llvm.fshr.i8(i8 %a8 , i8 %a8 , i8 %b8 )
835  %r9  = call i8 @llvm.fshr.i8(i8 %a9 , i8 %a9 , i8 %b9 )
836  %r10 = call i8 @llvm.fshr.i8(i8 %a10, i8 %a10, i8 %b10)
837  %r11 = call i8 @llvm.fshr.i8(i8 %a11, i8 %a11, i8 %b11)
838  %r12 = call i8 @llvm.fshr.i8(i8 %a12, i8 %a12, i8 %b12)
839  %r13 = call i8 @llvm.fshr.i8(i8 %a13, i8 %a13, i8 %b13)
840  %r14 = call i8 @llvm.fshr.i8(i8 %a14, i8 %a14, i8 %b14)
841  %r15 = call i8 @llvm.fshr.i8(i8 %a15, i8 %a15, i8 %b15)
842  %r16 = call i8 @llvm.fshr.i8(i8 %a16, i8 %a16, i8 %b16)
843  %r17 = call i8 @llvm.fshr.i8(i8 %a17, i8 %a17, i8 %b17)
844  %r18 = call i8 @llvm.fshr.i8(i8 %a18, i8 %a18, i8 %b18)
845  %r19 = call i8 @llvm.fshr.i8(i8 %a19, i8 %a19, i8 %b19)
846  %r20 = call i8 @llvm.fshr.i8(i8 %a20, i8 %a20, i8 %b20)
847  %r21 = call i8 @llvm.fshr.i8(i8 %a21, i8 %a21, i8 %b21)
848  %r22 = call i8 @llvm.fshr.i8(i8 %a22, i8 %a22, i8 %b22)
849  %r23 = call i8 @llvm.fshr.i8(i8 %a23, i8 %a23, i8 %b23)
850  %r24 = call i8 @llvm.fshr.i8(i8 %a24, i8 %a24, i8 %b24)
851  %r25 = call i8 @llvm.fshr.i8(i8 %a25, i8 %a25, i8 %b25)
852  %r26 = call i8 @llvm.fshr.i8(i8 %a26, i8 %a26, i8 %b26)
853  %r27 = call i8 @llvm.fshr.i8(i8 %a27, i8 %a27, i8 %b27)
854  %r28 = call i8 @llvm.fshr.i8(i8 %a28, i8 %a28, i8 %b28)
855  %r29 = call i8 @llvm.fshr.i8(i8 %a29, i8 %a29, i8 %b29)
856  %r30 = call i8 @llvm.fshr.i8(i8 %a30, i8 %a30, i8 %b30)
857  %r31 = call i8 @llvm.fshr.i8(i8 %a31, i8 %a31, i8 %b31)
858  %r32 = call i8 @llvm.fshr.i8(i8 %a32, i8 %a32, i8 %b32)
859  %r33 = call i8 @llvm.fshr.i8(i8 %a33, i8 %a33, i8 %b33)
860  %r34 = call i8 @llvm.fshr.i8(i8 %a34, i8 %a34, i8 %b34)
861  %r35 = call i8 @llvm.fshr.i8(i8 %a35, i8 %a35, i8 %b35)
862  %r36 = call i8 @llvm.fshr.i8(i8 %a36, i8 %a36, i8 %b36)
863  %r37 = call i8 @llvm.fshr.i8(i8 %a37, i8 %a37, i8 %b37)
864  %r38 = call i8 @llvm.fshr.i8(i8 %a38, i8 %a38, i8 %b38)
865  %r39 = call i8 @llvm.fshr.i8(i8 %a39, i8 %a39, i8 %b39)
866  %r40 = call i8 @llvm.fshr.i8(i8 %a40, i8 %a40, i8 %b40)
867  %r41 = call i8 @llvm.fshr.i8(i8 %a41, i8 %a41, i8 %b41)
868  %r42 = call i8 @llvm.fshr.i8(i8 %a42, i8 %a42, i8 %b42)
869  %r43 = call i8 @llvm.fshr.i8(i8 %a43, i8 %a43, i8 %b43)
870  %r44 = call i8 @llvm.fshr.i8(i8 %a44, i8 %a44, i8 %b44)
871  %r45 = call i8 @llvm.fshr.i8(i8 %a45, i8 %a45, i8 %b45)
872  %r46 = call i8 @llvm.fshr.i8(i8 %a46, i8 %a46, i8 %b46)
873  %r47 = call i8 @llvm.fshr.i8(i8 %a47, i8 %a47, i8 %b47)
874  %r48 = call i8 @llvm.fshr.i8(i8 %a48, i8 %a48, i8 %b48)
875  %r49 = call i8 @llvm.fshr.i8(i8 %a49, i8 %a49, i8 %b49)
876  %r50 = call i8 @llvm.fshr.i8(i8 %a50, i8 %a50, i8 %b50)
877  %r51 = call i8 @llvm.fshr.i8(i8 %a51, i8 %a51, i8 %b51)
878  %r52 = call i8 @llvm.fshr.i8(i8 %a52, i8 %a52, i8 %b52)
879  %r53 = call i8 @llvm.fshr.i8(i8 %a53, i8 %a53, i8 %b53)
880  %r54 = call i8 @llvm.fshr.i8(i8 %a54, i8 %a54, i8 %b54)
881  %r55 = call i8 @llvm.fshr.i8(i8 %a55, i8 %a55, i8 %b55)
882  %r56 = call i8 @llvm.fshr.i8(i8 %a56, i8 %a56, i8 %b56)
883  %r57 = call i8 @llvm.fshr.i8(i8 %a57, i8 %a57, i8 %b57)
884  %r58 = call i8 @llvm.fshr.i8(i8 %a58, i8 %a58, i8 %b58)
885  %r59 = call i8 @llvm.fshr.i8(i8 %a59, i8 %a59, i8 %b59)
886  %r60 = call i8 @llvm.fshr.i8(i8 %a60, i8 %a60, i8 %b60)
887  %r61 = call i8 @llvm.fshr.i8(i8 %a61, i8 %a61, i8 %b61)
888  %r62 = call i8 @llvm.fshr.i8(i8 %a62, i8 %a62, i8 %b62)
889  %r63 = call i8 @llvm.fshr.i8(i8 %a63, i8 %a63, i8 %b63)
890  store i8 %r0 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 0 ), align 1
891  store i8 %r1 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 1 ), align 1
892  store i8 %r2 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 2 ), align 1
893  store i8 %r3 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 3 ), align 1
894  store i8 %r4 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 4 ), align 1
895  store i8 %r5 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 5 ), align 1
896  store i8 %r6 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 6 ), align 1
897  store i8 %r7 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 7 ), align 1
898  store i8 %r8 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 8 ), align 1
899  store i8 %r9 , ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 9 ), align 1
900  store i8 %r10, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 10), align 1
901  store i8 %r11, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 11), align 1
902  store i8 %r12, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 12), align 1
903  store i8 %r13, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 13), align 1
904  store i8 %r14, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 14), align 1
905  store i8 %r15, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 15), align 1
906  store i8 %r16, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 16), align 1
907  store i8 %r17, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 17), align 1
908  store i8 %r18, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 18), align 1
909  store i8 %r19, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 19), align 1
910  store i8 %r20, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 20), align 1
911  store i8 %r21, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 21), align 1
912  store i8 %r22, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 22), align 1
913  store i8 %r23, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 23), align 1
914  store i8 %r24, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 24), align 1
915  store i8 %r25, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 25), align 1
916  store i8 %r26, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 26), align 1
917  store i8 %r27, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 27), align 1
918  store i8 %r28, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 28), align 1
919  store i8 %r29, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 29), align 1
920  store i8 %r30, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 30), align 1
921  store i8 %r31, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 31), align 1
922  store i8 %r32, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1
923  store i8 %r33, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 33), align 1
924  store i8 %r34, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 34), align 1
925  store i8 %r35, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 35), align 1
926  store i8 %r36, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 36), align 1
927  store i8 %r37, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 37), align 1
928  store i8 %r38, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 38), align 1
929  store i8 %r39, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 39), align 1
930  store i8 %r40, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 40), align 1
931  store i8 %r41, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 41), align 1
932  store i8 %r42, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 42), align 1
933  store i8 %r43, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 43), align 1
934  store i8 %r44, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 44), align 1
935  store i8 %r45, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 45), align 1
936  store i8 %r46, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 46), align 1
937  store i8 %r47, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 47), align 1
938  store i8 %r48, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 48), align 1
939  store i8 %r49, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 49), align 1
940  store i8 %r50, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 50), align 1
941  store i8 %r51, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 51), align 1
942  store i8 %r52, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 52), align 1
943  store i8 %r53, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 53), align 1
944  store i8 %r54, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 54), align 1
945  store i8 %r55, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 55), align 1
946  store i8 %r56, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 56), align 1
947  store i8 %r57, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 57), align 1
948  store i8 %r58, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 58), align 1
949  store i8 %r59, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 59), align 1
950  store i8 %r60, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 60), align 1
951  store i8 %r61, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 61), align 1
952  store i8 %r62, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 62), align 1
953  store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 63), align 1
954  ret void
955}
956
957define void @fshr_v2i32() {
958; CHECK-LABEL: @fshr_v2i32(
959; CHECK-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
960; CHECK-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
961; CHECK-NEXT:    [[B0:%.*]] = load i32, ptr @b32, align 4
962; CHECK-NEXT:    [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
963; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]])
964; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]])
965; CHECK-NEXT:    store i32 [[R0]], ptr @d32, align 4
966; CHECK-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
967; CHECK-NEXT:    ret void
968;
969; AVX512VBMI2-LABEL: @fshr_v2i32(
970; AVX512VBMI2-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
971; AVX512VBMI2-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4
972; AVX512VBMI2-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
973; AVX512VBMI2-NEXT:    store <2 x i32> [[TMP3]], ptr @d32, align 4
974; AVX512VBMI2-NEXT:    ret void
975;
976  %a0  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
977  %a1  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
978  %b0  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 0 ), align 4
979  %b1  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1 ), align 4
980  %r0  = call i32 @llvm.fshr.i32(i32 %a0 , i32 %a0 , i32 %b0 )
981  %r1  = call i32 @llvm.fshr.i32(i32 %a1 , i32 %a1 , i32 %b1 )
982  store i32 %r0 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 0 ), align 4
983  store i32 %r1 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1 ), align 4
984  ret void
985}
986
987; PR63980
988define void @fshr_v2i32_uniformconst() {
989; CHECK-LABEL: @fshr_v2i32_uniformconst(
990; CHECK-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
991; CHECK-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
992; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1)
993; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1)
994; CHECK-NEXT:    store i32 [[R0]], ptr @d32, align 4
995; CHECK-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
996; CHECK-NEXT:    ret void
997;
998; AVX512VBMI2-LABEL: @fshr_v2i32_uniformconst(
999; AVX512VBMI2-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
1000; AVX512VBMI2-NEXT:    [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1))
1001; AVX512VBMI2-NEXT:    store <2 x i32> [[TMP2]], ptr @d32, align 4
1002; AVX512VBMI2-NEXT:    ret void
1003;
1004  %a0  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
1005  %a1  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
1006  %r0  = call i32 @llvm.fshr.i32(i32 %a0 , i32 %a0 , i32 1 )
1007  %r1  = call i32 @llvm.fshr.i32(i32 %a1 , i32 %a1 , i32 1 )
1008  store i32 %r0 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 0 ), align 4
1009  store i32 %r1 , ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1 ), align 4
1010  ret void
1011}
1012