xref: /llvm-project/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vmul.ll (revision 3b74abdf04092a33ee1881821298ac2539fca68c)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2; RUN: opt < %s -passes=msan -S | FileCheck %s
3;
4; Test memory sanitizer instrumentation for Arm vector multiplication
5; instructions.
6;
7; Forked from llvm/test/CodeGen/AArch64/arm64-vmul.ll
8
9target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
10target triple = "aarch64--linux-android9001"
11
12define <8 x i16> @smull8h(ptr %A, ptr %B) nounwind sanitize_memory {
13; CHECK-LABEL: define <8 x i16> @smull8h(
14; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
15; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
16; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
17; CHECK-NEXT:    call void @llvm.donothing()
18; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
19; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1:![0-9]+]]
20; CHECK:       [[BB3]]:
21; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8:[0-9]+]]
22; CHECK-NEXT:    unreachable
23; CHECK:       [[BB4]]:
24; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[A]], align 8
25; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
26; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
27; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
28; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP7]], align 8
29; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP8]], 0
30; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
31; CHECK:       [[BB8]]:
32; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
33; CHECK-NEXT:    unreachable
34; CHECK:       [[BB9]]:
35; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[B]], align 8
36; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
37; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
38; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
39; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
40; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]]
41; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i8> [[_MSPROP]], zeroinitializer
42; CHECK-NEXT:    [[TMP13:%.*]] = zext <8 x i8> [[_MSPROP2]] to <8 x i16>
43; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
44; CHECK-NEXT:    store <8 x i16> [[TMP13]], ptr @__msan_retval_tls, align 8
45; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
46;
47  %temp1 = load <8 x i8>, ptr %A
48  %temp2 = load <8 x i8>, ptr %B
49  %temp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %temp1, <8 x i8> %temp2)
50  ret <8 x i16> %temp3
51}
52
53define <4 x i32> @smull4s(ptr %A, ptr %B) nounwind sanitize_memory {
54; CHECK-LABEL: define <4 x i32> @smull4s(
55; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
56; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
57; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
58; CHECK-NEXT:    call void @llvm.donothing()
59; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
60; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
61; CHECK:       [[BB3]]:
62; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
63; CHECK-NEXT:    unreachable
64; CHECK:       [[BB4]]:
65; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
66; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
67; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
68; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
69; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 8
70; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP8]], 0
71; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
72; CHECK:       [[BB8]]:
73; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
74; CHECK-NEXT:    unreachable
75; CHECK:       [[BB9]]:
76; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
77; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
78; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
79; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
80; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
81; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
82; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i16> [[_MSPROP]], zeroinitializer
83; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i16> [[_MSPROP2]] to <4 x i32>
84; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
85; CHECK-NEXT:    store <4 x i32> [[TMP13]], ptr @__msan_retval_tls, align 8
86; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
87;
88  %temp1 = load <4 x i16>, ptr %A
89  %temp2 = load <4 x i16>, ptr %B
90  %temp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
91  ret <4 x i32> %temp3
92}
93
94define <2 x i64> @smull2d(ptr %A, ptr %B) nounwind sanitize_memory {
95; CHECK-LABEL: define <2 x i64> @smull2d(
96; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
97; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
98; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
99; CHECK-NEXT:    call void @llvm.donothing()
100; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
101; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
102; CHECK:       [[BB3]]:
103; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
104; CHECK-NEXT:    unreachable
105; CHECK:       [[BB4]]:
106; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
107; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
108; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
109; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
110; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 8
111; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP8]], 0
112; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
113; CHECK:       [[BB8]]:
114; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
115; CHECK-NEXT:    unreachable
116; CHECK:       [[BB9]]:
117; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
118; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
119; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
120; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
121; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
122; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
123; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i32> [[_MSPROP]], zeroinitializer
124; CHECK-NEXT:    [[TMP13:%.*]] = zext <2 x i32> [[_MSPROP2]] to <2 x i64>
125; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
126; CHECK-NEXT:    store <2 x i64> [[TMP13]], ptr @__msan_retval_tls, align 8
127; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
128;
129  %temp1 = load <2 x i32>, ptr %A
130  %temp2 = load <2 x i32>, ptr %B
131  %temp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
132  ret <2 x i64> %temp3
133}
134
135declare <8 x i16>  @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
136declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
137declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
138
139define <8 x i16> @umull8h(ptr %A, ptr %B) nounwind sanitize_memory {
140; CHECK-LABEL: define <8 x i16> @umull8h(
141; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
142; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
143; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
144; CHECK-NEXT:    call void @llvm.donothing()
145; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
146; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
147; CHECK:       [[BB3]]:
148; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
149; CHECK-NEXT:    unreachable
150; CHECK:       [[BB4]]:
151; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[A]], align 8
152; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
153; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
154; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
155; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP7]], align 8
156; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP8]], 0
157; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
158; CHECK:       [[BB8]]:
159; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
160; CHECK-NEXT:    unreachable
161; CHECK:       [[BB9]]:
162; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[B]], align 8
163; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
164; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
165; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
166; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
167; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]]
168; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i8> [[_MSPROP]], zeroinitializer
169; CHECK-NEXT:    [[TMP13:%.*]] = zext <8 x i8> [[_MSPROP2]] to <8 x i16>
170; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
171; CHECK-NEXT:    store <8 x i16> [[TMP13]], ptr @__msan_retval_tls, align 8
172; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
173;
174  %temp1 = load <8 x i8>, ptr %A
175  %temp2 = load <8 x i8>, ptr %B
176  %temp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %temp1, <8 x i8> %temp2)
177  ret <8 x i16> %temp3
178}
179
180define <4 x i32> @umull4s(ptr %A, ptr %B) nounwind sanitize_memory {
181; CHECK-LABEL: define <4 x i32> @umull4s(
182; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
183; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
184; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
185; CHECK-NEXT:    call void @llvm.donothing()
186; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
187; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
188; CHECK:       [[BB3]]:
189; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
190; CHECK-NEXT:    unreachable
191; CHECK:       [[BB4]]:
192; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
193; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
194; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
195; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
196; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 8
197; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP8]], 0
198; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
199; CHECK:       [[BB8]]:
200; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
201; CHECK-NEXT:    unreachable
202; CHECK:       [[BB9]]:
203; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
204; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
205; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
206; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
207; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
208; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
209; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i16> [[_MSPROP]], zeroinitializer
210; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i16> [[_MSPROP2]] to <4 x i32>
211; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
212; CHECK-NEXT:    store <4 x i32> [[TMP13]], ptr @__msan_retval_tls, align 8
213; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
214;
215  %temp1 = load <4 x i16>, ptr %A
216  %temp2 = load <4 x i16>, ptr %B
217  %temp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
218  ret <4 x i32> %temp3
219}
220
221define <2 x i64> @umull2d(ptr %A, ptr %B) nounwind sanitize_memory {
222; CHECK-LABEL: define <2 x i64> @umull2d(
223; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
224; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
225; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
226; CHECK-NEXT:    call void @llvm.donothing()
227; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
228; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
229; CHECK:       [[BB3]]:
230; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
231; CHECK-NEXT:    unreachable
232; CHECK:       [[BB4]]:
233; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
234; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
235; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
236; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
237; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 8
238; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP8]], 0
239; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
240; CHECK:       [[BB8]]:
241; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
242; CHECK-NEXT:    unreachable
243; CHECK:       [[BB9]]:
244; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
245; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
246; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
247; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
248; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
249; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
250; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i32> [[_MSPROP]], zeroinitializer
251; CHECK-NEXT:    [[TMP13:%.*]] = zext <2 x i32> [[_MSPROP2]] to <2 x i64>
252; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
253; CHECK-NEXT:    store <2 x i64> [[TMP13]], ptr @__msan_retval_tls, align 8
254; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
255;
256  %temp1 = load <2 x i32>, ptr %A
257  %temp2 = load <2 x i32>, ptr %B
258  %temp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
259  ret <2 x i64> %temp3
260}
261
262declare <8 x i16>  @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
263declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
264declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
265
266define <4 x i32> @sqdmull4s(ptr %A, ptr %B) nounwind sanitize_memory {
267; CHECK-LABEL: define <4 x i32> @sqdmull4s(
268; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
269; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
270; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
271; CHECK-NEXT:    call void @llvm.donothing()
272; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
273; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
274; CHECK:       [[BB3]]:
275; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
276; CHECK-NEXT:    unreachable
277; CHECK:       [[BB4]]:
278; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
279; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
280; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
281; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
282; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 8
283; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
284; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
285; CHECK:       [[BB8]]:
286; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
287; CHECK-NEXT:    unreachable
288; CHECK:       [[BB9]]:
289; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
290; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
291; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
292; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
293; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
294; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i16> [[_MSLD]] to i64
295; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP13]], 0
296; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[_MSLD1]] to i64
297; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP14]], 0
298; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
299; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB15:.*]], label %[[BB16:.*]], !prof [[PROF1]]
300; CHECK:       [[BB15]]:
301; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
302; CHECK-NEXT:    unreachable
303; CHECK:       [[BB16]]:
304; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
305; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
306; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
307;
308  %temp1 = load <4 x i16>, ptr %A
309  %temp2 = load <4 x i16>, ptr %B
310  %temp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
311  ret <4 x i32> %temp3
312}
313
314define <2 x i64> @sqdmull2d(ptr %A, ptr %B) nounwind sanitize_memory {
315; CHECK-LABEL: define <2 x i64> @sqdmull2d(
316; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
317; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
318; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
319; CHECK-NEXT:    call void @llvm.donothing()
320; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
321; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
322; CHECK:       [[BB3]]:
323; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
324; CHECK-NEXT:    unreachable
325; CHECK:       [[BB4]]:
326; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
327; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
328; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
329; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
330; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 8
331; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
332; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
333; CHECK:       [[BB8]]:
334; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
335; CHECK-NEXT:    unreachable
336; CHECK:       [[BB9]]:
337; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
338; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
339; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
340; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
341; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
342; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[_MSLD]] to i64
343; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP13]], 0
344; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
345; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP14]], 0
346; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
347; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB15:.*]], label %[[BB16:.*]], !prof [[PROF1]]
348; CHECK:       [[BB15]]:
349; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
350; CHECK-NEXT:    unreachable
351; CHECK:       [[BB16]]:
352; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
353; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
354; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
355;
356  %temp1 = load <2 x i32>, ptr %A
357  %temp2 = load <2 x i32>, ptr %B
358  %temp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
359  ret <2 x i64> %temp3
360}
361
362define <4 x i32> @sqdmull2_4s(ptr %A, ptr %B) nounwind sanitize_memory {
363; CHECK-LABEL: define <4 x i32> @sqdmull2_4s(
364; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
365; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
366; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
367; CHECK-NEXT:    call void @llvm.donothing()
368; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
369; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
370; CHECK:       [[BB3]]:
371; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
372; CHECK-NEXT:    unreachable
373; CHECK:       [[BB4]]:
374; CHECK-NEXT:    [[LOAD1:%.*]] = load <8 x i16>, ptr [[A]], align 16
375; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
376; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
377; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
378; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 16
379; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP8]], 0
380; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
381; CHECK:       [[BB8]]:
382; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
383; CHECK-NEXT:    unreachable
384; CHECK:       [[BB9]]:
385; CHECK-NEXT:    [[LOAD2:%.*]] = load <8 x i16>, ptr [[B]], align 16
386; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
387; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
388; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
389; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
390; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[_MSLD]], <8 x i16> splat (i16 -1), <4 x i32> <i32 4, i32 5, i32 6, i32 7>
391; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[LOAD1]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
392; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i16> [[_MSLD1]], <8 x i16> splat (i16 -1), <4 x i32> <i32 4, i32 5, i32 6, i32 7>
393; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[LOAD2]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
394; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i16> [[_MSPROP]] to i64
395; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP13]], 0
396; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[_MSPROP2]] to i64
397; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP14]], 0
398; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
399; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB15:.*]], label %[[BB16:.*]], !prof [[PROF1]]
400; CHECK:       [[BB15]]:
401; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
402; CHECK-NEXT:    unreachable
403; CHECK:       [[BB16]]:
404; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
405; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
406; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
407;
408  %load1 = load <8 x i16>, ptr %A
409  %load2 = load <8 x i16>, ptr %B
410  %temp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
411  %temp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
412  %temp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
413  ret <4 x i32> %temp3
414}
415
416define <2 x i64> @sqdmull2_2d(ptr %A, ptr %B) nounwind sanitize_memory {
417; CHECK-LABEL: define <2 x i64> @sqdmull2_2d(
418; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
419; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
420; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
421; CHECK-NEXT:    call void @llvm.donothing()
422; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
423; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
424; CHECK:       [[BB3]]:
425; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
426; CHECK-NEXT:    unreachable
427; CHECK:       [[BB4]]:
428; CHECK-NEXT:    [[LOAD1:%.*]] = load <4 x i32>, ptr [[A]], align 16
429; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
430; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
431; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
432; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
433; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP8]], 0
434; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
435; CHECK:       [[BB8]]:
436; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
437; CHECK-NEXT:    unreachable
438; CHECK:       [[BB9]]:
439; CHECK-NEXT:    [[LOAD2:%.*]] = load <4 x i32>, ptr [[B]], align 16
440; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
441; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
442; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
443; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
444; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> splat (i32 -1), <2 x i32> <i32 2, i32 3>
445; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[LOAD1]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
446; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <4 x i32> [[_MSLD1]], <4 x i32> splat (i32 -1), <2 x i32> <i32 2, i32 3>
447; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[LOAD2]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
448; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[_MSPROP]] to i64
449; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP13]], 0
450; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[_MSPROP2]] to i64
451; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP14]], 0
452; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
453; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB15:.*]], label %[[BB16:.*]], !prof [[PROF1]]
454; CHECK:       [[BB15]]:
455; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
456; CHECK-NEXT:    unreachable
457; CHECK:       [[BB16]]:
458; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
459; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
460; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
461;
462  %load1 = load <4 x i32>, ptr %A
463  %load2 = load <4 x i32>, ptr %B
464  %temp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
465  %temp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
466  %temp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
467  ret <2 x i64> %temp3
468}
469
470
471declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
472declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
473
474define <8 x i16> @pmull8h(ptr %A, ptr %B) nounwind sanitize_memory {
475; CHECK-LABEL: define <8 x i16> @pmull8h(
476; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
477; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
478; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
479; CHECK-NEXT:    call void @llvm.donothing()
480; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
481; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
482; CHECK:       [[BB3]]:
483; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
484; CHECK-NEXT:    unreachable
485; CHECK:       [[BB4]]:
486; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[A]], align 8
487; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
488; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
489; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
490; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP7]], align 8
491; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP8]], 0
492; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
493; CHECK:       [[BB8]]:
494; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
495; CHECK-NEXT:    unreachable
496; CHECK:       [[BB9]]:
497; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[B]], align 8
498; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
499; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
500; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
501; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
502; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]]
503; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i8> [[_MSPROP]], zeroinitializer
504; CHECK-NEXT:    [[TMP13:%.*]] = zext <8 x i8> [[_MSPROP2]] to <8 x i16>
505; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
506; CHECK-NEXT:    store <8 x i16> [[TMP13]], ptr @__msan_retval_tls, align 8
507; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
508;
509  %temp1 = load <8 x i8>, ptr %A
510  %temp2 = load <8 x i8>, ptr %B
511  %temp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %temp1, <8 x i8> %temp2)
512  ret <8 x i16> %temp3
513}
514
515declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
516
517define <4 x i16> @sqdmulh_4h(ptr %A, ptr %B) nounwind sanitize_memory {
518; CHECK-LABEL: define <4 x i16> @sqdmulh_4h(
519; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
520; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
521; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
522; CHECK-NEXT:    call void @llvm.donothing()
523; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
524; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
525; CHECK:       [[BB3]]:
526; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
527; CHECK-NEXT:    unreachable
528; CHECK:       [[BB4]]:
529; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
530; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
531; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
532; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
533; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 8
534; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
535; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
536; CHECK:       [[BB8]]:
537; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
538; CHECK-NEXT:    unreachable
539; CHECK:       [[BB9]]:
540; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
541; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
542; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
543; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
544; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
545; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
546; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
547; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
548; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
549;
550  %temp1 = load <4 x i16>, ptr %A
551  %temp2 = load <4 x i16>, ptr %B
552  %temp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %temp1, <4 x i16> %temp2)
553  ret <4 x i16> %temp3
554}
555
556define <8 x i16> @sqdmulh_8h(ptr %A, ptr %B) nounwind sanitize_memory {
557; CHECK-LABEL: define <8 x i16> @sqdmulh_8h(
558; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
559; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
560; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
561; CHECK-NEXT:    call void @llvm.donothing()
562; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
563; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
564; CHECK:       [[BB3]]:
565; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
566; CHECK-NEXT:    unreachable
567; CHECK:       [[BB4]]:
568; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[A]], align 16
569; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
570; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
571; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
572; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 16
573; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
574; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
575; CHECK:       [[BB8]]:
576; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
577; CHECK-NEXT:    unreachable
578; CHECK:       [[BB9]]:
579; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[B]], align 16
580; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
581; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
582; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
583; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
584; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], [[_MSLD1]]
585; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
586; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
587; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
588;
589  %temp1 = load <8 x i16>, ptr %A
590  %temp2 = load <8 x i16>, ptr %B
591  %temp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %temp1, <8 x i16> %temp2)
592  ret <8 x i16> %temp3
593}
594
595define <2 x i32> @sqdmulh_2s(ptr %A, ptr %B) nounwind sanitize_memory {
596; CHECK-LABEL: define <2 x i32> @sqdmulh_2s(
597; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
598; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
599; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
600; CHECK-NEXT:    call void @llvm.donothing()
601; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
602; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
603; CHECK:       [[BB3]]:
604; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
605; CHECK-NEXT:    unreachable
606; CHECK:       [[BB4]]:
607; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
608; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
609; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
610; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
611; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 8
612; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
613; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
614; CHECK:       [[BB8]]:
615; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
616; CHECK-NEXT:    unreachable
617; CHECK:       [[BB9]]:
618; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
619; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
620; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
621; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
622; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
623; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
624; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
625; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
626; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
627;
628  %temp1 = load <2 x i32>, ptr %A
629  %temp2 = load <2 x i32>, ptr %B
630  %temp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %temp1, <2 x i32> %temp2)
631  ret <2 x i32> %temp3
632}
633
634define <4 x i32> @sqdmulh_4s(ptr %A, ptr %B) nounwind sanitize_memory {
635; CHECK-LABEL: define <4 x i32> @sqdmulh_4s(
636; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
637; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
638; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
639; CHECK-NEXT:    call void @llvm.donothing()
640; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
641; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
642; CHECK:       [[BB3]]:
643; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
644; CHECK-NEXT:    unreachable
645; CHECK:       [[BB4]]:
646; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[A]], align 16
647; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
648; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
649; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
650; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
651; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
652; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
653; CHECK:       [[BB8]]:
654; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
655; CHECK-NEXT:    unreachable
656; CHECK:       [[BB9]]:
657; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[B]], align 16
658; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
659; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
660; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
661; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
662; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
663; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
664; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
665; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
666;
667  %temp1 = load <4 x i32>, ptr %A
668  %temp2 = load <4 x i32>, ptr %B
669  %temp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %temp1, <4 x i32> %temp2)
670  ret <4 x i32> %temp3
671}
672
673define i32 @sqdmulh_1s(ptr %A, ptr %B) nounwind sanitize_memory {
674; CHECK-LABEL: define i32 @sqdmulh_1s(
675; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
676; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
677; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
678; CHECK-NEXT:    call void @llvm.donothing()
679; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
680; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
681; CHECK:       [[BB3]]:
682; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
683; CHECK-NEXT:    unreachable
684; CHECK:       [[BB4]]:
685; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
686; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
687; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
688; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
689; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
690; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
691; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
692; CHECK:       [[BB8]]:
693; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
694; CHECK-NEXT:    unreachable
695; CHECK:       [[BB9]]:
696; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 4
697; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
698; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
699; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
700; CHECK-NEXT:    [[_MSLD1:%.*]] = load i32, ptr [[TMP12]], align 4
701; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[_MSLD]], [[_MSLD1]]
702; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 [[TMP1]], i32 [[TMP2]])
703; CHECK-NEXT:    store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8
704; CHECK-NEXT:    ret i32 [[TMP3]]
705;
706  %temp1 = load i32, ptr %A
707  %temp2 = load i32, ptr %B
708  %temp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %temp1, i32 %temp2)
709  ret i32 %temp3
710}
711
712declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
713declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
714declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
715declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
716declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
717
718define <4 x i16> @sqrdmulh_4h(ptr %A, ptr %B) nounwind sanitize_memory {
719; CHECK-LABEL: define <4 x i16> @sqrdmulh_4h(
720; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
721; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
722; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
723; CHECK-NEXT:    call void @llvm.donothing()
724; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
725; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
726; CHECK:       [[BB3]]:
727; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
728; CHECK-NEXT:    unreachable
729; CHECK:       [[BB4]]:
730; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
731; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
732; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
733; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
734; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 8
735; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
736; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
737; CHECK:       [[BB8]]:
738; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
739; CHECK-NEXT:    unreachable
740; CHECK:       [[BB9]]:
741; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
742; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
743; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
744; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
745; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
746; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
747; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
748; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
749; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
750;
751  %temp1 = load <4 x i16>, ptr %A
752  %temp2 = load <4 x i16>, ptr %B
753  %temp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %temp1, <4 x i16> %temp2)
754  ret <4 x i16> %temp3
755}
756
757define <8 x i16> @sqrdmulh_8h(ptr %A, ptr %B) nounwind sanitize_memory {
758; CHECK-LABEL: define <8 x i16> @sqrdmulh_8h(
759; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
760; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
761; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
762; CHECK-NEXT:    call void @llvm.donothing()
763; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
764; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
765; CHECK:       [[BB3]]:
766; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
767; CHECK-NEXT:    unreachable
768; CHECK:       [[BB4]]:
769; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[A]], align 16
770; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
771; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
772; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
773; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 16
774; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
775; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
776; CHECK:       [[BB8]]:
777; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
778; CHECK-NEXT:    unreachable
779; CHECK:       [[BB9]]:
780; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[B]], align 16
781; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
782; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
783; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
784; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
785; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], [[_MSLD1]]
786; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
787; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
788; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
789;
790  %temp1 = load <8 x i16>, ptr %A
791  %temp2 = load <8 x i16>, ptr %B
792  %temp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %temp1, <8 x i16> %temp2)
793  ret <8 x i16> %temp3
794}
795
796define <2 x i32> @sqrdmulh_2s(ptr %A, ptr %B) nounwind sanitize_memory {
797; CHECK-LABEL: define <2 x i32> @sqrdmulh_2s(
798; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
799; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
800; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
801; CHECK-NEXT:    call void @llvm.donothing()
802; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
803; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
804; CHECK:       [[BB3]]:
805; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
806; CHECK-NEXT:    unreachable
807; CHECK:       [[BB4]]:
808; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
809; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
810; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
811; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
812; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 8
813; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
814; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
815; CHECK:       [[BB8]]:
816; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
817; CHECK-NEXT:    unreachable
818; CHECK:       [[BB9]]:
819; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
820; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
821; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
822; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
823; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
824; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
825; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
826; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
827; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
828;
829  %temp1 = load <2 x i32>, ptr %A
830  %temp2 = load <2 x i32>, ptr %B
831  %temp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %temp1, <2 x i32> %temp2)
832  ret <2 x i32> %temp3
833}
834
835define <4 x i32> @sqrdmulh_4s(ptr %A, ptr %B) nounwind sanitize_memory {
836; CHECK-LABEL: define <4 x i32> @sqrdmulh_4s(
837; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
838; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
839; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
840; CHECK-NEXT:    call void @llvm.donothing()
841; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
842; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
843; CHECK:       [[BB3]]:
844; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
845; CHECK-NEXT:    unreachable
846; CHECK:       [[BB4]]:
847; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[A]], align 16
848; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
849; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
850; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
851; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
852; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
853; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
854; CHECK:       [[BB8]]:
855; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
856; CHECK-NEXT:    unreachable
857; CHECK:       [[BB9]]:
858; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[B]], align 16
859; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
860; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
861; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
862; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
863; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
864; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
865; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
866; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
867;
868  %temp1 = load <4 x i32>, ptr %A
869  %temp2 = load <4 x i32>, ptr %B
870  %temp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %temp1, <4 x i32> %temp2)
871  ret <4 x i32> %temp3
872}
873
874define i32 @sqrdmulh_1s(ptr %A, ptr %B) nounwind sanitize_memory {
875; CHECK-LABEL: define i32 @sqrdmulh_1s(
876; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
877; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
878; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
879; CHECK-NEXT:    call void @llvm.donothing()
880; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
881; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
882; CHECK:       [[BB3]]:
883; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
884; CHECK-NEXT:    unreachable
885; CHECK:       [[BB4]]:
886; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
887; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
888; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
889; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
890; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
891; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
892; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
893; CHECK:       [[BB8]]:
894; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
895; CHECK-NEXT:    unreachable
896; CHECK:       [[BB9]]:
897; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 4
898; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
899; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
900; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
901; CHECK-NEXT:    [[_MSLD1:%.*]] = load i32, ptr [[TMP12]], align 4
902; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[_MSLD]], [[_MSLD1]]
903; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[TMP1]], i32 [[TMP2]])
904; CHECK-NEXT:    store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8
905; CHECK-NEXT:    ret i32 [[TMP3]]
906;
907  %temp1 = load i32, ptr %A
908  %temp2 = load i32, ptr %B
909  %temp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %temp1, i32 %temp2)
910  ret i32 %temp3
911}
912
913declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
914declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
915declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
916declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
917declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
918
919define <2 x float> @fmulx_2s(ptr %A, ptr %B) nounwind sanitize_memory {
920; CHECK-LABEL: define <2 x float> @fmulx_2s(
921; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
922; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
923; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
924; CHECK-NEXT:    call void @llvm.donothing()
925; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
926; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
927; CHECK:       [[BB3]]:
928; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
929; CHECK-NEXT:    unreachable
930; CHECK:       [[BB4]]:
931; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[A]], align 8
932; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
933; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
934; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
935; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 8
936; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP8]], 0
937; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
938; CHECK:       [[BB8]]:
939; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
940; CHECK-NEXT:    unreachable
941; CHECK:       [[BB9]]:
942; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[B]], align 8
943; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
944; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
945; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
946; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
947; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
948; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i32> [[_MSPROP]], zeroinitializer
949; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[TMP1]], <2 x float> [[TMP2]])
950; CHECK-NEXT:    store <2 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
951; CHECK-NEXT:    ret <2 x float> [[TMP3]]
952;
953  %temp1 = load <2 x float>, ptr %A
954  %temp2 = load <2 x float>, ptr %B
955  %temp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %temp1, <2 x float> %temp2)
956  ret <2 x float> %temp3
957}
958
959define <4 x float> @fmulx_4s(ptr %A, ptr %B) nounwind sanitize_memory {
960; CHECK-LABEL: define <4 x float> @fmulx_4s(
961; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
962; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
963; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
964; CHECK-NEXT:    call void @llvm.donothing()
965; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
966; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
967; CHECK:       [[BB3]]:
968; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
969; CHECK-NEXT:    unreachable
970; CHECK:       [[BB4]]:
971; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[A]], align 16
972; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
973; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
974; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
975; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
976; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP8]], 0
977; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
978; CHECK:       [[BB8]]:
979; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
980; CHECK-NEXT:    unreachable
981; CHECK:       [[BB9]]:
982; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[B]], align 16
983; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
984; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
985; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
986; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
987; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
988; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
989; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
990; CHECK-NEXT:    store <4 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
991; CHECK-NEXT:    ret <4 x float> [[TMP3]]
992;
993  %temp1 = load <4 x float>, ptr %A
994  %temp2 = load <4 x float>, ptr %B
995  %temp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %temp1, <4 x float> %temp2)
996  ret <4 x float> %temp3
997}
998
999define <2 x double> @fmulx_2d(ptr %A, ptr %B) nounwind sanitize_memory {
1000; CHECK-LABEL: define <2 x double> @fmulx_2d(
1001; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
1002; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
1003; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
1004; CHECK-NEXT:    call void @llvm.donothing()
1005; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
1006; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
1007; CHECK:       [[BB3]]:
1008; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1009; CHECK-NEXT:    unreachable
1010; CHECK:       [[BB4]]:
1011; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[A]], align 16
1012; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
1013; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
1014; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
1015; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP7]], align 16
1016; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP8]], 0
1017; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
1018; CHECK:       [[BB8]]:
1019; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1020; CHECK-NEXT:    unreachable
1021; CHECK:       [[BB9]]:
1022; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[B]], align 16
1023; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
1024; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
1025; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
1026; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
1027; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]]
1028; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
1029; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
1030; CHECK-NEXT:    store <2 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
1031; CHECK-NEXT:    ret <2 x double> [[TMP3]]
1032;
1033  %temp1 = load <2 x double>, ptr %A
1034  %temp2 = load <2 x double>, ptr %B
1035  %temp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %temp1, <2 x double> %temp2)
1036  ret <2 x double> %temp3
1037}
1038
1039declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
1040declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
1041declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
1042
1043define <4 x i32> @smlal4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
1044; CHECK-LABEL: define <4 x i32> @smlal4s(
1045; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
1046; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
1047; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
1048; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
1049; CHECK-NEXT:    call void @llvm.donothing()
1050; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
1051; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
1052; CHECK:       [[BB4]]:
1053; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1054; CHECK-NEXT:    unreachable
1055; CHECK:       [[BB5]]:
1056; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
1057; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
1058; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
1059; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
1060; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP8]], align 8
1061; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
1062; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
1063; CHECK:       [[BB9]]:
1064; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1065; CHECK-NEXT:    unreachable
1066; CHECK:       [[BB10]]:
1067; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
1068; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
1069; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
1070; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
1071; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP13]], align 8
1072; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
1073; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
1074; CHECK:       [[BB14]]:
1075; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1076; CHECK-NEXT:    unreachable
1077; CHECK:       [[BB15]]:
1078; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
1079; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
1080; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
1081; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
1082; CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
1083; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
1084; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i16> [[_MSPROP]], zeroinitializer
1085; CHECK-NEXT:    [[TMP19:%.*]] = zext <4 x i16> [[_MSPROP3]] to <4 x i32>
1086; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
1087; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSLD2]], [[TMP19]]
1088; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP3]], [[TMP4]]
1089; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
1090; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
1091;
1092  %temp1 = load <4 x i16>, ptr %A
1093  %temp2 = load <4 x i16>, ptr %B
1094  %temp3 = load <4 x i32>, ptr %C
1095  %temp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
1096  %temp5 = add <4 x i32> %temp3, %temp4
1097  ret <4 x i32> %temp5
1098}
1099
1100define <2 x i64> @smlal2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
1101; CHECK-LABEL: define <2 x i64> @smlal2d(
1102; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
1103; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
1104; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
1105; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
1106; CHECK-NEXT:    call void @llvm.donothing()
1107; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
1108; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
1109; CHECK:       [[BB4]]:
1110; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1111; CHECK-NEXT:    unreachable
1112; CHECK:       [[BB5]]:
1113; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
1114; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
1115; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
1116; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
1117; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 8
1118; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
1119; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
1120; CHECK:       [[BB9]]:
1121; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1122; CHECK-NEXT:    unreachable
1123; CHECK:       [[BB10]]:
1124; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
1125; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
1126; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
1127; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
1128; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP13]], align 8
1129; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
1130; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
1131; CHECK:       [[BB14]]:
1132; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1133; CHECK-NEXT:    unreachable
1134; CHECK:       [[BB15]]:
1135; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
1136; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
1137; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
1138; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
1139; CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
1140; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
1141; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i32> [[_MSPROP]], zeroinitializer
1142; CHECK-NEXT:    [[TMP19:%.*]] = zext <2 x i32> [[_MSPROP3]] to <2 x i64>
1143; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
1144; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSLD2]], [[TMP19]]
1145; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
1146; CHECK-NEXT:    store <2 x i64> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
1147; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
1148;
1149  %temp1 = load <2 x i32>, ptr %A
1150  %temp2 = load <2 x i32>, ptr %B
1151  %temp3 = load <2 x i64>, ptr %C
1152  %temp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
1153  %temp5 = add <2 x i64> %temp3, %temp4
1154  ret <2 x i64> %temp5
1155}
1156
1157define void @smlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
1158; CHECK-LABEL: define void @smlal8h_chain_with_constant(
1159; CHECK-SAME: ptr [[DST:%.*]], <8 x i8> [[V1:%.*]], <8 x i8> [[V2:%.*]], <8 x i8> [[V3:%.*]]) {
1160; CHECK-NEXT:    call void @llvm.donothing()
1161; CHECK-NEXT:    [[XOR:%.*]] = xor <8 x i8> [[V3]], splat (i8 -1)
1162; CHECK-NEXT:    [[SMULL_1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[V1]], <8 x i8> [[V3]])
1163; CHECK-NEXT:    [[ADD_1:%.*]] = add <8 x i16> [[SMULL_1]], splat (i16 257)
1164; CHECK-NEXT:    [[SMULL_2:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[V2]], <8 x i8> [[XOR]])
1165; CHECK-NEXT:    [[ADD_2:%.*]] = add <8 x i16> [[ADD_1]], [[SMULL_2]]
1166; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
1167; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
1168; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
1169; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[TMP3]], align 16
1170; CHECK-NEXT:    store <8 x i16> [[ADD_2]], ptr [[DST]], align 16
1171; CHECK-NEXT:    ret void
1172;
1173  %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1174  %smull.1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
1175  %add.1 = add <8 x i16> %smull.1, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
1176  %smull.2 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
1177  %add.2 = add <8 x i16> %add.1, %smull.2
1178  store <8 x i16> %add.2, ptr %dst
1179  ret void
1180}
1181
1182define void @smlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
1183; CHECK-LABEL: define void @smlal2d_chain_with_constant(
1184; CHECK-SAME: ptr [[DST:%.*]], <2 x i32> [[V1:%.*]], <2 x i32> [[V2:%.*]], <2 x i32> [[V3:%.*]]) {
1185; CHECK-NEXT:    call void @llvm.donothing()
1186; CHECK-NEXT:    [[XOR:%.*]] = xor <2 x i32> [[V3]], splat (i32 -1)
1187; CHECK-NEXT:    [[SMULL_1:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[V1]], <2 x i32> [[V3]])
1188; CHECK-NEXT:    [[ADD_1:%.*]] = add <2 x i64> [[SMULL_1]], splat (i64 257)
1189; CHECK-NEXT:    [[SMULL_2:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[V2]], <2 x i32> [[XOR]])
1190; CHECK-NEXT:    [[ADD_2:%.*]] = add <2 x i64> [[ADD_1]], [[SMULL_2]]
1191; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
1192; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
1193; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
1194; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
1195; CHECK-NEXT:    store <2 x i64> [[ADD_2]], ptr [[DST]], align 16
1196; CHECK-NEXT:    ret void
1197;
1198  %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
1199  %smull.1 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
1200  %add.1 = add <2 x i64> %smull.1, <i64 257, i64 257>
1201  %smull.2 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
1202  %add.2 = add <2 x i64> %add.1, %smull.2
1203  store <2 x i64> %add.2, ptr %dst
1204  ret void
1205}
1206
1207define <4 x i32> @smlsl4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
1208; CHECK-LABEL: define <4 x i32> @smlsl4s(
1209; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
1210; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
1211; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
1212; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
1213; CHECK-NEXT:    call void @llvm.donothing()
1214; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
1215; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
1216; CHECK:       [[BB4]]:
1217; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1218; CHECK-NEXT:    unreachable
1219; CHECK:       [[BB5]]:
1220; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
1221; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
1222; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
1223; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
1224; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP8]], align 8
1225; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
1226; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
1227; CHECK:       [[BB9]]:
1228; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1229; CHECK-NEXT:    unreachable
1230; CHECK:       [[BB10]]:
1231; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
1232; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
1233; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
1234; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
1235; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP13]], align 8
1236; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
1237; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
1238; CHECK:       [[BB14]]:
1239; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1240; CHECK-NEXT:    unreachable
1241; CHECK:       [[BB15]]:
1242; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
1243; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
1244; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
1245; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
1246; CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
1247; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
1248; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i16> [[_MSPROP]], zeroinitializer
1249; CHECK-NEXT:    [[TMP19:%.*]] = zext <4 x i16> [[_MSPROP3]] to <4 x i32>
1250; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
1251; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSLD2]], [[TMP19]]
1252; CHECK-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP3]], [[TMP4]]
1253; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
1254; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
1255;
1256  %temp1 = load <4 x i16>, ptr %A
1257  %temp2 = load <4 x i16>, ptr %B
1258  %temp3 = load <4 x i32>, ptr %C
1259  %temp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
1260  %temp5 = sub <4 x i32> %temp3, %temp4
1261  ret <4 x i32> %temp5
1262}
1263
1264define <2 x i64> @smlsl2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
1265; CHECK-LABEL: define <2 x i64> @smlsl2d(
1266; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
1267; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
1268; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
1269; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
1270; CHECK-NEXT:    call void @llvm.donothing()
1271; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
1272; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
1273; CHECK:       [[BB4]]:
1274; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1275; CHECK-NEXT:    unreachable
1276; CHECK:       [[BB5]]:
1277; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
1278; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
1279; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
1280; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
1281; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 8
1282; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
1283; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
1284; CHECK:       [[BB9]]:
1285; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1286; CHECK-NEXT:    unreachable
1287; CHECK:       [[BB10]]:
1288; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
1289; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
1290; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
1291; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
1292; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP13]], align 8
1293; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
1294; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
1295; CHECK:       [[BB14]]:
1296; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1297; CHECK-NEXT:    unreachable
1298; CHECK:       [[BB15]]:
1299; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
1300; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
1301; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
1302; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
1303; CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
1304; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
1305; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i32> [[_MSPROP]], zeroinitializer
1306; CHECK-NEXT:    [[TMP19:%.*]] = zext <2 x i32> [[_MSPROP3]] to <2 x i64>
1307; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
1308; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSLD2]], [[TMP19]]
1309; CHECK-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP3]], [[TMP4]]
1310; CHECK-NEXT:    store <2 x i64> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
1311; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
1312;
1313  %temp1 = load <2 x i32>, ptr %A
1314  %temp2 = load <2 x i32>, ptr %B
1315  %temp3 = load <2 x i64>, ptr %C
1316  %temp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
1317  %temp5 = sub <2 x i64> %temp3, %temp4
1318  ret <2 x i64> %temp5
1319}
1320
1321define void @smlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
1322; CHECK-LABEL: define void @smlsl8h_chain_with_constant(
1323; CHECK-SAME: ptr [[DST:%.*]], <8 x i8> [[V1:%.*]], <8 x i8> [[V2:%.*]], <8 x i8> [[V3:%.*]]) {
1324; CHECK-NEXT:    call void @llvm.donothing()
1325; CHECK-NEXT:    [[XOR:%.*]] = xor <8 x i8> [[V3]], splat (i8 -1)
1326; CHECK-NEXT:    [[SMULL_1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[V1]], <8 x i8> [[V3]])
1327; CHECK-NEXT:    [[SUB_1:%.*]] = sub <8 x i16> splat (i16 257), [[SMULL_1]]
1328; CHECK-NEXT:    [[SMULL_2:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[V2]], <8 x i8> [[XOR]])
1329; CHECK-NEXT:    [[SUB_2:%.*]] = sub <8 x i16> [[SUB_1]], [[SMULL_2]]
1330; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
1331; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
1332; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
1333; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[TMP3]], align 16
1334; CHECK-NEXT:    store <8 x i16> [[SUB_2]], ptr [[DST]], align 16
1335; CHECK-NEXT:    ret void
1336;
1337  %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1338  %smull.1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
1339  %sub.1 = sub <8 x i16> <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>, %smull.1
1340  %smull.2 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
1341  %sub.2 = sub <8 x i16> %sub.1, %smull.2
1342  store <8 x i16> %sub.2, ptr %dst
1343  ret void
1344}
1345
1346define void @smlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
1347; CHECK-LABEL: define void @smlsl2d_chain_with_constant(
1348; CHECK-SAME: ptr [[DST:%.*]], <2 x i32> [[V1:%.*]], <2 x i32> [[V2:%.*]], <2 x i32> [[V3:%.*]]) {
1349; CHECK-NEXT:    call void @llvm.donothing()
1350; CHECK-NEXT:    [[XOR:%.*]] = xor <2 x i32> [[V3]], splat (i32 -1)
1351; CHECK-NEXT:    [[SMULL_1:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[V1]], <2 x i32> [[V3]])
1352; CHECK-NEXT:    [[SUB_1:%.*]] = sub <2 x i64> splat (i64 257), [[SMULL_1]]
1353; CHECK-NEXT:    [[SMULL_2:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[V2]], <2 x i32> [[XOR]])
1354; CHECK-NEXT:    [[SUB_2:%.*]] = sub <2 x i64> [[SUB_1]], [[SMULL_2]]
1355; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
1356; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
1357; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
1358; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
1359; CHECK-NEXT:    store <2 x i64> [[SUB_2]], ptr [[DST]], align 16
1360; CHECK-NEXT:    ret void
1361;
1362  %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
1363  %smull.1 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
1364  %sub.1 = sub <2 x i64> <i64 257, i64 257>, %smull.1
1365  %smull.2 = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
1366  %sub.2 = sub <2 x i64> %sub.1, %smull.2
1367  store <2 x i64> %sub.2, ptr %dst
1368  ret void
1369}
1370
1371declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
1372declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
1373declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
1374declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
1375
1376define <4 x i32> @sqdmlal4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
1377; CHECK-LABEL: define <4 x i32> @sqdmlal4s(
1378; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
1379; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
1380; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
1381; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
1382; CHECK-NEXT:    call void @llvm.donothing()
1383; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
1384; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
1385; CHECK:       [[BB4]]:
1386; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1387; CHECK-NEXT:    unreachable
1388; CHECK:       [[BB5]]:
1389; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
1390; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
1391; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
1392; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
1393; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP8]], align 8
1394; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP10]], 0
1395; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
1396; CHECK:       [[BB9]]:
1397; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1398; CHECK-NEXT:    unreachable
1399; CHECK:       [[BB10]]:
1400; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
1401; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
1402; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
1403; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
1404; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP13]], align 8
1405; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP14]], 0
1406; CHECK-NEXT:    br i1 [[_MSCMP4]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
1407; CHECK:       [[BB14]]:
1408; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1409; CHECK-NEXT:    unreachable
1410; CHECK:       [[BB15]]:
1411; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
1412; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
1413; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
1414; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
1415; CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
1416; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i16> [[_MSLD]] to i64
1417; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP19]], 0
1418; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x i16> [[_MSLD1]] to i64
1419; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP20]], 0
1420; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP5]], [[_MSCMP6]]
1421; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB21:.*]], label %[[BB22:.*]], !prof [[PROF1]]
1422; CHECK:       [[BB21]]:
1423; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1424; CHECK-NEXT:    unreachable
1425; CHECK:       [[BB22]]:
1426; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
1427; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD2]], zeroinitializer
1428; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]])
1429; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
1430; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
1431;
1432  %temp1 = load <4 x i16>, ptr %A
1433  %temp2 = load <4 x i16>, ptr %B
1434  %temp3 = load <4 x i32>, ptr %C
1435  %temp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
1436  %temp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %temp3, <4 x i32> %temp4)
1437  ret <4 x i32> %temp5
1438}
1439
1440define <2 x i64> @sqdmlal2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
1441; CHECK-LABEL: define <2 x i64> @sqdmlal2d(
1442; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
1443; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
1444; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
1445; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
1446; CHECK-NEXT:    call void @llvm.donothing()
1447; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
1448; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
1449; CHECK:       [[BB4]]:
1450; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1451; CHECK-NEXT:    unreachable
1452; CHECK:       [[BB5]]:
1453; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
1454; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
1455; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
1456; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
1457; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 8
1458; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP10]], 0
1459; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
1460; CHECK:       [[BB9]]:
1461; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1462; CHECK-NEXT:    unreachable
1463; CHECK:       [[BB10]]:
1464; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
1465; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
1466; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
1467; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
1468; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP13]], align 8
1469; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP14]], 0
1470; CHECK-NEXT:    br i1 [[_MSCMP4]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
1471; CHECK:       [[BB14]]:
1472; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1473; CHECK-NEXT:    unreachable
1474; CHECK:       [[BB15]]:
1475; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
1476; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
1477; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
1478; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
1479; CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
1480; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i32> [[_MSLD]] to i64
1481; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP19]], 0
1482; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
1483; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP20]], 0
1484; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP5]], [[_MSCMP6]]
1485; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB21:.*]], label %[[BB22:.*]], !prof [[PROF1]]
1486; CHECK:       [[BB21]]:
1487; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1488; CHECK-NEXT:    unreachable
1489; CHECK:       [[BB22]]:
1490; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
1491; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD2]], zeroinitializer
1492; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]])
1493; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
1494; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
1495;
1496  %temp1 = load <2 x i32>, ptr %A
1497  %temp2 = load <2 x i32>, ptr %B
1498  %temp3 = load <2 x i64>, ptr %C
1499  %temp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
1500  %temp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %temp3, <2 x i64> %temp4)
1501  ret <2 x i64> %temp5
1502}
1503
1504define <4 x i32> @sqdmlal2_4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
1505; CHECK-LABEL: define <4 x i32> @sqdmlal2_4s(
1506; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
1507; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
1508; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
1509; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
1510; CHECK-NEXT:    call void @llvm.donothing()
1511; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
1512; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
1513; CHECK:       [[BB4]]:
1514; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1515; CHECK-NEXT:    unreachable
1516; CHECK:       [[BB5]]:
1517; CHECK-NEXT:    [[LOAD1:%.*]] = load <8 x i16>, ptr [[A]], align 16
1518; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
1519; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
1520; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
1521; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP8]], align 16
1522; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
1523; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
1524; CHECK:       [[BB9]]:
1525; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1526; CHECK-NEXT:    unreachable
1527; CHECK:       [[BB10]]:
1528; CHECK-NEXT:    [[LOAD2:%.*]] = load <8 x i16>, ptr [[B]], align 16
1529; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
1530; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
1531; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
1532; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP13]], align 16
1533; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
1534; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
1535; CHECK:       [[BB14]]:
1536; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1537; CHECK-NEXT:    unreachable
1538; CHECK:       [[BB15]]:
1539; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
1540; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
1541; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
1542; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
1543; CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
1544; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[_MSLD]], <8 x i16> splat (i16 -1), <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1545; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[LOAD1]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1546; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i16> [[_MSLD1]], <8 x i16> splat (i16 -1), <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1547; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[LOAD2]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1548; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i16> [[_MSPROP]] to i64
1549; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i64 [[TMP19]], 0
1550; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x i16> [[_MSPROP3]] to i64
1551; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP20]], 0
1552; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
1553; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB21:.*]], label %[[BB22:.*]], !prof [[PROF1]]
1554; CHECK:       [[BB21]]:
1555; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1556; CHECK-NEXT:    unreachable
1557; CHECK:       [[BB22]]:
1558; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
1559; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSLD2]], zeroinitializer
1560; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]])
1561; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
1562; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
1563;
1564  %load1 = load <8 x i16>, ptr %A
1565  %load2 = load <8 x i16>, ptr %B
1566  %temp3 = load <4 x i32>, ptr %C
1567  %temp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1568  %temp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1569  %temp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
1570  %temp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %temp3, <4 x i32> %temp4)
1571  ret <4 x i32> %temp5
1572}
1573
1574define <2 x i64> @sqdmlal2_2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
1575; CHECK-LABEL: define <2 x i64> @sqdmlal2_2d(
1576; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
1577; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
1578; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
1579; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
1580; CHECK-NEXT:    call void @llvm.donothing()
1581; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
1582; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
1583; CHECK:       [[BB4]]:
1584; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1585; CHECK-NEXT:    unreachable
1586; CHECK:       [[BB5]]:
1587; CHECK-NEXT:    [[LOAD1:%.*]] = load <4 x i32>, ptr [[A]], align 16
1588; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
1589; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
1590; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
1591; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
1592; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
1593; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
1594; CHECK:       [[BB9]]:
1595; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1596; CHECK-NEXT:    unreachable
1597; CHECK:       [[BB10]]:
1598; CHECK-NEXT:    [[LOAD2:%.*]] = load <4 x i32>, ptr [[B]], align 16
1599; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
1600; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
1601; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
1602; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP13]], align 16
1603; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
1604; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
1605; CHECK:       [[BB14]]:
1606; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1607; CHECK-NEXT:    unreachable
1608; CHECK:       [[BB15]]:
1609; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
1610; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
1611; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
1612; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
1613; CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
1614; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> splat (i32 -1), <2 x i32> <i32 2, i32 3>
1615; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[LOAD1]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1616; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <4 x i32> [[_MSLD1]], <4 x i32> splat (i32 -1), <2 x i32> <i32 2, i32 3>
1617; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[LOAD2]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1618; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i32> [[_MSPROP]] to i64
1619; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i64 [[TMP19]], 0
1620; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <2 x i32> [[_MSPROP3]] to i64
1621; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP20]], 0
1622; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
1623; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB21:.*]], label %[[BB22:.*]], !prof [[PROF1]]
1624; CHECK:       [[BB21]]:
1625; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1626; CHECK-NEXT:    unreachable
1627; CHECK:       [[BB22]]:
1628; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
1629; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSLD2]], zeroinitializer
1630; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]])
1631; CHECK-NEXT:    store <2 x i64> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
1632; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
1633;
1634  %load1 = load <4 x i32>, ptr %A
1635  %load2 = load <4 x i32>, ptr %B
1636  %temp3 = load <2 x i64>, ptr %C
1637  %temp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1638  %temp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1639  %temp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
1640  %temp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %temp3, <2 x i64> %temp4)
1641  ret <2 x i64> %temp5
1642}
1643
1644define <4 x i32> @sqdmlsl4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
1645; CHECK-LABEL: define <4 x i32> @sqdmlsl4s(
1646; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
1647; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
1648; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
1649; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
1650; CHECK-NEXT:    call void @llvm.donothing()
1651; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
1652; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
1653; CHECK:       [[BB4]]:
1654; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1655; CHECK-NEXT:    unreachable
1656; CHECK:       [[BB5]]:
1657; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
1658; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
1659; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
1660; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
1661; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP8]], align 8
1662; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP10]], 0
1663; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
1664; CHECK:       [[BB9]]:
1665; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1666; CHECK-NEXT:    unreachable
1667; CHECK:       [[BB10]]:
1668; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
1669; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
1670; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
1671; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
1672; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP13]], align 8
1673; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP14]], 0
1674; CHECK-NEXT:    br i1 [[_MSCMP4]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
1675; CHECK:       [[BB14]]:
1676; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1677; CHECK-NEXT:    unreachable
1678; CHECK:       [[BB15]]:
1679; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
1680; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
1681; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
1682; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
1683; CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
1684; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i16> [[_MSLD]] to i64
1685; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP19]], 0
1686; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x i16> [[_MSLD1]] to i64
1687; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP20]], 0
1688; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP5]], [[_MSCMP6]]
1689; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB21:.*]], label %[[BB22:.*]], !prof [[PROF1]]
1690; CHECK:       [[BB21]]:
1691; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1692; CHECK-NEXT:    unreachable
1693; CHECK:       [[BB22]]:
1694; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
1695; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD2]], zeroinitializer
1696; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]])
1697; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
1698; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
1699;
1700  %temp1 = load <4 x i16>, ptr %A
1701  %temp2 = load <4 x i16>, ptr %B
1702  %temp3 = load <4 x i32>, ptr %C
1703  %temp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
1704  %temp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %temp3, <4 x i32> %temp4)
1705  ret <4 x i32> %temp5
1706}
1707
1708define <2 x i64> @sqdmlsl2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
1709; CHECK-LABEL: define <2 x i64> @sqdmlsl2d(
1710; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
1711; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
1712; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
1713; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
1714; CHECK-NEXT:    call void @llvm.donothing()
1715; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
1716; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
1717; CHECK:       [[BB4]]:
1718; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1719; CHECK-NEXT:    unreachable
1720; CHECK:       [[BB5]]:
1721; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
1722; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
1723; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
1724; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
1725; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 8
1726; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP10]], 0
1727; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
1728; CHECK:       [[BB9]]:
1729; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1730; CHECK-NEXT:    unreachable
1731; CHECK:       [[BB10]]:
1732; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
1733; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
1734; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
1735; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
1736; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP13]], align 8
1737; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP14]], 0
1738; CHECK-NEXT:    br i1 [[_MSCMP4]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
1739; CHECK:       [[BB14]]:
1740; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1741; CHECK-NEXT:    unreachable
1742; CHECK:       [[BB15]]:
1743; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
1744; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
1745; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
1746; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
1747; CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
1748; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i32> [[_MSLD]] to i64
1749; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP19]], 0
1750; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
1751; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP20]], 0
1752; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP5]], [[_MSCMP6]]
1753; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB21:.*]], label %[[BB22:.*]], !prof [[PROF1]]
1754; CHECK:       [[BB21]]:
1755; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1756; CHECK-NEXT:    unreachable
1757; CHECK:       [[BB22]]:
1758; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
1759; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD2]], zeroinitializer
1760; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]])
1761; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
1762; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
1763;
1764  %temp1 = load <2 x i32>, ptr %A
1765  %temp2 = load <2 x i32>, ptr %B
1766  %temp3 = load <2 x i64>, ptr %C
1767  %temp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
1768  %temp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %temp3, <2 x i64> %temp4)
1769  ret <2 x i64> %temp5
1770}
1771
1772define <4 x i32> @sqdmlsl2_4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
1773; CHECK-LABEL: define <4 x i32> @sqdmlsl2_4s(
1774; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
1775; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
1776; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
1777; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
1778; CHECK-NEXT:    call void @llvm.donothing()
1779; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
1780; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
1781; CHECK:       [[BB4]]:
1782; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1783; CHECK-NEXT:    unreachable
1784; CHECK:       [[BB5]]:
1785; CHECK-NEXT:    [[LOAD1:%.*]] = load <8 x i16>, ptr [[A]], align 16
1786; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
1787; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
1788; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
1789; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP8]], align 16
1790; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
1791; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
1792; CHECK:       [[BB9]]:
1793; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1794; CHECK-NEXT:    unreachable
1795; CHECK:       [[BB10]]:
1796; CHECK-NEXT:    [[LOAD2:%.*]] = load <8 x i16>, ptr [[B]], align 16
1797; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
1798; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
1799; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
1800; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP13]], align 16
1801; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
1802; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
1803; CHECK:       [[BB14]]:
1804; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1805; CHECK-NEXT:    unreachable
1806; CHECK:       [[BB15]]:
1807; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
1808; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
1809; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
1810; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
1811; CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
1812; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[_MSLD]], <8 x i16> splat (i16 -1), <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1813; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[LOAD1]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1814; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i16> [[_MSLD1]], <8 x i16> splat (i16 -1), <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1815; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[LOAD2]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1816; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i16> [[_MSPROP]] to i64
1817; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i64 [[TMP19]], 0
1818; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x i16> [[_MSPROP3]] to i64
1819; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP20]], 0
1820; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
1821; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB21:.*]], label %[[BB22:.*]], !prof [[PROF1]]
1822; CHECK:       [[BB21]]:
1823; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1824; CHECK-NEXT:    unreachable
1825; CHECK:       [[BB22]]:
1826; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
1827; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSLD2]], zeroinitializer
1828; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]])
1829; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
1830; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
1831;
1832  %load1 = load <8 x i16>, ptr %A
1833  %load2 = load <8 x i16>, ptr %B
1834  %temp3 = load <4 x i32>, ptr %C
1835  %temp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1836  %temp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1837  %temp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
1838  %temp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %temp3, <4 x i32> %temp4)
1839  ret <4 x i32> %temp5
1840}
1841
1842define <2 x i64> @sqdmlsl2_2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
1843; CHECK-LABEL: define <2 x i64> @sqdmlsl2_2d(
1844; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
1845; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
1846; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
1847; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
1848; CHECK-NEXT:    call void @llvm.donothing()
1849; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
1850; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
1851; CHECK:       [[BB4]]:
1852; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1853; CHECK-NEXT:    unreachable
1854; CHECK:       [[BB5]]:
1855; CHECK-NEXT:    [[LOAD1:%.*]] = load <4 x i32>, ptr [[A]], align 16
1856; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
1857; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
1858; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
1859; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
1860; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
1861; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
1862; CHECK:       [[BB9]]:
1863; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1864; CHECK-NEXT:    unreachable
1865; CHECK:       [[BB10]]:
1866; CHECK-NEXT:    [[LOAD2:%.*]] = load <4 x i32>, ptr [[B]], align 16
1867; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
1868; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
1869; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
1870; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP13]], align 16
1871; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
1872; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
1873; CHECK:       [[BB14]]:
1874; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1875; CHECK-NEXT:    unreachable
1876; CHECK:       [[BB15]]:
1877; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
1878; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
1879; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
1880; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
1881; CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
1882; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> splat (i32 -1), <2 x i32> <i32 2, i32 3>
1883; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[LOAD1]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1884; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <4 x i32> [[_MSLD1]], <4 x i32> splat (i32 -1), <2 x i32> <i32 2, i32 3>
1885; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[LOAD2]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1886; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i32> [[_MSPROP]] to i64
1887; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i64 [[TMP19]], 0
1888; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <2 x i32> [[_MSPROP3]] to i64
1889; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP20]], 0
1890; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
1891; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB21:.*]], label %[[BB22:.*]], !prof [[PROF1]]
1892; CHECK:       [[BB21]]:
1893; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1894; CHECK-NEXT:    unreachable
1895; CHECK:       [[BB22]]:
1896; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
1897; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSLD2]], zeroinitializer
1898; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]])
1899; CHECK-NEXT:    store <2 x i64> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
1900; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
1901;
1902  %load1 = load <4 x i32>, ptr %A
1903  %load2 = load <4 x i32>, ptr %B
1904  %temp3 = load <2 x i64>, ptr %C
1905  %temp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1906  %temp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1907  %temp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
1908  %temp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %temp3, <2 x i64> %temp4)
1909  ret <2 x i64> %temp5
1910}
1911
1912define <4 x i32> @umlal4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
1913; CHECK-LABEL: define <4 x i32> @umlal4s(
1914; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
1915; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
1916; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
1917; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
1918; CHECK-NEXT:    call void @llvm.donothing()
1919; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
1920; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
1921; CHECK:       [[BB4]]:
1922; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1923; CHECK-NEXT:    unreachable
1924; CHECK:       [[BB5]]:
1925; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
1926; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
1927; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
1928; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
1929; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP8]], align 8
1930; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
1931; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
1932; CHECK:       [[BB9]]:
1933; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1934; CHECK-NEXT:    unreachable
1935; CHECK:       [[BB10]]:
1936; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
1937; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
1938; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
1939; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
1940; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP13]], align 8
1941; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
1942; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
1943; CHECK:       [[BB14]]:
1944; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1945; CHECK-NEXT:    unreachable
1946; CHECK:       [[BB15]]:
1947; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
1948; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
1949; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
1950; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
1951; CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
1952; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
1953; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i16> [[_MSPROP]], zeroinitializer
1954; CHECK-NEXT:    [[TMP19:%.*]] = zext <4 x i16> [[_MSPROP3]] to <4 x i32>
1955; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
1956; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSLD2]], [[TMP19]]
1957; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP3]], [[TMP4]]
1958; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
1959; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
1960;
1961  %temp1 = load <4 x i16>, ptr %A
1962  %temp2 = load <4 x i16>, ptr %B
1963  %temp3 = load <4 x i32>, ptr %C
1964  %temp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
1965  %temp5 = add <4 x i32> %temp3, %temp4
1966  ret <4 x i32> %temp5
1967}
1968
1969define <2 x i64> @umlal2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
1970; CHECK-LABEL: define <2 x i64> @umlal2d(
1971; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
1972; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
1973; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
1974; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
1975; CHECK-NEXT:    call void @llvm.donothing()
1976; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
1977; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
1978; CHECK:       [[BB4]]:
1979; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1980; CHECK-NEXT:    unreachable
1981; CHECK:       [[BB5]]:
1982; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
1983; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
1984; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
1985; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
1986; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 8
1987; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
1988; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
1989; CHECK:       [[BB9]]:
1990; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
1991; CHECK-NEXT:    unreachable
1992; CHECK:       [[BB10]]:
1993; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
1994; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
1995; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
1996; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
1997; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP13]], align 8
1998; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
1999; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
2000; CHECK:       [[BB14]]:
2001; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2002; CHECK-NEXT:    unreachable
2003; CHECK:       [[BB15]]:
2004; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
2005; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
2006; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
2007; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
2008; CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
2009; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
2010; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i32> [[_MSPROP]], zeroinitializer
2011; CHECK-NEXT:    [[TMP19:%.*]] = zext <2 x i32> [[_MSPROP3]] to <2 x i64>
2012; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
2013; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSLD2]], [[TMP19]]
2014; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
2015; CHECK-NEXT:    store <2 x i64> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
2016; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
2017;
2018  %temp1 = load <2 x i32>, ptr %A
2019  %temp2 = load <2 x i32>, ptr %B
2020  %temp3 = load <2 x i64>, ptr %C
2021  %temp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
2022  %temp5 = add <2 x i64> %temp3, %temp4
2023  ret <2 x i64> %temp5
2024}
2025
2026define void @umlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
2027; CHECK-LABEL: define void @umlal8h_chain_with_constant(
2028; CHECK-SAME: ptr [[DST:%.*]], <8 x i8> [[V1:%.*]], <8 x i8> [[V2:%.*]], <8 x i8> [[V3:%.*]]) {
2029; CHECK-NEXT:    call void @llvm.donothing()
2030; CHECK-NEXT:    [[XOR:%.*]] = xor <8 x i8> [[V3]], splat (i8 -1)
2031; CHECK-NEXT:    [[UMULL_1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[V1]], <8 x i8> [[V3]])
2032; CHECK-NEXT:    [[ADD_1:%.*]] = add <8 x i16> [[UMULL_1]], splat (i16 257)
2033; CHECK-NEXT:    [[UMULL_2:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[V2]], <8 x i8> [[XOR]])
2034; CHECK-NEXT:    [[ADD_2:%.*]] = add <8 x i16> [[ADD_1]], [[UMULL_2]]
2035; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
2036; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
2037; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
2038; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[TMP3]], align 16
2039; CHECK-NEXT:    store <8 x i16> [[ADD_2]], ptr [[DST]], align 16
2040; CHECK-NEXT:    ret void
2041;
2042  %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
2043  %umull.1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
2044  %add.1 = add <8 x i16> %umull.1, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
2045  %umull.2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
2046  %add.2 = add <8 x i16> %add.1, %umull.2
2047  store <8 x i16> %add.2, ptr %dst
2048  ret void
2049}
2050
2051define void @umlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
2052; CHECK-LABEL: define void @umlal2d_chain_with_constant(
2053; CHECK-SAME: ptr [[DST:%.*]], <2 x i32> [[V1:%.*]], <2 x i32> [[V2:%.*]], <2 x i32> [[V3:%.*]]) {
2054; CHECK-NEXT:    call void @llvm.donothing()
2055; CHECK-NEXT:    [[XOR:%.*]] = xor <2 x i32> [[V3]], splat (i32 -1)
2056; CHECK-NEXT:    [[UMULL_1:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[V1]], <2 x i32> [[V3]])
2057; CHECK-NEXT:    [[ADD_1:%.*]] = add <2 x i64> [[UMULL_1]], splat (i64 257)
2058; CHECK-NEXT:    [[UMULL_2:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[V2]], <2 x i32> [[XOR]])
2059; CHECK-NEXT:    [[ADD_2:%.*]] = add <2 x i64> [[ADD_1]], [[UMULL_2]]
2060; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
2061; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
2062; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
2063; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
2064; CHECK-NEXT:    store <2 x i64> [[ADD_2]], ptr [[DST]], align 16
2065; CHECK-NEXT:    ret void
2066;
2067  %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
2068  %umull.1 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
2069  %add.1 = add <2 x i64> %umull.1, <i64 257, i64 257>
2070  %umull.2 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
2071  %add.2 = add <2 x i64> %add.1, %umull.2
2072  store <2 x i64> %add.2, ptr %dst
2073  ret void
2074}
2075
2076define <4 x i32> @umlsl4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
2077; CHECK-LABEL: define <4 x i32> @umlsl4s(
2078; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
2079; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
2080; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
2081; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
2082; CHECK-NEXT:    call void @llvm.donothing()
2083; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
2084; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
2085; CHECK:       [[BB4]]:
2086; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2087; CHECK-NEXT:    unreachable
2088; CHECK:       [[BB5]]:
2089; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[A]], align 8
2090; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
2091; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
2092; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
2093; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP8]], align 8
2094; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
2095; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
2096; CHECK:       [[BB9]]:
2097; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2098; CHECK-NEXT:    unreachable
2099; CHECK:       [[BB10]]:
2100; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[B]], align 8
2101; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
2102; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
2103; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
2104; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP13]], align 8
2105; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
2106; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
2107; CHECK:       [[BB14]]:
2108; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2109; CHECK-NEXT:    unreachable
2110; CHECK:       [[BB15]]:
2111; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[C]], align 16
2112; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
2113; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
2114; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
2115; CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
2116; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
2117; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i16> [[_MSPROP]], zeroinitializer
2118; CHECK-NEXT:    [[TMP19:%.*]] = zext <4 x i16> [[_MSPROP3]] to <4 x i32>
2119; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
2120; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSLD2]], [[TMP19]]
2121; CHECK-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP3]], [[TMP4]]
2122; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
2123; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
2124;
2125  %temp1 = load <4 x i16>, ptr %A
2126  %temp2 = load <4 x i16>, ptr %B
2127  %temp3 = load <4 x i32>, ptr %C
2128  %temp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
2129  %temp5 = sub <4 x i32> %temp3, %temp4
2130  ret <4 x i32> %temp5
2131}
2132
2133define <2 x i64> @umlsl2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
2134; CHECK-LABEL: define <2 x i64> @umlsl2d(
2135; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
2136; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
2137; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
2138; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
2139; CHECK-NEXT:    call void @llvm.donothing()
2140; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
2141; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
2142; CHECK:       [[BB4]]:
2143; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2144; CHECK-NEXT:    unreachable
2145; CHECK:       [[BB5]]:
2146; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[A]], align 8
2147; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
2148; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
2149; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
2150; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 8
2151; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
2152; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
2153; CHECK:       [[BB9]]:
2154; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2155; CHECK-NEXT:    unreachable
2156; CHECK:       [[BB10]]:
2157; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[B]], align 8
2158; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
2159; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
2160; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
2161; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP13]], align 8
2162; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
2163; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
2164; CHECK:       [[BB14]]:
2165; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2166; CHECK-NEXT:    unreachable
2167; CHECK:       [[BB15]]:
2168; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C]], align 16
2169; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
2170; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
2171; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
2172; CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
2173; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
2174; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i32> [[_MSPROP]], zeroinitializer
2175; CHECK-NEXT:    [[TMP19:%.*]] = zext <2 x i32> [[_MSPROP3]] to <2 x i64>
2176; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
2177; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSLD2]], [[TMP19]]
2178; CHECK-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP3]], [[TMP4]]
2179; CHECK-NEXT:    store <2 x i64> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
2180; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
2181;
2182  %temp1 = load <2 x i32>, ptr %A
2183  %temp2 = load <2 x i32>, ptr %B
2184  %temp3 = load <2 x i64>, ptr %C
2185  %temp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
2186  %temp5 = sub <2 x i64> %temp3, %temp4
2187  ret <2 x i64> %temp5
2188}
2189
2190define void @umlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
2191; CHECK-LABEL: define void @umlsl8h_chain_with_constant(
2192; CHECK-SAME: ptr [[DST:%.*]], <8 x i8> [[V1:%.*]], <8 x i8> [[V2:%.*]], <8 x i8> [[V3:%.*]]) {
2193; CHECK-NEXT:    call void @llvm.donothing()
2194; CHECK-NEXT:    [[XOR:%.*]] = xor <8 x i8> [[V3]], splat (i8 -1)
2195; CHECK-NEXT:    [[UMULL_1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[V1]], <8 x i8> [[V3]])
2196; CHECK-NEXT:    [[ADD_1:%.*]] = sub <8 x i16> splat (i16 257), [[UMULL_1]]
2197; CHECK-NEXT:    [[UMULL_2:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[V2]], <8 x i8> [[XOR]])
2198; CHECK-NEXT:    [[ADD_2:%.*]] = sub <8 x i16> [[ADD_1]], [[UMULL_2]]
2199; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
2200; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
2201; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
2202; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[TMP3]], align 16
2203; CHECK-NEXT:    store <8 x i16> [[ADD_2]], ptr [[DST]], align 16
2204; CHECK-NEXT:    ret void
2205;
2206  %xor = xor <8 x i8> %v3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
2207  %umull.1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v1, <8 x i8> %v3)
2208  %add.1 = sub <8 x i16> <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>, %umull.1
2209  %umull.2 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %v2, <8 x i8> %xor)
2210  %add.2 = sub <8 x i16> %add.1, %umull.2
2211  store <8 x i16> %add.2, ptr %dst
2212  ret void
2213}
2214
2215define void @umlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
2216; CHECK-LABEL: define void @umlsl2d_chain_with_constant(
2217; CHECK-SAME: ptr [[DST:%.*]], <2 x i32> [[V1:%.*]], <2 x i32> [[V2:%.*]], <2 x i32> [[V3:%.*]]) {
2218; CHECK-NEXT:    call void @llvm.donothing()
2219; CHECK-NEXT:    [[XOR:%.*]] = xor <2 x i32> [[V3]], splat (i32 -1)
2220; CHECK-NEXT:    [[UMULL_1:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[V1]], <2 x i32> [[V3]])
2221; CHECK-NEXT:    [[ADD_1:%.*]] = sub <2 x i64> splat (i64 257), [[UMULL_1]]
2222; CHECK-NEXT:    [[UMULL_2:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[V2]], <2 x i32> [[XOR]])
2223; CHECK-NEXT:    [[ADD_2:%.*]] = sub <2 x i64> [[ADD_1]], [[UMULL_2]]
2224; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[DST]] to i64
2225; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
2226; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
2227; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr [[TMP3]], align 16
2228; CHECK-NEXT:    store <2 x i64> [[ADD_2]], ptr [[DST]], align 16
2229; CHECK-NEXT:    ret void
2230;
2231  %xor = xor <2 x i32> %v3, <i32 -1, i32 -1>
2232  %umull.1 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v1, <2 x i32> %v3)
2233  %add.1 = sub <2 x i64> <i64 257, i64 257>, %umull.1
2234  %umull.2 = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %v2, <2 x i32> %xor)
2235  %add.2 = sub <2 x i64> %add.1, %umull.2
2236  store <2 x i64> %add.2, ptr %dst
2237  ret void
2238}
2239
2240define <2 x float> @fmla_2s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
2241; CHECK-LABEL: define <2 x float> @fmla_2s(
2242; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
2243; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_param_tls, align 8
2244; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
2245; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
2246; CHECK-NEXT:    call void @llvm.donothing()
2247; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
2248; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
2249; CHECK:       [[BB4]]:
2250; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2251; CHECK-NEXT:    unreachable
2252; CHECK:       [[BB5]]:
2253; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[A]], align 8
2254; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
2255; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
2256; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
2257; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 8
2258; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
2259; CHECK-NEXT:    br i1 [[_MSCMP4]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
2260; CHECK:       [[BB9]]:
2261; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2262; CHECK-NEXT:    unreachable
2263; CHECK:       [[BB10]]:
2264; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[B]], align 8
2265; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
2266; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
2267; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
2268; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP13]], align 8
2269; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
2270; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
2271; CHECK:       [[BB14]]:
2272; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2273; CHECK-NEXT:    unreachable
2274; CHECK:       [[BB15]]:
2275; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr [[C]], align 8
2276; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
2277; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
2278; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
2279; CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP18]], align 8
2280; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
2281; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i32> [[_MSPROP]], [[_MSLD2]]
2282; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]])
2283; CHECK-NEXT:    store <2 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
2284; CHECK-NEXT:    ret <2 x float> [[TMP4]]
2285;
2286  %temp1 = load <2 x float>, ptr %A
2287  %temp2 = load <2 x float>, ptr %B
2288  %temp3 = load <2 x float>, ptr %C
2289  %temp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %temp1, <2 x float> %temp2, <2 x float> %temp3)
2290  ret <2 x float> %temp4
2291}
2292
2293define <4 x float> @fmla_4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
2294; CHECK-LABEL: define <4 x float> @fmla_4s(
2295; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
2296; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_param_tls, align 8
2297; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
2298; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
2299; CHECK-NEXT:    call void @llvm.donothing()
2300; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
2301; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
2302; CHECK:       [[BB4]]:
2303; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2304; CHECK-NEXT:    unreachable
2305; CHECK:       [[BB5]]:
2306; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[A]], align 16
2307; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
2308; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
2309; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
2310; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
2311; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
2312; CHECK-NEXT:    br i1 [[_MSCMP4]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
2313; CHECK:       [[BB9]]:
2314; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2315; CHECK-NEXT:    unreachable
2316; CHECK:       [[BB10]]:
2317; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[B]], align 16
2318; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
2319; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
2320; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
2321; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP13]], align 16
2322; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
2323; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
2324; CHECK:       [[BB14]]:
2325; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2326; CHECK-NEXT:    unreachable
2327; CHECK:       [[BB15]]:
2328; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[C]], align 16
2329; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
2330; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
2331; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
2332; CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
2333; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
2334; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD2]]
2335; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]])
2336; CHECK-NEXT:    store <4 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
2337; CHECK-NEXT:    ret <4 x float> [[TMP4]]
2338;
2339  %temp1 = load <4 x float>, ptr %A
2340  %temp2 = load <4 x float>, ptr %B
2341  %temp3 = load <4 x float>, ptr %C
2342  %temp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %temp1, <4 x float> %temp2, <4 x float> %temp3)
2343  ret <4 x float> %temp4
2344}
2345
2346define <2 x double> @fmla_2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
2347; CHECK-LABEL: define <2 x double> @fmla_2d(
2348; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
2349; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_param_tls, align 8
2350; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
2351; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
2352; CHECK-NEXT:    call void @llvm.donothing()
2353; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
2354; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
2355; CHECK:       [[BB4]]:
2356; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2357; CHECK-NEXT:    unreachable
2358; CHECK:       [[BB5]]:
2359; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[A]], align 16
2360; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
2361; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
2362; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
2363; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP8]], align 16
2364; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
2365; CHECK-NEXT:    br i1 [[_MSCMP4]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
2366; CHECK:       [[BB9]]:
2367; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2368; CHECK-NEXT:    unreachable
2369; CHECK:       [[BB10]]:
2370; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[B]], align 16
2371; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
2372; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
2373; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
2374; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP13]], align 16
2375; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
2376; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
2377; CHECK:       [[BB14]]:
2378; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2379; CHECK-NEXT:    unreachable
2380; CHECK:       [[BB15]]:
2381; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[C]], align 16
2382; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
2383; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
2384; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
2385; CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
2386; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]]
2387; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[_MSPROP]], [[_MSLD2]]
2388; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]])
2389; CHECK-NEXT:    store <2 x i64> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
2390; CHECK-NEXT:    ret <2 x double> [[TMP4]]
2391;
2392  %temp1 = load <2 x double>, ptr %A
2393  %temp2 = load <2 x double>, ptr %B
2394  %temp3 = load <2 x double>, ptr %C
2395  %temp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %temp1, <2 x double> %temp2, <2 x double> %temp3)
2396  ret <2 x double> %temp4
2397}
2398
2399declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
2400declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
2401declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
2402
2403define <2 x float> @fmls_2s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
2404; CHECK-LABEL: define <2 x float> @fmls_2s(
2405; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
2406; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
2407; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
2408; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
2409; CHECK-NEXT:    call void @llvm.donothing()
2410; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
2411; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
2412; CHECK:       [[BB4]]:
2413; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2414; CHECK-NEXT:    unreachable
2415; CHECK:       [[BB5]]:
2416; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[A]], align 8
2417; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
2418; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
2419; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
2420; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 8
2421; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
2422; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
2423; CHECK:       [[BB9]]:
2424; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2425; CHECK-NEXT:    unreachable
2426; CHECK:       [[BB10]]:
2427; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[B]], align 8
2428; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
2429; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
2430; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
2431; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP13]], align 8
2432; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
2433; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
2434; CHECK:       [[BB14]]:
2435; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2436; CHECK-NEXT:    unreachable
2437; CHECK:       [[BB15]]:
2438; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr [[C]], align 8
2439; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
2440; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
2441; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
2442; CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP18]], align 8
2443; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> zeroinitializer, [[_MSLD1]]
2444; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x float> splat (float -0.000000e+00), [[TMP2]]
2445; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i32> [[_MSLD]], [[_MSPROP]]
2446; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i32> [[_MSPROP3]], [[_MSLD2]]
2447; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP1]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
2448; CHECK-NEXT:    store <2 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
2449; CHECK-NEXT:    ret <2 x float> [[TMP5]]
2450;
2451  %temp1 = load <2 x float>, ptr %A
2452  %temp2 = load <2 x float>, ptr %B
2453  %temp3 = load <2 x float>, ptr %C
2454  %temp4 = fsub <2 x float> <float -0.0, float -0.0>, %temp2
2455  %temp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %temp1, <2 x float> %temp4, <2 x float> %temp3)
2456  ret <2 x float> %temp5
2457}
2458
2459define <4 x float> @fmls_4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
2460; CHECK-LABEL: define <4 x float> @fmls_4s(
2461; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
2462; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
2463; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
2464; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
2465; CHECK-NEXT:    call void @llvm.donothing()
2466; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
2467; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
2468; CHECK:       [[BB4]]:
2469; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2470; CHECK-NEXT:    unreachable
2471; CHECK:       [[BB5]]:
2472; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[A]], align 16
2473; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
2474; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
2475; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
2476; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
2477; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
2478; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
2479; CHECK:       [[BB9]]:
2480; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2481; CHECK-NEXT:    unreachable
2482; CHECK:       [[BB10]]:
2483; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[B]], align 16
2484; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
2485; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
2486; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
2487; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP13]], align 16
2488; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
2489; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
2490; CHECK:       [[BB14]]:
2491; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2492; CHECK-NEXT:    unreachable
2493; CHECK:       [[BB15]]:
2494; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[C]], align 16
2495; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
2496; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
2497; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
2498; CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
2499; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[_MSLD1]]
2500; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[TMP2]]
2501; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSLD]], [[_MSPROP]]
2502; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSLD2]]
2503; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
2504; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
2505; CHECK-NEXT:    ret <4 x float> [[TMP5]]
2506;
2507  %temp1 = load <4 x float>, ptr %A
2508  %temp2 = load <4 x float>, ptr %B
2509  %temp3 = load <4 x float>, ptr %C
2510  %temp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %temp2
2511  %temp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %temp1, <4 x float> %temp4, <4 x float> %temp3)
2512  ret <4 x float> %temp5
2513}
2514
2515define <2 x double> @fmls_2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
2516; CHECK-LABEL: define <2 x double> @fmls_2d(
2517; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
2518; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
2519; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
2520; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
2521; CHECK-NEXT:    call void @llvm.donothing()
2522; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
2523; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
2524; CHECK:       [[BB4]]:
2525; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2526; CHECK-NEXT:    unreachable
2527; CHECK:       [[BB5]]:
2528; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[A]], align 16
2529; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
2530; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
2531; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
2532; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP8]], align 16
2533; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
2534; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
2535; CHECK:       [[BB9]]:
2536; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2537; CHECK-NEXT:    unreachable
2538; CHECK:       [[BB10]]:
2539; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[B]], align 16
2540; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
2541; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
2542; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
2543; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP13]], align 16
2544; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
2545; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
2546; CHECK:       [[BB14]]:
2547; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2548; CHECK-NEXT:    unreachable
2549; CHECK:       [[BB15]]:
2550; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[C]], align 16
2551; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
2552; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
2553; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
2554; CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
2555; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[_MSLD1]]
2556; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[TMP2]]
2557; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[_MSLD]], [[_MSPROP]]
2558; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSLD2]]
2559; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
2560; CHECK-NEXT:    store <2 x i64> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
2561; CHECK-NEXT:    ret <2 x double> [[TMP5]]
2562;
2563  %temp1 = load <2 x double>, ptr %A
2564  %temp2 = load <2 x double>, ptr %B
2565  %temp3 = load <2 x double>, ptr %C
2566  %temp4 = fsub <2 x double> <double -0.0, double -0.0>, %temp2
2567  %temp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %temp1, <2 x double> %temp4, <2 x double> %temp3)
2568  ret <2 x double> %temp5
2569}
2570
2571define <2 x float> @fmls_commuted_neg_2s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
2572; CHECK-LABEL: define <2 x float> @fmls_commuted_neg_2s(
2573; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
2574; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
2575; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
2576; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
2577; CHECK-NEXT:    call void @llvm.donothing()
2578; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
2579; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
2580; CHECK:       [[BB4]]:
2581; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2582; CHECK-NEXT:    unreachable
2583; CHECK:       [[BB5]]:
2584; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[A]], align 8
2585; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
2586; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
2587; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
2588; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 8
2589; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
2590; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
2591; CHECK:       [[BB9]]:
2592; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2593; CHECK-NEXT:    unreachable
2594; CHECK:       [[BB10]]:
2595; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[B]], align 8
2596; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
2597; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
2598; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
2599; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP13]], align 8
2600; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
2601; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
2602; CHECK:       [[BB14]]:
2603; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2604; CHECK-NEXT:    unreachable
2605; CHECK:       [[BB15]]:
2606; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr [[C]], align 8
2607; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
2608; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
2609; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
2610; CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i32>, ptr [[TMP18]], align 8
2611; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> zeroinitializer, [[_MSLD1]]
2612; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x float> splat (float -0.000000e+00), [[TMP2]]
2613; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i32> [[_MSPROP]], [[_MSLD]]
2614; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i32> [[_MSPROP3]], [[_MSLD2]]
2615; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP1]], <2 x float> [[TMP3]])
2616; CHECK-NEXT:    store <2 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
2617; CHECK-NEXT:    ret <2 x float> [[TMP5]]
2618;
2619  %temp1 = load <2 x float>, ptr %A
2620  %temp2 = load <2 x float>, ptr %B
2621  %temp3 = load <2 x float>, ptr %C
2622  %temp4 = fsub <2 x float> <float -0.0, float -0.0>, %temp2
2623  %temp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %temp4, <2 x float> %temp1, <2 x float> %temp3)
2624  ret <2 x float> %temp5
2625}
2626
2627define <4 x float> @fmls_commuted_neg_4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
2628; CHECK-LABEL: define <4 x float> @fmls_commuted_neg_4s(
2629; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
2630; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
2631; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
2632; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
2633; CHECK-NEXT:    call void @llvm.donothing()
2634; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
2635; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
2636; CHECK:       [[BB4]]:
2637; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2638; CHECK-NEXT:    unreachable
2639; CHECK:       [[BB5]]:
2640; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[A]], align 16
2641; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
2642; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
2643; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
2644; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
2645; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
2646; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
2647; CHECK:       [[BB9]]:
2648; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2649; CHECK-NEXT:    unreachable
2650; CHECK:       [[BB10]]:
2651; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[B]], align 16
2652; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
2653; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
2654; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
2655; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP13]], align 16
2656; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
2657; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
2658; CHECK:       [[BB14]]:
2659; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2660; CHECK-NEXT:    unreachable
2661; CHECK:       [[BB15]]:
2662; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[C]], align 16
2663; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
2664; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
2665; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
2666; CHECK-NEXT:    [[_MSLD2:%.*]] = load <4 x i32>, ptr [[TMP18]], align 16
2667; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[_MSLD1]]
2668; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[TMP2]]
2669; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
2670; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSLD2]]
2671; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x float> [[TMP3]])
2672; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
2673; CHECK-NEXT:    ret <4 x float> [[TMP5]]
2674;
2675  %temp1 = load <4 x float>, ptr %A
2676  %temp2 = load <4 x float>, ptr %B
2677  %temp3 = load <4 x float>, ptr %C
2678  %temp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %temp2
2679  %temp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %temp4, <4 x float> %temp1, <4 x float> %temp3)
2680  ret <4 x float> %temp5
2681}
2682
2683define <2 x double> @fmls_commuted_neg_2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
2684; CHECK-LABEL: define <2 x double> @fmls_commuted_neg_2d(
2685; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
2686; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
2687; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
2688; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
2689; CHECK-NEXT:    call void @llvm.donothing()
2690; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
2691; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
2692; CHECK:       [[BB4]]:
2693; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2694; CHECK-NEXT:    unreachable
2695; CHECK:       [[BB5]]:
2696; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[A]], align 16
2697; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
2698; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
2699; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
2700; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP8]], align 16
2701; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP10]], 0
2702; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
2703; CHECK:       [[BB9]]:
2704; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2705; CHECK-NEXT:    unreachable
2706; CHECK:       [[BB10]]:
2707; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[B]], align 16
2708; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
2709; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
2710; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
2711; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP13]], align 16
2712; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP14]], 0
2713; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB14:.*]], label %[[BB15:.*]], !prof [[PROF1]]
2714; CHECK:       [[BB14]]:
2715; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
2716; CHECK-NEXT:    unreachable
2717; CHECK:       [[BB15]]:
2718; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[C]], align 16
2719; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[C]] to i64
2720; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 193514046488576
2721; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
2722; CHECK-NEXT:    [[_MSLD2:%.*]] = load <2 x i64>, ptr [[TMP18]], align 16
2723; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[_MSLD1]]
2724; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[TMP2]]
2725; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[_MSPROP]], [[_MSLD]]
2726; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSLD2]]
2727; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP1]], <2 x double> [[TMP3]])
2728; CHECK-NEXT:    store <2 x i64> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
2729; CHECK-NEXT:    ret <2 x double> [[TMP5]]
2730;
2731  %temp1 = load <2 x double>, ptr %A
2732  %temp2 = load <2 x double>, ptr %B
2733  %temp3 = load <2 x double>, ptr %C
2734  %temp4 = fsub <2 x double> <double -0.0, double -0.0>, %temp2
2735  %temp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %temp4, <2 x double> %temp1, <2 x double> %temp3)
2736  ret <2 x double> %temp5
2737}
2738
2739define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
2740; CHECK-LABEL: define <2 x float> @fmls_indexed_2s(
2741; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]]) #[[ATTR3:[0-9]+]] {
2742; CHECK-NEXT:  [[ENTRY:.*:]]
2743; CHECK-NEXT:    call void @llvm.donothing()
2744; CHECK-NEXT:    [[TMP0:%.*]] = fsub <2 x float> splat (float -0.000000e+00), [[C]]
2745; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[B]], <2 x float> undef, <2 x i32> zeroinitializer
2746; CHECK-NEXT:    [[FMLS1:%.*]] = tail call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP0]], <2 x float> [[LANE]], <2 x float> [[A]])
2747; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
2748; CHECK-NEXT:    ret <2 x float> [[FMLS1]]
2749;
2750entry:
2751  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
2752  %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
2753  %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
2754  ret <2 x float> %fmls1
2755}
2756
2757define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
2758; CHECK-LABEL: define <4 x float> @fmls_indexed_4s(
2759; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]]) #[[ATTR3]] {
2760; CHECK-NEXT:  [[ENTRY:.*:]]
2761; CHECK-NEXT:    call void @llvm.donothing()
2762; CHECK-NEXT:    [[TMP0:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[C]]
2763; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> zeroinitializer
2764; CHECK-NEXT:    [[FMLS1:%.*]] = tail call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP0]], <4 x float> [[LANE]], <4 x float> [[A]])
2765; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
2766; CHECK-NEXT:    ret <4 x float> [[FMLS1]]
2767;
2768entry:
2769  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
2770  %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
2771  %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
2772  ret <4 x float> %fmls1
2773}
2774
2775define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
2776; CHECK-LABEL: define <2 x double> @fmls_indexed_2d(
2777; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]]) #[[ATTR3]] {
2778; CHECK-NEXT:  [[ENTRY:.*:]]
2779; CHECK-NEXT:    call void @llvm.donothing()
2780; CHECK-NEXT:    [[TMP0:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[C]]
2781; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[B]], <2 x double> undef, <2 x i32> zeroinitializer
2782; CHECK-NEXT:    [[FMLS1:%.*]] = tail call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP0]], <2 x double> [[LANE]], <2 x double> [[A]])
2783; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
2784; CHECK-NEXT:    ret <2 x double> [[FMLS1]]
2785;
2786entry:
2787  %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
2788  %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
2789  %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
2790  ret <2 x double> %fmls1
2791}
2792
2793define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
2794; CHECK-LABEL: define <2 x float> @fmla_indexed_scalar_2s(
2795; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], float [[C:%.*]]) #[[ATTR3]] {
2796; CHECK-NEXT:  [[ENTRY:.*:]]
2797; CHECK-NEXT:    call void @llvm.donothing()
2798; CHECK-NEXT:    [[V1:%.*]] = insertelement <2 x float> undef, float [[C]], i32 0
2799; CHECK-NEXT:    [[V2:%.*]] = insertelement <2 x float> [[V1]], float [[C]], i32 1
2800; CHECK-NEXT:    [[FMLA1:%.*]] = tail call <2 x float> @llvm.fma.v2f32(<2 x float> [[V1]], <2 x float> [[B]], <2 x float> [[A]]) #[[ATTR7:[0-9]+]]
2801; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
2802; CHECK-NEXT:    ret <2 x float> [[FMLA1]]
2803;
2804entry:
2805  %v1 = insertelement <2 x float> undef, float %c, i32 0
2806  %v2 = insertelement <2 x float> %v1, float %c, i32 1
2807  %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
2808  ret <2 x float> %fmla1
2809}
2810
2811define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
2812; CHECK-LABEL: define <4 x float> @fmla_indexed_scalar_4s(
2813; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], float [[C:%.*]]) #[[ATTR3]] {
2814; CHECK-NEXT:  [[ENTRY:.*:]]
2815; CHECK-NEXT:    call void @llvm.donothing()
2816; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> undef, float [[C]], i32 0
2817; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[C]], i32 1
2818; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[C]], i32 2
2819; CHECK-NEXT:    [[V4:%.*]] = insertelement <4 x float> [[V3]], float [[C]], i32 3
2820; CHECK-NEXT:    [[FMLA1:%.*]] = tail call <4 x float> @llvm.fma.v4f32(<4 x float> [[V4]], <4 x float> [[B]], <4 x float> [[A]]) #[[ATTR7]]
2821; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
2822; CHECK-NEXT:    ret <4 x float> [[FMLA1]]
2823;
2824entry:
2825  %v1 = insertelement <4 x float> undef, float %c, i32 0
2826  %v2 = insertelement <4 x float> %v1, float %c, i32 1
2827  %v3 = insertelement <4 x float> %v2, float %c, i32 2
2828  %v4 = insertelement <4 x float> %v3, float %c, i32 3
2829  %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
2830  ret <4 x float> %fmla1
2831}
2832
2833define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
2834; CHECK-LABEL: define <2 x double> @fmla_indexed_scalar_2d(
2835; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], double [[C:%.*]]) #[[ATTR3]] {
2836; CHECK-NEXT:  [[ENTRY:.*:]]
2837; CHECK-NEXT:    call void @llvm.donothing()
2838; CHECK-NEXT:    [[V1:%.*]] = insertelement <2 x double> undef, double [[C]], i32 0
2839; CHECK-NEXT:    [[V2:%.*]] = insertelement <2 x double> [[V1]], double [[C]], i32 1
2840; CHECK-NEXT:    [[FMLA1:%.*]] = tail call <2 x double> @llvm.fma.v2f64(<2 x double> [[V2]], <2 x double> [[B]], <2 x double> [[A]]) #[[ATTR7]]
2841; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
2842; CHECK-NEXT:    ret <2 x double> [[FMLA1]]
2843;
2844entry:
2845  %v1 = insertelement <2 x double> undef, double %c, i32 0
2846  %v2 = insertelement <2 x double> %v1, double %c, i32 1
2847  %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
2848  ret <2 x double> %fmla1
2849}
2850
2851define <2 x float> @fmls_indexed_2s_strict(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp strictfp {
2852; CHECK-LABEL: define <2 x float> @fmls_indexed_2s_strict(
2853; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]]) #[[ATTR4:[0-9]+]] {
2854; CHECK-NEXT:  [[ENTRY:.*:]]
2855; CHECK-NEXT:    call void @llvm.donothing()
2856; CHECK-NEXT:    [[TMP0:%.*]] = fneg <2 x float> [[C]]
2857; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[B]], <2 x float> undef, <2 x i32> zeroinitializer
2858; CHECK-NEXT:    [[FMLS1:%.*]] = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> [[TMP0]], <2 x float> [[LANE]], <2 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]]
2859; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
2860; CHECK-NEXT:    ret <2 x float> [[FMLS1]]
2861;
2862entry:
2863  %0 = fneg <2 x float> %c
2864  %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
2865  %fmls1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
2866  ret <2 x float> %fmls1
2867}
2868
2869define <4 x float> @fmls_indexed_4s_strict(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp strictfp {
2870; CHECK-LABEL: define <4 x float> @fmls_indexed_4s_strict(
2871; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]]) #[[ATTR4]] {
2872; CHECK-NEXT:  [[ENTRY:.*:]]
2873; CHECK-NEXT:    call void @llvm.donothing()
2874; CHECK-NEXT:    [[TMP0:%.*]] = fneg <4 x float> [[C]]
2875; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> zeroinitializer
2876; CHECK-NEXT:    [[FMLS1:%.*]] = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> [[TMP0]], <4 x float> [[LANE]], <4 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR9]]
2877; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
2878; CHECK-NEXT:    ret <4 x float> [[FMLS1]]
2879;
2880entry:
2881  %0 = fneg <4 x float> %c
2882  %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
2883  %fmls1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
2884  ret <4 x float> %fmls1
2885}
2886
2887define <2 x double> @fmls_indexed_2d_strict(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp strictfp {
2888; CHECK-LABEL: define <2 x double> @fmls_indexed_2d_strict(
2889; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]]) #[[ATTR4]] {
2890; CHECK-NEXT:  [[ENTRY:.*:]]
2891; CHECK-NEXT:    call void @llvm.donothing()
2892; CHECK-NEXT:    [[TMP0:%.*]] = fneg <2 x double> [[C]]
2893; CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[B]], <2 x double> undef, <2 x i32> zeroinitializer
2894; CHECK-NEXT:    [[FMLS1:%.*]] = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> [[TMP0]], <2 x double> [[LANE]], <2 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR9]]
2895; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
2896; CHECK-NEXT:    ret <2 x double> [[FMLS1]]
2897;
2898entry:
2899  %0 = fneg <2 x double> %c
2900  %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
2901  %fmls1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
2902  ret <2 x double> %fmls1
2903}
2904
2905define <2 x float> @fmla_indexed_scalar_2s_strict(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp strictfp {
2906; CHECK-LABEL: define <2 x float> @fmla_indexed_scalar_2s_strict(
2907; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], float [[C:%.*]]) #[[ATTR4]] {
2908; CHECK-NEXT:  [[ENTRY:.*:]]
2909; CHECK-NEXT:    call void @llvm.donothing()
2910; CHECK-NEXT:    [[V1:%.*]] = insertelement <2 x float> undef, float [[C]], i32 0
2911; CHECK-NEXT:    [[V2:%.*]] = insertelement <2 x float> [[V1]], float [[C]], i32 1
2912; CHECK-NEXT:    [[FMLA1:%.*]] = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> [[V2]], <2 x float> [[B]], <2 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR9]]
2913; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
2914; CHECK-NEXT:    ret <2 x float> [[FMLA1]]
2915;
2916entry:
2917  %v1 = insertelement <2 x float> undef, float %c, i32 0
2918  %v2 = insertelement <2 x float> %v1, float %c, i32 1
2919  %fmla1 = tail call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %v2, <2 x float> %b, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
2920  ret <2 x float> %fmla1
2921}
2922
2923define <4 x float> @fmla_indexed_scalar_4s_strict(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp strictfp {
2924; CHECK-LABEL: define <4 x float> @fmla_indexed_scalar_4s_strict(
2925; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], float [[C:%.*]]) #[[ATTR4]] {
2926; CHECK-NEXT:  [[ENTRY:.*:]]
2927; CHECK-NEXT:    call void @llvm.donothing()
2928; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> undef, float [[C]], i32 0
2929; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[C]], i32 1
2930; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[C]], i32 2
2931; CHECK-NEXT:    [[V4:%.*]] = insertelement <4 x float> [[V3]], float [[C]], i32 3
2932; CHECK-NEXT:    [[FMLA1:%.*]] = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> [[V4]], <4 x float> [[B]], <4 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR9]]
2933; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
2934; CHECK-NEXT:    ret <4 x float> [[FMLA1]]
2935;
2936entry:
2937  %v1 = insertelement <4 x float> undef, float %c, i32 0
2938  %v2 = insertelement <4 x float> %v1, float %c, i32 1
2939  %v3 = insertelement <4 x float> %v2, float %c, i32 2
2940  %v4 = insertelement <4 x float> %v3, float %c, i32 3
2941  %fmla1 = tail call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
2942  ret <4 x float> %fmla1
2943}
2944
2945define <2 x double> @fmla_indexed_scalar_2d_strict(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp strictfp {
2946; CHECK-LABEL: define <2 x double> @fmla_indexed_scalar_2d_strict(
2947; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], double [[C:%.*]]) #[[ATTR4]] {
2948; CHECK-NEXT:  [[ENTRY:.*:]]
2949; CHECK-NEXT:    call void @llvm.donothing()
2950; CHECK-NEXT:    [[V1:%.*]] = insertelement <2 x double> undef, double [[C]], i32 0
2951; CHECK-NEXT:    [[V2:%.*]] = insertelement <2 x double> [[V1]], double [[C]], i32 1
2952; CHECK-NEXT:    [[FMLA1:%.*]] = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> [[V2]], <2 x double> [[B]], <2 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR9]]
2953; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
2954; CHECK-NEXT:    ret <2 x double> [[FMLA1]]
2955;
2956entry:
2957  %v1 = insertelement <2 x double> undef, double %c, i32 0
2958  %v2 = insertelement <2 x double> %v1, double %c, i32 1
2959  %fmla1 = tail call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
2960  ret <2 x double> %fmla1
2961}
2962
2963attributes #0 = { strictfp }
2964
2965declare <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float>, <2 x float>, <2 x float>, metadata, metadata)
2966declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
2967declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
2968
2969define <4 x i16> @mul_4h(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_memory {
2970; CHECK-LABEL: define <4 x i16> @mul_4h(
2971; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
2972; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
2973; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
2974; CHECK-NEXT:    call void @llvm.donothing()
2975; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2976; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2977; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i16> [[TMP2]], [[_MSPROP]]
2978; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i16> [[A]], [[TMP3]]
2979; CHECK-NEXT:    store <4 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
2980; CHECK-NEXT:    ret <4 x i16> [[TMP4]]
2981;
2982  %temp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2983  %temp4 = mul <4 x i16> %A, %temp3
2984  ret <4 x i16> %temp4
2985}
2986
2987define <8 x i16> @mul_8h(<8 x i16> %A, <8 x i16> %B) nounwind sanitize_memory {
2988; CHECK-LABEL: define <8 x i16> @mul_8h(
2989; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
2990; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
2991; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
2992; CHECK-NEXT:    call void @llvm.donothing()
2993; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2994; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2995; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[TMP2]], [[_MSPROP]]
2996; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i16> [[A]], [[TMP3]]
2997; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
2998; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
2999;
3000  %temp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3001  %temp4 = mul <8 x i16> %A, %temp3
3002  ret <8 x i16> %temp4
3003}
3004
3005define <2 x i32> @mul_2s(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_memory {
3006; CHECK-LABEL: define <2 x i32> @mul_2s(
3007; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
3008; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3009; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
3010; CHECK-NEXT:    call void @llvm.donothing()
3011; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
3012; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3013; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i32> [[TMP2]], [[_MSPROP]]
3014; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[A]], [[TMP3]]
3015; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3016; CHECK-NEXT:    ret <2 x i32> [[TMP4]]
3017;
3018  %temp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3019  %temp4 = mul <2 x i32> %A, %temp3
3020  ret <2 x i32> %temp4
3021}
3022
3023define <4 x i32> @mul_4s(<4 x i32> %A, <4 x i32> %B) nounwind sanitize_memory {
3024; CHECK-LABEL: define <4 x i32> @mul_4s(
3025; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
3026; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3027; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
3028; CHECK-NEXT:    call void @llvm.donothing()
3029; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3030; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3031; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP2]], [[_MSPROP]]
3032; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[A]], [[TMP3]]
3033; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3034; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
3035;
3036  %temp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3037  %temp4 = mul <4 x i32> %A, %temp3
3038  ret <4 x i32> %temp4
3039}
3040
3041define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind sanitize_memory {
3042; CHECK-LABEL: define <2 x i64> @mul_2d(
3043; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR0]] {
3044; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
3045; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3046; CHECK-NEXT:    call void @llvm.donothing()
3047; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP3]], [[TMP2]]
3048; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i64> [[A]], [[B]]
3049; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
3050; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
3051;
3052  %temp1 = mul <2 x i64> %A, %B
3053  ret <2 x i64> %temp1
3054}
3055
3056define <2 x float> @fmul_lane_2s(<2 x float> %A, <2 x float> %B) nounwind sanitize_memory {
3057; CHECK-LABEL: define <2 x float> @fmul_lane_2s(
3058; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]) #[[ATTR0]] {
3059; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3060; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
3061; CHECK-NEXT:    call void @llvm.donothing()
3062; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
3063; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[B]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
3064; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i32> [[TMP2]], [[_MSPROP]]
3065; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[A]], [[TMP3]]
3066; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3067; CHECK-NEXT:    ret <2 x float> [[TMP4]]
3068;
3069  %temp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1>
3070  %temp4 = fmul <2 x float> %A, %temp3
3071  ret <2 x float> %temp4
3072}
3073
3074define <4 x float> @fmul_lane_4s(<4 x float> %A, <4 x float> %B) nounwind sanitize_memory {
3075; CHECK-LABEL: define <4 x float> @fmul_lane_4s(
3076; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) #[[ATTR0]] {
3077; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3078; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
3079; CHECK-NEXT:    call void @llvm.donothing()
3080; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3081; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3082; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP2]], [[_MSPROP]]
3083; CHECK-NEXT:    [[TMP4:%.*]] = fmul <4 x float> [[A]], [[TMP3]]
3084; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3085; CHECK-NEXT:    ret <4 x float> [[TMP4]]
3086;
3087  %temp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3088  %temp4 = fmul <4 x float> %A, %temp3
3089  ret <4 x float> %temp4
3090}
3091
3092define <2 x double> @fmul_lane_2d(<2 x double> %A, <2 x double> %B) nounwind sanitize_memory {
3093; CHECK-LABEL: define <2 x double> @fmul_lane_2d(
3094; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] {
3095; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3096; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
3097; CHECK-NEXT:    call void @llvm.donothing()
3098; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <2 x i32> <i32 1, i32 1>
3099; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
3100; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP2]], [[_MSPROP]]
3101; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[A]], [[TMP3]]
3102; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3103; CHECK-NEXT:    ret <2 x double> [[TMP4]]
3104;
3105  %temp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1>
3106  %temp4 = fmul <2 x double> %A, %temp3
3107  ret <2 x double> %temp4
3108}
3109
3110define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind sanitize_memory {
3111; CHECK-LABEL: define float @fmul_lane_s(
3112; CHECK-SAME: float [[A:%.*]], <4 x float> [[VEC:%.*]]) #[[ATTR0]] {
3113; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3114; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
3115; CHECK-NEXT:    call void @llvm.donothing()
3116; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
3117; CHECK-NEXT:    [[B:%.*]] = extractelement <4 x float> [[VEC]], i32 3
3118; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[TMP2]], [[_MSPROP]]
3119; CHECK-NEXT:    [[RES:%.*]] = fmul float [[A]], [[B]]
3120; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3121; CHECK-NEXT:    ret float [[RES]]
3122;
3123  %B = extractelement <4 x float> %vec, i32 3
3124  %res = fmul float %A, %B
3125  ret float %res
3126}
3127
3128define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind sanitize_memory {
3129; CHECK-LABEL: define double @fmul_lane_d(
3130; CHECK-SAME: double [[A:%.*]], <2 x double> [[VEC:%.*]]) #[[ATTR0]] {
3131; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3132; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
3133; CHECK-NEXT:    call void @llvm.donothing()
3134; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
3135; CHECK-NEXT:    [[B:%.*]] = extractelement <2 x double> [[VEC]], i32 1
3136; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[TMP2]], [[_MSPROP]]
3137; CHECK-NEXT:    [[RES:%.*]] = fmul double [[A]], [[B]]
3138; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3139; CHECK-NEXT:    ret double [[RES]]
3140;
3141  %B = extractelement <2 x double> %vec, i32 1
3142  %res = fmul double %A, %B
3143  ret double %res
3144}
3145
3146
3147
3148define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind sanitize_memory {
3149; CHECK-LABEL: define <2 x float> @fmulx_lane_2s(
3150; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]) #[[ATTR0]] {
3151; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3152; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
3153; CHECK-NEXT:    call void @llvm.donothing()
3154; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
3155; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[B]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
3156; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i32> [[TMP2]], [[_MSPROP]]
3157; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i32> [[_MSPROP1]], zeroinitializer
3158; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[TMP3]])
3159; CHECK-NEXT:    store <2 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
3160; CHECK-NEXT:    ret <2 x float> [[TMP4]]
3161;
3162  %temp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1>
3163  %temp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %A, <2 x float> %temp3)
3164  ret <2 x float> %temp4
3165}
3166
3167define <4 x float> @fmulx_lane_4s(<4 x float> %A, <4 x float> %B) nounwind sanitize_memory {
3168; CHECK-LABEL: define <4 x float> @fmulx_lane_4s(
3169; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) #[[ATTR0]] {
3170; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3171; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
3172; CHECK-NEXT:    call void @llvm.donothing()
3173; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3174; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3175; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP2]], [[_MSPROP]]
3176; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP1]], zeroinitializer
3177; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[TMP3]])
3178; CHECK-NEXT:    store <4 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
3179; CHECK-NEXT:    ret <4 x float> [[TMP4]]
3180;
3181  %temp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3182  %temp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %A, <4 x float> %temp3)
3183  ret <4 x float> %temp4
3184}
3185
3186define <2 x double> @fmulx_lane_2d(<2 x double> %A, <2 x double> %B) nounwind sanitize_memory {
3187; CHECK-LABEL: define <2 x double> @fmulx_lane_2d(
3188; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] {
3189; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3190; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
3191; CHECK-NEXT:    call void @llvm.donothing()
3192; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <2 x i32> <i32 1, i32 1>
3193; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
3194; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP2]], [[_MSPROP]]
3195; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[_MSPROP1]], zeroinitializer
3196; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[TMP3]])
3197; CHECK-NEXT:    store <2 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
3198; CHECK-NEXT:    ret <2 x double> [[TMP4]]
3199;
3200  %temp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1>
3201  %temp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %A, <2 x double> %temp3)
3202  ret <2 x double> %temp4
3203}
3204
3205define <4 x i16> @sqdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_memory {
3206; CHECK-LABEL: define <4 x i16> @sqdmulh_lane_4h(
3207; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
3208; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3209; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
3210; CHECK-NEXT:    call void @llvm.donothing()
3211; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3212; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3213; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i16> [[TMP2]], [[_MSPROP]]
3214; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[TMP3]])
3215; CHECK-NEXT:    store <4 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3216; CHECK-NEXT:    ret <4 x i16> [[TMP4]]
3217;
3218  %temp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3219  %temp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %A, <4 x i16> %temp3)
3220  ret <4 x i16> %temp4
3221}
3222
3223define <8 x i16> @sqdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind sanitize_memory {
3224; CHECK-LABEL: define <8 x i16> @sqdmulh_lane_8h(
3225; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
3226; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3227; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
3228; CHECK-NEXT:    call void @llvm.donothing()
3229; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3230; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3231; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[TMP2]], [[_MSPROP]]
3232; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[TMP3]])
3233; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3234; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
3235;
3236  %temp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3237  %temp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %A, <8 x i16> %temp3)
3238  ret <8 x i16> %temp4
3239}
3240
3241define <2 x i32> @sqdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_memory {
3242; CHECK-LABEL: define <2 x i32> @sqdmulh_lane_2s(
3243; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
3244; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3245; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
3246; CHECK-NEXT:    call void @llvm.donothing()
3247; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
3248; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3249; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i32> [[TMP2]], [[_MSPROP]]
3250; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[TMP3]])
3251; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3252; CHECK-NEXT:    ret <2 x i32> [[TMP4]]
3253;
3254  %temp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3255  %temp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %A, <2 x i32> %temp3)
3256  ret <2 x i32> %temp4
3257}
3258
3259define <4 x i32> @sqdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind sanitize_memory {
3260; CHECK-LABEL: define <4 x i32> @sqdmulh_lane_4s(
3261; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
3262; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3263; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
3264; CHECK-NEXT:    call void @llvm.donothing()
3265; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3266; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3267; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP2]], [[_MSPROP]]
3268; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[TMP3]])
3269; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3270; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
3271;
3272  %temp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3273  %temp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %A, <4 x i32> %temp3)
3274  ret <4 x i32> %temp4
3275}
3276
3277define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind sanitize_memory {
3278; CHECK-LABEL: define i32 @sqdmulh_lane_1s(
3279; CHECK-SAME: i32 [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
3280; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3281; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_tls, align 8
3282; CHECK-NEXT:    call void @llvm.donothing()
3283; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
3284; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[B]], i32 1
3285; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[TMP4]], [[_MSPROP]]
3286; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 [[A]], i32 [[TMP1]])
3287; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3288; CHECK-NEXT:    ret i32 [[TMP2]]
3289;
3290  %temp1 = extractelement <4 x i32> %B, i32 1
3291  %temp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %temp1)
3292  ret i32 %temp2
3293}
3294
3295define <4 x i16> @sqrdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_memory {
3296; CHECK-LABEL: define <4 x i16> @sqrdmulh_lane_4h(
3297; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
3298; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3299; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
3300; CHECK-NEXT:    call void @llvm.donothing()
3301; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3302; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3303; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i16> [[TMP2]], [[_MSPROP]]
3304; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[TMP3]])
3305; CHECK-NEXT:    store <4 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3306; CHECK-NEXT:    ret <4 x i16> [[TMP4]]
3307;
3308  %temp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3309  %temp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %A, <4 x i16> %temp3)
3310  ret <4 x i16> %temp4
3311}
3312
3313define <8 x i16> @sqrdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind sanitize_memory {
3314; CHECK-LABEL: define <8 x i16> @sqrdmulh_lane_8h(
3315; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
3316; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3317; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
3318; CHECK-NEXT:    call void @llvm.donothing()
3319; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3320; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3321; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[TMP2]], [[_MSPROP]]
3322; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[TMP3]])
3323; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3324; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
3325;
3326  %temp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3327  %temp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %A, <8 x i16> %temp3)
3328  ret <8 x i16> %temp4
3329}
3330
3331define <2 x i32> @sqrdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_memory {
3332; CHECK-LABEL: define <2 x i32> @sqrdmulh_lane_2s(
3333; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
3334; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3335; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
3336; CHECK-NEXT:    call void @llvm.donothing()
3337; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
3338; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3339; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i32> [[TMP2]], [[_MSPROP]]
3340; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[TMP3]])
3341; CHECK-NEXT:    store <2 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3342; CHECK-NEXT:    ret <2 x i32> [[TMP4]]
3343;
3344  %temp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3345  %temp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %A, <2 x i32> %temp3)
3346  ret <2 x i32> %temp4
3347}
3348
3349define <4 x i32> @sqrdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind sanitize_memory {
3350; CHECK-LABEL: define <4 x i32> @sqrdmulh_lane_4s(
3351; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
3352; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3353; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
3354; CHECK-NEXT:    call void @llvm.donothing()
3355; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3356; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3357; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP2]], [[_MSPROP]]
3358; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[TMP3]])
3359; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3360; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
3361;
3362  %temp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3363  %temp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %A, <4 x i32> %temp3)
3364  ret <4 x i32> %temp4
3365}
3366
3367define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind sanitize_memory {
3368; CHECK-LABEL: define i32 @sqrdmulh_lane_1s(
3369; CHECK-SAME: i32 [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
3370; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3371; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_tls, align 8
3372; CHECK-NEXT:    call void @llvm.donothing()
3373; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
3374; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[B]], i32 1
3375; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[TMP4]], [[_MSPROP]]
3376; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[A]], i32 [[TMP1]])
3377; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3378; CHECK-NEXT:    ret i32 [[TMP2]]
3379;
3380  %temp1 = extractelement <4 x i32> %B, i32 1
3381  %temp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %temp1)
3382  ret i32 %temp2
3383}
3384
3385define <4 x i32> @sqdmull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_memory {
3386; CHECK-LABEL: define <4 x i32> @sqdmull_lane_4s(
3387; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
3388; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3389; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
3390; CHECK-NEXT:    call void @llvm.donothing()
3391; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3392; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3393; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
3394; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
3395; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[_MSPROP]] to i64
3396; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
3397; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
3398; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
3399; CHECK:       [[BB5]]:
3400; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
3401; CHECK-NEXT:    unreachable
3402; CHECK:       [[BB6]]:
3403; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP3]])
3404; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
3405; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
3406;
3407  %temp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3408  %temp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %temp3)
3409  ret <4 x i32> %temp4
3410}
3411
3412define <2 x i64> @sqdmull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_memory {
3413; CHECK-LABEL: define <2 x i64> @sqdmull_lane_2d(
3414; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
3415; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3416; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
3417; CHECK-NEXT:    call void @llvm.donothing()
3418; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
3419; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3420; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
3421; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
3422; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[_MSPROP]] to i64
3423; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
3424; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
3425; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
3426; CHECK:       [[BB5]]:
3427; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
3428; CHECK-NEXT:    unreachable
3429; CHECK:       [[BB6]]:
3430; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP3]])
3431; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
3432; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
3433;
3434  %temp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3435  %temp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %temp3)
3436  ret <2 x i64> %temp4
3437}
3438
3439define <4 x i32> @sqdmull2_lane_4s(<8 x i16> %A, <8 x i16> %B) nounwind sanitize_memory {
3440; CHECK-LABEL: define <4 x i32> @sqdmull2_lane_4s(
3441; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
3442; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
3443; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3444; CHECK-NEXT:    call void @llvm.donothing()
3445; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> splat (i16 -1), <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3446; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3447; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3448; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3449; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[_MSPROP]] to i64
3450; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
3451; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[_MSPROP1]] to i64
3452; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
3453; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
3454; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
3455; CHECK:       [[BB5]]:
3456; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
3457; CHECK-NEXT:    unreachable
3458; CHECK:       [[BB6]]:
3459; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
3460; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
3461; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
3462;
3463  %temp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3464  %temp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3465  %temp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
3466  ret <4 x i32> %temp4
3467}
3468
3469define <2 x i64> @sqdmull2_lane_2d(<4 x i32> %A, <4 x i32> %B) nounwind sanitize_memory {
3470; CHECK-LABEL: define <2 x i64> @sqdmull2_lane_2d(
3471; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
3472; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
3473; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3474; CHECK-NEXT:    call void @llvm.donothing()
3475; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> splat (i32 -1), <2 x i32> <i32 2, i32 3>
3476; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3477; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
3478; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 1, i32 1>
3479; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[_MSPROP]] to i64
3480; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
3481; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[_MSPROP1]] to i64
3482; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
3483; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
3484; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
3485; CHECK:       [[BB5]]:
3486; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
3487; CHECK-NEXT:    unreachable
3488; CHECK:       [[BB6]]:
3489; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
3490; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
3491; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
3492;
3493  %temp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3494  %temp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
3495  %temp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
3496  ret <2 x i64> %temp4
3497}
3498
3499define <4 x i32> @umull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_memory {
3500; CHECK-LABEL: define <4 x i32> @umull_lane_4s(
3501; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
3502; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3503; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
3504; CHECK-NEXT:    call void @llvm.donothing()
3505; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3506; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3507; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i16> [[TMP2]], [[_MSPROP]]
3508; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i16> [[_MSPROP1]], zeroinitializer
3509; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i16> [[_MSPROP2]] to <4 x i32>
3510; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP3]])
3511; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
3512; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
3513;
3514  %temp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3515  %temp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %temp3)
3516  ret <4 x i32> %temp4
3517}
3518
3519define <2 x i64> @umull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_memory {
3520; CHECK-LABEL: define <2 x i64> @umull_lane_2d(
3521; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
3522; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3523; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
3524; CHECK-NEXT:    call void @llvm.donothing()
3525; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
3526; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3527; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i32> [[TMP2]], [[_MSPROP]]
3528; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i32> [[_MSPROP1]], zeroinitializer
3529; CHECK-NEXT:    [[TMP5:%.*]] = zext <2 x i32> [[_MSPROP2]] to <2 x i64>
3530; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP3]])
3531; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8
3532; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
3533;
3534  %temp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3535  %temp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %temp3)
3536  ret <2 x i64> %temp4
3537}
3538
3539define <4 x i32> @smull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_memory {
3540; CHECK-LABEL: define <4 x i32> @smull_lane_4s(
3541; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
3542; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3543; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
3544; CHECK-NEXT:    call void @llvm.donothing()
3545; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3546; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3547; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i16> [[TMP2]], [[_MSPROP]]
3548; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i16> [[_MSPROP1]], zeroinitializer
3549; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i16> [[_MSPROP2]] to <4 x i32>
3550; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP3]])
3551; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
3552; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
3553;
3554  %temp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3555  %temp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %temp3)
3556  ret <4 x i32> %temp4
3557}
3558
3559define <2 x i64> @smull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_memory {
3560; CHECK-LABEL: define <2 x i64> @smull_lane_2d(
3561; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
3562; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3563; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
3564; CHECK-NEXT:    call void @llvm.donothing()
3565; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
3566; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3567; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i32> [[TMP2]], [[_MSPROP]]
3568; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i32> [[_MSPROP1]], zeroinitializer
3569; CHECK-NEXT:    [[TMP5:%.*]] = zext <2 x i32> [[_MSPROP2]] to <2 x i64>
3570; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP3]])
3571; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8
3572; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
3573;
3574  %temp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3575  %temp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %temp3)
3576  ret <2 x i64> %temp4
3577}
3578
3579define <4 x i32> @smlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind sanitize_memory {
3580; CHECK-LABEL: define <4 x i32> @smlal_lane_4s(
3581; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
3582; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3583; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
3584; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3585; CHECK-NEXT:    call void @llvm.donothing()
3586; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3587; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3588; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i16> [[TMP2]], [[_MSPROP]]
3589; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i16> [[_MSPROP1]], zeroinitializer
3590; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i16> [[_MSPROP2]] to <4 x i32>
3591; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP4]])
3592; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP3]], [[TMP7]]
3593; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[C]], [[TMP5]]
3594; CHECK-NEXT:    store <4 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
3595; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
3596;
3597  %temp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3598  %temp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %temp4)
3599  %temp6 = add <4 x i32> %C, %temp5
3600  ret <4 x i32> %temp6
3601}
3602
3603define <2 x i64> @smlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind sanitize_memory {
3604; CHECK-LABEL: define <2 x i64> @smlal_lane_2d(
3605; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
3606; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3607; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
3608; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3609; CHECK-NEXT:    call void @llvm.donothing()
3610; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
3611; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3612; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i32> [[TMP2]], [[_MSPROP]]
3613; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i32> [[_MSPROP1]], zeroinitializer
3614; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i32> [[_MSPROP2]] to <2 x i64>
3615; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP4]])
3616; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[TMP3]], [[TMP7]]
3617; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[C]], [[TMP5]]
3618; CHECK-NEXT:    store <2 x i64> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
3619; CHECK-NEXT:    ret <2 x i64> [[TMP6]]
3620;
3621  %temp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3622  %temp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %temp4)
3623  %temp6 = add <2 x i64> %C, %temp5
3624  ret <2 x i64> %temp6
3625}
3626
3627define <4 x i32> @sqdmlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind sanitize_memory {
3628; CHECK-LABEL: define <4 x i32> @sqdmlal_lane_4s(
3629; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
3630; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3631; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
3632; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3633; CHECK-NEXT:    call void @llvm.donothing()
3634; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3635; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3636; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
3637; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
3638; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[_MSPROP]] to i64
3639; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
3640; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
3641; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
3642; CHECK:       [[BB6]]:
3643; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
3644; CHECK-NEXT:    unreachable
3645; CHECK:       [[BB7]]:
3646; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP4]])
3647; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer
3648; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[C]], <4 x i32> [[TMP5]])
3649; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3650; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
3651;
3652  %temp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3653  %temp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %temp4)
3654  %temp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %temp5)
3655  ret <4 x i32> %temp6
3656}
3657
3658define <2 x i64> @sqdmlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind sanitize_memory {
3659; CHECK-LABEL: define <2 x i64> @sqdmlal_lane_2d(
3660; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
3661; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3662; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
3663; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3664; CHECK-NEXT:    call void @llvm.donothing()
3665; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
3666; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3667; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
3668; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
3669; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[_MSPROP]] to i64
3670; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
3671; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
3672; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
3673; CHECK:       [[BB6]]:
3674; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
3675; CHECK-NEXT:    unreachable
3676; CHECK:       [[BB7]]:
3677; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP4]])
3678; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
3679; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[C]], <2 x i64> [[TMP5]])
3680; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3681; CHECK-NEXT:    ret <2 x i64> [[TMP6]]
3682;
3683  %temp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3684  %temp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %temp4)
3685  %temp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %temp5)
3686  ret <2 x i64> %temp6
3687}
3688
3689define <4 x i32> @sqdmlal2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind sanitize_memory {
3690; CHECK-LABEL: define <4 x i32> @sqdmlal2_lane_4s(
3691; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
3692; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
3693; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3694; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
3695; CHECK-NEXT:    call void @llvm.donothing()
3696; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> splat (i16 -1), <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3697; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3698; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3699; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3700; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[_MSPROP]] to i64
3701; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
3702; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[_MSPROP1]] to i64
3703; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP9]], 0
3704; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP3]]
3705; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
3706; CHECK:       [[BB6]]:
3707; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
3708; CHECK-NEXT:    unreachable
3709; CHECK:       [[BB7]]:
3710; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
3711; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer
3712; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[C]], <4 x i32> [[TMP5]])
3713; CHECK-NEXT:    store <4 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
3714; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
3715;
3716  %temp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3717  %temp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3718  %temp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
3719  %temp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %temp5)
3720  ret <4 x i32> %temp6
3721}
3722
3723define <2 x i64> @sqdmlal2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind sanitize_memory {
3724; CHECK-LABEL: define <2 x i64> @sqdmlal2_lane_2d(
3725; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
3726; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
3727; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3728; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
3729; CHECK-NEXT:    call void @llvm.donothing()
3730; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> splat (i32 -1), <2 x i32> <i32 2, i32 3>
3731; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3732; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
3733; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 1, i32 1>
3734; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[_MSPROP]] to i64
3735; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
3736; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[_MSPROP1]] to i64
3737; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP9]], 0
3738; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP3]]
3739; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
3740; CHECK:       [[BB6]]:
3741; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
3742; CHECK-NEXT:    unreachable
3743; CHECK:       [[BB7]]:
3744; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
3745; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
3746; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[C]], <2 x i64> [[TMP5]])
3747; CHECK-NEXT:    store <2 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
3748; CHECK-NEXT:    ret <2 x i64> [[TMP6]]
3749;
3750  %temp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
3751  %temp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
3752  %temp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
3753  %temp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %temp5)
3754  ret <2 x i64> %temp6
3755}
3756
3757define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind sanitize_memory {
3758; CHECK-LABEL: define i32 @sqdmlal_lane_1s(
3759; CHECK-SAME: i32 [[A:%.*]], i16 [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
3760; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3761; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3762; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_tls, align 8
3763; CHECK-NEXT:    call void @llvm.donothing()
3764; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> splat (i16 -1), i16 [[TMP1]], i32 0
3765; CHECK-NEXT:    [[LHS:%.*]] = insertelement <4 x i16> undef, i16 [[B]], i32 0
3766; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
3767; CHECK-NEXT:    [[RHS:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
3768; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[_MSPROP]] to i64
3769; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
3770; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[_MSPROP1]] to i64
3771; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP5]], 0
3772; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP3]]
3773; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
3774; CHECK:       [[BB6]]:
3775; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
3776; CHECK-NEXT:    unreachable
3777; CHECK:       [[BB7]]:
3778; CHECK-NEXT:    [[PROD_VEC:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[LHS]], <4 x i16> [[RHS]])
3779; CHECK-NEXT:    [[PROD:%.*]] = extractelement <4 x i32> [[PROD_VEC]], i32 0
3780; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i32 [[TMP3]], 0
3781; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[PROD]])
3782; CHECK-NEXT:    store i32 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
3783; CHECK-NEXT:    ret i32 [[RES]]
3784;
3785  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
3786  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
3787  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
3788  %prod = extractelement <4 x i32> %prod.vec, i32 0
3789  %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
3790  ret i32 %res
3791}
3792declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
3793
3794define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind sanitize_memory {
3795; CHECK-LABEL: define i32 @sqdmlsl_lane_1s(
3796; CHECK-SAME: i32 [[A:%.*]], i16 [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
3797; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3798; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3799; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_tls, align 8
3800; CHECK-NEXT:    call void @llvm.donothing()
3801; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> splat (i16 -1), i16 [[TMP1]], i32 0
3802; CHECK-NEXT:    [[LHS:%.*]] = insertelement <4 x i16> undef, i16 [[B]], i32 0
3803; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
3804; CHECK-NEXT:    [[RHS:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
3805; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[_MSPROP]] to i64
3806; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
3807; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[_MSPROP1]] to i64
3808; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP5]], 0
3809; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP3]]
3810; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
3811; CHECK:       [[BB6]]:
3812; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
3813; CHECK-NEXT:    unreachable
3814; CHECK:       [[BB7]]:
3815; CHECK-NEXT:    [[PROD_VEC:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[LHS]], <4 x i16> [[RHS]])
3816; CHECK-NEXT:    [[PROD:%.*]] = extractelement <4 x i32> [[PROD_VEC]], i32 0
3817; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i32 [[TMP3]], 0
3818; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[PROD]])
3819; CHECK-NEXT:    store i32 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
3820; CHECK-NEXT:    ret i32 [[RES]]
3821;
3822  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
3823  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
3824  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
3825  %prod = extractelement <4 x i32> %prod.vec, i32 0
3826  %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
3827  ret i32 %res
3828}
3829declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
3830
3831define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind sanitize_memory {
3832; CHECK-LABEL: define i32 @sqadd_lane1_sqdmull4s(
3833; CHECK-SAME: i32 [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
3834; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3835; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3836; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_tls, align 8
3837; CHECK-NEXT:    call void @llvm.donothing()
3838; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
3839; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
3840; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
3841; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
3842; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
3843; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
3844; CHECK:       [[BB6]]:
3845; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
3846; CHECK-NEXT:    unreachable
3847; CHECK:       [[BB7]]:
3848; CHECK-NEXT:    [[PROD_VEC:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
3849; CHECK-NEXT:    [[PROD:%.*]] = extractelement <4 x i32> [[PROD_VEC]], i32 1
3850; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP3]], 0
3851; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[PROD]])
3852; CHECK-NEXT:    store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8
3853; CHECK-NEXT:    ret i32 [[RES]]
3854;
3855  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
3856  %prod = extractelement <4 x i32> %prod.vec, i32 1
3857  %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
3858  ret i32 %res
3859}
3860
3861define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind sanitize_memory {
3862; CHECK-LABEL: define i32 @sqsub_lane1_sqdmull4s(
3863; CHECK-SAME: i32 [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
3864; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3865; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3866; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_tls, align 8
3867; CHECK-NEXT:    call void @llvm.donothing()
3868; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
3869; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
3870; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
3871; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
3872; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
3873; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
3874; CHECK:       [[BB6]]:
3875; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
3876; CHECK-NEXT:    unreachable
3877; CHECK:       [[BB7]]:
3878; CHECK-NEXT:    [[PROD_VEC:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]])
3879; CHECK-NEXT:    [[PROD:%.*]] = extractelement <4 x i32> [[PROD_VEC]], i32 1
3880; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP3]], 0
3881; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[PROD]])
3882; CHECK-NEXT:    store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8
3883; CHECK-NEXT:    ret i32 [[RES]]
3884;
3885  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
3886  %prod = extractelement <4 x i32> %prod.vec, i32 1
3887  %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
3888  ret i32 %res
3889}
3890
3891define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind sanitize_memory {
3892; CHECK-LABEL: define i64 @sqdmlal_lane_1d(
3893; CHECK-SAME: i64 [[A:%.*]], i32 [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR0]] {
3894; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3895; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3896; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
3897; CHECK-NEXT:    call void @llvm.donothing()
3898; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
3899; CHECK-NEXT:    [[RHS:%.*]] = extractelement <2 x i32> [[C]], i32 1
3900; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
3901; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i32 [[_MSPROP]], 0
3902; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
3903; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
3904; CHECK:       [[BB4]]:
3905; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
3906; CHECK-NEXT:    unreachable
3907; CHECK:       [[BB5]]:
3908; CHECK-NEXT:    [[PROD:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[RHS]])
3909; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[TMP3]], 0
3910; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A]], i64 [[PROD]])
3911; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3912; CHECK-NEXT:    ret i64 [[RES]]
3913;
3914  %rhs = extractelement <2 x i32> %C, i32 1
3915  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
3916  %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
3917  ret i64 %res
3918}
3919declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
3920declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
3921
3922define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind sanitize_memory {
3923; CHECK-LABEL: define i64 @sqdmlsl_lane_1d(
3924; CHECK-SAME: i64 [[A:%.*]], i32 [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR0]] {
3925; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3926; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3927; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
3928; CHECK-NEXT:    call void @llvm.donothing()
3929; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
3930; CHECK-NEXT:    [[RHS:%.*]] = extractelement <2 x i32> [[C]], i32 1
3931; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
3932; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i32 [[_MSPROP]], 0
3933; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
3934; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
3935; CHECK:       [[BB4]]:
3936; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
3937; CHECK-NEXT:    unreachable
3938; CHECK:       [[BB5]]:
3939; CHECK-NEXT:    [[PROD:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[RHS]])
3940; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[TMP3]], 0
3941; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A]], i64 [[PROD]])
3942; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr @__msan_retval_tls, align 8
3943; CHECK-NEXT:    ret i64 [[RES]]
3944;
3945  %rhs = extractelement <2 x i32> %C, i32 1
3946  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
3947  %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
3948  ret i64 %res
3949}
3950declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
3951
3952
3953define <4 x i32> @umlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind sanitize_memory {
3954; CHECK-LABEL: define <4 x i32> @umlal_lane_4s(
3955; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
3956; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3957; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
3958; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3959; CHECK-NEXT:    call void @llvm.donothing()
3960; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3961; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3962; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i16> [[TMP2]], [[_MSPROP]]
3963; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i16> [[_MSPROP1]], zeroinitializer
3964; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i16> [[_MSPROP2]] to <4 x i32>
3965; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP4]])
3966; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP3]], [[TMP7]]
3967; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[C]], [[TMP5]]
3968; CHECK-NEXT:    store <4 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
3969; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
3970;
3971  %temp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
3972  %temp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %temp4)
3973  %temp6 = add <4 x i32> %C, %temp5
3974  ret <4 x i32> %temp6
3975}
3976
3977define <2 x i64> @umlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind sanitize_memory {
3978; CHECK-LABEL: define <2 x i64> @umlal_lane_2d(
3979; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
3980; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
3981; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
3982; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
3983; CHECK-NEXT:    call void @llvm.donothing()
3984; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
3985; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3986; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i32> [[TMP2]], [[_MSPROP]]
3987; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i32> [[_MSPROP1]], zeroinitializer
3988; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i32> [[_MSPROP2]] to <2 x i64>
3989; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP4]])
3990; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[TMP3]], [[TMP7]]
3991; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[C]], [[TMP5]]
3992; CHECK-NEXT:    store <2 x i64> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
3993; CHECK-NEXT:    ret <2 x i64> [[TMP6]]
3994;
3995  %temp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
3996  %temp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %temp4)
3997  %temp6 = add <2 x i64> %C, %temp5
3998  ret <2 x i64> %temp6
3999}
4000
4001
4002define <4 x i32> @smlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind sanitize_memory {
4003; CHECK-LABEL: define <4 x i32> @smlsl_lane_4s(
4004; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
4005; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
4006; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
4007; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4008; CHECK-NEXT:    call void @llvm.donothing()
4009; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4010; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4011; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i16> [[TMP2]], [[_MSPROP]]
4012; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i16> [[_MSPROP1]], zeroinitializer
4013; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i16> [[_MSPROP2]] to <4 x i32>
4014; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP4]])
4015; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP3]], [[TMP7]]
4016; CHECK-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[C]], [[TMP5]]
4017; CHECK-NEXT:    store <4 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
4018; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
4019;
4020  %temp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4021  %temp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %temp4)
4022  %temp6 = sub <4 x i32> %C, %temp5
4023  ret <4 x i32> %temp6
4024}
4025
4026define <2 x i64> @smlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind sanitize_memory {
4027; CHECK-LABEL: define <2 x i64> @smlsl_lane_2d(
4028; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
4029; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
4030; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
4031; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4032; CHECK-NEXT:    call void @llvm.donothing()
4033; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
4034; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
4035; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i32> [[TMP2]], [[_MSPROP]]
4036; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i32> [[_MSPROP1]], zeroinitializer
4037; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i32> [[_MSPROP2]] to <2 x i64>
4038; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP4]])
4039; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[TMP3]], [[TMP7]]
4040; CHECK-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[C]], [[TMP5]]
4041; CHECK-NEXT:    store <2 x i64> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
4042; CHECK-NEXT:    ret <2 x i64> [[TMP6]]
4043;
4044  %temp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
4045  %temp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %temp4)
4046  %temp6 = sub <2 x i64> %C, %temp5
4047  ret <2 x i64> %temp6
4048}
4049
4050define <4 x i32> @sqdmlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind sanitize_memory {
4051; CHECK-LABEL: define <4 x i32> @sqdmlsl_lane_4s(
4052; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
4053; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
4054; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
4055; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4056; CHECK-NEXT:    call void @llvm.donothing()
4057; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4058; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4059; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP2]] to i64
4060; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
4061; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[_MSPROP]] to i64
4062; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
4063; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
4064; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
4065; CHECK:       [[BB6]]:
4066; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
4067; CHECK-NEXT:    unreachable
4068; CHECK:       [[BB7]]:
4069; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP4]])
4070; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer
4071; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[C]], <4 x i32> [[TMP5]])
4072; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
4073; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
4074;
4075  %temp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4076  %temp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %temp4)
4077  %temp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %temp5)
4078  ret <4 x i32> %temp6
4079}
4080
4081define <2 x i64> @sqdmlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind sanitize_memory {
4082; CHECK-LABEL: define <2 x i64> @sqdmlsl_lane_2d(
4083; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
4084; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
4085; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
4086; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4087; CHECK-NEXT:    call void @llvm.donothing()
4088; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
4089; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
4090; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
4091; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
4092; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[_MSPROP]] to i64
4093; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
4094; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
4095; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
4096; CHECK:       [[BB6]]:
4097; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
4098; CHECK-NEXT:    unreachable
4099; CHECK:       [[BB7]]:
4100; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP4]])
4101; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
4102; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[C]], <2 x i64> [[TMP5]])
4103; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
4104; CHECK-NEXT:    ret <2 x i64> [[TMP6]]
4105;
4106  %temp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
4107  %temp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %temp4)
4108  %temp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %temp5)
4109  ret <2 x i64> %temp6
4110}
4111
4112define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind sanitize_memory {
4113; CHECK-LABEL: define <4 x i32> @sqdmlsl2_lane_4s(
4114; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
4115; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
4116; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4117; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
4118; CHECK-NEXT:    call void @llvm.donothing()
4119; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> splat (i16 -1), <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4120; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4121; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4122; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4123; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[_MSPROP]] to i64
4124; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
4125; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[_MSPROP1]] to i64
4126; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP9]], 0
4127; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP3]]
4128; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
4129; CHECK:       [[BB6]]:
4130; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
4131; CHECK-NEXT:    unreachable
4132; CHECK:       [[BB7]]:
4133; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
4134; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer
4135; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[C]], <4 x i32> [[TMP5]])
4136; CHECK-NEXT:    store <4 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
4137; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
4138;
4139  %temp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4140  %temp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4141  %temp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
4142  %temp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %temp5)
4143  ret <4 x i32> %temp6
4144}
4145
4146define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind sanitize_memory {
4147; CHECK-LABEL: define <2 x i64> @sqdmlsl2_lane_2d(
4148; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
4149; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
4150; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4151; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
4152; CHECK-NEXT:    call void @llvm.donothing()
4153; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> splat (i32 -1), <2 x i32> <i32 2, i32 3>
4154; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
4155; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
4156; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 1, i32 1>
4157; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[_MSPROP]] to i64
4158; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
4159; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[_MSPROP1]] to i64
4160; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP9]], 0
4161; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP3]]
4162; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
4163; CHECK:       [[BB6]]:
4164; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
4165; CHECK-NEXT:    unreachable
4166; CHECK:       [[BB7]]:
4167; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
4168; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
4169; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[C]], <2 x i64> [[TMP5]])
4170; CHECK-NEXT:    store <2 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
4171; CHECK-NEXT:    ret <2 x i64> [[TMP6]]
4172;
4173  %temp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
4174  %temp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
4175  %temp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %temp1, <2 x i32> %temp2)
4176  %temp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %temp5)
4177  ret <2 x i64> %temp6
4178}
4179
4180define <4 x i32> @umlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind sanitize_memory {
4181; CHECK-LABEL: define <4 x i32> @umlsl_lane_4s(
4182; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
4183; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
4184; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
4185; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4186; CHECK-NEXT:    call void @llvm.donothing()
4187; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4188; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4189; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i16> [[TMP2]], [[_MSPROP]]
4190; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i16> [[_MSPROP1]], zeroinitializer
4191; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i16> [[_MSPROP2]] to <4 x i32>
4192; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[TMP4]])
4193; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP3]], [[TMP7]]
4194; CHECK-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[C]], [[TMP5]]
4195; CHECK-NEXT:    store <4 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
4196; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
4197;
4198  %temp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4199  %temp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %temp4)
4200  %temp6 = sub <4 x i32> %C, %temp5
4201  ret <4 x i32> %temp6
4202}
4203
4204define <2 x i64> @umlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind sanitize_memory {
4205; CHECK-LABEL: define <2 x i64> @umlsl_lane_2d(
4206; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
4207; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
4208; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
4209; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4210; CHECK-NEXT:    call void @llvm.donothing()
4211; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
4212; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
4213; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i32> [[TMP2]], [[_MSPROP]]
4214; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i32> [[_MSPROP1]], zeroinitializer
4215; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i32> [[_MSPROP2]] to <2 x i64>
4216; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[TMP4]])
4217; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[TMP3]], [[TMP7]]
4218; CHECK-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[C]], [[TMP5]]
4219; CHECK-NEXT:    store <2 x i64> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
4220; CHECK-NEXT:    ret <2 x i64> [[TMP6]]
4221;
4222  %temp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
4223  %temp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %temp4)
4224  %temp6 = sub <2 x i64> %C, %temp5
4225  ret <2 x i64> %temp6
4226}
4227
4228; Scalar FMULX
4229define float @fmulxs(float %a, float %b) nounwind sanitize_memory {
4230; CHECK-LABEL: define float @fmulxs(
4231; CHECK-SAME: float [[A:%.*]], float [[B:%.*]]) #[[ATTR0]] {
4232; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
4233; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
4234; CHECK-NEXT:    call void @llvm.donothing()
4235; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]]
4236; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0
4237; CHECK-NEXT:    [[FMULX_I:%.*]] = tail call float @llvm.aarch64.neon.fmulx.f32(float [[A]], float [[B]]) #[[ATTR7]]
4238; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr @__msan_retval_tls, align 8
4239; CHECK-NEXT:    ret float [[FMULX_I]]
4240;
4241  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
4242  ret float %fmulx.i
4243}
4244
4245define double @fmulxd(double %a, double %b) nounwind sanitize_memory {
4246; CHECK-LABEL: define double @fmulxd(
4247; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) #[[ATTR0]] {
4248; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
4249; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
4250; CHECK-NEXT:    call void @llvm.donothing()
4251; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
4252; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
4253; CHECK-NEXT:    [[FMULX_I:%.*]] = tail call double @llvm.aarch64.neon.fmulx.f64(double [[A]], double [[B]]) #[[ATTR7]]
4254; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr @__msan_retval_tls, align 8
4255; CHECK-NEXT:    ret double [[FMULX_I]]
4256;
4257  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
4258  ret double %fmulx.i
4259}
4260
4261define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind sanitize_memory {
4262; CHECK-LABEL: define float @fmulxs_lane(
4263; CHECK-SAME: float [[A:%.*]], <4 x float> [[VEC:%.*]]) #[[ATTR0]] {
4264; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
4265; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
4266; CHECK-NEXT:    call void @llvm.donothing()
4267; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
4268; CHECK-NEXT:    [[B:%.*]] = extractelement <4 x float> [[VEC]], i32 3
4269; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[TMP2]], [[_MSPROP]]
4270; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i32 [[_MSPROP1]], 0
4271; CHECK-NEXT:    [[FMULX_I:%.*]] = tail call float @llvm.aarch64.neon.fmulx.f32(float [[A]], float [[B]]) #[[ATTR7]]
4272; CHECK-NEXT:    store i32 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
4273; CHECK-NEXT:    ret float [[FMULX_I]]
4274;
4275  %b = extractelement <4 x float> %vec, i32 3
4276  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
4277  ret float %fmulx.i
4278}
4279
4280define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind sanitize_memory {
4281; CHECK-LABEL: define double @fmulxd_lane(
4282; CHECK-SAME: double [[A:%.*]], <2 x double> [[VEC:%.*]]) #[[ATTR0]] {
4283; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
4284; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
4285; CHECK-NEXT:    call void @llvm.donothing()
4286; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
4287; CHECK-NEXT:    [[B:%.*]] = extractelement <2 x double> [[VEC]], i32 1
4288; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[TMP2]], [[_MSPROP]]
4289; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i64 [[_MSPROP1]], 0
4290; CHECK-NEXT:    [[FMULX_I:%.*]] = tail call double @llvm.aarch64.neon.fmulx.f64(double [[A]], double [[B]]) #[[ATTR7]]
4291; CHECK-NEXT:    store i64 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
4292; CHECK-NEXT:    ret double [[FMULX_I]]
4293;
4294  %b = extractelement <2 x double> %vec, i32 1
4295  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
4296  ret double %fmulx.i
4297}
4298
4299declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
4300declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
4301
4302
4303define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind sanitize_memory {
4304; CHECK-LABEL: define <8 x i16> @smull2_8h_simple(
4305; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
4306; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
4307; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4308; CHECK-NEXT:    call void @llvm.donothing()
4309; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> splat (i8 -1), <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4310; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4311; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> splat (i8 -1), <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4312; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4313; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i8> [[_MSPROP]], [[_MSPROP1]]
4314; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i8> [[_MSPROP2]], zeroinitializer
4315; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[_MSPROP3]] to <8 x i16>
4316; CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
4317; CHECK-NEXT:    store <8 x i16> [[TMP5]], ptr @__msan_retval_tls, align 8
4318; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
4319;
4320  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4321  %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4322  %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
4323  ret <8 x i16> %3
4324}
4325
4326define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind sanitize_memory {
4327; CHECK-LABEL: define <8 x i16> @foo0(
4328; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
4329; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
4330; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4331; CHECK-NEXT:    call void @llvm.donothing()
4332; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
4333; CHECK-NEXT:    [[TMP:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
4334; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4335; CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
4336; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
4337; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <8 x i8>
4338; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
4339; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[B]] to <2 x i64>
4340; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4341; CHECK-NEXT:    [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
4342; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <8 x i8>
4343; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <8 x i8>
4344; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i8> [[TMP4]], [[TMP6]]
4345; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i8> [[_MSPROP2]], zeroinitializer
4346; CHECK-NEXT:    [[TMP7:%.*]] = zext <8 x i8> [[_MSPROP3]] to <8 x i16>
4347; CHECK-NEXT:    [[VMULL_I_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP3]]) #[[ATTR7]]
4348; CHECK-NEXT:    store <8 x i16> [[TMP7]], ptr @__msan_retval_tls, align 8
4349; CHECK-NEXT:    ret <8 x i16> [[VMULL_I_I]]
4350;
4351  %temp = bitcast <16 x i8> %a to <2 x i64>
4352  %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
4353  %temp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
4354  %temp2 = bitcast <16 x i8> %b to <2 x i64>
4355  %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
4356  %temp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
4357  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %temp1, <8 x i8> %temp3) nounwind
4358  ret <8 x i16> %vmull.i.i
4359}
4360
4361define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind sanitize_memory {
4362; CHECK-LABEL: define <4 x i32> @foo1(
4363; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
4364; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
4365; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4366; CHECK-NEXT:    call void @llvm.donothing()
4367; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i16> [[TMP8]] to <2 x i64>
4368; CHECK-NEXT:    [[TMP:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
4369; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4370; CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
4371; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
4372; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <4 x i16>
4373; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP9]] to <2 x i64>
4374; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
4375; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4376; CHECK-NEXT:    [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
4377; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <4 x i16>
4378; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <4 x i16>
4379; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i16> [[TMP4]], [[TMP6]]
4380; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i16> [[_MSPROP2]], zeroinitializer
4381; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i16> [[_MSPROP3]] to <4 x i32>
4382; CHECK-NEXT:    [[VMULL2_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP3]]) #[[ATTR7]]
4383; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
4384; CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
4385;
4386  %temp = bitcast <8 x i16> %a to <2 x i64>
4387  %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
4388  %temp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
4389  %temp2 = bitcast <8 x i16> %b to <2 x i64>
4390  %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
4391  %temp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
4392  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %temp1, <4 x i16> %temp3) nounwind
4393  ret <4 x i32> %vmull2.i.i
4394}
4395
4396define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind sanitize_memory {
4397; CHECK-LABEL: define <2 x i64> @foo2(
4398; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
4399; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
4400; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4401; CHECK-NEXT:    call void @llvm.donothing()
4402; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP8]] to <2 x i64>
4403; CHECK-NEXT:    [[TMP:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
4404; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4405; CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
4406; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
4407; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <2 x i32>
4408; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP9]] to <2 x i64>
4409; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
4410; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4411; CHECK-NEXT:    [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
4412; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <2 x i32>
4413; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <2 x i32>
4414; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i32> [[TMP4]], [[TMP6]]
4415; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i32> [[_MSPROP2]], zeroinitializer
4416; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i32> [[_MSPROP3]] to <2 x i64>
4417; CHECK-NEXT:    [[VMULL2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP3]]) #[[ATTR7]]
4418; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
4419; CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
4420;
4421  %temp = bitcast <4 x i32> %a to <2 x i64>
4422  %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
4423  %temp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
4424  %temp2 = bitcast <4 x i32> %b to <2 x i64>
4425  %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
4426  %temp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
4427  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %temp1, <2 x i32> %temp3) nounwind
4428  ret <2 x i64> %vmull2.i.i
4429}
4430
4431define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind sanitize_memory {
4432; CHECK-LABEL: define <8 x i16> @foo3(
4433; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
4434; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
4435; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4436; CHECK-NEXT:    call void @llvm.donothing()
4437; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
4438; CHECK-NEXT:    [[TMP:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
4439; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4440; CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
4441; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
4442; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <8 x i8>
4443; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
4444; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[B]] to <2 x i64>
4445; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4446; CHECK-NEXT:    [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
4447; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <8 x i8>
4448; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <8 x i8>
4449; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i8> [[TMP4]], [[TMP6]]
4450; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i8> [[_MSPROP2]], zeroinitializer
4451; CHECK-NEXT:    [[TMP7:%.*]] = zext <8 x i8> [[_MSPROP3]] to <8 x i16>
4452; CHECK-NEXT:    [[VMULL_I_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP3]]) #[[ATTR7]]
4453; CHECK-NEXT:    store <8 x i16> [[TMP7]], ptr @__msan_retval_tls, align 8
4454; CHECK-NEXT:    ret <8 x i16> [[VMULL_I_I]]
4455;
4456  %temp = bitcast <16 x i8> %a to <2 x i64>
4457  %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
4458  %temp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
4459  %temp2 = bitcast <16 x i8> %b to <2 x i64>
4460  %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
4461  %temp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
4462  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %temp1, <8 x i8> %temp3) nounwind
4463  ret <8 x i16> %vmull.i.i
4464}
4465
4466define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind sanitize_memory {
4467; CHECK-LABEL: define <4 x i32> @foo4(
4468; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
4469; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
4470; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4471; CHECK-NEXT:    call void @llvm.donothing()
4472; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i16> [[TMP8]] to <2 x i64>
4473; CHECK-NEXT:    [[TMP:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
4474; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4475; CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
4476; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
4477; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <4 x i16>
4478; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP9]] to <2 x i64>
4479; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
4480; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4481; CHECK-NEXT:    [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
4482; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <4 x i16>
4483; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <4 x i16>
4484; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i16> [[TMP4]], [[TMP6]]
4485; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i16> [[_MSPROP2]], zeroinitializer
4486; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i16> [[_MSPROP3]] to <4 x i32>
4487; CHECK-NEXT:    [[VMULL2_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP3]]) #[[ATTR7]]
4488; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
4489; CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
4490;
4491  %temp = bitcast <8 x i16> %a to <2 x i64>
4492  %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
4493  %temp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
4494  %temp2 = bitcast <8 x i16> %b to <2 x i64>
4495  %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
4496  %temp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
4497  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %temp1, <4 x i16> %temp3) nounwind
4498  ret <4 x i32> %vmull2.i.i
4499}
4500
4501define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind sanitize_memory {
4502; CHECK-LABEL: define <2 x i64> @foo5(
4503; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
4504; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
4505; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4506; CHECK-NEXT:    call void @llvm.donothing()
4507; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP8]] to <2 x i64>
4508; CHECK-NEXT:    [[TMP:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
4509; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4510; CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
4511; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
4512; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <2 x i32>
4513; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP9]] to <2 x i64>
4514; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
4515; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4516; CHECK-NEXT:    [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
4517; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <2 x i32>
4518; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <2 x i32>
4519; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i32> [[TMP4]], [[TMP6]]
4520; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i32> [[_MSPROP2]], zeroinitializer
4521; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i32> [[_MSPROP3]] to <2 x i64>
4522; CHECK-NEXT:    [[VMULL2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP3]]) #[[ATTR7]]
4523; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
4524; CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
4525;
4526  %temp = bitcast <4 x i32> %a to <2 x i64>
4527  %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
4528  %temp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
4529  %temp2 = bitcast <4 x i32> %b to <2 x i64>
4530  %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
4531  %temp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
4532  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %temp1, <2 x i32> %temp3) nounwind
4533  ret <2 x i64> %vmull2.i.i
4534}
4535
4536define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
4537; CHECK-LABEL: define <4 x i32> @foo6(
4538; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR6:[0-9]+]] {
4539; CHECK-NEXT:  [[ENTRY:.*:]]
4540; CHECK-NEXT:    call void @llvm.donothing()
4541; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
4542; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
4543; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <4 x i16>
4544; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4545; CHECK-NEXT:    [[VMULL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[SHUFFLE]]) #[[ATTR7]]
4546; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
4547; CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
4548;
4549entry:
4550  %0 = bitcast <8 x i16> %b to <2 x i64>
4551  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
4552  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
4553  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4554  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
4555  ret <4 x i32> %vmull2.i
4556}
4557
4558define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
4559; CHECK-LABEL: define <4 x i32> @foo6a(
4560; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR6]] {
4561; CHECK-NEXT:  [[ENTRY:.*:]]
4562; CHECK-NEXT:    call void @llvm.donothing()
4563; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
4564; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> zeroinitializer
4565; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <4 x i16>
4566; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4567; CHECK-NEXT:    [[VMULL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[SHUFFLE]]) #[[ATTR7]]
4568; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
4569; CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
4570;
4571entry:
4572  %0 = bitcast <8 x i16> %b to <2 x i64>
4573  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
4574  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
4575  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4576  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
4577  ret <4 x i32> %vmull2.i
4578}
4579
4580define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
4581; CHECK-LABEL: define <2 x i64> @foo7(
4582; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR6]] {
4583; CHECK-NEXT:  [[ENTRY:.*:]]
4584; CHECK-NEXT:    call void @llvm.donothing()
4585; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
4586; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
4587; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <2 x i32>
4588; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
4589; CHECK-NEXT:    [[VMULL2_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[SHUFFLE]]) #[[ATTR7]]
4590; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
4591; CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
4592;
4593entry:
4594  %0 = bitcast <4 x i32> %b to <2 x i64>
4595  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
4596  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
4597  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
4598  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
4599  ret <2 x i64> %vmull2.i
4600}
4601
4602define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
4603; CHECK-LABEL: define <2 x i64> @foo7a(
4604; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR6]] {
4605; CHECK-NEXT:  [[ENTRY:.*:]]
4606; CHECK-NEXT:    call void @llvm.donothing()
4607; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
4608; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> zeroinitializer
4609; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <2 x i32>
4610; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
4611; CHECK-NEXT:    [[VMULL2_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[SHUFFLE]]) #[[ATTR7]]
4612; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
4613; CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
4614;
4615entry:
4616  %0 = bitcast <4 x i32> %b to <2 x i64>
4617  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
4618  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
4619  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
4620  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
4621  ret <2 x i64> %vmull2.i
4622}
4623
4624
4625define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
4626; CHECK-LABEL: define <4 x i32> @foo8(
4627; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR6]] {
4628; CHECK-NEXT:  [[ENTRY:.*:]]
4629; CHECK-NEXT:    call void @llvm.donothing()
4630; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
4631; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
4632; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <4 x i16>
4633; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4634; CHECK-NEXT:    [[VMULL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[SHUFFLE]]) #[[ATTR7]]
4635; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
4636; CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
4637;
4638entry:
4639  %0 = bitcast <8 x i16> %b to <2 x i64>
4640  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
4641  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
4642  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4643  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
4644  ret <4 x i32> %vmull2.i
4645}
4646
4647define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
4648; CHECK-LABEL: define <4 x i32> @foo8a(
4649; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR6]] {
4650; CHECK-NEXT:  [[ENTRY:.*:]]
4651; CHECK-NEXT:    call void @llvm.donothing()
4652; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
4653; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> zeroinitializer
4654; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <4 x i16>
4655; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4656; CHECK-NEXT:    [[VMULL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[SHUFFLE]]) #[[ATTR7]]
4657; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
4658; CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
4659;
4660entry:
4661  %0 = bitcast <8 x i16> %b to <2 x i64>
4662  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
4663  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
4664  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4665  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
4666  ret <4 x i32> %vmull2.i
4667}
4668
4669define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
4670; CHECK-LABEL: define <2 x i64> @foo9(
4671; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR6]] {
4672; CHECK-NEXT:  [[ENTRY:.*:]]
4673; CHECK-NEXT:    call void @llvm.donothing()
4674; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
4675; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
4676; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <2 x i32>
4677; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
4678; CHECK-NEXT:    [[VMULL2_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[SHUFFLE]]) #[[ATTR7]]
4679; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
4680; CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
4681;
4682entry:
4683  %0 = bitcast <4 x i32> %b to <2 x i64>
4684  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
4685  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
4686  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
4687  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
4688  ret <2 x i64> %vmull2.i
4689}
4690
4691define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
4692; CHECK-LABEL: define <2 x i64> @foo9a(
4693; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR6]] {
4694; CHECK-NEXT:  [[ENTRY:.*:]]
4695; CHECK-NEXT:    call void @llvm.donothing()
4696; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
4697; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> zeroinitializer
4698; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to <2 x i32>
4699; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
4700; CHECK-NEXT:    [[VMULL2_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[SHUFFLE]]) #[[ATTR7]]
4701; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
4702; CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
4703;
4704entry:
4705  %0 = bitcast <4 x i32> %b to <2 x i64>
4706  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
4707  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
4708  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
4709  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
4710  ret <2 x i64> %vmull2.i
4711}
4712
4713define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind sanitize_memory {
4714; CHECK-LABEL: define <8 x i16> @bar0(
4715; CHECK-SAME: <8 x i16> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] {
4716; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4717; CHECK-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
4718; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
4719; CHECK-NEXT:    call void @llvm.donothing()
4720; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
4721; CHECK-NEXT:    [[TMP:%.*]] = bitcast <16 x i8> [[B]] to <2 x i64>
4722; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4723; CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
4724; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
4725; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I_I]] to <8 x i8>
4726; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
4727; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[C]] to <2 x i64>
4728; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4729; CHECK-NEXT:    [[SHUFFLE_I3_I_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
4730; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <8 x i8>
4731; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I_I]] to <8 x i8>
4732; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i8> [[TMP5]], [[TMP7]]
4733; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i8> [[_MSPROP2]], zeroinitializer
4734; CHECK-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[_MSPROP3]] to <8 x i16>
4735; CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP3]]) #[[ATTR7]]
4736; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i16> [[TMP8]], [[TMP11]]
4737; CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[VMULL_I_I_I]], [[A]]
4738; CHECK-NEXT:    store <8 x i16> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
4739; CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
4740;
4741  %temp = bitcast <16 x i8> %b to <2 x i64>
4742  %shuffle.i.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
4743  %temp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
4744  %temp2 = bitcast <16 x i8> %c to <2 x i64>
4745  %shuffle.i3.i.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
4746  %temp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
4747  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %temp1, <8 x i8> %temp3) nounwind
4748  %add.i = add <8 x i16> %vmull.i.i.i, %a
4749  ret <8 x i16> %add.i
4750}
4751
4752define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind sanitize_memory {
4753; CHECK-LABEL: define <4 x i32> @bar1(
4754; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
4755; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4756; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
4757; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
4758; CHECK-NEXT:    call void @llvm.donothing()
4759; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP9]] to <2 x i64>
4760; CHECK-NEXT:    [[TMP:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
4761; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4762; CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
4763; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
4764; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I_I]] to <4 x i16>
4765; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP10]] to <2 x i64>
4766; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[C]] to <2 x i64>
4767; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4768; CHECK-NEXT:    [[SHUFFLE_I3_I_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
4769; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <4 x i16>
4770; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I_I]] to <4 x i16>
4771; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i16> [[TMP5]], [[TMP7]]
4772; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i16> [[_MSPROP2]], zeroinitializer
4773; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i16> [[_MSPROP3]] to <4 x i32>
4774; CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP3]]) #[[ATTR7]]
4775; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP8]], [[TMP11]]
4776; CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[VMULL2_I_I_I]], [[A]]
4777; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
4778; CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
4779;
4780  %temp = bitcast <8 x i16> %b to <2 x i64>
4781  %shuffle.i.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
4782  %temp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
4783  %temp2 = bitcast <8 x i16> %c to <2 x i64>
4784  %shuffle.i3.i.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
4785  %temp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
4786  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %temp1, <4 x i16> %temp3) nounwind
4787  %add.i = add <4 x i32> %vmull2.i.i.i, %a
4788  ret <4 x i32> %add.i
4789}
4790
4791define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind sanitize_memory {
4792; CHECK-LABEL: define <2 x i64> @bar2(
4793; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
4794; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4795; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
4796; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
4797; CHECK-NEXT:    call void @llvm.donothing()
4798; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP9]] to <2 x i64>
4799; CHECK-NEXT:    [[TMP:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
4800; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4801; CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
4802; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
4803; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I_I]] to <2 x i32>
4804; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP10]] to <2 x i64>
4805; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[C]] to <2 x i64>
4806; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4807; CHECK-NEXT:    [[SHUFFLE_I3_I_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
4808; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <2 x i32>
4809; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I_I]] to <2 x i32>
4810; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i32> [[TMP5]], [[TMP7]]
4811; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i32> [[_MSPROP2]], zeroinitializer
4812; CHECK-NEXT:    [[TMP8:%.*]] = zext <2 x i32> [[_MSPROP3]] to <2 x i64>
4813; CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP3]]) #[[ATTR7]]
4814; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[TMP8]], [[TMP11]]
4815; CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[VMULL2_I_I_I]], [[A]]
4816; CHECK-NEXT:    store <2 x i64> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
4817; CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
4818;
4819  %temp = bitcast <4 x i32> %b to <2 x i64>
4820  %shuffle.i.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
4821  %temp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
4822  %temp2 = bitcast <4 x i32> %c to <2 x i64>
4823  %shuffle.i3.i.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
4824  %temp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
4825  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %temp1, <2 x i32> %temp3) nounwind
4826  %add.i = add <2 x i64> %vmull2.i.i.i, %a
4827  ret <2 x i64> %add.i
4828}
4829
4830define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind sanitize_memory {
4831; CHECK-LABEL: define <8 x i16> @bar3(
4832; CHECK-SAME: <8 x i16> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] {
4833; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4834; CHECK-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
4835; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
4836; CHECK-NEXT:    call void @llvm.donothing()
4837; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
4838; CHECK-NEXT:    [[TMP:%.*]] = bitcast <16 x i8> [[B]] to <2 x i64>
4839; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4840; CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
4841; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
4842; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I_I]] to <8 x i8>
4843; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
4844; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[C]] to <2 x i64>
4845; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4846; CHECK-NEXT:    [[SHUFFLE_I3_I_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
4847; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <8 x i8>
4848; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I_I]] to <8 x i8>
4849; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i8> [[TMP5]], [[TMP7]]
4850; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i8> [[_MSPROP2]], zeroinitializer
4851; CHECK-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[_MSPROP3]] to <8 x i16>
4852; CHECK-NEXT:    [[VMULL_I_I_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[TMP3]]) #[[ATTR7]]
4853; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i16> [[TMP8]], [[TMP11]]
4854; CHECK-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[VMULL_I_I_I]], [[A]]
4855; CHECK-NEXT:    store <8 x i16> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
4856; CHECK-NEXT:    ret <8 x i16> [[ADD_I]]
4857;
4858  %temp = bitcast <16 x i8> %b to <2 x i64>
4859  %shuffle.i.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
4860  %temp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
4861  %temp2 = bitcast <16 x i8> %c to <2 x i64>
4862  %shuffle.i3.i.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
4863  %temp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
4864  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %temp1, <8 x i8> %temp3) nounwind
4865  %add.i = add <8 x i16> %vmull.i.i.i, %a
4866  ret <8 x i16> %add.i
4867}
4868
4869define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind sanitize_memory {
4870; CHECK-LABEL: define <4 x i32> @bar4(
4871; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
4872; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4873; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
4874; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
4875; CHECK-NEXT:    call void @llvm.donothing()
4876; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP9]] to <2 x i64>
4877; CHECK-NEXT:    [[TMP:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
4878; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4879; CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
4880; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
4881; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I_I]] to <4 x i16>
4882; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP10]] to <2 x i64>
4883; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[C]] to <2 x i64>
4884; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4885; CHECK-NEXT:    [[SHUFFLE_I3_I_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
4886; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <4 x i16>
4887; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I_I]] to <4 x i16>
4888; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i16> [[TMP5]], [[TMP7]]
4889; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i16> [[_MSPROP2]], zeroinitializer
4890; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i16> [[_MSPROP3]] to <4 x i32>
4891; CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP3]]) #[[ATTR7]]
4892; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP8]], [[TMP11]]
4893; CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[VMULL2_I_I_I]], [[A]]
4894; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
4895; CHECK-NEXT:    ret <4 x i32> [[ADD_I]]
4896;
4897  %temp = bitcast <8 x i16> %b to <2 x i64>
4898  %shuffle.i.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
4899  %temp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
4900  %temp2 = bitcast <8 x i16> %c to <2 x i64>
4901  %shuffle.i3.i.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
4902  %temp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
4903  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %temp1, <4 x i16> %temp3) nounwind
4904  %add.i = add <4 x i32> %vmull2.i.i.i, %a
4905  ret <4 x i32> %add.i
4906}
4907
4908define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind sanitize_memory {
4909; CHECK-LABEL: define <2 x i64> @bar5(
4910; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
4911; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4912; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
4913; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
4914; CHECK-NEXT:    call void @llvm.donothing()
4915; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP9]] to <2 x i64>
4916; CHECK-NEXT:    [[TMP:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
4917; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4918; CHECK-NEXT:    [[SHUFFLE_I_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
4919; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
4920; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I_I]] to <2 x i32>
4921; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP10]] to <2 x i64>
4922; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[C]] to <2 x i64>
4923; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4924; CHECK-NEXT:    [[SHUFFLE_I3_I_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
4925; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <2 x i32>
4926; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I_I]] to <2 x i32>
4927; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i32> [[TMP5]], [[TMP7]]
4928; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i32> [[_MSPROP2]], zeroinitializer
4929; CHECK-NEXT:    [[TMP8:%.*]] = zext <2 x i32> [[_MSPROP3]] to <2 x i64>
4930; CHECK-NEXT:    [[VMULL2_I_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP3]]) #[[ATTR7]]
4931; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[TMP8]], [[TMP11]]
4932; CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[VMULL2_I_I_I]], [[A]]
4933; CHECK-NEXT:    store <2 x i64> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
4934; CHECK-NEXT:    ret <2 x i64> [[ADD_I]]
4935;
4936  %temp = bitcast <4 x i32> %b to <2 x i64>
4937  %shuffle.i.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
4938  %temp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
4939  %temp2 = bitcast <4 x i32> %c to <2 x i64>
4940  %shuffle.i3.i.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
4941  %temp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
4942  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %temp1, <2 x i32> %temp3) nounwind
4943  %add.i = add <2 x i64> %vmull2.i.i.i, %a
4944  ret <2 x i64> %add.i
4945}
4946
4947define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind sanitize_memory {
4948; CHECK-LABEL: define <4 x i32> @mlal2_1(
4949; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
4950; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
4951; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4952; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
4953; CHECK-NEXT:    call void @llvm.donothing()
4954; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> splat (i16 -1), <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4955; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4956; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP10]] to <2 x i64>
4957; CHECK-NEXT:    [[TMP:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
4958; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4959; CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
4960; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <4 x i16>
4961; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <4 x i16>
4962; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[_MSPROP]] to <2 x i64>
4963; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <2 x i64>
4964; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
4965; CHECK-NEXT:    [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
4966; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[_MSPROP2]] to <4 x i16>
4967; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <4 x i16>
4968; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i16> [[TMP5]], [[TMP7]]
4969; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i16> [[_MSPROP3]], zeroinitializer
4970; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i16> [[_MSPROP4]] to <4 x i32>
4971; CHECK-NEXT:    [[VMULL2_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP3]]) #[[ATTR7]]
4972; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[TMP8]], [[TMP11]]
4973; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[VMULL2_I_I]], [[A]]
4974; CHECK-NEXT:    store <4 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
4975; CHECK-NEXT:    ret <4 x i32> [[ADD]]
4976;
4977  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4978  %temp = bitcast <8 x i16> %b to <2 x i64>
4979  %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
4980  %temp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
4981  %temp2 = bitcast <8 x i16> %shuffle to <2 x i64>
4982  %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
4983  %temp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
4984  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %temp1, <4 x i16> %temp3) nounwind
4985  %add = add <4 x i32> %vmull2.i.i, %a
4986  ret <4 x i32> %add
4987}
4988
4989define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind sanitize_memory {
4990; CHECK-LABEL: define <2 x i64> @mlal2_2(
4991; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR0]] {
4992; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
4993; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
4994; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
4995; CHECK-NEXT:    call void @llvm.donothing()
4996; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> splat (i32 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4997; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4998; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP10]] to <2 x i64>
4999; CHECK-NEXT:    [[TMP:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
5000; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
5001; CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
5002; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <2 x i32>
5003; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <2 x i32>
5004; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[_MSPROP]] to <2 x i64>
5005; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <2 x i64>
5006; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
5007; CHECK-NEXT:    [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
5008; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[_MSPROP2]] to <2 x i32>
5009; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <2 x i32>
5010; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i32> [[TMP5]], [[TMP7]]
5011; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i32> [[_MSPROP3]], zeroinitializer
5012; CHECK-NEXT:    [[TMP8:%.*]] = zext <2 x i32> [[_MSPROP4]] to <2 x i64>
5013; CHECK-NEXT:    [[VMULL2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP3]]) #[[ATTR7]]
5014; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <2 x i64> [[TMP8]], [[TMP11]]
5015; CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[VMULL2_I_I]], [[A]]
5016; CHECK-NEXT:    store <2 x i64> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
5017; CHECK-NEXT:    ret <2 x i64> [[ADD]]
5018;
5019  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
5020  %temp = bitcast <4 x i32> %b to <2 x i64>
5021  %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
5022  %temp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
5023  %temp2 = bitcast <4 x i32> %shuffle to <2 x i64>
5024  %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
5025  %temp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
5026  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %temp1, <2 x i32> %temp3) nounwind
5027  %add = add <2 x i64> %vmull2.i.i, %a
5028  ret <2 x i64> %add
5029}
5030
5031define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind sanitize_memory {
5032; CHECK-LABEL: define <4 x i32> @mlal2_4(
5033; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
5034; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
5035; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
5036; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
5037; CHECK-NEXT:    call void @llvm.donothing()
5038; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> splat (i16 -1), <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
5039; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
5040; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP10]] to <2 x i64>
5041; CHECK-NEXT:    [[TMP:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
5042; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
5043; CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
5044; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <4 x i16>
5045; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <4 x i16>
5046; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[_MSPROP]] to <2 x i64>
5047; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <2 x i64>
5048; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
5049; CHECK-NEXT:    [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
5050; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[_MSPROP2]] to <4 x i16>
5051; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <4 x i16>
5052; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i16> [[TMP5]], [[TMP7]]
5053; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i16> [[_MSPROP3]], zeroinitializer
5054; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i16> [[_MSPROP4]] to <4 x i32>
5055; CHECK-NEXT:    [[VMULL2_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP3]]) #[[ATTR7]]
5056; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[TMP8]], [[TMP11]]
5057; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[VMULL2_I_I]], [[A]]
5058; CHECK-NEXT:    store <4 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
5059; CHECK-NEXT:    ret <4 x i32> [[ADD]]
5060;
5061  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
5062  %temp = bitcast <8 x i16> %b to <2 x i64>
5063  %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
5064  %temp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
5065  %temp2 = bitcast <8 x i16> %shuffle to <2 x i64>
5066  %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
5067  %temp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
5068  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %temp1, <4 x i16> %temp3) nounwind
5069  %add = add <4 x i32> %vmull2.i.i, %a
5070  ret <4 x i32> %add
5071}
5072
5073define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind sanitize_memory {
5074; CHECK-LABEL: define <2 x i64> @mlal2_5(
5075; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR0]] {
5076; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
5077; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
5078; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
5079; CHECK-NEXT:    call void @llvm.donothing()
5080; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> splat (i32 -1), <4 x i32> zeroinitializer
5081; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> undef, <4 x i32> zeroinitializer
5082; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP10]] to <2 x i64>
5083; CHECK-NEXT:    [[TMP:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
5084; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
5085; CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <1 x i32> <i32 1>
5086; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to <2 x i32>
5087; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <2 x i32>
5088; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[_MSPROP]] to <2 x i64>
5089; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <2 x i64>
5090; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> splat (i64 -1), <1 x i32> <i32 1>
5091; CHECK-NEXT:    [[SHUFFLE_I3_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <1 x i32> <i32 1>
5092; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[_MSPROP2]] to <2 x i32>
5093; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[SHUFFLE_I3_I]] to <2 x i32>
5094; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i32> [[TMP5]], [[TMP7]]
5095; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i32> [[_MSPROP3]], zeroinitializer
5096; CHECK-NEXT:    [[TMP8:%.*]] = zext <2 x i32> [[_MSPROP4]] to <2 x i64>
5097; CHECK-NEXT:    [[VMULL2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[TMP3]]) #[[ATTR7]]
5098; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <2 x i64> [[TMP8]], [[TMP11]]
5099; CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[VMULL2_I_I]], [[A]]
5100; CHECK-NEXT:    store <2 x i64> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
5101; CHECK-NEXT:    ret <2 x i64> [[ADD]]
5102;
5103  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
5104  %temp = bitcast <4 x i32> %b to <2 x i64>
5105  %shuffle.i.i = shufflevector <2 x i64> %temp, <2 x i64> undef, <1 x i32> <i32 1>
5106  %temp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
5107  %temp2 = bitcast <4 x i32> %shuffle to <2 x i64>
5108  %shuffle.i3.i = shufflevector <2 x i64> %temp2, <2 x i64> undef, <1 x i32> <i32 1>
5109  %temp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
5110  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %temp1, <2 x i32> %temp3) nounwind
5111  %add = add <2 x i64> %vmull2.i.i, %a
5112  ret <2 x i64> %add
5113}
5114
5115; rdar://12328502
5116define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
5117; CHECK-LABEL: define <2 x double> @vmulq_n_f64(
5118; CHECK-SAME: <2 x double> [[X:%.*]], double [[Y:%.*]]) #[[ATTR3]] {
5119; CHECK-NEXT:  [[ENTRY:.*:]]
5120; CHECK-NEXT:    call void @llvm.donothing()
5121; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double [[Y]], i32 0
5122; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[Y]], i32 1
5123; CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x double> [[VECINIT1_I]], [[X]]
5124; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5125; CHECK-NEXT:    ret <2 x double> [[MUL_I]]
5126;
5127entry:
5128  %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
5129  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
5130  %mul.i = fmul <2 x double> %vecinit1.i, %x
5131  ret <2 x double> %mul.i
5132}
5133
5134define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
5135; CHECK-LABEL: define <4 x float> @vmulq_n_f32(
5136; CHECK-SAME: <4 x float> [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
5137; CHECK-NEXT:  [[ENTRY:.*:]]
5138; CHECK-NEXT:    call void @llvm.donothing()
5139; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Y]], i32 0
5140; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[Y]], i32 1
5141; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[Y]], i32 2
5142; CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[Y]], i32 3
5143; CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[VECINIT3_I]], [[X]]
5144; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
5145; CHECK-NEXT:    ret <4 x float> [[MUL_I]]
5146;
5147entry:
5148  %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
5149  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
5150  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
5151  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
5152  %mul.i = fmul <4 x float> %vecinit3.i, %x
5153  ret <4 x float> %mul.i
5154}
5155
5156define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
5157; CHECK-LABEL: define <2 x float> @vmul_n_f32(
5158; CHECK-SAME: <2 x float> [[X:%.*]], float [[Y:%.*]]) #[[ATTR3]] {
5159; CHECK-NEXT:  [[ENTRY:.*:]]
5160; CHECK-NEXT:    call void @llvm.donothing()
5161; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float [[Y]], i32 0
5162; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[Y]], i32 1
5163; CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[VECINIT1_I]], [[X]]
5164; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
5165; CHECK-NEXT:    ret <2 x float> [[MUL_I]]
5166;
5167entry:
5168  %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
5169  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
5170  %mul.i = fmul <2 x float> %vecinit1.i, %x
5171  ret <2 x float> %mul.i
5172}
5173
5174define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
5175; CHECK-LABEL: define <4 x i16> @vmla_laneq_s16_test(
5176; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]] {
5177; CHECK-NEXT:  [[ENTRY:.*:]]
5178; CHECK-NEXT:    call void @llvm.donothing()
5179; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
5180; CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i16> [[SHUFFLE]], [[B]]
5181; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i16> [[MUL]], [[A]]
5182; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
5183; CHECK-NEXT:    ret <4 x i16> [[ADD]]
5184;
5185entry:
5186  %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
5187  %mul = mul <4 x i16> %shuffle, %b
5188  %add = add <4 x i16> %mul, %a
5189  ret <4 x i16> %add
5190}
5191
5192define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
5193; CHECK-LABEL: define <2 x i32> @vmla_laneq_s32_test(
5194; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]] {
5195; CHECK-NEXT:  [[ENTRY:.*:]]
5196; CHECK-NEXT:    call void @llvm.donothing()
5197; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> undef, <2 x i32> <i32 3, i32 3>
5198; CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[SHUFFLE]], [[B]]
5199; CHECK-NEXT:    [[ADD:%.*]] = add <2 x i32> [[MUL]], [[A]]
5200; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
5201; CHECK-NEXT:    ret <2 x i32> [[ADD]]
5202;
5203entry:
5204  %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
5205  %mul = mul <2 x i32> %shuffle, %b
5206  %add = add <2 x i32> %mul, %a
5207  ret <2 x i32> %add
5208}
5209
5210define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
5211; CHECK-LABEL: define <8 x i16> @not_really_vmlaq_laneq_s16_test(
5212; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]] {
5213; CHECK-NEXT:  [[ENTRY:.*:]]
5214; CHECK-NEXT:    call void @llvm.donothing()
5215; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5216; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x i16> [[SHUFFLE1]], <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
5217; CHECK-NEXT:    [[MUL:%.*]] = mul <8 x i16> [[SHUFFLE2]], [[B]]
5218; CHECK-NEXT:    [[ADD:%.*]] = add <8 x i16> [[MUL]], [[A]]
5219; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
5220; CHECK-NEXT:    ret <8 x i16> [[ADD]]
5221;
5222entry:
5223  %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
5224  %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
5225  %mul = mul <8 x i16> %shuffle2, %b
5226  %add = add <8 x i16> %mul, %a
5227  ret <8 x i16> %add
5228}
5229
5230define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
5231; CHECK-LABEL: define <4 x i32> @not_really_vmlaq_laneq_s32_test(
5232; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]] {
5233; CHECK-NEXT:  [[ENTRY:.*:]]
5234; CHECK-NEXT:    call void @llvm.donothing()
5235; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5236; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[SHUFFLE1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
5237; CHECK-NEXT:    [[MUL:%.*]] = mul <4 x i32> [[SHUFFLE2]], [[B]]
5238; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[MUL]], [[A]]
5239; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
5240; CHECK-NEXT:    ret <4 x i32> [[ADD]]
5241;
5242entry:
5243  %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5244  %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
5245  %mul = mul <4 x i32> %shuffle2, %b
5246  %add = add <4 x i32> %mul, %a
5247  ret <4 x i32> %add
5248}
5249
5250define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
5251; CHECK-LABEL: define <4 x i32> @vmull_laneq_s16_test(
5252; CHECK-SAME: <4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR3]] {
5253; CHECK-NEXT:  [[ENTRY:.*:]]
5254; CHECK-NEXT:    call void @llvm.donothing()
5255; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
5256; CHECK-NEXT:    [[VMULL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]])
5257; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
5258; CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
5259;
5260entry:
5261  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
5262  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
5263  ret <4 x i32> %vmull2.i
5264}
5265
5266define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
5267; CHECK-LABEL: define <2 x i64> @vmull_laneq_s32_test(
5268; CHECK-SAME: <2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR3]] {
5269; CHECK-NEXT:  [[ENTRY:.*:]]
5270; CHECK-NEXT:    call void @llvm.donothing()
5271; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 2, i32 2>
5272; CHECK-NEXT:    [[VMULL2_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]])
5273; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5274; CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
5275;
5276entry:
5277  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
5278  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
5279  ret <2 x i64> %vmull2.i
5280}
5281define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
5282; CHECK-LABEL: define <4 x i32> @vmull_laneq_u16_test(
5283; CHECK-SAME: <4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR3]] {
5284; CHECK-NEXT:  [[ENTRY:.*:]]
5285; CHECK-NEXT:    call void @llvm.donothing()
5286; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
5287; CHECK-NEXT:    [[VMULL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]])
5288; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
5289; CHECK-NEXT:    ret <4 x i32> [[VMULL2_I]]
5290;
5291entry:
5292  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
5293  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
5294  ret <4 x i32> %vmull2.i
5295}
5296
5297define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
5298; CHECK-LABEL: define <2 x i64> @vmull_laneq_u32_test(
5299; CHECK-SAME: <2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR3]] {
5300; CHECK-NEXT:  [[ENTRY:.*:]]
5301; CHECK-NEXT:    call void @llvm.donothing()
5302; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> undef, <2 x i32> <i32 2, i32 2>
5303; CHECK-NEXT:    [[VMULL2_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]])
5304; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5305; CHECK-NEXT:    ret <2 x i64> [[VMULL2_I]]
5306;
5307entry:
5308  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
5309  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
5310  ret <2 x i64> %vmull2.i
5311}
5312
5313define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
5314; CHECK-LABEL: define <4 x i32> @vmull_low_n_s16_test(
5315; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i32 [[D:%.*]]) #[[ATTR6]] {
5316; CHECK-NEXT:  [[ENTRY:.*:]]
5317; CHECK-NEXT:    call void @llvm.donothing()
5318; CHECK-NEXT:    [[CONV:%.*]] = trunc i32 [[D]] to i16
5319; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
5320; CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> zeroinitializer
5321; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <4 x i16>
5322; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[CONV]], i32 0
5323; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[CONV]], i32 1
5324; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[CONV]], i32 2
5325; CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[CONV]], i32 3
5326; CHECK-NEXT:    [[VMULL2_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[VECINIT3_I]]) #[[ATTR7]]
5327; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
5328; CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
5329;
5330entry:
5331  %conv = trunc i32 %d to i16
5332  %0 = bitcast <8 x i16> %b to <2 x i64>
5333  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 0>
5334  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
5335  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
5336  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
5337  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
5338  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
5339  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
5340  ret <4 x i32> %vmull2.i.i
5341}
5342
5343define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
5344; CHECK-LABEL: define <4 x i32> @vmull_high_n_s16_test(
5345; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i32 [[D:%.*]]) #[[ATTR6]] {
5346; CHECK-NEXT:  [[ENTRY:.*:]]
5347; CHECK-NEXT:    call void @llvm.donothing()
5348; CHECK-NEXT:    [[CONV:%.*]] = trunc i32 [[D]] to i16
5349; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
5350; CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
5351; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <4 x i16>
5352; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[CONV]], i32 0
5353; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[CONV]], i32 1
5354; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[CONV]], i32 2
5355; CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[CONV]], i32 3
5356; CHECK-NEXT:    [[VMULL2_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[VECINIT3_I]]) #[[ATTR7]]
5357; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
5358; CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
5359;
5360entry:
5361  %conv = trunc i32 %d to i16
5362  %0 = bitcast <8 x i16> %b to <2 x i64>
5363  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
5364  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
5365  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
5366  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
5367  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
5368  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
5369  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
5370  ret <4 x i32> %vmull2.i.i
5371}
5372
5373define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
5374; CHECK-LABEL: define <2 x i64> @vmull_high_n_s32_test(
5375; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], i32 [[D:%.*]]) #[[ATTR6]] {
5376; CHECK-NEXT:  [[ENTRY:.*:]]
5377; CHECK-NEXT:    call void @llvm.donothing()
5378; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
5379; CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
5380; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <2 x i32>
5381; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[D]], i32 0
5382; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[D]], i32 1
5383; CHECK-NEXT:    [[VMULL2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[VECINIT1_I]]) #[[ATTR7]]
5384; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5385; CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
5386;
5387entry:
5388  %0 = bitcast <4 x i32> %b to <2 x i64>
5389  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
5390  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
5391  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
5392  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
5393  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
5394  ret <2 x i64> %vmull2.i.i
5395}
5396
5397define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
5398; CHECK-LABEL: define <4 x i32> @vmull_high_n_u16_test(
5399; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i32 [[D:%.*]]) #[[ATTR6]] {
5400; CHECK-NEXT:  [[ENTRY:.*:]]
5401; CHECK-NEXT:    call void @llvm.donothing()
5402; CHECK-NEXT:    [[CONV:%.*]] = trunc i32 [[D]] to i16
5403; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <2 x i64>
5404; CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
5405; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <4 x i16>
5406; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[CONV]], i32 0
5407; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[CONV]], i32 1
5408; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[CONV]], i32 2
5409; CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[CONV]], i32 3
5410; CHECK-NEXT:    [[VMULL2_I_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[VECINIT3_I]]) #[[ATTR7]]
5411; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
5412; CHECK-NEXT:    ret <4 x i32> [[VMULL2_I_I]]
5413;
5414entry:
5415  %conv = trunc i32 %d to i16
5416  %0 = bitcast <8 x i16> %b to <2 x i64>
5417  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
5418  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
5419  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
5420  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
5421  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
5422  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
5423  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
5424  ret <4 x i32> %vmull2.i.i
5425}
5426
5427define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
5428; CHECK-LABEL: define <2 x i64> @vmull_high_n_u32_test(
5429; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], i32 [[D:%.*]]) #[[ATTR6]] {
5430; CHECK-NEXT:  [[ENTRY:.*:]]
5431; CHECK-NEXT:    call void @llvm.donothing()
5432; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <2 x i64>
5433; CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> undef, <1 x i32> <i32 1>
5434; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to <2 x i32>
5435; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 [[D]], i32 0
5436; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[D]], i32 1
5437; CHECK-NEXT:    [[VMULL2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[TMP1]], <2 x i32> [[VECINIT1_I]]) #[[ATTR7]]
5438; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5439; CHECK-NEXT:    ret <2 x i64> [[VMULL2_I_I]]
5440;
5441entry:
5442  %0 = bitcast <4 x i32> %b to <2 x i64>
5443  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
5444  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
5445  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
5446  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
5447  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
5448  ret <2 x i64> %vmull2.i.i
5449}
5450
5451define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
5452; CHECK-LABEL: define <4 x i32> @vmul_built_dup_test(
5453; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) {
5454; CHECK-NEXT:    call void @llvm.donothing()
5455; CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i32> [[B]], i32 1
5456; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[VGET_LANE]], i32 0
5457; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[VGET_LANE]], i32 1
5458; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[VGET_LANE]], i32 2
5459; CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[VGET_LANE]], i32 3
5460; CHECK-NEXT:    [[PROD:%.*]] = mul <4 x i32> [[A]], [[VECINIT3_I]]
5461; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
5462; CHECK-NEXT:    ret <4 x i32> [[PROD]]
5463;
5464  %vget_lane = extractelement <4 x i32> %b, i32 1
5465  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
5466  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
5467  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
5468  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
5469  %prod = mul <4 x i32> %a, %vecinit3.i
5470  ret <4 x i32> %prod
5471}
5472
5473define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
5474; CHECK-LABEL: define <4 x i16> @vmul_built_dup_fromsmall_test(
5475; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) {
5476; CHECK-NEXT:    call void @llvm.donothing()
5477; CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[B]], i32 3
5478; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i32 0
5479; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[VGET_LANE]], i32 1
5480; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[VGET_LANE]], i32 2
5481; CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[VGET_LANE]], i32 3
5482; CHECK-NEXT:    [[PROD:%.*]] = mul <4 x i16> [[A]], [[VECINIT3_I]]
5483; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
5484; CHECK-NEXT:    ret <4 x i16> [[PROD]]
5485;
5486  %vget_lane = extractelement <4 x i16> %b, i32 3
5487  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
5488  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
5489  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
5490  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
5491  %prod = mul <4 x i16> %a, %vecinit3.i
5492  ret <4 x i16> %prod
5493}
5494
5495define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
5496; CHECK-LABEL: define <8 x i16> @vmulq_built_dup_fromsmall_test(
5497; CHECK-SAME: <8 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) {
5498; CHECK-NEXT:    call void @llvm.donothing()
5499; CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[B]], i32 0
5500; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[VGET_LANE]], i32 0
5501; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[VGET_LANE]], i32 1
5502; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[VGET_LANE]], i32 2
5503; CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[VGET_LANE]], i32 3
5504; CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[VGET_LANE]], i32 4
5505; CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[VGET_LANE]], i32 5
5506; CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[VGET_LANE]], i32 6
5507; CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[VGET_LANE]], i32 7
5508; CHECK-NEXT:    [[PROD:%.*]] = mul <8 x i16> [[A]], [[VECINIT7_I]]
5509; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
5510; CHECK-NEXT:    ret <8 x i16> [[PROD]]
5511;
5512  %vget_lane = extractelement <4 x i16> %b, i32 0
5513  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
5514  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
5515  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
5516  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
5517  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
5518  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
5519  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
5520  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
5521  %prod = mul <8 x i16> %a, %vecinit7.i
5522  ret <8 x i16> %prod
5523}
5524
5525define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
5526; CHECK-LABEL: define <2 x i64> @mull_from_two_extracts(
5527; CHECK-SAME: <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
5528; CHECK-NEXT:    call void @llvm.donothing()
5529; CHECK-NEXT:    [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5530; CHECK-NEXT:    [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5531; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR7]]
5532; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5533; CHECK-NEXT:    ret <2 x i64> [[RES]]
5534;
5535  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5536  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5537
5538  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
5539  ret <2 x i64> %res
5540}
5541
5542define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
5543; CHECK-LABEL: define <2 x i64> @mlal_from_two_extracts(
5544; CHECK-SAME: <2 x i64> [[ACCUM:%.*]], <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
5545; CHECK-NEXT:    call void @llvm.donothing()
5546; CHECK-NEXT:    [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5547; CHECK-NEXT:    [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5548; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR7]]
5549; CHECK-NEXT:    [[SUM:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[ACCUM]], <2 x i64> [[RES]])
5550; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5551; CHECK-NEXT:    ret <2 x i64> [[SUM]]
5552;
5553  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5554  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5555
5556  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
5557  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
5558  ret <2 x i64> %sum
5559}
5560
5561define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) {
5562; CHECK-LABEL: define <2 x i64> @mull_from_extract_dup_low(
5563; CHECK-SAME: <4 x i32> [[LHS:%.*]], i32 [[RHS:%.*]]) {
5564; CHECK-NEXT:    call void @llvm.donothing()
5565; CHECK-NEXT:    [[RHSVEC_TMP:%.*]] = insertelement <2 x i32> undef, i32 [[RHS]], i32 0
5566; CHECK-NEXT:    [[RHSVEC:%.*]] = insertelement <2 x i32> [[RHSVEC_TMP]], i32 [[RHS]], i32 1
5567; CHECK-NEXT:    [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
5568; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHSVEC]]) #[[ATTR7]]
5569; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5570; CHECK-NEXT:    ret <2 x i64> [[RES]]
5571;
5572  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
5573  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
5574
5575  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
5576
5577  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
5578  ret <2 x i64> %res
5579}
5580
5581define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) {
5582; CHECK-LABEL: define <2 x i64> @mull_from_extract_dup_high(
5583; CHECK-SAME: <4 x i32> [[LHS:%.*]], i32 [[RHS:%.*]]) {
5584; CHECK-NEXT:    call void @llvm.donothing()
5585; CHECK-NEXT:    [[RHSVEC_TMP:%.*]] = insertelement <2 x i32> undef, i32 [[RHS]], i32 0
5586; CHECK-NEXT:    [[RHSVEC:%.*]] = insertelement <2 x i32> [[RHSVEC_TMP]], i32 [[RHS]], i32 1
5587; CHECK-NEXT:    [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5588; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHSVEC]]) #[[ATTR7]]
5589; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5590; CHECK-NEXT:    ret <2 x i64> [[RES]]
5591;
5592  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
5593  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
5594
5595  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5596
5597  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
5598  ret <2 x i64> %res
5599}
5600
5601define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
5602; CHECK-LABEL: define <8 x i16> @pmull_from_extract_dup_low(
5603; CHECK-SAME: <16 x i8> [[LHS:%.*]], i8 [[RHS:%.*]]) {
5604; CHECK-NEXT:    call void @llvm.donothing()
5605; CHECK-NEXT:    [[RHSVEC_0:%.*]] = insertelement <8 x i8> undef, i8 [[RHS]], i32 0
5606; CHECK-NEXT:    [[RHSVEC:%.*]] = shufflevector <8 x i8> [[RHSVEC_0]], <8 x i8> undef, <8 x i32> zeroinitializer
5607; CHECK-NEXT:    [[LHS_HIGH:%.*]] = shufflevector <16 x i8> [[LHS]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
5608; CHECK-NEXT:    [[RES:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[LHS_HIGH]], <8 x i8> [[RHSVEC]]) #[[ATTR7]]
5609; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
5610; CHECK-NEXT:    ret <8 x i16> [[RES]]
5611;
5612  %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
5613  %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
5614
5615  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
5616
5617  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
5618  ret <8 x i16> %res
5619}
5620
5621define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
5622; CHECK-LABEL: define <8 x i16> @pmull_from_extract_dup_high(
5623; CHECK-SAME: <16 x i8> [[LHS:%.*]], i8 [[RHS:%.*]]) {
5624; CHECK-NEXT:    call void @llvm.donothing()
5625; CHECK-NEXT:    [[RHSVEC_0:%.*]] = insertelement <8 x i8> undef, i8 [[RHS]], i32 0
5626; CHECK-NEXT:    [[RHSVEC:%.*]] = shufflevector <8 x i8> [[RHSVEC_0]], <8 x i8> undef, <8 x i32> zeroinitializer
5627; CHECK-NEXT:    [[LHS_HIGH:%.*]] = shufflevector <16 x i8> [[LHS]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
5628; CHECK-NEXT:    [[RES:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[LHS_HIGH]], <8 x i8> [[RHSVEC]]) #[[ATTR7]]
5629; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
5630; CHECK-NEXT:    ret <8 x i16> [[RES]]
5631;
5632  %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
5633  %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
5634
5635  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
5636
5637  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
5638  ret <8 x i16> %res
5639}
5640
5641define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) {
5642; CHECK-LABEL: define <8 x i16> @pmull_from_extract_duplane_low(
5643; CHECK-SAME: <16 x i8> [[LHS:%.*]], <8 x i8> [[RHS:%.*]]) {
5644; CHECK-NEXT:    call void @llvm.donothing()
5645; CHECK-NEXT:    [[LHS_HIGH:%.*]] = shufflevector <16 x i8> [[LHS]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
5646; CHECK-NEXT:    [[RHS_HIGH:%.*]] = shufflevector <8 x i8> [[RHS]], <8 x i8> undef, <8 x i32> zeroinitializer
5647; CHECK-NEXT:    [[RES:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[LHS_HIGH]], <8 x i8> [[RHS_HIGH]]) #[[ATTR7]]
5648; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
5649; CHECK-NEXT:    ret <8 x i16> [[RES]]
5650;
5651  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
5652  %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
5653
5654  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
5655  ret <8 x i16> %res
5656}
5657
5658define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
5659; CHECK-LABEL: define <8 x i16> @pmull_from_extract_duplane_high(
5660; CHECK-SAME: <16 x i8> [[LHS:%.*]], <8 x i8> [[RHS:%.*]]) {
5661; CHECK-NEXT:    call void @llvm.donothing()
5662; CHECK-NEXT:    [[LHS_HIGH:%.*]] = shufflevector <16 x i8> [[LHS]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
5663; CHECK-NEXT:    [[RHS_HIGH:%.*]] = shufflevector <8 x i8> [[RHS]], <8 x i8> undef, <8 x i32> zeroinitializer
5664; CHECK-NEXT:    [[RES:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[LHS_HIGH]], <8 x i8> [[RHS_HIGH]]) #[[ATTR7]]
5665; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
5666; CHECK-NEXT:    ret <8 x i16> [[RES]]
5667;
5668  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
5669  %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
5670
5671  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
5672  ret <8 x i16> %res
5673}
5674
5675define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) {
5676; CHECK-LABEL: define <2 x i64> @sqdmull_from_extract_duplane_low(
5677; CHECK-SAME: <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
5678; CHECK-NEXT:    call void @llvm.donothing()
5679; CHECK-NEXT:    [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
5680; CHECK-NEXT:    [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> zeroinitializer
5681; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR7]]
5682; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5683; CHECK-NEXT:    ret <2 x i64> [[RES]]
5684;
5685  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
5686  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
5687
5688  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
5689  ret <2 x i64> %res
5690}
5691
5692define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) {
5693; CHECK-LABEL: define <2 x i64> @sqdmull_from_extract_duplane_high(
5694; CHECK-SAME: <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
5695; CHECK-NEXT:    call void @llvm.donothing()
5696; CHECK-NEXT:    [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5697; CHECK-NEXT:    [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> zeroinitializer
5698; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR7]]
5699; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5700; CHECK-NEXT:    ret <2 x i64> [[RES]]
5701;
5702  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5703  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
5704
5705  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
5706  ret <2 x i64> %res
5707}
5708
5709define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
5710; CHECK-LABEL: define <2 x i64> @sqdmlal_from_extract_duplane_low(
5711; CHECK-SAME: <2 x i64> [[ACCUM:%.*]], <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
5712; CHECK-NEXT:    call void @llvm.donothing()
5713; CHECK-NEXT:    [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
5714; CHECK-NEXT:    [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> zeroinitializer
5715; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR7]]
5716; CHECK-NEXT:    [[SUM:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[ACCUM]], <2 x i64> [[RES]])
5717; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5718; CHECK-NEXT:    ret <2 x i64> [[SUM]]
5719;
5720  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
5721  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
5722
5723  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
5724  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
5725  ret <2 x i64> %sum
5726}
5727
5728define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
5729; CHECK-LABEL: define <2 x i64> @sqdmlal_from_extract_duplane_high(
5730; CHECK-SAME: <2 x i64> [[ACCUM:%.*]], <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
5731; CHECK-NEXT:    call void @llvm.donothing()
5732; CHECK-NEXT:    [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5733; CHECK-NEXT:    [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> zeroinitializer
5734; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR7]]
5735; CHECK-NEXT:    [[SUM:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[ACCUM]], <2 x i64> [[RES]])
5736; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5737; CHECK-NEXT:    ret <2 x i64> [[SUM]]
5738;
5739  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5740  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
5741
5742  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
5743  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
5744  ret <2 x i64> %sum
5745}
5746
5747define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
5748; CHECK-LABEL: define <2 x i64> @umlal_from_extract_duplane_low(
5749; CHECK-SAME: <2 x i64> [[ACCUM:%.*]], <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
5750; CHECK-NEXT:    call void @llvm.donothing()
5751; CHECK-NEXT:    [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 0, i32 1>
5752; CHECK-NEXT:    [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> zeroinitializer
5753; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR7]]
5754; CHECK-NEXT:    [[SUM:%.*]] = add <2 x i64> [[ACCUM]], [[RES]]
5755; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5756; CHECK-NEXT:    ret <2 x i64> [[SUM]]
5757;
5758  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
5759  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
5760
5761  %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
5762  %sum = add <2 x i64> %accum, %res
5763  ret <2 x i64> %sum
5764}
5765
5766define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
5767; CHECK-LABEL: define <2 x i64> @umlal_from_extract_duplane_high(
5768; CHECK-SAME: <2 x i64> [[ACCUM:%.*]], <4 x i32> [[LHS:%.*]], <4 x i32> [[RHS:%.*]]) {
5769; CHECK-NEXT:    call void @llvm.donothing()
5770; CHECK-NEXT:    [[LHS_HIGH:%.*]] = shufflevector <4 x i32> [[LHS]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5771; CHECK-NEXT:    [[RHS_HIGH:%.*]] = shufflevector <4 x i32> [[RHS]], <4 x i32> undef, <2 x i32> zeroinitializer
5772; CHECK-NEXT:    [[RES:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[LHS_HIGH]], <2 x i32> [[RHS_HIGH]]) #[[ATTR7]]
5773; CHECK-NEXT:    [[SUM:%.*]] = add <2 x i64> [[ACCUM]], [[RES]]
5774; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5775; CHECK-NEXT:    ret <2 x i64> [[SUM]]
5776;
5777  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
5778  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
5779
5780  %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
5781  %sum = add <2 x i64> %accum, %res
5782  ret <2 x i64> %sum
5783}
5784
5785define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
5786; CHECK-LABEL: define float @scalar_fmla_from_extract_v4f32(
5787; CHECK-SAME: float [[ACCUM:%.*]], float [[LHS:%.*]], <4 x float> [[RVEC:%.*]]) {
5788; CHECK-NEXT:    call void @llvm.donothing()
5789; CHECK-NEXT:    [[RHS:%.*]] = extractelement <4 x float> [[RVEC]], i32 3
5790; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.fma.f32(float [[LHS]], float [[RHS]], float [[ACCUM]])
5791; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
5792; CHECK-NEXT:    ret float [[RES]]
5793;
5794  %rhs = extractelement <4 x float> %rvec, i32 3
5795  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
5796  ret float %res
5797}
5798
5799define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
5800; CHECK-LABEL: define float @scalar_fmla_from_extract_v2f32(
5801; CHECK-SAME: float [[ACCUM:%.*]], float [[LHS:%.*]], <2 x float> [[RVEC:%.*]]) {
5802; CHECK-NEXT:    call void @llvm.donothing()
5803; CHECK-NEXT:    [[RHS:%.*]] = extractelement <2 x float> [[RVEC]], i32 1
5804; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.fma.f32(float [[LHS]], float [[RHS]], float [[ACCUM]])
5805; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
5806; CHECK-NEXT:    ret float [[RES]]
5807;
5808  %rhs = extractelement <2 x float> %rvec, i32 1
5809  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
5810  ret float %res
5811}
5812
5813define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
5814; CHECK-LABEL: define float @scalar_fmls_from_extract_v4f32(
5815; CHECK-SAME: float [[ACCUM:%.*]], float [[LHS:%.*]], <4 x float> [[RVEC:%.*]]) {
5816; CHECK-NEXT:    call void @llvm.donothing()
5817; CHECK-NEXT:    [[RHS_SCAL:%.*]] = extractelement <4 x float> [[RVEC]], i32 3
5818; CHECK-NEXT:    [[RHS:%.*]] = fsub float -0.000000e+00, [[RHS_SCAL]]
5819; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.fma.f32(float [[LHS]], float [[RHS]], float [[ACCUM]])
5820; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
5821; CHECK-NEXT:    ret float [[RES]]
5822;
5823  %rhs.scal = extractelement <4 x float> %rvec, i32 3
5824  %rhs = fsub float -0.0, %rhs.scal
5825  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
5826  ret float %res
5827}
5828
5829define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
5830; CHECK-LABEL: define float @scalar_fmls_from_extract_v2f32(
5831; CHECK-SAME: float [[ACCUM:%.*]], float [[LHS:%.*]], <2 x float> [[RVEC:%.*]]) {
5832; CHECK-NEXT:    call void @llvm.donothing()
5833; CHECK-NEXT:    [[RHS_SCAL:%.*]] = extractelement <2 x float> [[RVEC]], i32 1
5834; CHECK-NEXT:    [[RHS:%.*]] = fsub float -0.000000e+00, [[RHS_SCAL]]
5835; CHECK-NEXT:    [[RES:%.*]] = call float @llvm.fma.f32(float [[LHS]], float [[RHS]], float [[ACCUM]])
5836; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
5837; CHECK-NEXT:    ret float [[RES]]
5838;
5839  %rhs.scal = extractelement <2 x float> %rvec, i32 1
5840  %rhs = fsub float -0.0, %rhs.scal
5841  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
5842  ret float %res
5843}
5844
5845declare float @llvm.fma.f32(float, float, float)
5846
5847define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
5848; CHECK-LABEL: define double @scalar_fmla_from_extract_v2f64(
5849; CHECK-SAME: double [[ACCUM:%.*]], double [[LHS:%.*]], <2 x double> [[RVEC:%.*]]) {
5850; CHECK-NEXT:    call void @llvm.donothing()
5851; CHECK-NEXT:    [[RHS:%.*]] = extractelement <2 x double> [[RVEC]], i32 1
5852; CHECK-NEXT:    [[RES:%.*]] = call double @llvm.fma.f64(double [[LHS]], double [[RHS]], double [[ACCUM]])
5853; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
5854; CHECK-NEXT:    ret double [[RES]]
5855;
5856  %rhs = extractelement <2 x double> %rvec, i32 1
5857  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
5858  ret double %res
5859}
5860
5861define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
5862; CHECK-LABEL: define double @scalar_fmls_from_extract_v2f64(
5863; CHECK-SAME: double [[ACCUM:%.*]], double [[LHS:%.*]], <2 x double> [[RVEC:%.*]]) {
5864; CHECK-NEXT:    call void @llvm.donothing()
5865; CHECK-NEXT:    [[RHS_SCAL:%.*]] = extractelement <2 x double> [[RVEC]], i32 1
5866; CHECK-NEXT:    [[RHS:%.*]] = fsub double -0.000000e+00, [[RHS_SCAL]]
5867; CHECK-NEXT:    [[RES:%.*]] = call double @llvm.fma.f64(double [[LHS]], double [[RHS]], double [[ACCUM]])
5868; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
5869; CHECK-NEXT:    ret double [[RES]]
5870;
5871  %rhs.scal = extractelement <2 x double> %rvec, i32 1
5872  %rhs = fsub double -0.0, %rhs.scal
5873  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
5874  ret double %res
5875}
5876
5877declare double @llvm.fma.f64(double, double, double)
5878
5879define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
5880; CHECK-LABEL: define <2 x float> @fmls_with_fneg_before_extract_v2f32(
5881; CHECK-SAME: <2 x float> [[ACCUM:%.*]], <2 x float> [[LHS:%.*]], <4 x float> [[RHS:%.*]]) {
5882; CHECK-NEXT:    call void @llvm.donothing()
5883; CHECK-NEXT:    [[RHS_NEG:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[RHS]]
5884; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[RHS_NEG]], <4 x float> undef, <2 x i32> <i32 3, i32 3>
5885; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LHS]], <2 x float> [[SPLAT]], <2 x float> [[ACCUM]])
5886; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
5887; CHECK-NEXT:    ret <2 x float> [[RES]]
5888;
5889  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
5890  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
5891  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
5892  ret <2 x float> %res
5893}
5894
5895define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
5896; CHECK-LABEL: define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(
5897; CHECK-SAME: <2 x float> [[ACCUM:%.*]], <2 x float> [[LHS:%.*]], <2 x float> [[RHS:%.*]]) {
5898; CHECK-NEXT:    call void @llvm.donothing()
5899; CHECK-NEXT:    [[RHS_NEG:%.*]] = fsub <2 x float> splat (float -0.000000e+00), [[RHS]]
5900; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <2 x float> [[RHS_NEG]], <2 x float> undef, <2 x i32> <i32 1, i32 1>
5901; CHECK-NEXT:    [[RES:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LHS]], <2 x float> [[SPLAT]], <2 x float> [[ACCUM]])
5902; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
5903; CHECK-NEXT:    ret <2 x float> [[RES]]
5904;
5905  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
5906  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
5907  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
5908  ret <2 x float> %res
5909}
5910
5911define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
5912; CHECK-LABEL: define <4 x float> @fmls_with_fneg_before_extract_v4f32(
5913; CHECK-SAME: <4 x float> [[ACCUM:%.*]], <4 x float> [[LHS:%.*]], <4 x float> [[RHS:%.*]]) {
5914; CHECK-NEXT:    call void @llvm.donothing()
5915; CHECK-NEXT:    [[RHS_NEG:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[RHS]]
5916; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x float> [[RHS_NEG]], <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5917; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LHS]], <4 x float> [[SPLAT]], <4 x float> [[ACCUM]])
5918; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
5919; CHECK-NEXT:    ret <4 x float> [[RES]]
5920;
5921  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
5922  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5923  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
5924  ret <4 x float> %res
5925}
5926
5927define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
5928; CHECK-LABEL: define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(
5929; CHECK-SAME: <4 x float> [[ACCUM:%.*]], <4 x float> [[LHS:%.*]], <2 x float> [[RHS:%.*]]) {
5930; CHECK-NEXT:    call void @llvm.donothing()
5931; CHECK-NEXT:    [[RHS_NEG:%.*]] = fsub <2 x float> splat (float -0.000000e+00), [[RHS]]
5932; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <2 x float> [[RHS_NEG]], <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
5933; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LHS]], <4 x float> [[SPLAT]], <4 x float> [[ACCUM]])
5934; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
5935; CHECK-NEXT:    ret <4 x float> [[RES]]
5936;
5937  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
5938  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
5939  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
5940  ret <4 x float> %res
5941}
5942
5943define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
5944; CHECK-LABEL: define <2 x double> @fmls_with_fneg_before_extract_v2f64(
5945; CHECK-SAME: <2 x double> [[ACCUM:%.*]], <2 x double> [[LHS:%.*]], <2 x double> [[RHS:%.*]]) {
5946; CHECK-NEXT:    call void @llvm.donothing()
5947; CHECK-NEXT:    [[RHS_NEG:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[RHS]]
5948; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <2 x double> [[RHS_NEG]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
5949; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LHS]], <2 x double> [[SPLAT]], <2 x double> [[ACCUM]])
5950; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
5951; CHECK-NEXT:    ret <2 x double> [[RES]]
5952;
5953  %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
5954  %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
5955  %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
5956  ret <2 x double> %res
5957}
5958
5959define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind sanitize_memory {
5960; CHECK-LABEL: define <1 x double> @test_fmul_v1f64(
5961; CHECK-SAME: <1 x double> [[L:%.*]], <1 x double> [[R:%.*]]) #[[ATTR0]] {
5962; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
5963; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
5964; CHECK-NEXT:    call void @llvm.donothing()
5965; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP1]], [[TMP2]]
5966; CHECK-NEXT:    [[PROD:%.*]] = fmul <1 x double> [[L]], [[R]]
5967; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
5968; CHECK-NEXT:    ret <1 x double> [[PROD]]
5969;
5970  %prod = fmul <1 x double> %L, %R
5971  ret <1 x double> %prod
5972}
5973
5974define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind sanitize_memory {
5975; CHECK-LABEL: define <1 x double> @test_fdiv_v1f64(
5976; CHECK-SAME: <1 x double> [[L:%.*]], <1 x double> [[R:%.*]]) #[[ATTR0]] {
5977; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
5978; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
5979; CHECK-NEXT:    call void @llvm.donothing()
5980; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP1]], [[TMP2]]
5981; CHECK-NEXT:    [[PROD:%.*]] = fdiv <1 x double> [[L]], [[R]]
5982; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
5983; CHECK-NEXT:    ret <1 x double> [[PROD]]
5984;
5985  %prod = fdiv <1 x double> %L, %R
5986  ret <1 x double> %prod
5987}
5988
5989define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind sanitize_memory {
5990; CHECK-LABEL: define i32 @sqdmlal_s(
5991; CHECK-SAME: i16 [[A:%.*]], i16 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR0]] {
5992; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr @__msan_param_tls, align 8
5993; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
5994; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
5995; CHECK-NEXT:    call void @llvm.donothing()
5996; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> splat (i16 -1), i16 [[TMP6]], i64 0
5997; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[A]], i64 0
5998; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i16> splat (i16 -1), i16 [[TMP7]], i64 0
5999; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[B]], i64 0
6000; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[_MSPROP]] to i64
6001; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
6002; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[_MSPROP1]] to i64
6003; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP10]], 0
6004; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP3]]
6005; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
6006; CHECK:       [[BB6]]:
6007; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
6008; CHECK-NEXT:    unreachable
6009; CHECK:       [[BB7]]:
6010; CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
6011; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
6012; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i32 [[TMP8]], 0
6013; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[C]], i32 [[TMP4]])
6014; CHECK-NEXT:    store i32 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
6015; CHECK-NEXT:    ret i32 [[TMP5]]
6016;
6017  %temp1 = insertelement <4 x i16> undef, i16 %A, i64 0
6018  %temp2 = insertelement <4 x i16> undef, i16 %B, i64 0
6019  %temp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
6020  %temp4 = extractelement <4 x i32> %temp3, i64 0
6021  %temp5 = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %C, i32 %temp4)
6022  ret i32 %temp5
6023}
6024
6025define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind sanitize_memory {
6026; CHECK-LABEL: define i64 @sqdmlal_d(
6027; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR0]] {
6028; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
6029; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
6030; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
6031; CHECK-NEXT:    call void @llvm.donothing()
6032; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
6033; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0
6034; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
6035; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
6036; CHECK:       [[BB4]]:
6037; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
6038; CHECK-NEXT:    unreachable
6039; CHECK:       [[BB5]]:
6040; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[A]], i32 [[B]])
6041; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP3]], 0
6042; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[C]], i64 [[TMP4]])
6043; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
6044; CHECK-NEXT:    ret i64 [[TMP5]]
6045;
6046  %temp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
6047  %temp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %temp4)
6048  ret i64 %temp5
6049}
6050
6051define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind sanitize_memory {
6052; CHECK-LABEL: define i32 @sqdmlsl_s(
6053; CHECK-SAME: i16 [[A:%.*]], i16 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR0]] {
6054; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr @__msan_param_tls, align 8
6055; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
6056; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
6057; CHECK-NEXT:    call void @llvm.donothing()
6058; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> splat (i16 -1), i16 [[TMP6]], i64 0
6059; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[A]], i64 0
6060; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i16> splat (i16 -1), i16 [[TMP7]], i64 0
6061; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[B]], i64 0
6062; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[_MSPROP]] to i64
6063; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
6064; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[_MSPROP1]] to i64
6065; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP10]], 0
6066; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP3]]
6067; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
6068; CHECK:       [[BB6]]:
6069; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
6070; CHECK-NEXT:    unreachable
6071; CHECK:       [[BB7]]:
6072; CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
6073; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
6074; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i32 [[TMP8]], 0
6075; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[C]], i32 [[TMP4]])
6076; CHECK-NEXT:    store i32 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
6077; CHECK-NEXT:    ret i32 [[TMP5]]
6078;
6079  %temp1 = insertelement <4 x i16> undef, i16 %A, i64 0
6080  %temp2 = insertelement <4 x i16> undef, i16 %B, i64 0
6081  %temp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %temp1, <4 x i16> %temp2)
6082  %temp4 = extractelement <4 x i32> %temp3, i64 0
6083  %temp5 = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %C, i32 %temp4)
6084  ret i32 %temp5
6085}
6086
6087define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind sanitize_memory {
6088; CHECK-LABEL: define i64 @sqdmlsl_d(
6089; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR0]] {
6090; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
6091; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
6092; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
6093; CHECK-NEXT:    call void @llvm.donothing()
6094; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
6095; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0
6096; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
6097; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
6098; CHECK:       [[BB4]]:
6099; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
6100; CHECK-NEXT:    unreachable
6101; CHECK:       [[BB5]]:
6102; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[A]], i32 [[B]])
6103; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP3]], 0
6104; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[C]], i64 [[TMP4]])
6105; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
6106; CHECK-NEXT:    ret i64 [[TMP5]]
6107;
6108  %temp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
6109  %temp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %temp4)
6110  ret i64 %temp5
6111}
6112
6113define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind sanitize_memory {
6114; CHECK-LABEL: define <16 x i8> @test_pmull_64(
6115; CHECK-SAME: i64 [[L:%.*]], i64 [[R:%.*]]) #[[ATTR0]] {
6116; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
6117; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
6118; CHECK-NEXT:    call void @llvm.donothing()
6119; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
6120; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
6121; CHECK-NEXT:    [[TMP3:%.*]] = zext i64 [[_MSPROP1]] to i128
6122; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i128 [[TMP3]] to <16 x i8>
6123; CHECK-NEXT:    [[VAL:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[L]], i64 [[R]])
6124; CHECK-NEXT:    store <16 x i8> [[TMP4]], ptr @__msan_retval_tls, align 8
6125; CHECK-NEXT:    ret <16 x i8> [[VAL]]
6126;
6127  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
6128  ret <16 x i8> %val
6129}
6130
6131define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind sanitize_memory {
6132; CHECK-LABEL: define <16 x i8> @test_pmull_high_64(
6133; CHECK-SAME: <2 x i64> [[L:%.*]], <2 x i64> [[R:%.*]]) #[[ATTR0]] {
6134; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
6135; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
6136; CHECK-NEXT:    call void @llvm.donothing()
6137; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
6138; CHECK-NEXT:    [[L_HI:%.*]] = extractelement <2 x i64> [[L]], i32 1
6139; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
6140; CHECK-NEXT:    [[R_HI:%.*]] = extractelement <2 x i64> [[R]], i32 1
6141; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
6142; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP2]], 0
6143; CHECK-NEXT:    [[TMP3:%.*]] = zext i64 [[_MSPROP3]] to i128
6144; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i128 [[TMP3]] to <16 x i8>
6145; CHECK-NEXT:    [[VAL:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[L_HI]], i64 [[R_HI]])
6146; CHECK-NEXT:    store <16 x i8> [[TMP4]], ptr @__msan_retval_tls, align 8
6147; CHECK-NEXT:    ret <16 x i8> [[VAL]]
6148;
6149  %l_hi = extractelement <2 x i64> %l, i32 1
6150  %r_hi = extractelement <2 x i64> %r, i32 1
6151  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
6152  ret <16 x i8> %val
6153}
6154
6155declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
6156
6157define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind sanitize_memory {
6158; CHECK-LABEL: define <1 x i64> @test_mul_v1i64(
6159; CHECK-SAME: <1 x i64> [[LHS:%.*]], <1 x i64> [[RHS:%.*]]) #[[ATTR0]] {
6160; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
6161; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
6162; CHECK-NEXT:    call void @llvm.donothing()
6163; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP1]], [[TMP2]]
6164; CHECK-NEXT:    [[PROD:%.*]] = mul <1 x i64> [[LHS]], [[RHS]]
6165; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
6166; CHECK-NEXT:    ret <1 x i64> [[PROD]]
6167;
6168  %prod = mul <1 x i64> %lhs, %rhs
6169  ret <1 x i64> %prod
6170}
6171;.
6172; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
6173;.
6174