1; RUN: opt -mtriple=amdgcn-amd-amdhsa --mcpu=hawaii -passes=load-store-vectorizer -S -o - %s | FileCheck %s 2; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions 3 4target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" 5 6; TODO: Vector element tests 7; TODO: Non-zero base offset for load and store combinations 8; TODO: Same base addrspacecasted 9 10 11define amdgpu_kernel void @merge_global_store_2_constants_i8(ptr addrspace(1) %out) #0 { 12; CHECK-LABEL: @merge_global_store_2_constants_i8( 13; CHECK-NEXT: store <2 x i8> <i8 -56, i8 123>, ptr addrspace(1) [[OUT:%.*]], align 2 14; CHECK-NEXT: ret void 15; 16 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1 17 18 store i8 123, ptr addrspace(1) %out.gep.1 19 store i8 456, ptr addrspace(1) %out, align 2 20 ret void 21} 22 23define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(ptr addrspace(1) %out) #0 { 24; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align( 25; CHECK-NEXT: store <2 x i8> <i8 -56, i8 123>, ptr addrspace(1) [[OUT:%.*]], align 1 26; CHECK-NEXT: ret void 27; 28 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1 29 30 store i8 123, ptr addrspace(1) %out.gep.1 31 store i8 456, ptr addrspace(1) %out 32 ret void 33} 34 35define amdgpu_kernel void @merge_global_store_2_constants_i16(ptr addrspace(1) %out) #0 { 36; CHECK-LABEL: @merge_global_store_2_constants_i16( 37; CHECK-NEXT: store <2 x i16> <i16 456, i16 123>, ptr addrspace(1) [[OUT:%.*]], align 4 38; CHECK-NEXT: ret void 39; 40 %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1 41 42 store i16 123, ptr addrspace(1) %out.gep.1 43 store i16 456, ptr addrspace(1) %out, align 4 44 ret void 45} 46 47define amdgpu_kernel void @merge_global_store_2_constants_0_i16(ptr addrspace(1) %out) #0 { 48; CHECK-LABEL: @merge_global_store_2_constants_0_i16( 49; CHECK-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(1) [[OUT:%.*]], align 4 50; CHECK-NEXT: ret void 51; 52 %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1 53 54 store i16 0, ptr addrspace(1) %out.gep.1 55 store i16 0, ptr addrspace(1) %out, align 4 56 ret void 57} 58 59define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(ptr addrspace(1) %out) #0 { 60; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align( 61; CHECK-NEXT: store <2 x i16> <i16 456, i16 123>, ptr addrspace(1) [[OUT:%.*]], align 2 62; CHECK-NEXT: ret void 63; 64 %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1 65 66 store i16 123, ptr addrspace(1) %out.gep.1 67 store i16 456, ptr addrspace(1) %out 68 ret void 69} 70 71define amdgpu_kernel void @merge_global_store_2_constants_i16_align_1(ptr addrspace(1) %out) #0 { 72; CHECK-LABEL: @merge_global_store_2_constants_i16_align_1( 73; CHECK-NEXT: store <2 x i16> <i16 456, i16 123>, ptr addrspace(1) [[OUT:%.*]], align 1 74; CHECK-NEXT: ret void 75; 76 %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1 77 78 store i16 123, ptr addrspace(1) %out.gep.1, align 1 79 store i16 456, ptr addrspace(1) %out, align 1 80 ret void 81} 82 83define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(ptr addrspace(1) %out) #0 { 84; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align( 85; CHECK-NEXT: store <2 x half> <half 0xH3C00, half 0xH4000>, ptr addrspace(1) [[OUT:%.*]], align 2 86; CHECK-NEXT: ret void 87; 88 %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1 89 90 store half 2.0, ptr addrspace(1) %out.gep.1 91 store half 1.0, ptr addrspace(1) %out 92 ret void 93} 94 95define amdgpu_kernel void @merge_global_store_2_constants_half_align_1(ptr addrspace(1) %out) #0 { 96; CHECK-LABEL: @merge_global_store_2_constants_half_align_1( 97; CHECK-NEXT: store <2 x half> <half 0xH3C00, half 0xH4000>, ptr addrspace(1) [[OUT:%.*]], align 1 98; CHECK-NEXT: ret void 99; 100 %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1 101 102 store half 2.0, ptr addrspace(1) %out.gep.1, align 1 103 store half 1.0, ptr addrspace(1) %out, align 1 104 ret void 105} 106 107define amdgpu_kernel void @merge_global_store_2_constants_i32(ptr addrspace(1) %out) #0 { 108; CHECK-LABEL: @merge_global_store_2_constants_i32( 109; CHECK-NEXT: store <2 x i32> <i32 456, i32 123>, ptr addrspace(1) [[OUT:%.*]], align 4 110; CHECK-NEXT: ret void 111; 112 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 113 114 store i32 123, ptr addrspace(1) %out.gep.1 115 store i32 456, ptr addrspace(1) %out 116 ret void 117} 118 119define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(ptr addrspace(1) %out) #0 { 120; CHECK-LABEL: @merge_global_store_2_constants_i32_f32( 121; CHECK-NEXT: store <2 x i32> <i32 456, i32 1065353216>, ptr addrspace(1) [[OUT:%.*]], align 4 122; CHECK-NEXT: ret void 123; 124 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 125 store float 1.0, ptr addrspace(1) %out.gep.1 126 store i32 456, ptr addrspace(1) %out 127 ret void 128} 129 130define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(ptr addrspace(1) %out) #0 { 131; CHECK-LABEL: @merge_global_store_2_constants_f32_i32( 132; CHECK-NEXT: store <2 x i32> <i32 1082130432, i32 123>, ptr addrspace(1) [[OUT:%.*]], align 4 133; CHECK-NEXT: ret void 134; 135 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 136 store i32 123, ptr addrspace(1) %out.gep.1 137 store float 4.0, ptr addrspace(1) %out 138 ret void 139} 140 141define amdgpu_kernel void @merge_global_store_4_constants_i32(ptr addrspace(1) %out) #0 { 142; CHECK-LABEL: @merge_global_store_4_constants_i32( 143; CHECK-NEXT: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, ptr addrspace(1) [[OUT:%.*]], align 4 144; CHECK-NEXT: ret void 145; 146 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 147 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2 148 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3 149 150 store i32 123, ptr addrspace(1) %out.gep.1 151 store i32 456, ptr addrspace(1) %out.gep.2 152 store i32 333, ptr addrspace(1) %out.gep.3 153 store i32 1234, ptr addrspace(1) %out 154 ret void 155} 156 157define amdgpu_kernel void @merge_global_store_4_constants_f32_order(ptr addrspace(1) %out) #0 { 158; CHECK-LABEL: @merge_global_store_4_constants_f32_order( 159; CHECK-NEXT: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, ptr addrspace(1) [[OUT:%.*]], align 4 160; CHECK-NEXT: ret void 161; 162 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 163 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2 164 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3 165 166 store float 8.0, ptr addrspace(1) %out 167 store float 1.0, ptr addrspace(1) %out.gep.1 168 store float 2.0, ptr addrspace(1) %out.gep.2 169 store float 4.0, ptr addrspace(1) %out.gep.3 170 ret void 171} 172 173; First store is out of order. 174define amdgpu_kernel void @merge_global_store_4_constants_f32(ptr addrspace(1) %out) #0 { 175; CHECK-LABEL: @merge_global_store_4_constants_f32( 176; CHECK-NEXT: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, ptr addrspace(1) [[OUT:%.*]], align 4 177; CHECK-NEXT: ret void 178; 179 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 180 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2 181 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3 182 183 store float 1.0, ptr addrspace(1) %out.gep.1 184 store float 2.0, ptr addrspace(1) %out.gep.2 185 store float 4.0, ptr addrspace(1) %out.gep.3 186 store float 8.0, ptr addrspace(1) %out 187 ret void 188} 189 190define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(ptr addrspace(1) %out) #0 { 191; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32( 192; CHECK-NEXT: store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, ptr addrspace(1) [[OUT:%.*]], align 4 193; CHECK-NEXT: ret void 194; 195 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 196 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2 197 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3 198 199 200 store i32 11, ptr addrspace(1) %out.gep.1 201 store float 2.0, ptr addrspace(1) %out.gep.2 202 store i32 17, ptr addrspace(1) %out.gep.3 203 store float 8.0, ptr addrspace(1) %out 204 ret void 205} 206 207define amdgpu_kernel void @merge_global_store_3_constants_i32(ptr addrspace(1) %out) #0 { 208; CHECK-LABEL: @merge_global_store_3_constants_i32( 209; CHECK-NEXT: store <3 x i32> <i32 1234, i32 123, i32 456>, ptr addrspace(1) [[OUT:%.*]], align 4 210; CHECK-NEXT: ret void 211; 212 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 213 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2 214 215 store i32 123, ptr addrspace(1) %out.gep.1 216 store i32 456, ptr addrspace(1) %out.gep.2 217 store i32 1234, ptr addrspace(1) %out 218 ret void 219} 220 221define amdgpu_kernel void @merge_global_store_2_constants_i64(ptr addrspace(1) %out) #0 { 222; CHECK-LABEL: @merge_global_store_2_constants_i64( 223; CHECK-NEXT: store <2 x i64> <i64 456, i64 123>, ptr addrspace(1) [[OUT:%.*]], align 8 224; CHECK-NEXT: ret void 225; 226 %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1 227 228 store i64 123, ptr addrspace(1) %out.gep.1 229 store i64 456, ptr addrspace(1) %out 230 ret void 231} 232 233define amdgpu_kernel void @merge_global_store_4_constants_i64(ptr addrspace(1) %out) #0 { 234; CHECK-LABEL: @merge_global_store_4_constants_i64( 235; CHECK-NEXT: [[OUT_GEP_2:%.*]] = getelementptr i64, ptr addrspace(1) [[OUT:%.*]], i64 2 236; CHECK-NEXT: store <2 x i64> <i64 456, i64 333>, ptr addrspace(1) [[OUT_GEP_2]], align 8 237; CHECK-NEXT: store <2 x i64> <i64 1234, i64 123>, ptr addrspace(1) [[OUT]], align 8 238; CHECK-NEXT: ret void 239; 240 %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1 241 %out.gep.2 = getelementptr i64, ptr addrspace(1) %out, i64 2 242 %out.gep.3 = getelementptr i64, ptr addrspace(1) %out, i64 3 243 244 store i64 123, ptr addrspace(1) %out.gep.1 245 store i64 456, ptr addrspace(1) %out.gep.2 246 store i64 333, ptr addrspace(1) %out.gep.3 247 store i64 1234, ptr addrspace(1) %out 248 ret void 249} 250 251define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 252; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32( 253; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[IN:%.*]], align 4 254; CHECK-NEXT: [[LO1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 255; CHECK-NEXT: [[HI2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 256; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LO1]], i32 0 257; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[HI2]], i32 1 258; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(1) [[OUT:%.*]], align 4 259; CHECK-NEXT: ret void 260; 261 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 262 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1 263 264 %lo = load i32, ptr addrspace(1) %in 265 %hi = load i32, ptr addrspace(1) %in.gep.1 266 267 store i32 %lo, ptr addrspace(1) %out 268 store i32 %hi, ptr addrspace(1) %out.gep.1 269 ret void 270} 271 272define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 273; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32_nonzero_base( 274; CHECK-NEXT: [[IN_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[IN:%.*]], i32 2 275; CHECK-NEXT: [[OUT_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i32 2 276; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[IN_GEP_0]], align 4 277; CHECK-NEXT: [[LO1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 278; CHECK-NEXT: [[HI2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 279; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LO1]], i32 0 280; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[HI2]], i32 1 281; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(1) [[OUT_GEP_0]], align 4 282; CHECK-NEXT: ret void 283; 284 %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 2 285 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 3 286 287 %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 2 288 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 3 289 %lo = load i32, ptr addrspace(1) %in.gep.0 290 %hi = load i32, ptr addrspace(1) %in.gep.1 291 292 store i32 %lo, ptr addrspace(1) %out.gep.0 293 store i32 %hi, ptr addrspace(1) %out.gep.1 294 ret void 295} 296 297define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 298; CHECK-LABEL: @merge_global_store_2_adjacent_loads_shuffle_i32( 299; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[IN:%.*]], align 4 300; CHECK-NEXT: [[LO1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 301; CHECK-NEXT: [[HI2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 302; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[HI2]], i32 0 303; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LO1]], i32 1 304; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(1) [[OUT:%.*]], align 4 305; CHECK-NEXT: ret void 306; 307 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 308 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1 309 310 %lo = load i32, ptr addrspace(1) %in 311 %hi = load i32, ptr addrspace(1) %in.gep.1 312 313 store i32 %hi, ptr addrspace(1) %out 314 store i32 %lo, ptr addrspace(1) %out.gep.1 315 ret void 316} 317 318define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 319; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32( 320; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 4 321; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 322; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 323; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 324; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 325; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0 326; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Y2]], i32 1 327; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Z3]], i32 2 328; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[W4]], i32 3 329; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4 330; CHECK-NEXT: ret void 331; 332 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 333 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2 334 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3 335 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1 336 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2 337 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3 338 339 %x = load i32, ptr addrspace(1) %in 340 %y = load i32, ptr addrspace(1) %in.gep.1 341 %z = load i32, ptr addrspace(1) %in.gep.2 342 %w = load i32, ptr addrspace(1) %in.gep.3 343 344 store i32 %x, ptr addrspace(1) %out 345 store i32 %y, ptr addrspace(1) %out.gep.1 346 store i32 %z, ptr addrspace(1) %out.gep.2 347 store i32 %w, ptr addrspace(1) %out.gep.3 348 ret void 349} 350 351define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 352; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32( 353; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr addrspace(1) [[IN:%.*]], align 4 354; CHECK-NEXT: [[X1:%.*]] = extractelement <3 x i32> [[TMP1]], i32 0 355; CHECK-NEXT: [[Y2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 1 356; CHECK-NEXT: [[Z3:%.*]] = extractelement <3 x i32> [[TMP1]], i32 2 357; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x i32> poison, i32 [[X1]], i32 0 358; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i32> [[TMP2]], i32 [[Y2]], i32 1 359; CHECK-NEXT: [[TMP4:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[Z3]], i32 2 360; CHECK-NEXT: store <3 x i32> [[TMP4]], ptr addrspace(1) [[OUT:%.*]], align 4 361; CHECK-NEXT: ret void 362; 363 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 364 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2 365 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1 366 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2 367 368 %x = load i32, ptr addrspace(1) %in 369 %y = load i32, ptr addrspace(1) %in.gep.1 370 %z = load i32, ptr addrspace(1) %in.gep.2 371 372 store i32 %x, ptr addrspace(1) %out 373 store i32 %y, ptr addrspace(1) %out.gep.1 374 store i32 %z, ptr addrspace(1) %out.gep.2 375 ret void 376} 377 378define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 379; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32( 380; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr addrspace(1) [[IN:%.*]], align 4 381; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 382; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 383; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 384; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 385; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[X1]], i32 0 386; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[Y2]], i32 1 387; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[Z3]], i32 2 388; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[W4]], i32 3 389; CHECK-NEXT: store <4 x float> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4 390; CHECK-NEXT: ret void 391; 392 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 393 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2 394 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3 395 %in.gep.1 = getelementptr float, ptr addrspace(1) %in, i32 1 396 %in.gep.2 = getelementptr float, ptr addrspace(1) %in, i32 2 397 %in.gep.3 = getelementptr float, ptr addrspace(1) %in, i32 3 398 399 %x = load float, ptr addrspace(1) %in 400 %y = load float, ptr addrspace(1) %in.gep.1 401 %z = load float, ptr addrspace(1) %in.gep.2 402 %w = load float, ptr addrspace(1) %in.gep.3 403 404 store float %x, ptr addrspace(1) %out 405 store float %y, ptr addrspace(1) %out.gep.1 406 store float %z, ptr addrspace(1) %out.gep.2 407 store float %w, ptr addrspace(1) %out.gep.3 408 ret void 409} 410 411define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 412; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base( 413; CHECK-NEXT: [[IN_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[IN:%.*]], i32 11 414; CHECK-NEXT: [[OUT_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i32 7 415; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN_GEP_0]], align 4 416; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 417; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 418; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 419; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 420; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0 421; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Y2]], i32 1 422; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Z3]], i32 2 423; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[W4]], i32 3 424; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT_GEP_0]], align 4 425; CHECK-NEXT: ret void 426; 427 %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 11 428 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 12 429 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 13 430 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 14 431 %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 7 432 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 8 433 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 9 434 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 10 435 436 %x = load i32, ptr addrspace(1) %in.gep.0 437 %y = load i32, ptr addrspace(1) %in.gep.1 438 %z = load i32, ptr addrspace(1) %in.gep.2 439 %w = load i32, ptr addrspace(1) %in.gep.3 440 441 store i32 %x, ptr addrspace(1) %out.gep.0 442 store i32 %y, ptr addrspace(1) %out.gep.1 443 store i32 %z, ptr addrspace(1) %out.gep.2 444 store i32 %w, ptr addrspace(1) %out.gep.3 445 ret void 446} 447 448define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 449; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32( 450; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 4 451; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 452; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 453; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 454; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 455; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() #[[ATTR3:[0-9]+]] 456; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0 457; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Y2]], i32 1 458; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Z3]], i32 2 459; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[W4]], i32 3 460; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4 461; CHECK-NEXT: ret void 462; 463 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 464 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2 465 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3 466 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1 467 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2 468 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3 469 470 %x = load i32, ptr addrspace(1) %in 471 %y = load i32, ptr addrspace(1) %in.gep.1 472 %z = load i32, ptr addrspace(1) %in.gep.2 473 %w = load i32, ptr addrspace(1) %in.gep.3 474 475 ; Make sure the barrier doesn't stop this 476 tail call void @llvm.amdgcn.s.barrier() #1 477 478 store i32 %w, ptr addrspace(1) %out.gep.3 479 store i32 %z, ptr addrspace(1) %out.gep.2 480 store i32 %y, ptr addrspace(1) %out.gep.1 481 store i32 %x, ptr addrspace(1) %out 482 483 ret void 484} 485 486define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 487; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32( 488; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 4 489; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 490; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 491; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 492; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 493; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() #[[ATTR3]] 494; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[W4]], i32 0 495; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Z3]], i32 1 496; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Y2]], i32 2 497; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[X1]], i32 3 498; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4 499; CHECK-NEXT: ret void 500; 501 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 502 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2 503 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3 504 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1 505 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2 506 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3 507 508 %x = load i32, ptr addrspace(1) %in 509 %y = load i32, ptr addrspace(1) %in.gep.1 510 %z = load i32, ptr addrspace(1) %in.gep.2 511 %w = load i32, ptr addrspace(1) %in.gep.3 512 513 ; Make sure the barrier doesn't stop this 514 tail call void @llvm.amdgcn.s.barrier() #1 515 516 store i32 %w, ptr addrspace(1) %out 517 store i32 %z, ptr addrspace(1) %out.gep.1 518 store i32 %y, ptr addrspace(1) %out.gep.2 519 store i32 %x, ptr addrspace(1) %out.gep.3 520 521 ret void 522} 523 524define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 525; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8( 526; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr addrspace(1) [[IN:%.*]], align 4 527; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 528; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 529; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 530; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 531; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[X1]], i32 0 532; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[Y2]], i32 1 533; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[Z3]], i32 2 534; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[W4]], i32 3 535; CHECK-NEXT: store <4 x i8> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4 536; CHECK-NEXT: ret void 537; 538 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1 539 %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2 540 %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3 541 %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1 542 %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2 543 %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3 544 545 %x = load i8, ptr addrspace(1) %in, align 4 546 %y = load i8, ptr addrspace(1) %in.gep.1 547 %z = load i8, ptr addrspace(1) %in.gep.2 548 %w = load i8, ptr addrspace(1) %in.gep.3 549 550 store i8 %x, ptr addrspace(1) %out, align 4 551 store i8 %y, ptr addrspace(1) %out.gep.1 552 store i8 %z, ptr addrspace(1) %out.gep.2 553 store i8 %w, ptr addrspace(1) %out.gep.3 554 ret void 555} 556 557define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 558; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align( 559; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr addrspace(1) [[IN:%.*]], align 1 560; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 561; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 562; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 563; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 564; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[X1]], i32 0 565; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[Y2]], i32 1 566; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[Z3]], i32 2 567; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[W4]], i32 3 568; CHECK-NEXT: store <4 x i8> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 1 569; CHECK-NEXT: ret void 570; 571 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1 572 %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2 573 %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3 574 %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1 575 %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2 576 %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3 577 578 %x = load i8, ptr addrspace(1) %in 579 %y = load i8, ptr addrspace(1) %in.gep.1 580 %z = load i8, ptr addrspace(1) %in.gep.2 581 %w = load i8, ptr addrspace(1) %in.gep.3 582 583 store i8 %x, ptr addrspace(1) %out 584 store i8 %y, ptr addrspace(1) %out.gep.1 585 store i8 %z, ptr addrspace(1) %out.gep.2 586 store i8 %w, ptr addrspace(1) %out.gep.3 587 ret void 588} 589 590define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 591; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32( 592; CHECK-NEXT: [[VEC:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 16 593; CHECK-NEXT: [[X:%.*]] = extractelement <4 x i32> [[VEC]], i32 0 594; CHECK-NEXT: [[Y:%.*]] = extractelement <4 x i32> [[VEC]], i32 1 595; CHECK-NEXT: [[Z:%.*]] = extractelement <4 x i32> [[VEC]], i32 2 596; CHECK-NEXT: [[W:%.*]] = extractelement <4 x i32> [[VEC]], i32 3 597; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 598; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[Y]], i32 1 599; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Z]], i32 2 600; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[W]], i32 3 601; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr addrspace(1) [[OUT:%.*]], align 4 602; CHECK-NEXT: ret void 603; 604 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 605 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2 606 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3 607 %vec = load <4 x i32>, ptr addrspace(1) %in 608 609 %x = extractelement <4 x i32> %vec, i32 0 610 %y = extractelement <4 x i32> %vec, i32 1 611 %z = extractelement <4 x i32> %vec, i32 2 612 %w = extractelement <4 x i32> %vec, i32 3 613 614 store i32 %x, ptr addrspace(1) %out 615 store i32 %y, ptr addrspace(1) %out.gep.1 616 store i32 %z, ptr addrspace(1) %out.gep.2 617 store i32 %w, ptr addrspace(1) %out.gep.3 618 ret void 619} 620 621define amdgpu_kernel void @merge_local_store_2_constants_i8(ptr addrspace(3) %out) #0 { 622; CHECK-LABEL: @merge_local_store_2_constants_i8( 623; CHECK-NEXT: store <2 x i8> <i8 -56, i8 123>, ptr addrspace(3) [[OUT:%.*]], align 2 624; CHECK-NEXT: ret void 625; 626 %out.gep.1 = getelementptr i8, ptr addrspace(3) %out, i32 1 627 628 store i8 123, ptr addrspace(3) %out.gep.1 629 store i8 456, ptr addrspace(3) %out, align 2 630 ret void 631} 632 633define amdgpu_kernel void @merge_local_store_2_constants_i32(ptr addrspace(3) %out) #0 { 634; CHECK-LABEL: @merge_local_store_2_constants_i32( 635; CHECK-NEXT: store <2 x i32> <i32 456, i32 123>, ptr addrspace(3) [[OUT:%.*]], align 4 636; CHECK-NEXT: ret void 637; 638 %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1 639 640 store i32 123, ptr addrspace(3) %out.gep.1 641 store i32 456, ptr addrspace(3) %out 642 ret void 643} 644 645define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(ptr addrspace(3) %out) #0 { 646; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2( 647; CHECK-NEXT: [[OUT_GEP_1:%.*]] = getelementptr i32, ptr addrspace(3) [[OUT:%.*]], i32 1 648; CHECK-NEXT: store i32 123, ptr addrspace(3) [[OUT_GEP_1]], align 2 649; CHECK-NEXT: store i32 456, ptr addrspace(3) [[OUT]], align 2 650; CHECK-NEXT: ret void 651; 652 %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1 653 654 store i32 123, ptr addrspace(3) %out.gep.1, align 2 655 store i32 456, ptr addrspace(3) %out, align 2 656 ret void 657} 658 659define amdgpu_kernel void @merge_local_store_4_constants_i32(ptr addrspace(3) %out) #0 { 660; CHECK-LABEL: @merge_local_store_4_constants_i32( 661; CHECK-NEXT: [[OUT_GEP_2:%.*]] = getelementptr i32, ptr addrspace(3) [[OUT:%.*]], i32 2 662; CHECK-NEXT: store <2 x i32> <i32 456, i32 333>, ptr addrspace(3) [[OUT_GEP_2]], align 4 663; CHECK-NEXT: store <2 x i32> <i32 1234, i32 123>, ptr addrspace(3) [[OUT]], align 4 664; CHECK-NEXT: ret void 665; 666 %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1 667 %out.gep.2 = getelementptr i32, ptr addrspace(3) %out, i32 2 668 %out.gep.3 = getelementptr i32, ptr addrspace(3) %out, i32 3 669 670 store i32 123, ptr addrspace(3) %out.gep.1 671 store i32 456, ptr addrspace(3) %out.gep.2 672 store i32 333, ptr addrspace(3) %out.gep.3 673 store i32 1234, ptr addrspace(3) %out 674 ret void 675} 676 677define amdgpu_kernel void @merge_global_store_5_constants_i32(ptr addrspace(1) %out) { 678; CHECK-LABEL: @merge_global_store_5_constants_i32( 679; CHECK-NEXT: store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, ptr addrspace(1) [[OUT:%.*]], align 4 680; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4 681; CHECK-NEXT: store i32 11, ptr addrspace(1) [[IDX4]], align 4 682; CHECK-NEXT: ret void 683; 684 store i32 9, ptr addrspace(1) %out, align 4 685 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1 686 store i32 12, ptr addrspace(1) %idx1, align 4 687 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2 688 store i32 16, ptr addrspace(1) %idx2, align 4 689 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3 690 store i32 -12, ptr addrspace(1) %idx3, align 4 691 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4 692 store i32 11, ptr addrspace(1) %idx4, align 4 693 ret void 694} 695 696define amdgpu_kernel void @merge_global_store_6_constants_i32(ptr addrspace(1) %out) { 697; CHECK-LABEL: @merge_global_store_6_constants_i32( 698; CHECK-NEXT: store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, ptr addrspace(1) [[OUT:%.*]], align 4 699; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4 700; CHECK-NEXT: store <2 x i32> <i32 11, i32 123>, ptr addrspace(1) [[IDX4]], align 4 701; CHECK-NEXT: ret void 702; 703 store i32 13, ptr addrspace(1) %out, align 4 704 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1 705 store i32 15, ptr addrspace(1) %idx1, align 4 706 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2 707 store i32 62, ptr addrspace(1) %idx2, align 4 708 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3 709 store i32 63, ptr addrspace(1) %idx3, align 4 710 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4 711 store i32 11, ptr addrspace(1) %idx4, align 4 712 %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5 713 store i32 123, ptr addrspace(1) %idx5, align 4 714 ret void 715} 716 717define amdgpu_kernel void @merge_global_store_7_constants_i32(ptr addrspace(1) %out) { 718; CHECK-LABEL: @merge_global_store_7_constants_i32( 719; CHECK-NEXT: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, ptr addrspace(1) [[OUT:%.*]], align 4 720; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4 721; CHECK-NEXT: store <3 x i32> <i32 98, i32 91, i32 212>, ptr addrspace(1) [[IDX4]], align 4 722; CHECK-NEXT: ret void 723; 724 store i32 34, ptr addrspace(1) %out, align 4 725 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1 726 store i32 999, ptr addrspace(1) %idx1, align 4 727 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2 728 store i32 65, ptr addrspace(1) %idx2, align 4 729 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3 730 store i32 33, ptr addrspace(1) %idx3, align 4 731 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4 732 store i32 98, ptr addrspace(1) %idx4, align 4 733 %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5 734 store i32 91, ptr addrspace(1) %idx5, align 4 735 %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6 736 store i32 212, ptr addrspace(1) %idx6, align 4 737 ret void 738} 739 740define amdgpu_kernel void @merge_global_store_8_constants_i32(ptr addrspace(1) %out) { 741; CHECK-LABEL: @merge_global_store_8_constants_i32( 742; CHECK-NEXT: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, ptr addrspace(1) [[OUT:%.*]], align 4 743; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4 744; CHECK-NEXT: store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, ptr addrspace(1) [[IDX4]], align 4 745; CHECK-NEXT: ret void 746; 747 store i32 34, ptr addrspace(1) %out, align 4 748 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1 749 store i32 999, ptr addrspace(1) %idx1, align 4 750 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2 751 store i32 65, ptr addrspace(1) %idx2, align 4 752 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3 753 store i32 33, ptr addrspace(1) %idx3, align 4 754 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4 755 store i32 98, ptr addrspace(1) %idx4, align 4 756 %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5 757 store i32 91, ptr addrspace(1) %idx5, align 4 758 %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6 759 store i32 212, ptr addrspace(1) %idx6, align 4 760 %idx7 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 7 761 store i32 999, ptr addrspace(1) %idx7, align 4 762 ret void 763} 764 765define amdgpu_kernel void @copy_v3i32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { 766; CHECK-LABEL: @copy_v3i32_align4( 767; CHECK-NEXT: [[VEC:%.*]] = load <3 x i32>, ptr addrspace(1) [[IN:%.*]], align 4 768; CHECK-NEXT: store <3 x i32> [[VEC]], ptr addrspace(1) [[OUT:%.*]], align 16 769; CHECK-NEXT: ret void 770; 771 %vec = load <3 x i32>, ptr addrspace(1) %in, align 4 772 store <3 x i32> %vec, ptr addrspace(1) %out 773 ret void 774} 775 776define amdgpu_kernel void @copy_v3i64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { 777; CHECK-LABEL: @copy_v3i64_align4( 778; CHECK-NEXT: [[VEC:%.*]] = load <3 x i64>, ptr addrspace(1) [[IN:%.*]], align 4 779; CHECK-NEXT: store <3 x i64> [[VEC]], ptr addrspace(1) [[OUT:%.*]], align 32 780; CHECK-NEXT: ret void 781; 782 %vec = load <3 x i64>, ptr addrspace(1) %in, align 4 783 store <3 x i64> %vec, ptr addrspace(1) %out 784 ret void 785} 786 787define amdgpu_kernel void @copy_v3f32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { 788; CHECK-LABEL: @copy_v3f32_align4( 789; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr addrspace(1) [[IN:%.*]], align 4 790; CHECK-NEXT: [[FADD:%.*]] = fadd <3 x float> [[VEC]], <float 1.000000e+00, float 2.000000e+00, float 4.000000e+00> 791; CHECK-NEXT: store <3 x float> [[FADD]], ptr addrspace(1) [[OUT:%.*]], align 16 792; CHECK-NEXT: ret void 793; 794 %vec = load <3 x float>, ptr addrspace(1) %in, align 4 795 %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0> 796 store <3 x float> %fadd, ptr addrspace(1) %out 797 ret void 798} 799 800define amdgpu_kernel void @copy_v3f64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { 801; CHECK-LABEL: @copy_v3f64_align4( 802; CHECK-NEXT: [[VEC:%.*]] = load <3 x double>, ptr addrspace(1) [[IN:%.*]], align 4 803; CHECK-NEXT: [[FADD:%.*]] = fadd <3 x double> [[VEC]], <double 1.000000e+00, double 2.000000e+00, double 4.000000e+00> 804; CHECK-NEXT: store <3 x double> [[FADD]], ptr addrspace(1) [[OUT:%.*]], align 32 805; CHECK-NEXT: ret void 806; 807 %vec = load <3 x double>, ptr addrspace(1) %in, align 4 808 %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0> 809 store <3 x double> %fadd, ptr addrspace(1) %out 810 ret void 811} 812 813; Verify that we no longer hit asserts for this test case. No change expected. 814define amdgpu_kernel void @copy_vec_of_ptrs(ptr addrspace(1) %out, 815; CHECK-LABEL: @copy_vec_of_ptrs( 816; CHECK-NEXT: [[IN_GEP_1:%.*]] = getelementptr <2 x ptr>, ptr addrspace(1) [[IN:%.*]], i32 1 817; CHECK-NEXT: [[VEC1:%.*]] = load <2 x ptr>, ptr addrspace(1) [[IN_GEP_1]], align 16 818; CHECK-NEXT: [[VEC2:%.*]] = load <2 x ptr>, ptr addrspace(1) [[IN]], align 4 819; CHECK-NEXT: [[OUT_GEP_1:%.*]] = getelementptr <2 x ptr>, ptr addrspace(1) [[OUT:%.*]], i32 1 820; CHECK-NEXT: store <2 x ptr> [[VEC1]], ptr addrspace(1) [[OUT_GEP_1]], align 16 821; CHECK-NEXT: store <2 x ptr> [[VEC2]], ptr addrspace(1) [[OUT]], align 4 822; CHECK-NEXT: ret void 823; 824 ptr addrspace(1) %in ) #0 { 825 %in.gep.1 = getelementptr <2 x ptr>, ptr addrspace(1) %in, i32 1 826 %vec1 = load <2 x ptr>, ptr addrspace(1) %in.gep.1 827 %vec2 = load <2 x ptr>, ptr addrspace(1) %in, align 4 828 829 %out.gep.1 = getelementptr <2 x ptr>, ptr addrspace(1) %out, i32 1 830 store <2 x ptr> %vec1, ptr addrspace(1) %out.gep.1 831 store <2 x ptr> %vec2, ptr addrspace(1) %out, align 4 832 ret void 833} 834 835declare void @llvm.amdgcn.s.barrier() #1 836 837attributes #0 = { nounwind } 838attributes #1 = { convergent nounwind } 839