1; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=GCN-AA %s 3 4; This test is mostly to test DAG store merging, so disable the vectorizer. 5; Run with devices with different unaligned load restrictions. 6 7; TODO: Vector element tests 8; TODO: Non-zero base offset for load and store combinations 9; TODO: Same base addrspacecasted 10 11 12; GCN-LABEL: {{^}}merge_global_store_2_constants_i8: 13; GCN: buffer_store_short 14; GCN: s_endpgm 15define amdgpu_kernel void @merge_global_store_2_constants_i8(ptr addrspace(1) %out) #0 { 16 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1 17 18 store i8 123, ptr addrspace(1) %out.gep.1 19 store i8 456, ptr addrspace(1) %out, align 2 20 ret void 21} 22 23; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align: 24; GCN: buffer_store_byte 25; GCN: buffer_store_byte 26; GCN: s_endpgm 27define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(ptr addrspace(1) %out) #0 { 28 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1 29 30 store i8 123, ptr addrspace(1) %out.gep.1 31 store i8 456, ptr addrspace(1) %out 32 ret void 33} 34 35; GCN-LABEL: {{^}}merge_global_store_2_constants_i16: 36; GCN: buffer_store_dword v 37define amdgpu_kernel void @merge_global_store_2_constants_i16(ptr addrspace(1) %out) #0 { 38 %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1 39 40 store i16 123, ptr addrspace(1) %out.gep.1 41 store i16 456, ptr addrspace(1) %out, align 4 42 ret void 43} 44 45; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16: 46; GCN: buffer_store_dword v 47define amdgpu_kernel void @merge_global_store_2_constants_0_i16(ptr addrspace(1) %out) #0 { 48 %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1 49 50 store i16 0, ptr addrspace(1) %out.gep.1 51 store i16 0, ptr addrspace(1) %out, align 4 52 ret void 53} 54 55; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align: 56; GCN: buffer_store_short 57; GCN: buffer_store_short 58; GCN: s_endpgm 59define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(ptr addrspace(1) %out) #0 { 60 %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1 61 62 store i16 123, ptr addrspace(1) %out.gep.1 63 store i16 456, ptr addrspace(1) %out 64 ret void 65} 66 67; GCN-LABEL: {{^}}merge_global_store_2_constants_i32: 68; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 69; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b 70; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]] 71define amdgpu_kernel void @merge_global_store_2_constants_i32(ptr addrspace(1) %out) #0 { 72 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 73 74 store i32 123, ptr addrspace(1) %out.gep.1 75 store i32 456, ptr addrspace(1) %out 76 ret void 77} 78 79; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32: 80; GCN: buffer_store_dwordx2 81define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(ptr addrspace(1) %out) #0 { 82 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 83 store float 1.0, ptr addrspace(1) %out.gep.1 84 store i32 456, ptr addrspace(1) %out 85 ret void 86} 87 88; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32: 89; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0 90; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b 91; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]] 92define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(ptr addrspace(1) %out) #0 { 93 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 94 store i32 123, ptr addrspace(1) %out.gep.1 95 store float 4.0, ptr addrspace(1) %out 96 ret void 97} 98 99; GCN-LABEL: {{^}}merge_global_store_4_constants_i32: 100; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}} 101; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}} 102; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}} 103; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}} 104; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI]]] 105define amdgpu_kernel void @merge_global_store_4_constants_i32(ptr addrspace(1) %out) #0 { 106 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 107 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2 108 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3 109 110 store i32 123, ptr addrspace(1) %out.gep.1 111 store i32 456, ptr addrspace(1) %out.gep.2 112 store i32 333, ptr addrspace(1) %out.gep.3 113 store i32 1234, ptr addrspace(1) %out 114 ret void 115} 116 117; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order: 118; GCN: buffer_store_dwordx4 119define amdgpu_kernel void @merge_global_store_4_constants_f32_order(ptr addrspace(1) %out) #0 { 120 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 121 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2 122 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3 123 124 store float 8.0, ptr addrspace(1) %out 125 store float 1.0, ptr addrspace(1) %out.gep.1 126 store float 2.0, ptr addrspace(1) %out.gep.2 127 store float 4.0, ptr addrspace(1) %out.gep.3 128 ret void 129} 130 131; First store is out of order. 132; GCN-LABEL: {{^}}merge_global_store_4_constants_f32: 133; GCN: buffer_store_dwordx4 134define amdgpu_kernel void @merge_global_store_4_constants_f32(ptr addrspace(1) %out) #0 { 135 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 136 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2 137 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3 138 139 store float 1.0, ptr addrspace(1) %out.gep.1 140 store float 2.0, ptr addrspace(1) %out.gep.2 141 store float 4.0, ptr addrspace(1) %out.gep.3 142 store float 8.0, ptr addrspace(1) %out 143 ret void 144} 145 146; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32: 147; GCN-AA: buffer_store_dwordx4 v 148; GCN: s_endpgm 149define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(ptr addrspace(1) %out) #0 { 150 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 151 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2 152 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3 153 154 155 store i32 11, ptr addrspace(1) %out.gep.1 156 store float 2.0, ptr addrspace(1) %out.gep.2 157 store i32 17, ptr addrspace(1) %out.gep.3 158 store float 8.0, ptr addrspace(1) %out 159 ret void 160} 161 162; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: 163; SI-DAG: buffer_store_dwordx2 164; SI-DAG: buffer_store_dword v 165; CI-DAG: buffer_store_dwordx3 166; GCN-NOT: buffer_store_dword 167; GCN: s_endpgm 168define amdgpu_kernel void @merge_global_store_3_constants_i32(ptr addrspace(1) %out) #0 { 169 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 170 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2 171 172 store i32 123, ptr addrspace(1) %out.gep.1 173 store i32 456, ptr addrspace(1) %out.gep.2 174 store i32 1234, ptr addrspace(1) %out 175 ret void 176} 177 178; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: 179; GCN: buffer_store_dwordx4 180define amdgpu_kernel void @merge_global_store_2_constants_i64(ptr addrspace(1) %out) #0 { 181 %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1 182 183 store i64 123, ptr addrspace(1) %out.gep.1 184 store i64 456, ptr addrspace(1) %out 185 ret void 186} 187 188; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: 189; GCN: buffer_store_dwordx4 190; GCN: buffer_store_dwordx4 191define amdgpu_kernel void @merge_global_store_4_constants_i64(ptr addrspace(1) %out) #0 { 192 %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1 193 %out.gep.2 = getelementptr i64, ptr addrspace(1) %out, i64 2 194 %out.gep.3 = getelementptr i64, ptr addrspace(1) %out, i64 3 195 196 store i64 123, ptr addrspace(1) %out.gep.1 197 store i64 456, ptr addrspace(1) %out.gep.2 198 store i64 333, ptr addrspace(1) %out.gep.3 199 store i64 1234, ptr addrspace(1) %out 200 ret void 201} 202 203; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32: 204; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 205; GCN: buffer_store_dwordx2 [[LOAD]] 206define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 207 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 208 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1 209 210 %lo = load i32, ptr addrspace(1) %in 211 %hi = load i32, ptr addrspace(1) %in.gep.1 212 213 store i32 %lo, ptr addrspace(1) %out 214 store i32 %hi, ptr addrspace(1) %out.gep.1 215 ret void 216} 217 218; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base: 219; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 220; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 221define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 222 %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 2 223 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 3 224 225 %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 2 226 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 3 227 %lo = load i32, ptr addrspace(1) %in.gep.0 228 %hi = load i32, ptr addrspace(1) %in.gep.1 229 230 store i32 %lo, ptr addrspace(1) %out.gep.0 231 store i32 %hi, ptr addrspace(1) %out.gep.1 232 ret void 233} 234 235; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32: 236; GCN: buffer_load_dwordx2 v 237; GCN: buffer_store_dwordx2 v 238define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 239 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 240 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1 241 242 %lo = load i32, ptr addrspace(1) %in 243 %hi = load i32, ptr addrspace(1) %in.gep.1 244 245 store i32 %hi, ptr addrspace(1) %out 246 store i32 %lo, ptr addrspace(1) %out.gep.1 247 ret void 248} 249 250; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32: 251; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 252; GCN: buffer_store_dwordx4 [[LOAD]] 253define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 254 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 255 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2 256 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3 257 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1 258 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2 259 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3 260 261 %x = load i32, ptr addrspace(1) %in 262 %y = load i32, ptr addrspace(1) %in.gep.1 263 %z = load i32, ptr addrspace(1) %in.gep.2 264 %w = load i32, ptr addrspace(1) %in.gep.3 265 266 store i32 %x, ptr addrspace(1) %out 267 store i32 %y, ptr addrspace(1) %out.gep.1 268 store i32 %z, ptr addrspace(1) %out.gep.2 269 store i32 %w, ptr addrspace(1) %out.gep.3 270 ret void 271} 272 273; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: 274; SI-DAG: buffer_load_dwordx2 275; SI-DAG: buffer_load_dword 276; CI-DAG: buffer_load_dwordx3 277; GCN: s_waitcnt 278; SI-DAG: buffer_store_dwordx2 279; SI-DAG: buffer_store_dword v 280; CI-DAG: buffer_store_dwordx3 281; GCN: s_endpgm 282define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 283 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 284 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2 285 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1 286 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2 287 288 %x = load i32, ptr addrspace(1) %in 289 %y = load i32, ptr addrspace(1) %in.gep.1 290 %z = load i32, ptr addrspace(1) %in.gep.2 291 292 store i32 %x, ptr addrspace(1) %out 293 store i32 %y, ptr addrspace(1) %out.gep.1 294 store i32 %z, ptr addrspace(1) %out.gep.2 295 ret void 296} 297 298; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32: 299; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 300; GCN: buffer_store_dwordx4 [[LOAD]] 301define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 302 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1 303 %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2 304 %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3 305 %in.gep.1 = getelementptr float, ptr addrspace(1) %in, i32 1 306 %in.gep.2 = getelementptr float, ptr addrspace(1) %in, i32 2 307 %in.gep.3 = getelementptr float, ptr addrspace(1) %in, i32 3 308 309 %x = load float, ptr addrspace(1) %in 310 %y = load float, ptr addrspace(1) %in.gep.1 311 %z = load float, ptr addrspace(1) %in.gep.2 312 %w = load float, ptr addrspace(1) %in.gep.3 313 314 store float %x, ptr addrspace(1) %out 315 store float %y, ptr addrspace(1) %out.gep.1 316 store float %z, ptr addrspace(1) %out.gep.2 317 store float %w, ptr addrspace(1) %out.gep.3 318 ret void 319} 320 321; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base: 322; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 323; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 324define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 325 %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 11 326 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 12 327 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 13 328 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 14 329 %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 7 330 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 8 331 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 9 332 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 10 333 334 %x = load i32, ptr addrspace(1) %in.gep.0 335 %y = load i32, ptr addrspace(1) %in.gep.1 336 %z = load i32, ptr addrspace(1) %in.gep.2 337 %w = load i32, ptr addrspace(1) %in.gep.3 338 339 store i32 %x, ptr addrspace(1) %out.gep.0 340 store i32 %y, ptr addrspace(1) %out.gep.1 341 store i32 %z, ptr addrspace(1) %out.gep.2 342 store i32 %w, ptr addrspace(1) %out.gep.3 343 ret void 344} 345 346; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32: 347; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 348; GCN: s_barrier 349; GCN: buffer_store_dwordx4 [[LOAD]] 350define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 351 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 352 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2 353 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3 354 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1 355 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2 356 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3 357 358 %x = load i32, ptr addrspace(1) %in 359 %y = load i32, ptr addrspace(1) %in.gep.1 360 %z = load i32, ptr addrspace(1) %in.gep.2 361 %w = load i32, ptr addrspace(1) %in.gep.3 362 363 ; Make sure the barrier doesn't stop this 364 tail call void @llvm.amdgcn.s.barrier() #1 365 366 store i32 %w, ptr addrspace(1) %out.gep.3 367 store i32 %z, ptr addrspace(1) %out.gep.2 368 store i32 %y, ptr addrspace(1) %out.gep.1 369 store i32 %x, ptr addrspace(1) %out 370 371 ret void 372} 373 374; TODO: Re-packing of loaded register required. Maybe an IR pass 375; should catch this? 376 377; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32: 378; GCN: buffer_load_dwordx4 v 379; GCN: s_barrier 380; GCN: buffer_store_dwordx4 v 381define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 382 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 383 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2 384 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3 385 %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1 386 %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2 387 %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3 388 389 %x = load i32, ptr addrspace(1) %in 390 %y = load i32, ptr addrspace(1) %in.gep.1 391 %z = load i32, ptr addrspace(1) %in.gep.2 392 %w = load i32, ptr addrspace(1) %in.gep.3 393 394 ; Make sure the barrier doesn't stop this 395 tail call void @llvm.amdgcn.s.barrier() #1 396 397 store i32 %w, ptr addrspace(1) %out 398 store i32 %z, ptr addrspace(1) %out.gep.1 399 store i32 %y, ptr addrspace(1) %out.gep.2 400 store i32 %x, ptr addrspace(1) %out.gep.3 401 402 ret void 403} 404 405; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8: 406; GCN: buffer_load_dword [[LOAD:v[0-9]+]] 407; GCN: buffer_store_dword [[LOAD]] 408; GCN: s_endpgm 409define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 410 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1 411 %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2 412 %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3 413 %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1 414 %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2 415 %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3 416 417 %x = load i8, ptr addrspace(1) %in, align 4 418 %y = load i8, ptr addrspace(1) %in.gep.1 419 %z = load i8, ptr addrspace(1) %in.gep.2 420 %w = load i8, ptr addrspace(1) %in.gep.3 421 422 store i8 %x, ptr addrspace(1) %out, align 4 423 store i8 %y, ptr addrspace(1) %out.gep.1 424 store i8 %z, ptr addrspace(1) %out.gep.2 425 store i8 %w, ptr addrspace(1) %out.gep.3 426 ret void 427} 428 429; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align: 430; GCN: buffer_load_ubyte 431; GCN: buffer_load_ubyte 432; GCN: buffer_load_ubyte 433; GCN: buffer_load_ubyte 434; GCN: buffer_store_byte 435; GCN: buffer_store_byte 436; GCN: buffer_store_byte 437; GCN: buffer_store_byte 438; GCN: s_endpgm 439define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 440 %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1 441 %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2 442 %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3 443 %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1 444 %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2 445 %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3 446 447 %x = load i8, ptr addrspace(1) %in 448 %y = load i8, ptr addrspace(1) %in.gep.1 449 %z = load i8, ptr addrspace(1) %in.gep.2 450 %w = load i8, ptr addrspace(1) %in.gep.3 451 452 store i8 %x, ptr addrspace(1) %out 453 store i8 %y, ptr addrspace(1) %out.gep.1 454 store i8 %z, ptr addrspace(1) %out.gep.2 455 store i8 %w, ptr addrspace(1) %out.gep.3 456 ret void 457} 458 459; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32: 460; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 461; GCN: buffer_store_dwordx4 [[LOAD]] 462; GCN: s_endpgm 463define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 464 %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1 465 %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2 466 %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3 467 %vec = load <4 x i32>, ptr addrspace(1) %in 468 469 %x = extractelement <4 x i32> %vec, i32 0 470 %y = extractelement <4 x i32> %vec, i32 1 471 %z = extractelement <4 x i32> %vec, i32 2 472 %w = extractelement <4 x i32> %vec, i32 3 473 474 store i32 %x, ptr addrspace(1) %out 475 store i32 %y, ptr addrspace(1) %out.gep.1 476 store i32 %z, ptr addrspace(1) %out.gep.2 477 store i32 %w, ptr addrspace(1) %out.gep.3 478 ret void 479} 480 481; GCN-LABEL: {{^}}merge_local_store_2_constants_i8: 482; GCN: ds_write_b16 483; GCN: s_endpgm 484define amdgpu_kernel void @merge_local_store_2_constants_i8(ptr addrspace(3) %out) #0 { 485 %out.gep.1 = getelementptr i8, ptr addrspace(3) %out, i32 1 486 487 store i8 123, ptr addrspace(3) %out.gep.1 488 store i8 456, ptr addrspace(3) %out, align 2 489 ret void 490} 491 492; GCN-LABEL: {{^}}merge_local_store_2_constants_i32: 493; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 494; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b 495; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}} 496define amdgpu_kernel void @merge_local_store_2_constants_i32(ptr addrspace(3) %out) #0 { 497 %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1 498 499 store i32 123, ptr addrspace(3) %out.gep.1 500 store i32 456, ptr addrspace(3) %out 501 ret void 502} 503 504; GCN-LABEL: {{^}}merge_local_store_4_constants_i32: 505; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8 506; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d 507; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3 508 509; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2 510; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b 511; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1 512 513; GCN: s_endpgm 514define amdgpu_kernel void @merge_local_store_4_constants_i32(ptr addrspace(3) %out) #0 { 515 %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1 516 %out.gep.2 = getelementptr i32, ptr addrspace(3) %out, i32 2 517 %out.gep.3 = getelementptr i32, ptr addrspace(3) %out, i32 3 518 519 store i32 123, ptr addrspace(3) %out.gep.1 520 store i32 456, ptr addrspace(3) %out.gep.2 521 store i32 333, ptr addrspace(3) %out.gep.3 522 store i32 1234, ptr addrspace(3) %out 523 ret void 524} 525 526; GCN-LABEL: {{^}}merge_global_store_5_constants_i32: 527; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}} 528; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}} 529; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI4]]] 530; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} 531; GCN: buffer_store_dword v[[HI]] 532define amdgpu_kernel void @merge_global_store_5_constants_i32(ptr addrspace(1) %out) { 533 store i32 9, ptr addrspace(1) %out, align 4 534 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1 535 store i32 12, ptr addrspace(1) %idx1, align 4 536 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2 537 store i32 16, ptr addrspace(1) %idx2, align 4 538 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3 539 store i32 -12, ptr addrspace(1) %idx3, align 4 540 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4 541 store i32 11, ptr addrspace(1) %idx4, align 4 542 ret void 543} 544 545; GCN-LABEL: {{^}}merge_global_store_6_constants_i32: 546; GCN: buffer_store_dwordx4 547; GCN: buffer_store_dwordx2 548define amdgpu_kernel void @merge_global_store_6_constants_i32(ptr addrspace(1) %out) { 549 store i32 13, ptr addrspace(1) %out, align 4 550 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1 551 store i32 15, ptr addrspace(1) %idx1, align 4 552 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2 553 store i32 62, ptr addrspace(1) %idx2, align 4 554 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3 555 store i32 63, ptr addrspace(1) %idx3, align 4 556 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4 557 store i32 11, ptr addrspace(1) %idx4, align 4 558 %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5 559 store i32 123, ptr addrspace(1) %idx5, align 4 560 ret void 561} 562 563; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: 564; GCN: buffer_store_dwordx4 565; SI-DAG: buffer_store_dwordx2 566; CI: buffer_store_dwordx3 567define amdgpu_kernel void @merge_global_store_7_constants_i32(ptr addrspace(1) %out) { 568 store i32 34, ptr addrspace(1) %out, align 4 569 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1 570 store i32 999, ptr addrspace(1) %idx1, align 4 571 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2 572 store i32 65, ptr addrspace(1) %idx2, align 4 573 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3 574 store i32 33, ptr addrspace(1) %idx3, align 4 575 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4 576 store i32 98, ptr addrspace(1) %idx4, align 4 577 %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5 578 store i32 91, ptr addrspace(1) %idx5, align 4 579 %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6 580 store i32 212, ptr addrspace(1) %idx6, align 4 581 ret void 582} 583 584; GCN-LABEL: {{^}}merge_global_store_8_constants_i32: 585; GCN: buffer_store_dwordx4 586; GCN: buffer_store_dwordx4 587; GCN: s_endpgm 588define amdgpu_kernel void @merge_global_store_8_constants_i32(ptr addrspace(1) %out) { 589 store i32 34, ptr addrspace(1) %out, align 4 590 %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1 591 store i32 999, ptr addrspace(1) %idx1, align 4 592 %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2 593 store i32 65, ptr addrspace(1) %idx2, align 4 594 %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3 595 store i32 33, ptr addrspace(1) %idx3, align 4 596 %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4 597 store i32 98, ptr addrspace(1) %idx4, align 4 598 %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5 599 store i32 91, ptr addrspace(1) %idx5, align 4 600 %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6 601 store i32 212, ptr addrspace(1) %idx6, align 4 602 %idx7 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 7 603 store i32 999, ptr addrspace(1) %idx7, align 4 604 ret void 605} 606 607; This requires handling of scalar_to_vector for v2i64 to avoid 608; scratch usage. 609; FIXME: Should do single load and store 610 611; GCN-LABEL: {{^}}copy_v3i32_align4: 612; GCN-NOT: SCRATCH_RSRC_DWORD 613; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 614; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 615; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 616; GCN-NOT: offen 617; GCN: s_waitcnt vmcnt 618; GCN-NOT: offen 619; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 620; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 621; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 622 623; GCN: ScratchSize: 0{{$}} 624define amdgpu_kernel void @copy_v3i32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { 625 %vec = load <3 x i32>, ptr addrspace(1) %in, align 4 626 store <3 x i32> %vec, ptr addrspace(1) %out 627 ret void 628} 629 630; GCN-LABEL: {{^}}copy_v3i64_align4: 631; GCN-NOT: SCRATCH_RSRC_DWORD 632; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 633; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 634; GCN-NOT: offen 635; GCN: s_waitcnt vmcnt 636; GCN-NOT: offen 637; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 638; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 639; GCN: ScratchSize: 0{{$}} 640define amdgpu_kernel void @copy_v3i64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { 641 %vec = load <3 x i64>, ptr addrspace(1) %in, align 4 642 store <3 x i64> %vec, ptr addrspace(1) %out 643 ret void 644} 645 646; GCN-LABEL: {{^}}copy_v3f32_align4: 647; GCN-NOT: SCRATCH_RSRC_DWORD 648; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 649; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 650; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 651; GCN-NOT: offen 652; GCN: s_waitcnt vmcnt 653; GCN-NOT: offen 654; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 655; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 656; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 657; GCN: ScratchSize: 0{{$}} 658define amdgpu_kernel void @copy_v3f32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { 659 %vec = load <3 x float>, ptr addrspace(1) %in, align 4 660 %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0> 661 store <3 x float> %fadd, ptr addrspace(1) %out 662 ret void 663} 664 665; GCN-LABEL: {{^}}copy_v3f64_align4: 666; GCN-NOT: SCRATCH_RSRC_DWORD 667; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 668; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 669; GCN-NOT: offen 670; GCN: s_waitcnt vmcnt 671; GCN-NOT: offen 672; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 673; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 674; GCN: ScratchSize: 0{{$}} 675define amdgpu_kernel void @copy_v3f64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { 676 %vec = load <3 x double>, ptr addrspace(1) %in, align 4 677 %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0> 678 store <3 x double> %fadd, ptr addrspace(1) %out 679 ret void 680} 681 682declare void @llvm.amdgcn.s.barrier() #1 683 684attributes #0 = { nounwind } 685attributes #1 = { convergent nounwind } 686