AMDGPU/irreducible/reducible-headers.ll

; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s

;
;                         Entry
;                           |
;                           v
;                  -------->H---------
;                  |        |        |
;                  |        v        |
;                  |    --->T----    |
;                  |    |       |    |
;                  |    |       V    |
;                  S<---R       P <---
;                  ^    ^       |
;                  |    |  Div  |
;                  |    --- Q <--
;                  |        |
;                  |        v
;                  -------- U
;                           |
;                           v
;                          Exit
;
; The divergent branch is at Q that exits an irreducible cycle with
; entries T and P nested inside a reducible cycle with header H. R is
; assigned label R, which reaches P. S is a join node with label S. If
; this is propagated to P via H, then P is incorrectly recognized as a
; join, making the inner cycle divergent. P is always executed
; convergently -- either by threads that reconverged at header H, or
; by threads that are still executing the inner cycle. Thus, any PHI
; at P should not be marked divergent.

define amdgpu_kernel void @nested_irreducible(i32 %a, i32 %b, i32 %c) {
; CHECK=LABEL: UniformityInfo for function 'nested_irreducible':
; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
; CHECK: CYCLES WITH DIVERGENT EXIT:
; CHECK-DAG:   depth=2: entries(P T) R Q
; CHECK-DAG:   depth=1: entries(H) S P T R Q U
entry:
  %cond.uni = icmp slt i32 %a, 0
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %cond.div = icmp slt i32 %tid, 0
  br label %H

H:
 br i1 %cond.uni, label %T, label %P

P:
; CHECK-LABEL: BLOCK P
; CHECK-NOT:   DIVERGENT:   %pp.phi =
; CHECK-NOT: DIVERGENT:      %pp =
  %pp.phi  = phi i32 [ %a, %H], [ %b, %T ]
  %pp = add i32 %b, 1
  br label %Q

Q:
; CHECK-LABEL: BLOCK Q
; CHECK-NOT: DIVERGENT:   %qq =
; CHECK-NOT:   DIVERGENT:   %qq.uni =
  %qq = add i32 %b, 1
  %qq.uni = add i32 %pp.phi, 1
  br i1 %cond.div, label %R, label %U

R:
  br i1 %cond.uni, label %S, label %T

T:
; CHECK-LABEL: BLOCK T
; CHECK-NOT:   DIVERGENT:   %tt.phi =
; CHECK-NOT: DIVERGENT:     %tt =
  %tt.phi = phi i32 [ %qq, %R ], [ %a, %H ]
  %tt = add i32 %b, 1
  br label %P

S:
; CHECK-LABEL: BLOCK S
; CHECK:   DIVERGENT:   %ss.phi =
; CHECK-NOT: DIVERGENT:     %ss =
  %ss.phi = phi i32 [ %qq.uni, %U ], [ %a, %R ]
  %ss = add i32 %b, 1
  br label %H

U:
  br i1 %cond.uni, label %S, label %exit

exit:
; CHECK: DIVERGENT:     %ee.div =
; CHECK-NOT: DIVERGENT:     %ee =
  %ee.div =  add i32 %qq.uni, 1
  %ee = add i32 %b, 1
  ret void
}

;
;                         Entry
;                           |
;                           v
;               -->-------->H---------
;               |  ^        |        |
;               |  |        |        |
;               |  |        |        |
;               |  |        |        |
;               |  |        v        V
;               |  R<-------T-->U--->P
;               |          Div       |
;               |                    |
;               ----------- Q <-------
;                           |
;                           v
;                          Exit
;
; This is a reducible cycle with a divergent branch at T. Disjoint
; paths eventually join at the header H, which is assigned label H.
; Node P is assigned label U. If the header label were propagated to
; P, it will be incorrectly recgonized as a join. P is always executed
; convergently -- either by threads that reconverged at header H, or
; by threads that diverged at T (and eventually reconverged at H).
; Thus, any PHI at P should not be marked divergent.

define amdgpu_kernel void @header_label_1(i32 %a, i32 %b, i32 %c) {
; CHECK=LABEL: UniformityInfo for function 'header_label_1':
; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
; CHECK: CYCLES WITH DIVERGENT EXIT:
; CHECK:  depth=1: entries(H) Q P U T R
entry:
  %cond.uni = icmp slt i32 %a, 0
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %cond.div = icmp slt i32 %tid, 0
  br label %H

H:
  br i1 %cond.uni, label %T, label %P

P:
; CHECK-LABEL: BLOCK P
; CHECK-NOT:   DIVERGENT:   %pp.phi =
; CHECK-NOT: DIVERGENT:      %pp =
  %pp.phi  = phi i32 [ %a, %H], [ %b, %U ]
  %pp = add i32 %b, 1
  br label %Q

Q:
; CHECK-LABEL: BLOCK Q
; CHECK-NOT: DIVERGENT:   %qq =
; CHECK-NOT:   DIVERGENT:   %qq.uni =
  %qq = add i32 %b, 1
  %qq.uni = add i32 %pp.phi, 1
  br i1 %cond.uni, label %exit, label %H

R:
  br label %H

T:
  br i1 %cond.div, label %R, label %U

U:
  br label %P

exit:
; CHECK-LABEL: BLOCK exit
; CHECK: DIVERGENT:     %ee.div =
; CHECK-NOT: DIVERGENT:     %ee =
  %ee.div =  add i32 %qq.uni, 1
  %ee = add i32 %b, 1
  ret void
}

;        entry
;            |
;        --> H1
;        |   | \
;        |   | H2(div)
;        |   \ / \
;        |    B   C
;        ^     \ /
;        \------D
;               |
;               X
;
; This is a reducible cycle with a divergent branch at H2. Disjoint
; paths eventually join at the header D, which is assigned label D.
; Node B is assigned label B. If the header label D were propagated to
; B, it will be incorrectly recgonized as a join. B is always executed
; convergently -- either by threads that reconverged at header H1, or
; by threads that diverge at H2 (and eventually reconverged at H1).
; Thus, any PHI at B should not be marked divergent.

define amdgpu_kernel void @header_label_2(i32 %a, i32 %b, i32 %c) {
; CHECK-LABEL: UniformityInfo for function 'header_label_2':
; CHECK-NOT: CYCLES ASSSUMED DIVERGENT:
; CHECK-NOT: CYCLES WITH DIVERGENT EXIT:
entry:
  %cond.uni = icmp slt i32 %a, 0
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %cond.div = icmp slt i32 %tid, 0
  br label %H1

H1:
  br i1 %cond.uni, label %B, label %H2

H2:
  br i1 %cond.div, label %B, label %C

B:
; CHECK-LABEL: BLOCK B
; CHECK-NOT: DIVERGENT:     %bb.phi =
  %bb.phi = phi i32 [%a, %H1], [%b, %H2]
  br label %D

C:
  br label %D

D:
; CHECK-LABEL: BLOCK D
; CHECK: DIVERGENT:     %dd.phi =
  %dd.phi = phi i32 [%a, %B], [%b, %C]
  br i1 %cond.uni, label %exit, label %H1

exit:
  %ee.1 = add i32 %dd.phi, 1
  %ee.2 = add i32 %b, 1
  ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() #0