xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/jit/docs/intro/tutorial04.rst (revision 8feb0f0b7eaff0608f8350bbfa3098827b4bb91b)
1*8feb0f0bSmrg.. Copyright (C) 2014-2020 Free Software Foundation, Inc.
236ac495dSmrg   Originally contributed by David Malcolm <dmalcolm@redhat.com>
336ac495dSmrg
436ac495dSmrg   This is free software: you can redistribute it and/or modify it
536ac495dSmrg   under the terms of the GNU General Public License as published by
636ac495dSmrg   the Free Software Foundation, either version 3 of the License, or
736ac495dSmrg   (at your option) any later version.
836ac495dSmrg
936ac495dSmrg   This program is distributed in the hope that it will be useful, but
1036ac495dSmrg   WITHOUT ANY WARRANTY; without even the implied warranty of
1136ac495dSmrg   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
1236ac495dSmrg   General Public License for more details.
1336ac495dSmrg
1436ac495dSmrg   You should have received a copy of the GNU General Public License
1536ac495dSmrg   along with this program.  If not, see
1636ac495dSmrg   <http://www.gnu.org/licenses/>.
1736ac495dSmrg
1836ac495dSmrgTutorial part 4: Adding JIT-compilation to a toy interpreter
1936ac495dSmrg------------------------------------------------------------
2036ac495dSmrgIn this example we construct a "toy" interpreter, and add JIT-compilation
2136ac495dSmrgto it.
2236ac495dSmrg
2336ac495dSmrgOur toy interpreter
2436ac495dSmrg*******************
2536ac495dSmrg
2636ac495dSmrgIt's a stack-based interpreter, and is intended as a (very simple) example
2736ac495dSmrgof the kind of bytecode interpreter seen in dynamic languages such as
2836ac495dSmrgPython, Ruby etc.
2936ac495dSmrg
3036ac495dSmrgFor the sake of simplicity, our toy virtual machine is very limited:
3136ac495dSmrg
3236ac495dSmrg  * The only data type is `int`
3336ac495dSmrg
3436ac495dSmrg  * It can only work on one function at a time (so that the only
3536ac495dSmrg    function call that can be made is to recurse).
3636ac495dSmrg
3736ac495dSmrg  * Functions can only take one parameter.
3836ac495dSmrg
3936ac495dSmrg  * Functions have a stack of `int` values.
4036ac495dSmrg
4136ac495dSmrg  * We'll implement function call within the interpreter by calling a
4236ac495dSmrg    function in our implementation, rather than implementing our own
4336ac495dSmrg    frame stack.
4436ac495dSmrg
4536ac495dSmrg  * The parser is only good enough to get the examples to work.
4636ac495dSmrg
4736ac495dSmrgNaturally, a real interpreter would be much more complicated that this.
4836ac495dSmrg
4936ac495dSmrgThe following operations are supported:
5036ac495dSmrg
5136ac495dSmrg====================== ======================== =============== ==============
5236ac495dSmrgOperation              Meaning                  Old Stack       New Stack
5336ac495dSmrg====================== ======================== =============== ==============
5436ac495dSmrgDUP                    Duplicate top of stack.  ``[..., x]``    ``[..., x, x]``
5536ac495dSmrgROT                    Swap top two elements    ``[..., x, y]`` ``[..., y, x]``
5636ac495dSmrg                       of stack.
5736ac495dSmrgBINARY_ADD             Add the top two elements ``[..., x, y]`` ``[..., (x+y)]``
5836ac495dSmrg                       on the stack.
5936ac495dSmrgBINARY_SUBTRACT        Likewise, but subtract.  ``[..., x, y]`` ``[..., (x-y)]``
6036ac495dSmrgBINARY_MULT            Likewise, but multiply.  ``[..., x, y]`` ``[..., (x*y)]``
6136ac495dSmrgBINARY_COMPARE_LT      Compare the top two      ``[..., x, y]`` ``[..., (x<y)]``
6236ac495dSmrg                       elements on the stack
6336ac495dSmrg                       and push a nonzero/zero
6436ac495dSmrg                       if (x<y).
6536ac495dSmrgRECURSE                Recurse, passing the top ``[..., x]``    ``[..., fn(x)]``
6636ac495dSmrg                       of the stack, and
6736ac495dSmrg                       popping the result.
6836ac495dSmrgRETURN                 Return the top of the    ``[x]``         ``[]``
6936ac495dSmrg                       stack.
7036ac495dSmrgPUSH_CONST `arg`       Push an int const.       ``[...]``       ``[..., arg]``
7136ac495dSmrgJUMP_ABS_IF_TRUE `arg` Pop; if top of stack was ``[..., x]``    ``[...]``
7236ac495dSmrg                       nonzero, jump to
7336ac495dSmrg                       ``arg``.
7436ac495dSmrg====================== ======================== =============== ==============
7536ac495dSmrg
7636ac495dSmrgPrograms can be interpreted, disassembled, and compiled to machine code.
7736ac495dSmrg
7836ac495dSmrgThe interpreter reads ``.toy`` scripts.  Here's what a simple recursive
7936ac495dSmrgfactorial program looks like, the script ``factorial.toy``.
8036ac495dSmrgThe parser ignores lines beginning with a `#`.
8136ac495dSmrg
8236ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/factorial.toy
8336ac495dSmrg    :lines: 1-
8436ac495dSmrg    :language: c
8536ac495dSmrg
8636ac495dSmrgThe interpreter is a simple infinite loop with a big ``switch`` statement
8736ac495dSmrgbased on what the next opcode is:
8836ac495dSmrg
8936ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
9036ac495dSmrg    :start-after: /* Execute the given function.  */
9136ac495dSmrg    :end-before: /* JIT compilation.  */
9236ac495dSmrg    :language: c
9336ac495dSmrg
9436ac495dSmrgCompiling to machine code
9536ac495dSmrg*************************
9636ac495dSmrgWe want to generate machine code that can be cast to this type and
9736ac495dSmrgthen directly executed in-process:
9836ac495dSmrg
9936ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
10036ac495dSmrg    :start-after: /* Functions are compiled to this function ptr type.  */
10136ac495dSmrg    :end-before: enum opcode
10236ac495dSmrg    :language: c
10336ac495dSmrg
10436ac495dSmrgThe lifetime of the code is tied to that of a :c:type:`gcc_jit_result *`.
10536ac495dSmrgWe'll handle this by bundling them up in a structure, so that we can
10636ac495dSmrgclean them up together by calling :c:func:`gcc_jit_result_release`:
10736ac495dSmrg
10836ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
10936ac495dSmrg    :start-after: /* A struct to hold the compilation results.  */
11036ac495dSmrg    :end-before: /* The main compilation hook.  */
11136ac495dSmrg    :language: c
11236ac495dSmrg
11336ac495dSmrgOur compiler isn't very sophisticated; it takes the implementation of
11436ac495dSmrgeach opcode above, and maps it directly to the operations supported by
11536ac495dSmrgthe libgccjit API.
11636ac495dSmrg
11736ac495dSmrgHow should we handle the stack?  In theory we could calculate what the
11836ac495dSmrgstack depth will be at each opcode, and optimize away the stack
11936ac495dSmrgmanipulation "by hand".  We'll see below that libgccjit is able to do
12036ac495dSmrgthis for us, so we'll implement stack manipulation
12136ac495dSmrgin a direct way, by creating a ``stack`` array and ``stack_depth``
12236ac495dSmrgvariables, local within the generated function, equivalent to this C code:
12336ac495dSmrg
12436ac495dSmrg.. code-block:: c
12536ac495dSmrg
12636ac495dSmrg  int stack_depth;
12736ac495dSmrg  int stack[MAX_STACK_DEPTH];
12836ac495dSmrg
12936ac495dSmrgWe'll also have local variables ``x`` and ``y`` for use when implementing
13036ac495dSmrgthe opcodes, equivalent to this:
13136ac495dSmrg
13236ac495dSmrg.. code-block:: c
13336ac495dSmrg
13436ac495dSmrg  int x;
13536ac495dSmrg  int y;
13636ac495dSmrg
13736ac495dSmrgThis means our compiler has the following state:
13836ac495dSmrg
13936ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
14036ac495dSmrg    :start-after: /* JIT compilation.  */
14136ac495dSmrg    :end-before: /* Stack manipulation.  */
14236ac495dSmrg    :language: c
14336ac495dSmrg
14436ac495dSmrgSetting things up
14536ac495dSmrg*****************
14636ac495dSmrg
14736ac495dSmrgFirst we create our types:
14836ac495dSmrg
14936ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
15036ac495dSmrg    :start-after: /* Create types.  */
15136ac495dSmrg    :end-before: /* The constant value 1.  */
15236ac495dSmrg    :language: c
15336ac495dSmrg
15436ac495dSmrgalong with extracting a useful `int` constant:
15536ac495dSmrg
15636ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
15736ac495dSmrg    :start-after: /* The constant value 1.  */
15836ac495dSmrg    :end-before: /* Create locations.  */
15936ac495dSmrg    :language: c
16036ac495dSmrg
16136ac495dSmrgWe'll implement push and pop in terms of the ``stack`` array and
16236ac495dSmrg``stack_depth``.  Here are helper functions for adding statements to
16336ac495dSmrga block, implementing pushing and popping values:
16436ac495dSmrg
16536ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
16636ac495dSmrg    :start-after: /* Stack manipulation.  */
16736ac495dSmrg    :end-before: /* A struct to hold the compilation results.  */
16836ac495dSmrg    :language: c
16936ac495dSmrg
17036ac495dSmrgWe will support single-stepping through the generated code in the
17136ac495dSmrgdebugger, so we need to create :c:type:`gcc_jit_location` instances, one
17236ac495dSmrgper operation in the source code.  These will reference the lines of
17336ac495dSmrge.g. ``factorial.toy``.
17436ac495dSmrg
17536ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
17636ac495dSmrg    :start-after: /* Create locations.  */
17736ac495dSmrg    :end-before: /* Creating the function.  */
17836ac495dSmrg    :language: c
17936ac495dSmrg
18036ac495dSmrgLet's create the function itself.  As usual, we create its parameter
18136ac495dSmrgfirst, then use the parameter to create the function:
18236ac495dSmrg
18336ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
18436ac495dSmrg    :start-after: /* Creating the function.  */
18536ac495dSmrg    :end-before: /* Create stack lvalues.  */
18636ac495dSmrg    :language: c
18736ac495dSmrg
18836ac495dSmrgWe create the locals within the function.
18936ac495dSmrg
19036ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
19136ac495dSmrg    :start-after: /* Create stack lvalues.  */
19236ac495dSmrg    :end-before: /* 1st pass: create blocks, one per opcode.
19336ac495dSmrg    :language: c
19436ac495dSmrg
19536ac495dSmrgPopulating the function
19636ac495dSmrg***********************
19736ac495dSmrg
19836ac495dSmrgThere's some one-time initialization, and the API treats the first block
19936ac495dSmrgyou create as the entrypoint of the function, so we need to create that
20036ac495dSmrgblock first:
20136ac495dSmrg
20236ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
20336ac495dSmrg    :start-after: first.  */
20436ac495dSmrg    :end-before: /* Create a block per operation.  */
20536ac495dSmrg    :language: c
20636ac495dSmrg
20736ac495dSmrgWe can now create blocks for each of the operations.  Most of these will
20836ac495dSmrgbe consolidated into larger blocks when the optimizer runs.
20936ac495dSmrg
21036ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
21136ac495dSmrg    :start-after: /* Create a block per operation.  */
21236ac495dSmrg    :end-before: /* Populate the initial block.  */
21336ac495dSmrg    :language: c
21436ac495dSmrg
21536ac495dSmrgNow that we have a block it can jump to when it's done, we can populate
21636ac495dSmrgthe initial block:
21736ac495dSmrg
21836ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
21936ac495dSmrg    :start-after: /* Populate the initial block.  */
22036ac495dSmrg    :end-before: /* 2nd pass: fill in instructions.  */
22136ac495dSmrg    :language: c
22236ac495dSmrg
22336ac495dSmrgWe can now populate the blocks for the individual operations.  We loop
22436ac495dSmrgthrough them, adding instructions to their blocks:
22536ac495dSmrg
22636ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
22736ac495dSmrg    :start-after: /* 2nd pass: fill in instructions.  */
22836ac495dSmrg    :end-before: /* Helper macros.  */
22936ac495dSmrg    :language: c
23036ac495dSmrg
23136ac495dSmrgWe're going to have another big ``switch`` statement for implementing
23236ac495dSmrgthe opcodes, this time for compiling them, rather than interpreting
23336ac495dSmrgthem.  It's helpful to have macros for implementing push and pop, so that
23436ac495dSmrgwe can make the ``switch`` statement that's coming up look as much as
23536ac495dSmrgpossible like the one above within the interpreter:
23636ac495dSmrg
23736ac495dSmrg.. literalinclude:: ../examples/tut04-toyvm/toyvm.c
23836ac495dSmrg    :start-after: /* Helper macros.  */
23936ac495dSmrg    :end-before: gcc_jit_block_add_comment
24036ac495dSmrg    :language: c
24136ac495dSmrg
24236ac495dSmrg.. note::
24336ac495dSmrg
24436ac495dSmrg   A particularly clever implementation would have an *identical*
24536ac495dSmrg   ``switch`` statement shared by the interpreter and the compiler, with
24636ac495dSmrg   some preprocessor "magic".  We're not doing that here, for the sake
24736ac495dSmrg   of simplicity.
24836ac495dSmrg
24936ac495dSmrgWhen I first implemented this compiler, I accidentally missed an edit
25036ac495dSmrgwhen copying and pasting the ``Y_EQUALS_POP`` macro, so that popping the
25136ac495dSmrgstack into ``y`` instead erroneously assigned it to ``x``, leaving ``y``
25236ac495dSmrguninitialized.
25336ac495dSmrg
25436ac495dSmrgTo track this kind of thing down, we can use
25536ac495dSmrg:c:func:`gcc_jit_block_add_comment` to add descriptive comments
25636ac495dSmrgto the internal representation.  This is invaluable when looking through
25736ac495dSmrgthe generated IR for, say ``factorial``:
25836ac495dSmrg
25936ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
26036ac495dSmrg    :start-after: PUSH_RVALUE (gcc_jit_lvalue_as_rvalue (state.y))
26136ac495dSmrg    :end-before: /* Handle the individual opcodes.  */
26236ac495dSmrg    :language: c
26336ac495dSmrg
26436ac495dSmrgWe can now write the big ``switch`` statement that implements the
26536ac495dSmrgindividual opcodes, populating the relevant block with statements:
26636ac495dSmrg
26736ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
26836ac495dSmrg    :start-after: /* Handle the individual opcodes.  */
26936ac495dSmrg    :end-before: /* Go to the next block.  */
27036ac495dSmrg    :language: c
27136ac495dSmrg
27236ac495dSmrgEvery block must be terminated, via a call to one of the
27336ac495dSmrg``gcc_jit_block_end_with_`` entrypoints.  This has been done for two
27436ac495dSmrgof the opcodes, but we need to do it for the other ones, by jumping
27536ac495dSmrgto the next block.
27636ac495dSmrg
27736ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
27836ac495dSmrg    :start-after: /* Go to the next block.  */
27936ac495dSmrg    :end-before: /* end of loop on PC locations.  */
28036ac495dSmrg    :language: c
28136ac495dSmrg
28236ac495dSmrgThis is analogous to simply incrementing the program counter.
28336ac495dSmrg
28436ac495dSmrgVerifying the control flow graph
28536ac495dSmrg********************************
28636ac495dSmrgHaving finished looping over the blocks, the context is complete.
28736ac495dSmrg
28836ac495dSmrgAs before, we can verify that the control flow and statements are sane by
28936ac495dSmrgusing :c:func:`gcc_jit_function_dump_to_dot`:
29036ac495dSmrg
29136ac495dSmrg.. code-block:: c
29236ac495dSmrg
29336ac495dSmrg  gcc_jit_function_dump_to_dot (state.fn, "/tmp/factorial.dot");
29436ac495dSmrg
29536ac495dSmrgand viewing the result.  Note how the label names, comments, and
29636ac495dSmrgvariable names show up in the dump, to make it easier to spot
29736ac495dSmrgerrors in our compiler.
29836ac495dSmrg
29936ac495dSmrg  .. figure:: factorial.png
30036ac495dSmrg    :alt: image of a control flow graph
30136ac495dSmrg
30236ac495dSmrgCompiling the context
30336ac495dSmrg*********************
30436ac495dSmrgHaving finished looping over the blocks and populating them with
30536ac495dSmrgstatements, the context is complete.
30636ac495dSmrg
30736ac495dSmrgWe can now compile it, and extract machine code from the result:
30836ac495dSmrg
30936ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
31036ac495dSmrg    :start-after: /* We've now finished populating the context.  Compile it.  */
31136ac495dSmrg    :end-before: /* (this leaks "result" and "funcname") */
31236ac495dSmrg    :language: c
31336ac495dSmrg
31436ac495dSmrgWe can now run the result:
31536ac495dSmrg
31636ac495dSmrg   .. literalinclude:: ../examples/tut04-toyvm/toyvm.c
31736ac495dSmrg    :start-after: /* JIT-compilation.  */
31836ac495dSmrg    :end-before: return 0;
31936ac495dSmrg    :language: c
32036ac495dSmrg
32136ac495dSmrgSingle-stepping through the generated code
32236ac495dSmrg******************************************
32336ac495dSmrg
32436ac495dSmrgIt's possible to debug the generated code.  To do this we need to both:
32536ac495dSmrg
32636ac495dSmrg  * Set up source code locations for our statements, so that we can
32736ac495dSmrg    meaningfully step through the code.  We did this above by
32836ac495dSmrg    calling :c:func:`gcc_jit_context_new_location` and using the
32936ac495dSmrg    results.
33036ac495dSmrg
33136ac495dSmrg  * Enable the generation of debugging information, by setting
33236ac495dSmrg    :c:macro:`GCC_JIT_BOOL_OPTION_DEBUGINFO` on the
33336ac495dSmrg    :c:type:`gcc_jit_context` via
33436ac495dSmrg    :c:func:`gcc_jit_context_set_bool_option`:
33536ac495dSmrg
33636ac495dSmrg    .. code-block:: c
33736ac495dSmrg
33836ac495dSmrg     gcc_jit_context_set_bool_option (
33936ac495dSmrg       ctxt,
34036ac495dSmrg       GCC_JIT_BOOL_OPTION_DEBUGINFO,
34136ac495dSmrg       1);
34236ac495dSmrg
34336ac495dSmrgHaving done this, we can put a breakpoint on the generated function:
34436ac495dSmrg
34536ac495dSmrg.. code-block:: console
34636ac495dSmrg
34736ac495dSmrg  $ gdb --args ./toyvm factorial.toy 10
34836ac495dSmrg  (gdb) break factorial
34936ac495dSmrg  Function "factorial" not defined.
35036ac495dSmrg  Make breakpoint pending on future shared library load? (y or [n]) y
35136ac495dSmrg  Breakpoint 1 (factorial) pending.
35236ac495dSmrg  (gdb) run
35336ac495dSmrg  Breakpoint 1, factorial (arg=10) at factorial.toy:14
35436ac495dSmrg  14	DUP
35536ac495dSmrg
35636ac495dSmrgWe've set up location information, which references ``factorial.toy``.
35736ac495dSmrgThis allows us to use e.g. ``list`` to see where we are in the script:
35836ac495dSmrg
35936ac495dSmrg.. code-block:: console
36036ac495dSmrg
36136ac495dSmrg  (gdb) list
36236ac495dSmrg  9
36336ac495dSmrg  10    # Initial state:
36436ac495dSmrg  11    # stack: [arg]
36536ac495dSmrg  12
36636ac495dSmrg  13    # 0:
36736ac495dSmrg  14    DUP
36836ac495dSmrg  15    # stack: [arg, arg]
36936ac495dSmrg  16
37036ac495dSmrg  17    # 1:
37136ac495dSmrg  18    PUSH_CONST 2
37236ac495dSmrg
37336ac495dSmrgand to step through the function, examining the data:
37436ac495dSmrg
37536ac495dSmrg.. code-block:: console
37636ac495dSmrg
37736ac495dSmrg  (gdb) n
37836ac495dSmrg  18    PUSH_CONST 2
37936ac495dSmrg  (gdb) n
38036ac495dSmrg  22    BINARY_COMPARE_LT
38136ac495dSmrg  (gdb) print stack
38236ac495dSmrg  $5 = {10, 10, 2, 0, -7152, 32767, 0, 0}
38336ac495dSmrg  (gdb) print stack_depth
38436ac495dSmrg  $6 = 3
38536ac495dSmrg
38636ac495dSmrgYou'll see that the parts of the ``stack`` array that haven't been
38736ac495dSmrgtouched yet are uninitialized.
38836ac495dSmrg
38936ac495dSmrg.. note::
39036ac495dSmrg
39136ac495dSmrg   Turning on optimizations may lead to unpredictable results when
39236ac495dSmrg   stepping through the generated code: the execution may appear to
39336ac495dSmrg   "jump around" the source code.  This is analogous to turning up the
39436ac495dSmrg   optimization level in a regular compiler.
39536ac495dSmrg
39636ac495dSmrgExamining the generated code
39736ac495dSmrg****************************
39836ac495dSmrg
39936ac495dSmrgHow good is the optimized code?
40036ac495dSmrg
40136ac495dSmrgWe can turn up optimizations, by calling
40236ac495dSmrg:c:func:`gcc_jit_context_set_int_option` with
40336ac495dSmrg:c:macro:`GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL`:
40436ac495dSmrg
40536ac495dSmrg.. code-block:: c
40636ac495dSmrg
40736ac495dSmrg  gcc_jit_context_set_int_option (
40836ac495dSmrg    ctxt,
40936ac495dSmrg    GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL,
41036ac495dSmrg    3);
41136ac495dSmrg
41236ac495dSmrgOne of GCC's internal representations is called "gimple".  A dump of the
41336ac495dSmrginitial gimple representation of the code can be seen by setting:
41436ac495dSmrg
41536ac495dSmrg.. code-block:: c
41636ac495dSmrg
41736ac495dSmrg  gcc_jit_context_set_bool_option (ctxt,
41836ac495dSmrg                                   GCC_JIT_BOOL_OPTION_DUMP_INITIAL_GIMPLE,
41936ac495dSmrg                                   1);
42036ac495dSmrg
42136ac495dSmrgWith optimization on and source locations displayed, this gives:
42236ac495dSmrg
42336ac495dSmrg.. We'll use "c" for gimple dumps
42436ac495dSmrg
42536ac495dSmrg.. code-block:: c
42636ac495dSmrg
42736ac495dSmrg  factorial (signed int arg)
42836ac495dSmrg  {
42936ac495dSmrg    <unnamed type> D.80;
43036ac495dSmrg    signed int D.81;
43136ac495dSmrg    signed int D.82;
43236ac495dSmrg    signed int D.83;
43336ac495dSmrg    signed int D.84;
43436ac495dSmrg    signed int D.85;
43536ac495dSmrg    signed int y;
43636ac495dSmrg    signed int x;
43736ac495dSmrg    signed int stack_depth;
43836ac495dSmrg    signed int stack[8];
43936ac495dSmrg
44036ac495dSmrg    try
44136ac495dSmrg      {
44236ac495dSmrg        initial:
44336ac495dSmrg        stack_depth = 0;
44436ac495dSmrg        stack[stack_depth] = arg;
44536ac495dSmrg        stack_depth = stack_depth + 1;
44636ac495dSmrg        goto instr0;
44736ac495dSmrg        instr0:
44836ac495dSmrg        /* DUP */:
44936ac495dSmrg        stack_depth = stack_depth + -1;
45036ac495dSmrg        x = stack[stack_depth];
45136ac495dSmrg        stack[stack_depth] = x;
45236ac495dSmrg        stack_depth = stack_depth + 1;
45336ac495dSmrg        stack[stack_depth] = x;
45436ac495dSmrg        stack_depth = stack_depth + 1;
45536ac495dSmrg        goto instr1;
45636ac495dSmrg        instr1:
45736ac495dSmrg        /* PUSH_CONST */:
45836ac495dSmrg        stack[stack_depth] = 2;
45936ac495dSmrg        stack_depth = stack_depth + 1;
46036ac495dSmrg        goto instr2;
46136ac495dSmrg
46236ac495dSmrg        /* etc */
46336ac495dSmrg
46436ac495dSmrgYou can see the generated machine code in assembly form via:
46536ac495dSmrg
46636ac495dSmrg.. code-block:: c
46736ac495dSmrg
46836ac495dSmrg  gcc_jit_context_set_bool_option (
46936ac495dSmrg    ctxt,
47036ac495dSmrg    GCC_JIT_BOOL_OPTION_DUMP_GENERATED_CODE,
47136ac495dSmrg    1);
47236ac495dSmrg  result = gcc_jit_context_compile (ctxt);
47336ac495dSmrg
47436ac495dSmrgwhich shows that (on this x86_64 box) the compiler has unrolled the loop
47536ac495dSmrgand is using MMX instructions to perform several multiplications
47636ac495dSmrgsimultaneously:
47736ac495dSmrg
47836ac495dSmrg.. code-block:: gas
47936ac495dSmrg
48036ac495dSmrg          .file   "fake.c"
48136ac495dSmrg          .text
48236ac495dSmrg  .Ltext0:
48336ac495dSmrg          .p2align 4,,15
48436ac495dSmrg          .globl  factorial
48536ac495dSmrg          .type   factorial, @function
48636ac495dSmrg  factorial:
48736ac495dSmrg  .LFB0:
48836ac495dSmrg          .file 1 "factorial.toy"
48936ac495dSmrg          .loc 1 14 0
49036ac495dSmrg          .cfi_startproc
49136ac495dSmrg  .LVL0:
49236ac495dSmrg  .L2:
49336ac495dSmrg          .loc 1 26 0
49436ac495dSmrg          cmpl    $1, %edi
49536ac495dSmrg          jle     .L13
49636ac495dSmrg          leal    -1(%rdi), %edx
49736ac495dSmrg          movl    %edx, %ecx
49836ac495dSmrg          shrl    $2, %ecx
49936ac495dSmrg          leal    0(,%rcx,4), %esi
50036ac495dSmrg          testl   %esi, %esi
50136ac495dSmrg          je      .L14
50236ac495dSmrg          cmpl    $9, %edx
50336ac495dSmrg          jbe     .L14
50436ac495dSmrg          leal    -2(%rdi), %eax
50536ac495dSmrg          movl    %eax, -16(%rsp)
50636ac495dSmrg          leal    -3(%rdi), %eax
50736ac495dSmrg          movd    -16(%rsp), %xmm0
50836ac495dSmrg          movl    %edi, -16(%rsp)
50936ac495dSmrg          movl    %eax, -12(%rsp)
51036ac495dSmrg          movd    -16(%rsp), %xmm1
51136ac495dSmrg          xorl    %eax, %eax
51236ac495dSmrg          movl    %edx, -16(%rsp)
51336ac495dSmrg          movd    -12(%rsp), %xmm4
51436ac495dSmrg          movd    -16(%rsp), %xmm6
51536ac495dSmrg          punpckldq       %xmm4, %xmm0
51636ac495dSmrg          movdqa  .LC1(%rip), %xmm4
51736ac495dSmrg          punpckldq       %xmm6, %xmm1
51836ac495dSmrg          punpcklqdq      %xmm0, %xmm1
51936ac495dSmrg          movdqa  .LC0(%rip), %xmm0
52036ac495dSmrg          jmp     .L5
52136ac495dSmrg          # etc - edited for brevity
52236ac495dSmrg
52336ac495dSmrgThis is clearly overkill for a function that will likely overflow the
52436ac495dSmrg``int`` type before the vectorization is worthwhile - but then again, this
52536ac495dSmrgis a toy example.
52636ac495dSmrg
52736ac495dSmrgTurning down the optimization level to 2:
52836ac495dSmrg
52936ac495dSmrg.. code-block:: c
53036ac495dSmrg
53136ac495dSmrg  gcc_jit_context_set_int_option (
53236ac495dSmrg    ctxt,
53336ac495dSmrg    GCC_JIT_INT_OPTION_OPTIMIZATION_LEVEL,
53436ac495dSmrg    3);
53536ac495dSmrg
53636ac495dSmrgyields this code, which is simple enough to quote in its entirety:
53736ac495dSmrg
53836ac495dSmrg.. code-block:: gas
53936ac495dSmrg
54036ac495dSmrg          .file   "fake.c"
54136ac495dSmrg          .text
54236ac495dSmrg          .p2align 4,,15
54336ac495dSmrg          .globl  factorial
54436ac495dSmrg          .type   factorial, @function
54536ac495dSmrg  factorial:
54636ac495dSmrg  .LFB0:
54736ac495dSmrg          .cfi_startproc
54836ac495dSmrg  .L2:
54936ac495dSmrg          cmpl    $1, %edi
55036ac495dSmrg          jle     .L8
55136ac495dSmrg          movl    $1, %edx
55236ac495dSmrg          jmp     .L4
55336ac495dSmrg          .p2align 4,,10
55436ac495dSmrg          .p2align 3
55536ac495dSmrg  .L6:
55636ac495dSmrg          movl    %eax, %edi
55736ac495dSmrg  .L4:
55836ac495dSmrg  .L5:
55936ac495dSmrg          leal    -1(%rdi), %eax
56036ac495dSmrg          imull   %edi, %edx
56136ac495dSmrg          cmpl    $1, %eax
56236ac495dSmrg          jne     .L6
56336ac495dSmrg  .L3:
56436ac495dSmrg  .L7:
56536ac495dSmrg          imull   %edx, %eax
56636ac495dSmrg          ret
56736ac495dSmrg  .L8:
56836ac495dSmrg          movl    %edi, %eax
56936ac495dSmrg          movl    $1, %edx
57036ac495dSmrg          jmp     .L7
57136ac495dSmrg          .cfi_endproc
57236ac495dSmrg  .LFE0:
57336ac495dSmrg          .size   factorial, .-factorial
57436ac495dSmrg          .ident  "GCC: (GNU) 4.9.0 20131023 (Red Hat 0.2-%{gcc_release})"
57536ac495dSmrg          .section        .note.GNU-stack,"",@progbits
57636ac495dSmrg
57736ac495dSmrgNote that the stack pushing and popping have been eliminated, as has the
57836ac495dSmrgrecursive call (in favor of an iteration).
57936ac495dSmrg
58036ac495dSmrgPutting it all together
58136ac495dSmrg***********************
58236ac495dSmrg
58336ac495dSmrgThe complete example can be seen in the source tree at
58436ac495dSmrg``gcc/jit/docs/examples/tut04-toyvm/toyvm.c``
58536ac495dSmrg
58636ac495dSmrgalong with a Makefile and a couple of sample .toy scripts:
58736ac495dSmrg
58836ac495dSmrg.. code-block:: console
58936ac495dSmrg
59036ac495dSmrg  $ ls -al
59136ac495dSmrg  drwxrwxr-x. 2 david david   4096 Sep 19 17:46 .
59236ac495dSmrg  drwxrwxr-x. 3 david david   4096 Sep 19 15:26 ..
59336ac495dSmrg  -rw-rw-r--. 1 david david    615 Sep 19 12:43 factorial.toy
59436ac495dSmrg  -rw-rw-r--. 1 david david    834 Sep 19 13:08 fibonacci.toy
59536ac495dSmrg  -rw-rw-r--. 1 david david    238 Sep 19 14:22 Makefile
59636ac495dSmrg  -rw-rw-r--. 1 david david  16457 Sep 19 17:07 toyvm.c
59736ac495dSmrg
59836ac495dSmrg  $ make toyvm
59936ac495dSmrg  g++ -Wall -g -o toyvm toyvm.c -lgccjit
60036ac495dSmrg
60136ac495dSmrg  $ ./toyvm factorial.toy 10
60236ac495dSmrg  interpreter result: 3628800
60336ac495dSmrg  compiler result: 3628800
60436ac495dSmrg
60536ac495dSmrg  $ ./toyvm fibonacci.toy 10
60636ac495dSmrg  interpreter result: 55
60736ac495dSmrg  compiler result: 55
60836ac495dSmrg
60936ac495dSmrgBehind the curtain: How does our code get optimized?
61036ac495dSmrg****************************************************
61136ac495dSmrg
61236ac495dSmrgOur example is done, but you may be wondering about exactly how the
61336ac495dSmrgcompiler turned what we gave it into the machine code seen above.
61436ac495dSmrg
61536ac495dSmrgWe can examine what the compiler is doing in detail by setting:
61636ac495dSmrg
61736ac495dSmrg.. code-block:: c
61836ac495dSmrg
61936ac495dSmrg  gcc_jit_context_set_bool_option (state.ctxt,
62036ac495dSmrg                                   GCC_JIT_BOOL_OPTION_DUMP_EVERYTHING,
62136ac495dSmrg                                   1);
62236ac495dSmrg  gcc_jit_context_set_bool_option (state.ctxt,
62336ac495dSmrg                                   GCC_JIT_BOOL_OPTION_KEEP_INTERMEDIATES,
62436ac495dSmrg                                   1);
62536ac495dSmrg
62636ac495dSmrgThis will dump detailed information about the compiler's state to a
62736ac495dSmrgdirectory under ``/tmp``, and keep it from being cleaned up.
62836ac495dSmrg
62936ac495dSmrgThe precise names and their formats of these files is subject to change.
63036ac495dSmrgHigher optimization levels lead to more files.
63136ac495dSmrgHere's what I saw (edited for brevity; there were almost 200 files):
63236ac495dSmrg
63336ac495dSmrg.. code-block:: console
63436ac495dSmrg
63536ac495dSmrg  intermediate files written to /tmp/libgccjit-KPQbGw
63636ac495dSmrg  $ ls /tmp/libgccjit-KPQbGw/
63736ac495dSmrg  fake.c.000i.cgraph
63836ac495dSmrg  fake.c.000i.type-inheritance
63936ac495dSmrg  fake.c.004t.gimple
64036ac495dSmrg  fake.c.007t.omplower
64136ac495dSmrg  fake.c.008t.lower
64236ac495dSmrg  fake.c.011t.eh
64336ac495dSmrg  fake.c.012t.cfg
64436ac495dSmrg  fake.c.014i.visibility
64536ac495dSmrg  fake.c.015i.early_local_cleanups
64636ac495dSmrg  fake.c.016t.ssa
64736ac495dSmrg  # etc
64836ac495dSmrg
64936ac495dSmrgThe gimple code is converted into Static Single Assignment form,
65036ac495dSmrgwith annotations for use when generating the debuginfo:
65136ac495dSmrg
65236ac495dSmrg.. code-block:: console
65336ac495dSmrg
65436ac495dSmrg  $ less /tmp/libgccjit-KPQbGw/fake.c.016t.ssa
65536ac495dSmrg
65636ac495dSmrg.. code-block:: c
65736ac495dSmrg
65836ac495dSmrg  ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
65936ac495dSmrg
66036ac495dSmrg  factorial (signed int arg)
66136ac495dSmrg  {
66236ac495dSmrg    signed int stack[8];
66336ac495dSmrg    signed int stack_depth;
66436ac495dSmrg    signed int x;
66536ac495dSmrg    signed int y;
66636ac495dSmrg    <unnamed type> _20;
66736ac495dSmrg    signed int _21;
66836ac495dSmrg    signed int _38;
66936ac495dSmrg    signed int _44;
67036ac495dSmrg    signed int _51;
67136ac495dSmrg    signed int _56;
67236ac495dSmrg
67336ac495dSmrg  initial:
67436ac495dSmrg    stack_depth_3 = 0;
67536ac495dSmrg    # DEBUG stack_depth => stack_depth_3
67636ac495dSmrg    stack[stack_depth_3] = arg_5(D);
67736ac495dSmrg    stack_depth_7 = stack_depth_3 + 1;
67836ac495dSmrg    # DEBUG stack_depth => stack_depth_7
67936ac495dSmrg    # DEBUG instr0 => NULL
68036ac495dSmrg    # DEBUG /* DUP */ => NULL
68136ac495dSmrg    stack_depth_8 = stack_depth_7 + -1;
68236ac495dSmrg    # DEBUG stack_depth => stack_depth_8
68336ac495dSmrg    x_9 = stack[stack_depth_8];
68436ac495dSmrg    # DEBUG x => x_9
68536ac495dSmrg    stack[stack_depth_8] = x_9;
68636ac495dSmrg    stack_depth_11 = stack_depth_8 + 1;
68736ac495dSmrg    # DEBUG stack_depth => stack_depth_11
68836ac495dSmrg    stack[stack_depth_11] = x_9;
68936ac495dSmrg    stack_depth_13 = stack_depth_11 + 1;
69036ac495dSmrg    # DEBUG stack_depth => stack_depth_13
69136ac495dSmrg    # DEBUG instr1 => NULL
69236ac495dSmrg    # DEBUG /* PUSH_CONST */ => NULL
69336ac495dSmrg    stack[stack_depth_13] = 2;
69436ac495dSmrg
69536ac495dSmrg    /* etc; edited for brevity */
69636ac495dSmrg
69736ac495dSmrgWe can perhaps better see the code by turning off
69836ac495dSmrg:c:macro:`GCC_JIT_BOOL_OPTION_DEBUGINFO` to suppress all those ``DEBUG``
69936ac495dSmrgstatements, giving:
70036ac495dSmrg
70136ac495dSmrg.. code-block:: console
70236ac495dSmrg
70336ac495dSmrg  $ less /tmp/libgccjit-1Hywc0/fake.c.016t.ssa
70436ac495dSmrg
70536ac495dSmrg.. code-block:: c
70636ac495dSmrg
70736ac495dSmrg  ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
70836ac495dSmrg
70936ac495dSmrg  factorial (signed int arg)
71036ac495dSmrg  {
71136ac495dSmrg    signed int stack[8];
71236ac495dSmrg    signed int stack_depth;
71336ac495dSmrg    signed int x;
71436ac495dSmrg    signed int y;
71536ac495dSmrg    <unnamed type> _20;
71636ac495dSmrg    signed int _21;
71736ac495dSmrg    signed int _38;
71836ac495dSmrg    signed int _44;
71936ac495dSmrg    signed int _51;
72036ac495dSmrg    signed int _56;
72136ac495dSmrg
72236ac495dSmrg  initial:
72336ac495dSmrg    stack_depth_3 = 0;
72436ac495dSmrg    stack[stack_depth_3] = arg_5(D);
72536ac495dSmrg    stack_depth_7 = stack_depth_3 + 1;
72636ac495dSmrg    stack_depth_8 = stack_depth_7 + -1;
72736ac495dSmrg    x_9 = stack[stack_depth_8];
72836ac495dSmrg    stack[stack_depth_8] = x_9;
72936ac495dSmrg    stack_depth_11 = stack_depth_8 + 1;
73036ac495dSmrg    stack[stack_depth_11] = x_9;
73136ac495dSmrg    stack_depth_13 = stack_depth_11 + 1;
73236ac495dSmrg    stack[stack_depth_13] = 2;
73336ac495dSmrg    stack_depth_15 = stack_depth_13 + 1;
73436ac495dSmrg    stack_depth_16 = stack_depth_15 + -1;
73536ac495dSmrg    y_17 = stack[stack_depth_16];
73636ac495dSmrg    stack_depth_18 = stack_depth_16 + -1;
73736ac495dSmrg    x_19 = stack[stack_depth_18];
73836ac495dSmrg    _20 = x_19 < y_17;
73936ac495dSmrg    _21 = (signed int) _20;
74036ac495dSmrg    stack[stack_depth_18] = _21;
74136ac495dSmrg    stack_depth_23 = stack_depth_18 + 1;
74236ac495dSmrg    stack_depth_24 = stack_depth_23 + -1;
74336ac495dSmrg    x_25 = stack[stack_depth_24];
74436ac495dSmrg    if (x_25 != 0)
74536ac495dSmrg      goto <bb 4> (instr9);
74636ac495dSmrg    else
74736ac495dSmrg      goto <bb 3> (instr4);
74836ac495dSmrg
74936ac495dSmrg  instr4:
75036ac495dSmrg  /* DUP */:
75136ac495dSmrg    stack_depth_26 = stack_depth_24 + -1;
75236ac495dSmrg    x_27 = stack[stack_depth_26];
75336ac495dSmrg    stack[stack_depth_26] = x_27;
75436ac495dSmrg    stack_depth_29 = stack_depth_26 + 1;
75536ac495dSmrg    stack[stack_depth_29] = x_27;
75636ac495dSmrg    stack_depth_31 = stack_depth_29 + 1;
75736ac495dSmrg    stack[stack_depth_31] = 1;
75836ac495dSmrg    stack_depth_33 = stack_depth_31 + 1;
75936ac495dSmrg    stack_depth_34 = stack_depth_33 + -1;
76036ac495dSmrg    y_35 = stack[stack_depth_34];
76136ac495dSmrg    stack_depth_36 = stack_depth_34 + -1;
76236ac495dSmrg    x_37 = stack[stack_depth_36];
76336ac495dSmrg    _38 = x_37 - y_35;
76436ac495dSmrg    stack[stack_depth_36] = _38;
76536ac495dSmrg    stack_depth_40 = stack_depth_36 + 1;
76636ac495dSmrg    stack_depth_41 = stack_depth_40 + -1;
76736ac495dSmrg    x_42 = stack[stack_depth_41];
76836ac495dSmrg    _44 = factorial (x_42);
76936ac495dSmrg    stack[stack_depth_41] = _44;
77036ac495dSmrg    stack_depth_46 = stack_depth_41 + 1;
77136ac495dSmrg    stack_depth_47 = stack_depth_46 + -1;
77236ac495dSmrg    y_48 = stack[stack_depth_47];
77336ac495dSmrg    stack_depth_49 = stack_depth_47 + -1;
77436ac495dSmrg    x_50 = stack[stack_depth_49];
77536ac495dSmrg    _51 = x_50 * y_48;
77636ac495dSmrg    stack[stack_depth_49] = _51;
77736ac495dSmrg    stack_depth_53 = stack_depth_49 + 1;
77836ac495dSmrg
77936ac495dSmrg    # stack_depth_1 = PHI <stack_depth_24(2), stack_depth_53(3)>
78036ac495dSmrg  instr9:
78136ac495dSmrg  /* RETURN */:
78236ac495dSmrg    stack_depth_54 = stack_depth_1 + -1;
78336ac495dSmrg    x_55 = stack[stack_depth_54];
78436ac495dSmrg    _56 = x_55;
78536ac495dSmrg    stack ={v} {CLOBBER};
78636ac495dSmrg    return _56;
78736ac495dSmrg
78836ac495dSmrg  }
78936ac495dSmrg
79036ac495dSmrgNote in the above how all the :c:type:`gcc_jit_block` instances we
79136ac495dSmrgcreated have been consolidated into just 3 blocks in GCC's internal
79236ac495dSmrgrepresentation: ``initial``, ``instr4`` and ``instr9``.
79336ac495dSmrg
79436ac495dSmrgOptimizing away stack manipulation
79536ac495dSmrg^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
79636ac495dSmrgRecall our simple implementation of stack operations.  Let's examine
79736ac495dSmrghow the stack operations are optimized away.
79836ac495dSmrg
79936ac495dSmrgAfter a pass of constant-propagation, the depth of the stack at each
80036ac495dSmrgopcode can be determined at compile-time:
80136ac495dSmrg
80236ac495dSmrg.. code-block:: console
80336ac495dSmrg
80436ac495dSmrg  $ less /tmp/libgccjit-1Hywc0/fake.c.021t.ccp1
80536ac495dSmrg
80636ac495dSmrg.. code-block:: c
80736ac495dSmrg
80836ac495dSmrg  ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
80936ac495dSmrg
81036ac495dSmrg  factorial (signed int arg)
81136ac495dSmrg  {
81236ac495dSmrg    signed int stack[8];
81336ac495dSmrg    signed int stack_depth;
81436ac495dSmrg    signed int x;
81536ac495dSmrg    signed int y;
81636ac495dSmrg    <unnamed type> _20;
81736ac495dSmrg    signed int _21;
81836ac495dSmrg    signed int _38;
81936ac495dSmrg    signed int _44;
82036ac495dSmrg    signed int _51;
82136ac495dSmrg
82236ac495dSmrg  initial:
82336ac495dSmrg    stack[0] = arg_5(D);
82436ac495dSmrg    x_9 = stack[0];
82536ac495dSmrg    stack[0] = x_9;
82636ac495dSmrg    stack[1] = x_9;
82736ac495dSmrg    stack[2] = 2;
82836ac495dSmrg    y_17 = stack[2];
82936ac495dSmrg    x_19 = stack[1];
83036ac495dSmrg    _20 = x_19 < y_17;
83136ac495dSmrg    _21 = (signed int) _20;
83236ac495dSmrg    stack[1] = _21;
83336ac495dSmrg    x_25 = stack[1];
83436ac495dSmrg    if (x_25 != 0)
83536ac495dSmrg      goto <bb 4> (instr9);
83636ac495dSmrg    else
83736ac495dSmrg      goto <bb 3> (instr4);
83836ac495dSmrg
83936ac495dSmrg  instr4:
84036ac495dSmrg  /* DUP */:
84136ac495dSmrg    x_27 = stack[0];
84236ac495dSmrg    stack[0] = x_27;
84336ac495dSmrg    stack[1] = x_27;
84436ac495dSmrg    stack[2] = 1;
84536ac495dSmrg    y_35 = stack[2];
84636ac495dSmrg    x_37 = stack[1];
84736ac495dSmrg    _38 = x_37 - y_35;
84836ac495dSmrg    stack[1] = _38;
84936ac495dSmrg    x_42 = stack[1];
85036ac495dSmrg    _44 = factorial (x_42);
85136ac495dSmrg    stack[1] = _44;
85236ac495dSmrg    y_48 = stack[1];
85336ac495dSmrg    x_50 = stack[0];
85436ac495dSmrg    _51 = x_50 * y_48;
85536ac495dSmrg    stack[0] = _51;
85636ac495dSmrg
85736ac495dSmrg  instr9:
85836ac495dSmrg  /* RETURN */:
85936ac495dSmrg    x_55 = stack[0];
86036ac495dSmrg    x_56 = x_55;
86136ac495dSmrg    stack ={v} {CLOBBER};
86236ac495dSmrg    return x_56;
86336ac495dSmrg
86436ac495dSmrg  }
86536ac495dSmrg
86636ac495dSmrgNote how, in the above, all those ``stack_depth`` values are now just
86736ac495dSmrgconstants: we're accessing specific stack locations at each opcode.
86836ac495dSmrg
86936ac495dSmrgThe "esra" pass ("Early Scalar Replacement of Aggregates") breaks
87036ac495dSmrgout our "stack" array into individual elements:
87136ac495dSmrg
87236ac495dSmrg.. code-block:: console
87336ac495dSmrg
87436ac495dSmrg  $ less /tmp/libgccjit-1Hywc0/fake.c.024t.esra
87536ac495dSmrg
87636ac495dSmrg.. code-block:: c
87736ac495dSmrg
87836ac495dSmrg  ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
87936ac495dSmrg
88036ac495dSmrg  Created a replacement for stack offset: 0, size: 32: stack$0
88136ac495dSmrg  Created a replacement for stack offset: 32, size: 32: stack$1
88236ac495dSmrg  Created a replacement for stack offset: 64, size: 32: stack$2
88336ac495dSmrg
88436ac495dSmrg  Symbols to be put in SSA form
88536ac495dSmrg  { D.89 D.90 D.91 }
88636ac495dSmrg  Incremental SSA update started at block: 0
88736ac495dSmrg  Number of blocks in CFG: 5
88836ac495dSmrg  Number of blocks to update: 4 ( 80%)
88936ac495dSmrg
89036ac495dSmrg
89136ac495dSmrg  factorial (signed int arg)
89236ac495dSmrg  {
89336ac495dSmrg    signed int stack$2;
89436ac495dSmrg    signed int stack$1;
89536ac495dSmrg    signed int stack$0;
89636ac495dSmrg    signed int stack[8];
89736ac495dSmrg    signed int stack_depth;
89836ac495dSmrg    signed int x;
89936ac495dSmrg    signed int y;
90036ac495dSmrg    <unnamed type> _20;
90136ac495dSmrg    signed int _21;
90236ac495dSmrg    signed int _38;
90336ac495dSmrg    signed int _44;
90436ac495dSmrg    signed int _51;
90536ac495dSmrg
90636ac495dSmrg  initial:
90736ac495dSmrg    stack$0_45 = arg_5(D);
90836ac495dSmrg    x_9 = stack$0_45;
90936ac495dSmrg    stack$0_39 = x_9;
91036ac495dSmrg    stack$1_32 = x_9;
91136ac495dSmrg    stack$2_30 = 2;
91236ac495dSmrg    y_17 = stack$2_30;
91336ac495dSmrg    x_19 = stack$1_32;
91436ac495dSmrg    _20 = x_19 < y_17;
91536ac495dSmrg    _21 = (signed int) _20;
91636ac495dSmrg    stack$1_28 = _21;
91736ac495dSmrg    x_25 = stack$1_28;
91836ac495dSmrg    if (x_25 != 0)
91936ac495dSmrg      goto <bb 4> (instr9);
92036ac495dSmrg    else
92136ac495dSmrg      goto <bb 3> (instr4);
92236ac495dSmrg
92336ac495dSmrg  instr4:
92436ac495dSmrg  /* DUP */:
92536ac495dSmrg    x_27 = stack$0_39;
92636ac495dSmrg    stack$0_22 = x_27;
92736ac495dSmrg    stack$1_14 = x_27;
92836ac495dSmrg    stack$2_12 = 1;
92936ac495dSmrg    y_35 = stack$2_12;
93036ac495dSmrg    x_37 = stack$1_14;
93136ac495dSmrg    _38 = x_37 - y_35;
93236ac495dSmrg    stack$1_10 = _38;
93336ac495dSmrg    x_42 = stack$1_10;
93436ac495dSmrg    _44 = factorial (x_42);
93536ac495dSmrg    stack$1_6 = _44;
93636ac495dSmrg    y_48 = stack$1_6;
93736ac495dSmrg    x_50 = stack$0_22;
93836ac495dSmrg    _51 = x_50 * y_48;
93936ac495dSmrg    stack$0_1 = _51;
94036ac495dSmrg
94136ac495dSmrg    # stack$0_52 = PHI <stack$0_39(2), stack$0_1(3)>
94236ac495dSmrg  instr9:
94336ac495dSmrg  /* RETURN */:
94436ac495dSmrg    x_55 = stack$0_52;
94536ac495dSmrg    x_56 = x_55;
94636ac495dSmrg    stack ={v} {CLOBBER};
94736ac495dSmrg    return x_56;
94836ac495dSmrg
94936ac495dSmrg  }
95036ac495dSmrg
95136ac495dSmrgHence at this point, all those pushes and pops of the stack are now
95236ac495dSmrgsimply assignments to specific temporary variables.
95336ac495dSmrg
95436ac495dSmrgAfter some copy propagation, the stack manipulation has been completely
95536ac495dSmrgoptimized away:
95636ac495dSmrg
95736ac495dSmrg.. code-block:: console
95836ac495dSmrg
95936ac495dSmrg  $ less /tmp/libgccjit-1Hywc0/fake.c.026t.copyprop1
96036ac495dSmrg
96136ac495dSmrg.. code-block:: c
96236ac495dSmrg
96336ac495dSmrg  ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
96436ac495dSmrg
96536ac495dSmrg  factorial (signed int arg)
96636ac495dSmrg  {
96736ac495dSmrg    signed int stack$2;
96836ac495dSmrg    signed int stack$1;
96936ac495dSmrg    signed int stack$0;
97036ac495dSmrg    signed int stack[8];
97136ac495dSmrg    signed int stack_depth;
97236ac495dSmrg    signed int x;
97336ac495dSmrg    signed int y;
97436ac495dSmrg    <unnamed type> _20;
97536ac495dSmrg    signed int _21;
97636ac495dSmrg    signed int _38;
97736ac495dSmrg    signed int _44;
97836ac495dSmrg    signed int _51;
97936ac495dSmrg
98036ac495dSmrg  initial:
98136ac495dSmrg    stack$0_39 = arg_5(D);
98236ac495dSmrg    _20 = arg_5(D) <= 1;
98336ac495dSmrg    _21 = (signed int) _20;
98436ac495dSmrg    if (_21 != 0)
98536ac495dSmrg      goto <bb 4> (instr9);
98636ac495dSmrg    else
98736ac495dSmrg      goto <bb 3> (instr4);
98836ac495dSmrg
98936ac495dSmrg  instr4:
99036ac495dSmrg  /* DUP */:
99136ac495dSmrg    _38 = arg_5(D) + -1;
99236ac495dSmrg    _44 = factorial (_38);
99336ac495dSmrg    _51 = arg_5(D) * _44;
99436ac495dSmrg    stack$0_1 = _51;
99536ac495dSmrg
99636ac495dSmrg    # stack$0_52 = PHI <arg_5(D)(2), _51(3)>
99736ac495dSmrg  instr9:
99836ac495dSmrg  /* RETURN */:
99936ac495dSmrg    stack ={v} {CLOBBER};
100036ac495dSmrg    return stack$0_52;
100136ac495dSmrg
100236ac495dSmrg  }
100336ac495dSmrg
100436ac495dSmrgLater on, another pass finally eliminated ``stack_depth`` local and the
100536ac495dSmrgunused parts of the `stack`` array altogether:
100636ac495dSmrg
100736ac495dSmrg.. code-block:: console
100836ac495dSmrg
100936ac495dSmrg  $ less /tmp/libgccjit-1Hywc0/fake.c.036t.release_ssa
101036ac495dSmrg
101136ac495dSmrg.. code-block:: c
101236ac495dSmrg
101336ac495dSmrg  ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
101436ac495dSmrg
101536ac495dSmrg  Released 44 names, 314.29%, removed 44 holes
101636ac495dSmrg  factorial (signed int arg)
101736ac495dSmrg  {
101836ac495dSmrg    signed int stack$0;
101936ac495dSmrg    signed int mult_acc_1;
102036ac495dSmrg    <unnamed type> _5;
102136ac495dSmrg    signed int _6;
102236ac495dSmrg    signed int _7;
102336ac495dSmrg    signed int mul_tmp_10;
102436ac495dSmrg    signed int mult_acc_11;
102536ac495dSmrg    signed int mult_acc_13;
102636ac495dSmrg
102736ac495dSmrg    # arg_9 = PHI <arg_8(D)(0)>
102836ac495dSmrg    # mult_acc_13 = PHI <1(0)>
102936ac495dSmrg  initial:
103036ac495dSmrg
103136ac495dSmrg    <bb 5>:
103236ac495dSmrg    # arg_4 = PHI <arg_9(2), _7(3)>
103336ac495dSmrg    # mult_acc_1 = PHI <mult_acc_13(2), mult_acc_11(3)>
103436ac495dSmrg    _5 = arg_4 <= 1;
103536ac495dSmrg    _6 = (signed int) _5;
103636ac495dSmrg    if (_6 != 0)
103736ac495dSmrg      goto <bb 4> (instr9);
103836ac495dSmrg    else
103936ac495dSmrg      goto <bb 3> (instr4);
104036ac495dSmrg
104136ac495dSmrg  instr4:
104236ac495dSmrg  /* DUP */:
104336ac495dSmrg    _7 = arg_4 + -1;
104436ac495dSmrg    mult_acc_11 = mult_acc_1 * arg_4;
104536ac495dSmrg    goto <bb 5>;
104636ac495dSmrg
104736ac495dSmrg    # stack$0_12 = PHI <arg_4(5)>
104836ac495dSmrg  instr9:
104936ac495dSmrg  /* RETURN */:
105036ac495dSmrg    mul_tmp_10 = mult_acc_1 * stack$0_12;
105136ac495dSmrg    return mul_tmp_10;
105236ac495dSmrg
105336ac495dSmrg  }
105436ac495dSmrg
105536ac495dSmrg
105636ac495dSmrgElimination of tail recursion
105736ac495dSmrg^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
105836ac495dSmrgAnother significant optimization is the detection that the call to
105936ac495dSmrg``factorial`` is tail recursion, which can be eliminated in favor of
106036ac495dSmrgan iteration:
106136ac495dSmrg
106236ac495dSmrg.. code-block:: console
106336ac495dSmrg
106436ac495dSmrg  $ less /tmp/libgccjit-1Hywc0/fake.c.030t.tailr1
106536ac495dSmrg
106636ac495dSmrg.. code-block:: c
106736ac495dSmrg
106836ac495dSmrg  ;; Function factorial (factorial, funcdef_no=0, decl_uid=53, symbol_order=0)
106936ac495dSmrg
107036ac495dSmrg
107136ac495dSmrg  Symbols to be put in SSA form
107236ac495dSmrg  { D.88 }
107336ac495dSmrg  Incremental SSA update started at block: 0
107436ac495dSmrg  Number of blocks in CFG: 5
107536ac495dSmrg  Number of blocks to update: 4 ( 80%)
107636ac495dSmrg
107736ac495dSmrg
107836ac495dSmrg  factorial (signed int arg)
107936ac495dSmrg  {
108036ac495dSmrg    signed int stack$2;
108136ac495dSmrg    signed int stack$1;
108236ac495dSmrg    signed int stack$0;
108336ac495dSmrg    signed int stack[8];
108436ac495dSmrg    signed int stack_depth;
108536ac495dSmrg    signed int x;
108636ac495dSmrg    signed int y;
108736ac495dSmrg    signed int mult_acc_1;
108836ac495dSmrg    <unnamed type> _20;
108936ac495dSmrg    signed int _21;
109036ac495dSmrg    signed int _38;
109136ac495dSmrg    signed int mul_tmp_44;
109236ac495dSmrg    signed int mult_acc_51;
109336ac495dSmrg
109436ac495dSmrg    # arg_5 = PHI <arg_39(D)(0), _38(3)>
109536ac495dSmrg    # mult_acc_1 = PHI <1(0), mult_acc_51(3)>
109636ac495dSmrg  initial:
109736ac495dSmrg    _20 = arg_5 <= 1;
109836ac495dSmrg    _21 = (signed int) _20;
109936ac495dSmrg    if (_21 != 0)
110036ac495dSmrg      goto <bb 4> (instr9);
110136ac495dSmrg    else
110236ac495dSmrg      goto <bb 3> (instr4);
110336ac495dSmrg
110436ac495dSmrg  instr4:
110536ac495dSmrg  /* DUP */:
110636ac495dSmrg    _38 = arg_5 + -1;
110736ac495dSmrg    mult_acc_51 = mult_acc_1 * arg_5;
110836ac495dSmrg    goto <bb 2> (initial);
110936ac495dSmrg
111036ac495dSmrg    # stack$0_52 = PHI <arg_5(2)>
111136ac495dSmrg  instr9:
111236ac495dSmrg  /* RETURN */:
111336ac495dSmrg    stack ={v} {CLOBBER};
111436ac495dSmrg    mul_tmp_44 = mult_acc_1 * stack$0_52;
111536ac495dSmrg    return mul_tmp_44;
111636ac495dSmrg
111736ac495dSmrg  }
1118