Copy Fails in FORTRAN Order but Works for C Order #1953

philip-paul-mueller · 2025-02-26T08:47:59Z

Consider the following situation, where t1 is copied into t2, the Memlet expression is t1[0:10, 0:10] -> [0:10, 0:10].

It is important that they do not have the same shape, t1's shape is (20, 20) while the shape of t2 is (10, 10).
Now we consider the case where t1 does not have padding while t2 might have padding (I have not tried the case where t1 has padding but I doubt that it would be much different, because we only copy a subset out of t2 so it is consecutive anyway, so the rest of the array essentially acts as some kind of implicit padding).

Now we consider the case following cases:

Both arrays have C order and t2 has padding, so the strides are t1.strides := (20, 1) and t2.strides := (20, 1). In this case it works fine, note that neither the reading from t1 nor the writing in t2 are fully consecutive, for that reason DaCe generates calls to cudaMemcpy2DToArray().
Both arrays have FORTRAN order and t2 has padding (this is also the case the reproducer handles), thus the strides are t1.strides := (1, 20) and t2.strides := (1, 20). In that case code generation fails, it runs into a NotImplementedError in dace/codegen/targets/cuda.py:996, which is quite strange because there is nothing fundamentally different from the case above.
Both arrays have FORTRAN order, but t2 does not have padding, so the strides are t1.strides := (1, 20) and t2.strides := (1, 10). This time code generation works and the code runs as expected. However, the code generation handles it by running CopyToMap on that Memlet. The question is why it does or have not done it in the previous variant.

After a discussion with Lex I think that the checks the (CUDA) code generator does on the Memlets and calls CopyToMap are incomplete.

import dace
import numpy as np
import cupy as cp
import os


def ref_impl(
        a: dace.float64[20, 20],
        b: dace.float64[10, 10],
):
    """Reference of the computation.

    Note this is a CPU version, however, the real code uses GPU.
    """
    t1 = np.zeros((20, 20), dtype=np.float64)
    t2 = np.zeros((10, 10), dtype=np.float64)

    for i in range(20):
        for j in range(20):
            t1[i, j] = a[i, j] + 1.0

    t2[0:10, 0:10] = t1[0:10, 0:10]

    for i in range(10):
        for j in range(10):
            b[i, j] = t1[i, j] + 2.0


def _make_sdfg(
        c_order: bool,
) -> dace.SDFG:
    """Generates the SDFG.

    If `c_order` is `True` then the transients will have C order otherwise
    they have FORTRAN order. Note that the input arguments always have C order.
    """
    sdfg = dace.SDFG("test" + ("_corder" if c_order else "_forder"))
    state = sdfg.add_state(is_start_block=True)
    names = ["a", "t1", "t2", "b"]

    # Generate the data  descriptor (we will modify the transients later)
    for name in names:
        sdfg.add_array(
                name=name,
                shape=((20, 20) if name in {"a", "t1"} else (10, 10)),
                total_size=400,
                dtype=dace.float64,
                storage=dace.StorageType.GPU_Global,
                transient=name.startswith("t"),
                strides=None,
        )

    if c_order:
        sdfg.arrays["t1"].strides = (20, 1)
        # This case passes even if we change the stride to `(10, 1)`, what is interesting thou
        #  is that in that case the code generator does not run `CopyToMap` on it. Thus it is
        #  able to handle it natively!
        sdfg.arrays["t2"].strides = (20, 1)
    else:
        sdfg.arrays["t1"].strides = (1, 20)
        # NOTE:
        #  If we would set the stride of `t2` to `(1, 10)`, then it would work, this is because the
        #  code generator would turn it into a map (because it runs `CopyToMap`). This is interesting
        #  because in the C order it does not do that.
        sdfg.arrays["t2"].strides = (1, 20)

    t1, t2 = (state.add_access(name) for name in ["t1", "t2"])

    _, me, _ = state.add_mapped_tasklet(
            "comp1",
            schedule=dace.ScheduleType.GPU_Device,
            map_ranges={
                "__i0": "0:20",
                "__i1": "0:20",
            },
            inputs={"__in1": dace.Memlet("a[__i0, __i1]")},
            code="__out = __in1 + 1.0",
            outputs={"__out": dace.Memlet("t1[__i0, __i1]")},
            output_nodes={t1},
            external_edges=True,
    )
    me.map.gpu_block_size = (32, 1, 1)  # To avoid some annoying warning.

    state.add_nedge(
            t1,
            t2,
            dace.Memlet("t1[0:10, 0:10] -> [0:10, 0:10]"),
    )

    _, me, _ = state.add_mapped_tasklet(
            "comp2",
            schedule=dace.ScheduleType.GPU_Device,
            map_ranges={
                "__i0": "0:10",
                "__i1": "0:10",
            },
            inputs={"__in1": dace.Memlet("t2[__i0, __i1]")},
            code="__out = __in1 + 2.0",
            outputs={"__out": dace.Memlet("b[__i0, __i1]")},
            input_nodes={t2},
            external_edges=True,
    )
    me.map.gpu_block_size = (32, 1, 1)  # To avoid some annoying warning.

    sdfg.validate()
    return sdfg


def _do_test(
        c_order: bool,
) -> None:
    sdfg = _make_sdfg(c_order)

    try:
        csdfg = sdfg.compile()
    except Exception as e:
        raise RuntimeError(f"CODE GENERATION FAILED")

    ref = {
            "a": cp.array(np.random.rand(20, 20), order="C", dtype=cp.float64),
            "b": cp.array(np.random.rand(10, 10), order="C", dtype=cp.float64),
    }
    res = {k: v.copy() for k, v in ref.items()}

    csdfg(**res)
    ref_impl(**ref)

    assert all(cp.allclose(ref[k], res[k]) for k in res.keys())


def test_c_order():
    _do_test(True)


def test_f_order():
    _do_test(False)


if __name__ == "__main__":
    print(f"Try C order")
    try:
        test_c_order()
    except Exception as e:
        raise RuntimeError(f"The C-Order case failed unexpectingly with error:\n{str(e)}")
    else:
        print(f"As expected C order has passed\n\n")

    print("Now trying FORTRAN order.")
    try:
        test_f_order()
    except Exception as e:
        if(isinstance(e, RuntimeError) and str(e) == "CODE GENERATION FAILED"):
            print("As expected the code generation for FORTRAN order failed.")
            sys.exit(1)
        else:
            raise RuntimeError(f"Code generation for FORTRAN order seems to have passed, but an other error has occurred:\n{str(e)}")
    else:
        raise RuntimeError("The FORTRAN order case passed, did you fix the error?")

The text was updated successfully, but these errors were encountered:

philip-paul-mueller added the codegen label Feb 26, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Copy Fails in FORTRAN Order but Works for C Order #1953

Copy Fails in FORTRAN Order but Works for C Order #1953

philip-paul-mueller commented Feb 26, 2025 •

edited

Loading

Copy Fails in FORTRAN Order but Works for C Order #1953

Copy Fails in FORTRAN Order but Works for C Order #1953

Comments

philip-paul-mueller commented Feb 26, 2025 • edited Loading

philip-paul-mueller commented Feb 26, 2025 •

edited

Loading