inducer · a-alveyblanc · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025 · Dec 10, 2025
diff --git a/examples/python/compute-examples/compute-tiled-matmul.py b/examples/python/compute-examples/compute-tiled-matmul.py
@@ -0,0 +1,166 @@
+import namedisl as nisl
+
+import loopy as lp
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2
+from loopy.transform.compute import compute
+
+import numpy as np
+import numpy.linalg as la
+import pyopencl as cl
+
+
+def main(
+        M: int = 128,
+        N: int = 128,
+        K: int = 128,
+        bm: int = 32,
+        bn: int = 32,
+        bk: int = 16,
+        run_sequentially: bool = False,
+        use_precompute: bool = False,
+        use_compute: bool = False,
+        run_kernel: bool = False,
+        print_kernel: bool = False,
+        print_device_code: bool = False
+    ) -> None:
+
+    knl = lp.make_kernel(
+        "{ [i, j, k] : 0 <= i < M and 0 <= j < N and 0 <= k < K }",
+        """
+        a_(is, ks) := a[is, ks]
+        b_(ks, js) := b[ks, js]
+        c[i, j] = sum([k], a_(i, k) * b_(k, j))
+        """,
+        [
+            lp.GlobalArg("a", shape=(M, K), dtype=np.float64),
+            lp.GlobalArg("b", shape=(K, N), dtype=np.float64),
+            lp.GlobalArg("c", shape=(M, N), dtype=np.float64,
+                         is_output=True)
+        ]
+    )
+
+    # FIXME: without this, there are complaints about in-bounds access
+    # guarantees for the instruction that stores into c
+    knl = lp.fix_parameters(knl, M=M, N=N, K=K)
+
+    # shared memory tile-level splitting
+    knl = lp.split_iname(knl, "i", bm, inner_iname="ii", outer_iname="io")
+    knl = lp.split_iname(knl, "j", bn, inner_iname="ji", outer_iname="jo")
+    knl = lp.split_iname(knl, "k", bk, inner_iname="ki", outer_iname="ko")
+
+    compute_map_a = nisl.make_map(f"""{{
+        [is, ks] -> [ii_s, io, ki_s, ko] :
+            is = io * {bm} + ii_s and
+            ks = ko * {bk} + ki_s
+    }}""")
+
+    compute_map_b = nisl.make_map(f"""{{
+        [ks, js] -> [ki_s, ko, ji_s, jo] :
+            js = jo * {bn} + ji_s and
+            ks = ko * {bk} + ki_s
+    }}""")
+
+    if use_compute:
+        knl = compute(
+            knl,
+            "a_",
+            compute_map=compute_map_a,
+            storage_indices=["ii_s", "ki_s"],
+            temporal_inames=["io", "ko", "jo"],
+            temporary_address_space=lp.AddressSpace.LOCAL,
+            temporary_dtype=np.float64
+        )
+
+        knl = compute(
+            knl,
+            "b_",
+            compute_map=compute_map_b,
+            storage_indices=["ki_s", "ji_s"],
+            temporal_inames=["io", "ko", "jo"],
+            temporary_address_space=lp.AddressSpace.LOCAL,
+            temporary_dtype=np.float64
+        )
+
+    if use_precompute:
+        knl = lp.precompute(
+            knl,
+            "a_",
+            sweep_inames=["ii", "ki"],
+        )
+
+    if not run_sequentially:
+        knl = lp.tag_inames(
+            knl, {
+                "io"   : "g.0", # outer block loop over block rows
+                "jo"   : "g.1", # outer block loop over block cols
+
+                "ii"   : "l.0", # inner block loop over rows
+                "ji"   : "l.1", # inner block loop over cols
+
+                "ii_s" : "l.0", # inner storage loop over a rows
+                "ji_s" : "l.0", # inner storage loop over b cols
+                "ki_s" : "l.1"  # inner storage loop over a cols / b rows
+            }
+        )
+
+    knl = lp.add_inames_for_unused_hw_axes(knl)
+
+    if run_kernel:
+        a = np.random.randn(M, K)
+        b = np.random.randn(K, N)
+
+        ctx = cl.create_some_context()
+        queue = cl.CommandQueue(ctx)
+
+        ex = knl.executor(ctx)
+        _, out = ex(queue, a=a, b=b)
+
+        print(20*"=", "Tiled matmul report", 20*"=")
+        print(f"Problem size:  M = {M:-4},  N = {N:-4},  K = {K:-4}")
+        print(f"Tile size   : BM = {bm:-4}, BN = {bn:-4}, BK = {bk:-4}")
+        print(f"Relative error = {la.norm((a @ b) - out) / la.norm(a @ b)}")
+        print((40 + len(" Tiled matmul report "))*"=")
+
+    if print_device_code:
+        print(lp.generate_code_v2(knl).device_code())
+
+    if print_kernel:
+        print(knl)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    _ = parser.add_argument("--precompute", action="store_true")
+    _ = parser.add_argument("--compute", action="store_true")
+    _ = parser.add_argument("--run-kernel", action="store_true")
+    _ = parser.add_argument("--print-kernel", action="store_true")
+    _ = parser.add_argument("--print-device-code", action="store_true")
+    _ = parser.add_argument("--run-sequentially", action="store_true")
+
+    _ = parser.add_argument("--m", action="store", type=int, default=128)
+    _ = parser.add_argument("--n", action="store", type=int, default=128)
+    _ = parser.add_argument("--k", action="store", type=int, default=128)
+
+    _ = parser.add_argument("--bm", action="store", type=int, default=32)
+    _ = parser.add_argument("--bn", action="store", type=int, default=32)
+    _ = parser.add_argument("--bk", action="store", type=int, default=16)
+
+    args = parser.parse_args()
+
+    main(
+        M=args.m,
+        N=args.n,
+        K=args.k,
+        bm=args.bm,
+        bn=args.bn,
+        bk=args.bk,
+        use_precompute=args.precompute,
+        use_compute=args.compute,
+        run_kernel=args.run_kernel,
+        print_kernel=args.print_kernel,
+        print_device_code=args.print_device_code,
+        run_sequentially=args.run_sequentially
+    )
diff --git a/examples/python/compute-examples/finite-difference-2-5D.py b/examples/python/compute-examples/finite-difference-2-5D.py
@@ -0,0 +1,131 @@
+import loopy as lp
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2
+from loopy.transform.compute import compute
+
+import namedisl as nisl
+
+import numpy as np
+import numpy.linalg as la
+
+import pyopencl as cl
+
+
+# FIXME: more complicated function, or better yet define a set of functions
+# with sympy and get the exact laplacian symbolically
+def f(x, y, z):
+    return x**2 + y**2 + z**2
+
+
+def laplacian_f(x, y, z):
+    return 6 * np.ones_like(x)
+
+
+def main(
+        use_compute: bool = False,
+        print_device_code: bool = False,
+        print_kernel: bool = False
+    ) -> None:
+    npts = 64
+    pts = np.linspace(-1, 1, num=npts, endpoint=True)
+    h = pts[1] - pts[0]
+
+    x, y, z = np.meshgrid(*(pts,)*3)
+
+    dtype = np.float32
+    x = x.reshape(*(npts,)*3).astype(np.float32)
+    y = y.reshape(*(npts,)*3).astype(np.float32)
+    z = z.reshape(*(npts,)*3).astype(np.float32)
+
+    f_ = f(x, y, z)
+    lap_fd = np.zeros_like(f_)
+    c = (np.array([-1/12, 4/3, -5/2, 4/3, -1/12]) / h**2).astype(dtype)
+
+    m = 5
+    r = m // 2
+
+    bm = bn = m
+
+    # FIXME: the usage on the k dimension is incorrect since we are only testing
+    # tiling (i, j) planes
+    knl = lp.make_kernel(
+        "{ [i, j, k, l] : r <= i, j, k < npts - r and -r <= l < r + 1 }",
+        """
+        u_(is, js, ks) := u[is, js, ks]
+
+        lap_u[i,j,k] = sum(
+            [l],
+            c[l+r] * (u_(i-l,j,k) + u_(i,j-l,k) + u[i,j,k-l])
+        )
+        """,
+        [
+            lp.GlobalArg("u", dtype=dtype, shape=(npts,npts,npts)),
+            lp.GlobalArg("lap_u", dtype=dtype, shape=(npts,npts,npts),
+                         is_output=True),
+            lp.GlobalArg("c", dtype=dtype, shape=(m))
+        ]
+    )
+
+    knl = lp.fix_parameters(knl, npts=npts, r=r)
+
+    knl = lp.split_iname(knl, "i", bm, inner_iname="ii", outer_iname="io")
+    knl = lp.split_iname(knl, "j", bn, inner_iname="ji", outer_iname="jo")
+
+    # FIXME: need to split k dimension
+
+    if use_compute:
+        compute_map = nisl.make_map(
+            f"""
+            {{
+                [is, js, ks] -> [io, ii_s, jo, ji_s, k] :
+                is = io * {bm} + ii_s - {r} and
+                js = jo * {bn} + ji_s - {r} and
+                ks = k
+            }}
+            """
+        )
+
+        knl = compute(
+            knl,
+            "u_",
+            compute_map=compute_map,
+            storage_indices=["ii_s", "ji_s"],
+            temporal_inames=["io", "jo", "k"],
+            temporary_name="u_compute",
+            temporary_address_space=lp.AddressSpace.LOCAL,
+            temporary_dtype=np.float32
+        )
+
+    if print_device_code:
+        print(lp.generate_code_v2(knl).device_code())
+
+    if print_kernel:
+        print(knl)
+
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    ex = knl.executor(queue)
+    _, lap_fd = ex(queue, u=f(x, y, z), c=c)
+
+    lap_true = laplacian_f(x, y, z)
+    sl = (slice(r, npts - r),)*3
+
+    print(la.norm(lap_true[sl] - lap_fd[0][sl]) / la.norm(lap_true[sl]))
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+
+    _ = parser.add_argument("--compute", action="store_true")
+    _ = parser.add_argument("--print-device-code", action="store_true")
+    _ = parser.add_argument("--print-kernel", action="store_true")
+
+    args = parser.parse_args()
+
+    main(
+        use_compute=args.compute,
+        print_device_code=args.print_device_code,
+        print_kernel=args.print_kernel
+    )
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
@@ -48,7 +48,8 @@
 from constantdict import constantdict
 from typing_extensions import Self, override
 
+import namedisl as nisl
 import islpy as isl
 import pymbolic.primitives as p
 import pytools.lex
 from pymbolic.mapper import (
@@ -2044,45 +2045,60 @@
 
 # {{{ (pw)aff to expr conversion
 
-def aff_to_expr(aff: isl.Aff) -> ArithmeticExpression:
+def aff_to_expr(aff: isl.Aff | nisl.Aff) -> ArithmeticExpression:
     from pymbolic import var
 
+    # FIXME: remove this once namedisl is the standard in loopy
     denom = aff.get_denominator_val().to_python()
-
     result = (aff.get_constant_val()*denom).to_python()
-    for dt in [isl.dim_type.in_, isl.dim_type.param]:
-        for i in range(aff.dim(dt)):
-            coeff = (aff.get_coefficient_val(dt, i)*denom).to_python()
+    if isinstance(aff, isl.Aff):
+        for dt in [isl.dim_type.in_, isl.dim_type.param]:
+            for i in range(aff.dim(dt)):
+                coeff = (aff.get_coefficient_val(dt, i)*denom).to_python()
+                if coeff:
+                    dim_name = not_none(aff.get_dim_name(dt, i))
+                    result += coeff*var(dim_name)
+
+        for i in range(aff.dim(isl.dim_type.div)):
+            coeff = (aff.get_coefficient_val(isl.dim_type.div, i)*denom).to_python()
+            if coeff:
+                result += coeff*aff_to_expr(aff.get_div(i))
+
+    else:
+        in_names = set(aff.dim_type_names(isl.dim_type.in_))
+        param_names = set(aff.dim_type_names(isl.dim_type.param))
+
+        for name in in_names | param_names:
+            coeff = (aff.get_coefficient_val(name) * denom).to_python()
             if coeff:
-                dim_name = not_none(aff.get_dim_name(dt, i))
-                result += coeff*var(dim_name)
+                result = coeff * var(name)
 
-    for i in range(aff.dim(isl.dim_type.div)):
-        coeff = (aff.get_coefficient_val(isl.dim_type.div, i)*denom).to_python()
-        if coeff:
-            result += coeff*aff_to_expr(aff.get_div(i))
+        for name in aff.dim_type_names(isl.dim_type.div):
+            coeff = (aff.get_coefficient_val(name) * denom).to_python()
+            if coeff:
+                result += coeff * aff_to_expr(aff.get_div(name))
 
     assert not isinstance(result, complex)
     return flatten(result // denom)
 
 
 def pw_aff_to_expr(
-            pw_aff: int | isl.PwAff | isl.Aff,
+            pw_aff: int | isl.PwAff | isl.Aff | nisl.PwAff | nisl.Aff,
             int_ok: bool = False
         ) -> ArithmeticExpression:
     if isinstance(pw_aff, int):
        if not int_ok:
            warn(f"expected PwAff, got int: {pw_aff}", stacklevel=2)

        return pw_aff

    pieces = pw_aff.get_pieces()
    last_expr = aff_to_expr(pieces[-1][1])

    pairs = [(set_to_cond_expr(constr_set), aff_to_expr(aff))
             for constr_set, aff in pieces[:-1]]

    from pymbolic.primitives import If

    expr = last_expr
    for condition, then_expr in reversed(pairs):