From 63e83a70d18561cf140643ca757c0dbef98c9c46 Mon Sep 17 00:00:00 2001
From: AdrianLiu00 <adrianliu00@gmail.com>
Date: Wed, 25 Dec 2024 08:38:35 -0500
Subject: [PATCH] [dataflow] Supplement, refine, and organize designs of
 unified systolic array (#282)

---
 allo/dataflow.py                        |  17 +-
 tests/dataflow/test_daisy_chain_gemm.py |   2 -
 tests/dataflow/test_unified_systolic.py | 419 ++++++++++++++++++++++--
 3 files changed, 413 insertions(+), 25 deletions(-)

diff --git a/allo/dataflow.py b/allo/dataflow.py
index 9d3f2e58..646bfac1 100644
--- a/allo/dataflow.py
+++ b/allo/dataflow.py
@@ -167,7 +167,9 @@ def _build_top(s, stream_info):
     with s.module.context, Location.unknown():
         # create new func
         func_type = FunctionType.get(input_types, [])
-        new_top = func_d.FuncOp(name="top", type=func_type, ip=InsertionPoint(top_func))
+        new_top = func_d.FuncOp(
+            name=s.top_func_name, type=func_type, ip=InsertionPoint(top_func)
+        )
         new_top.add_entry_block()
         return_op = func_d.ReturnOp([], ip=InsertionPoint(new_top.entry_block))
         for op in top_func.entry_block.operations:
@@ -231,13 +233,19 @@ def wrapper(*args, **kwargs):
     return actual_decorator
 
 
-def customize(func):
+def df_primitive_default(s):
+    df_pipeline(s.module, rewind=True)
+
+
+def customize(func, opt_default=True):
     global_vars = get_global_vars(func)
     s = _customize(func, global_vars=global_vars)
     stream_info = move_stream_to_interface(s)
     s = _build_top(s, stream_info)
 
-    df_pipeline(s.module, rewind=True)
+    if opt_default:
+        df_primitive_default(s)
+
     return s
 
 
@@ -248,6 +256,7 @@ def build(
     project="top.prj",
     configs=None,
     wrap_io=True,
+    opt_default=True,
 ):
     if target == "aie":
         global_vars = get_global_vars(func)
@@ -257,7 +266,7 @@ def build(
         mod.build()
         return mod
     # FPGA backend
-    s = customize(func)
+    s = customize(func, opt_default)
     hls_mod = s.build(
         target=target,
         mode=mode,
diff --git a/tests/dataflow/test_daisy_chain_gemm.py b/tests/dataflow/test_daisy_chain_gemm.py
index 5b58487d..f7433713 100644
--- a/tests/dataflow/test_daisy_chain_gemm.py
+++ b/tests/dataflow/test_daisy_chain_gemm.py
@@ -86,8 +86,6 @@ def gemm(A: int16[M, K], B: int16[K, N], C: int16[M, N]):
                     fifo_A[i - 1, j].put(a)
                 with allo.meta_if(i < M):
                     fifo_B[i, j - 1].put(b)
-                with allo.meta_else():
-                    pass
 
             with allo.meta_if(i == 1):
                 packed_tmp: UInt(M * 16) = 0
diff --git a/tests/dataflow/test_unified_systolic.py b/tests/dataflow/test_unified_systolic.py
index d53607ff..7f327b4c 100644
--- a/tests/dataflow/test_unified_systolic.py
+++ b/tests/dataflow/test_unified_systolic.py
@@ -2,19 +2,289 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import allo
-from allo.ir.types import int32, bool
+from allo.ir.types import int16, int32, bool
 from allo.ir.utils import MockBuffer
 import allo.dataflow as df
 import allo.backend.hls as hls
 import numpy as np
 
-M, N, K = 4, 4, 4
-Rt, Ct = 2, 2
-P0, P1 = Rt + 2, Ct + 2
+
+@df.region()
+def unified_gemm_simple():
+    # interconnect
+    fifo_R = df.array(df.pipe(dtype=int32, shape=(), depth=16), shape=(P0, P1 - 1))
+    fifo_C = df.array(df.pipe(dtype=int32, shape=(), depth=16), shape=(P0 - 1, P1))
+    inst_broad = df.array(df.pipe(dtype=bool, shape=(), depth=4), shape=(P1 - 1,))
+    inst_chain = df.array(df.pipe(dtype=bool, shape=(), depth=4), shape=(P0 - 1, P1))
+
+    @df.kernel(mapping=[P0, P1])
+    def gemm(A: int32[M, K], B: int32[K, N], inst: bool, C: int32[M, N]):
+
+        i, j = df.get_pid()
+
+        # --------------------------------------------------------
+        # Decode and Dispatch
+
+        with allo.meta_if(i == 0 and j == 0):
+            tag: bool = inst
+            inst_broad[j].put(tag)
+            inst_chain[i, j].put(tag)
+
+        with allo.meta_else():
+            with allo.meta_if(i == 0):
+                flowtag: bool = inst_broad[j - 1].get()
+            with allo.meta_else():
+                flowtag: bool = inst_chain[i - 1, j].get()
+
+            with allo.meta_if(i == 0 and j != P1 - 1):
+                inst_broad[j].put(flowtag)
+            with allo.meta_if(i != P0 - 1):
+                inst_chain[i, j].put(flowtag)
+
+        # --------------------------------------------------------
+        # Computation
+
+        with allo.meta_if(i in {0, U + 1} and j in {0, U + 1}):
+            pass
+
+        with allo.meta_else():
+            # --------------------------------------------------------
+            # Parameters
+            Tlength: int32 = K if flowtag else M
+            Czero: int32 = 0
+
+            # peripheral Load
+            with allo.meta_if(i == 0):
+                for t in range(Tlength):
+                    if flowtag:
+                        fifo_C[i, j].put(B[t, j - 1])
+                    else:
+                        fifo_C[i, j].put(Czero)
+
+            with allo.meta_elif(j == 0):
+                for t in range(Tlength):
+                    fifo_R[i, j].put(A[i - 1, t] if flowtag else A[t, i - 1])
+
+            # peripheral Drain
+            with allo.meta_elif(i == U + 1 and j > 0):
+                for t in range(Tlength):
+                    if flowtag:
+                        c_drain: int32 = fifo_C[i - 1, j].get()
+                    else:
+                        C[t, j - 1] = fifo_C[i - 1, j].get()
+
+            with allo.meta_elif(j == U + 1 and i > 0):
+                for t in range(Tlength):
+                    r_drain: int32 = fifo_R[i, j - 1].get()
+
+            # main Compute
+            with allo.meta_else():
+                local_S: int32 = 0 if flowtag else B[i - 1, j - 1]
+
+                for t in range(Tlength):
+                    # Flow In
+                    s: int32 = local_S  # omit peripheral pe
+                    r: int32 = fifo_R[i, j - 1].get()
+                    c: int32 = fifo_C[i - 1, j].get()
+                    # Core MAC
+                    acti: int32 = r
+                    weight: int32 = c if flowtag else s
+                    psum: int32 = s if flowtag else c
+                    accu: int32 = acti * weight + psum
+                    # Flow Out
+                    local_S = accu if flowtag else s  # *
+                    fifo_R[i, j].put(r)
+                    fifo_C[i, j].put(c if flowtag else accu)
+
+                if flowtag:
+                    C[i - 1, j - 1] = local_S
 
 
 @df.region()
-def top():
+def unified_gemm_daisy_chain():
+    L2_R = df.array(df.pipe(dtype=UInt(U * 16), shape=(), depth=4), shape=(P0 - 1,))
+    L2_C = df.array(df.pipe(dtype=UInt(N * 16), shape=(), depth=4), shape=(P1 - 1,))
+
+    L1_S = df.array(df.pipe(dtype=UInt(U * 16), shape=(), depth=4), shape=(U + 1, N))
+    L2_S_in = df.array(df.pipe(dtype=UInt(U * 16), shape=(), depth=4), shape=(N,))
+    L2_S_out = df.array(df.pipe(dtype=UInt(U * 16), shape=(), depth=4), shape=(N,))
+
+    fifo_R = df.array(df.pipe(dtype=int16, shape=(), depth=4), shape=(U, N))
+    fifo_C = df.array(
+        df.pipe(dtype=int16, shape=(), depth=4), shape=(U + 1, N)
+    )  # Additional one for partial sum in WS
+
+    inst_broad = df.array(df.pipe(dtype=bool, shape=(), depth=4), shape=(P1 - 1,))
+    inst_chain = df.array(df.pipe(dtype=bool, shape=(), depth=4), shape=(P0 - 1, P1))
+
+    @df.kernel(mapping=[P0, P1])
+    def gemm(A: int16[M, K], B: int16[K, N], inst: bool, C: int16[M, N]):
+
+        # --------------------------------------------------------
+        # Parameters
+        i, j = df.get_pid()
+        Rtimes: int16 = U
+        Ctimes: int16 = N
+        Tlength: int16 = U
+        Czero: int16 = 0
+
+        # --------------------------------------------------------
+        # Instruction Decode and Dispatch
+
+        with allo.meta_if(i == 0 and j == 0):
+            flowtag: bool = inst
+            inst_broad[j].put(flowtag)
+            inst_chain[i, j].put(flowtag)
+
+        with allo.meta_else():
+            with allo.meta_if(i == 0):
+                flowtag: bool = inst_broad[j - 1].get()
+            with allo.meta_else():
+                flowtag: bool = inst_chain[i - 1, j].get()
+
+            with allo.meta_if(i == 0 and j != P1 - 1):
+                inst_broad[j].put(flowtag)
+            with allo.meta_if(i != P0 - 1):
+                inst_chain[i, j].put(flowtag)
+
+        # --------------------------------------------------------
+        # Computation
+
+        # corner kernels
+        with allo.meta_if(i == 0 and j == 0):
+            if not flowtag:
+                # pack weight
+                for n in range(N):
+                    packed_S_in: UInt(U * 16) = 0
+                    for k in range(U):
+                        packed_S_in[k * 16 : (k + 1) * 16] = B[k, n]
+                    L2_S_in[0].put(packed_S_in)
+
+            for u in range(U):
+                # pack data Row
+                packed_R: UInt(U * 16) = 0
+                if flowtag:
+                    for m in range(U):
+                        packed_R[m * 16 : (m + 1) * 16] = A[m, u]
+                else:
+                    for k in range(U):
+                        packed_R[k * 16 : (k + 1) * 16] = A[u, k]
+                L2_R[1].put(packed_R)
+                # pack data Column
+                packed_C: UInt(N * 16) = 0
+                if flowtag:
+                    for n in range(N):
+                        packed_C[n * 16 : (n + 1) * 16] = B[u, n]
+                else:
+                    for n in range(N):
+                        packed_C[n * 16 : (n + 1) * 16] = Czero
+                L2_C[1].put(packed_C)
+
+        with allo.meta_elif(i == P0 - 1 and j == P1 - 1):
+            for n in range(N):
+                packed_S_out = L2_S_out[N - 1].get()
+                for m in range(M):
+                    C[m, n] = packed_S_out[m * 16 : (m + 1) * 16]
+
+        with allo.meta_elif(i in {0, P0 - 1} and j in {0, P1 - 1}):
+            pass
+
+        # peripheral kernels
+        with allo.meta_elif(j == 0):
+            # i > 0, the first column
+            for u in range(U):
+                r = L2_R[i].get()
+                # unpack data
+                fifo_R[i - 1, 0].put(r[16 * (i - 1) : 16 * i])
+                with allo.meta_if(i < U):
+                    L2_R[i + 1].put(r)
+
+        with allo.meta_elif(i == 0):
+            # j > 0, the first row
+            if not flowtag:
+                L1_S[0, j - 1].put(L2_S_in[j - 1].get())
+                with allo.meta_if(j != P1 - 2):
+                    for ind in range(N - j):
+                        L2_S_in[j].put(L2_S_in[j - 1].get())
+
+            for u in range(U):
+                c = L2_C[j].get()
+                fifo_C[0, j - 1].put(c[16 * (j - 1) : 16 * j])
+                with allo.meta_if(j < N):
+                    L2_C[j + 1].put(c)
+
+        with allo.meta_elif(i == P0 - 1):
+            if flowtag:  # OS
+                c_C = L1_S[i - 1, N - j].get()
+                L2_S_out[j - 1].put(c_C)
+                with allo.meta_if(j != 1):
+                    for ind in range(j - 1):
+                        L2_S_out[j - 1].put(L2_S_out[j - 2].get())
+
+            else:  # WS
+                with allo.meta_if(j != 1):
+                    for ind in range(j - 1):
+                        L2_S_out[j - 1].put(L2_S_out[j - 2].get())
+
+                c_C: UInt(U * 16) = 0
+                for m in range(U):
+                    c_C[m * 16 : (m + 1) * 16] = fifo_C[U, j - 1].get()
+                L2_S_out[j - 1].put(c_C)
+
+        with allo.meta_elif(j == P1 - 1):
+            pass
+
+        # main body
+        with allo.meta_else():
+            local_s: int16 = 0
+
+            # Stationary Cache-In
+            if not flowtag:
+                packed_tmp: UInt(U * 16) = L1_S[i - 1, j - 1].get()
+                local_s = packed_tmp[16 * (i - 1) : 16 * i]
+                with allo.meta_if(i < U):
+                    L1_S[i, j - 1].put(packed_tmp)
+
+            for u in range(U):
+                # Flow In
+                r: int16 = fifo_R[i - 1, j - 1].get()
+                c: int16 = fifo_C[i - 1, j - 1].get()
+                # Core MAC
+                acti: int16 = r
+                weight: int16 = c if flowtag else local_s
+                psum: int16 = local_s if flowtag else c
+                accu: int16 = acti * weight + psum
+                if flowtag:
+                    local_s = accu
+                # Flow Out
+                with allo.meta_if(j < N):
+                    fifo_R[i - 1, j].put(r)
+                with allo.meta_if(i < U):
+                    fifo_C[i, j - 1].put(c if flowtag else accu)
+                with allo.meta_if(i == U):
+                    if not flowtag:
+                        fifo_C[i, j - 1].put(accu)
+
+            # Stationary Cache-Out
+            if flowtag:
+                with allo.meta_if(i == 1):
+                    packed_tmp: UInt(U * 16) = 0
+                with allo.meta_else():
+                    packed_tmp: UInt(U * 16) = L1_S[i - 1, j - 1].get()
+
+                packed_c: UInt(U * 16) = 0
+                for m in range(U):
+                    if m == i - 1:
+                        packed_c[m * 16 : (m + 1) * 16] = local_s
+                    else:
+                        packed_c[m * 16 : (m + 1) * 16] = packed_tmp[
+                            m * 16 : (m + 1) * 16
+                        ]
+                L1_S[i, j - 1].put(packed_c)
+
+
+@df.region()
+def unified_gemm_tiling():
     # interconnect
     fifo_R = df.array(df.pipe(dtype=int32, shape=(), depth=16), shape=(P0, P1 - 1))
     fifo_C = df.array(df.pipe(dtype=int32, shape=(), depth=16), shape=(P0 - 1, P1))
@@ -99,7 +369,7 @@ def gemm(A: int32[M, K], B: int32[K, N], inst: bool, C: int32[M, N]):
                         )
 
                         for t in range(Tlength):
-                            # Flow IN
+                            # Flow In
                             s: int32 = local_S  # omit peripheral pe
                             r: int32 = fifo_R[i, j - 1].get()
                             c: int32 = fifo_C[i - 1, j].get()
@@ -108,15 +378,13 @@ def gemm(A: int32[M, K], B: int32[K, N], inst: bool, C: int32[M, N]):
                             weight: int32 = c if flowtag else s
                             psum: int32 = s if flowtag else c
                             accu: int32 = acti * weight + psum
-                            # FLOW OUT
+                            # Flow Out
                             local_S = accu if flowtag else s  # *
                             fifo_R[i, j].put(r)
                             fifo_C[i, j].put(c if flowtag else accu)
 
                         if flowtag:
                             C[ri * Rt + (i - 1), ci * Ct + (j - 1)] = local_S
-                        else:
-                            pass
 
 
 def schedule_unified_systolic(s):
@@ -126,7 +394,12 @@ def schedule_unified_systolic(s):
     return s
 
 
-def test_unified_systolic():
+U = 4  # Require for same size in two dimension if not tiling
+M, N, K = U, 4, U
+P0, P1 = U + 2, U + 2
+
+
+def test_unified_simple():
 
     A = np.random.randint(-8, 8, (M, K)).astype(np.int32)
     B = np.random.randint(-8, 8, (K, N)).astype(np.int32)
@@ -135,12 +408,12 @@ def test_unified_systolic():
 
     if hls.is_available("vitis_hls"):
 
-        s = df.customize(top)
+        s = df.customize(unified_gemm_simple)
         schedule_unified_systolic(s)
 
         # csim test
         print(" Csim Test ".center(60, "*"))
-        mod = s.build(target="vitis_hls", mode="csim", project="top.prj")
+        mod = s.build(target="vitis_hls", mode="csim", project="df-uni-simple-csim.prj")
         C_truth = np.dot(A, B)
         print(C_truth)
 
@@ -157,16 +430,115 @@ def test_unified_systolic():
         np.testing.assert_allclose(C, C_truth, atol=1e-5)
         print("Csim: Output-stationary Mode Passed!")
 
-        # csyn test
-        print(" Csyn Test ".center(60, "*"))
-        mod_csyn = s.build(target="vitis_hls", mode="csyn", project="df-uni-csyn.prj")
-        mod_csyn()
-        print("Design: C-Synthesizable!")
+        # hw_emu test
+        print(" Hw_emu Test ".center(60, "*"))
+        mod_hwemu = s.build(
+            target="vitis_hls", mode="hw_emu", project="df-uni-simple-hwemu.prj"
+        )
+        C = np.zeros((M, N), dtype=np.int32)
+        mod_hwemu(A, B, flowtag1, C)
+        print(C)
+        np.testing.assert_allclose(C, C_truth, atol=1e-5)
+        print("Hw_emu: Weight-stationary Mode Passed!")
+
+        C = np.zeros((M, N), dtype=np.int32)
+        mod_hwemu(A, B, flowtag2, C)
+        print(C)
+        np.testing.assert_allclose(C, C_truth, atol=1e-5)
+        print("Hw_emu: Output-stationary Mode Passed!")
+
+
+def test_unified_daisy_chain():
+    A = np.random.randint(0, 8, (M, K), dtype=np.int16)
+    B = np.random.randint(0, 8, (K, N), dtype=np.int16)
+    C = np.zeros((M, N), dtype=np.int16)
+
+    if hls.is_available("vitis_hls"):
+        # csim test
+        print(" Csim Test ".center(60, "*"))
+        mod = df.build(
+            unified_gemm_daisy_chain,
+            target="vitis_hls",
+            mode="csim",
+            project="df-uni-daisy-csim.prj",
+        )
+        C_truth = np.dot(A, B)
+        print(C_truth)
+
+        flowtag1: bool = False
+        mod(A, B, flowtag1, C)
+        print(C)
+        np.testing.assert_allclose(C, C_truth, atol=1e-5)
+        print("Csim: Weight-stationary Mode Passed!")
+
+        flowtag2: bool = True
+        C = np.zeros((M, N), dtype=np.int16)
+        mod(A, B, flowtag2, C)
+        print(C)
+        np.testing.assert_allclose(C, C_truth, atol=1e-5)
+        print("Csim: Output-stationary Mode Passed!")
+
+        # hw_emu test
+        print(" Hw_emu Test ".center(60, "*"))
+        mod_hwemu = df.build(
+            unified_gemm_daisy_chain,
+            target="vitis_hls",
+            mode="hw_emu",
+            project="df-uni-daisy-hwemu.prj",
+        )
+        C = np.zeros((M, N), dtype=np.int32)
+        mod_hwemu(A, B, flowtag1, C)
+        print(C)
+        np.testing.assert_allclose(C, C_truth, atol=1e-5)
+        print("Hw_emu: Weight-stationary Mode Passed!")
+
+        C = np.zeros((M, N), dtype=np.int32)
+        mod_hwemu(A, B, flowtag2, C)
+        print(C)
+        np.testing.assert_allclose(C, C_truth, atol=1e-5)
+        print("Hw_emu: Output-stationary Mode Passed!")
+
+
+M, N, K = 4, 4, 4
+Rt, Ct = 2, 2
+P0, P1 = Rt + 2, Ct + 2
+
+
+def test_unified_tiling():
+
+    A = np.random.randint(-8, 8, (M, K)).astype(np.int32)
+    B = np.random.randint(-8, 8, (K, N)).astype(np.int32)
+
+    C = np.zeros((M, N), dtype=np.int32)
+
+    if hls.is_available("vitis_hls"):
+
+        s = df.customize(unified_gemm_tiling)
+        schedule_unified_systolic(s)
+
+        # csim test
+        print(" Csim Test ".center(60, "*"))
+        mod = s.build(target="vitis_hls", mode="csim", project="df-uni-tiling-csim.prj")
+        C_truth = np.dot(A, B)
+        print(C_truth)
+
+        flowtag1: bool = False
+        mod(A, B, flowtag1, C)
+        print(C)
+        np.testing.assert_allclose(C, C_truth, atol=1e-5)
+        print("Csim: Weight-stationary Mode Passed!")
+
+        flowtag2: bool = True
+        C = np.zeros((M, N), dtype=np.int32)
+        mod(A, B, flowtag2, C)
+        print(C)
+        np.testing.assert_allclose(C, C_truth, atol=1e-5)
+        print("Csim: Output-stationary Mode Passed!")
 
         # hw_emu test
         print(" Hw_emu Test ".center(60, "*"))
         mod_hwemu = s.build(
-            target="vitis_hls", mode="hw_emu", project="df-uni-hwemu.prj"
+            target="vitis_hls", mode="hw_emu", project="df-uni-tiling-hwemu.prj"
         )
         C = np.zeros((M, N), dtype=np.int32)
         mod_hwemu(A, B, flowtag1, C)
@@ -182,4 +554,13 @@ def test_unified_systolic():
 
 
 if __name__ == "__main__":
-    test_unified_systolic()
+    U = 4  # Require for same size in two dimension if not tiling
+    M, N, K = U, 4, U
+    P0, P1 = U + 2, U + 2
+    test_unified_simple()
+    test_unified_daisy_chain()
+
+    M, N, K = 4, 4, 4
+    Rt, Ct = 2, 2
+    P0, P1 = Rt + 2, Ct + 2
+    test_unified_tiling()