From 63e83a70d18561cf140643ca757c0dbef98c9c46 Mon Sep 17 00:00:00 2001 From: AdrianLiu00 Date: Wed, 25 Dec 2024 08:38:35 -0500 Subject: [PATCH] [dataflow] Supplement, refine, and organize designs of unified systolic array (#282) --- allo/dataflow.py | 17 +- tests/dataflow/test_daisy_chain_gemm.py | 2 - tests/dataflow/test_unified_systolic.py | 419 ++++++++++++++++++++++-- 3 files changed, 413 insertions(+), 25 deletions(-) diff --git a/allo/dataflow.py b/allo/dataflow.py index 9d3f2e58..646bfac1 100644 --- a/allo/dataflow.py +++ b/allo/dataflow.py @@ -167,7 +167,9 @@ def _build_top(s, stream_info): with s.module.context, Location.unknown(): # create new func func_type = FunctionType.get(input_types, []) - new_top = func_d.FuncOp(name="top", type=func_type, ip=InsertionPoint(top_func)) + new_top = func_d.FuncOp( + name=s.top_func_name, type=func_type, ip=InsertionPoint(top_func) + ) new_top.add_entry_block() return_op = func_d.ReturnOp([], ip=InsertionPoint(new_top.entry_block)) for op in top_func.entry_block.operations: @@ -231,13 +233,19 @@ def wrapper(*args, **kwargs): return actual_decorator -def customize(func): +def df_primitive_default(s): + df_pipeline(s.module, rewind=True) + + +def customize(func, opt_default=True): global_vars = get_global_vars(func) s = _customize(func, global_vars=global_vars) stream_info = move_stream_to_interface(s) s = _build_top(s, stream_info) - df_pipeline(s.module, rewind=True) + if opt_default: + df_primitive_default(s) + return s @@ -248,6 +256,7 @@ def build( project="top.prj", configs=None, wrap_io=True, + opt_default=True, ): if target == "aie": global_vars = get_global_vars(func) @@ -257,7 +266,7 @@ def build( mod.build() return mod # FPGA backend - s = customize(func) + s = customize(func, opt_default) hls_mod = s.build( target=target, mode=mode, diff --git a/tests/dataflow/test_daisy_chain_gemm.py b/tests/dataflow/test_daisy_chain_gemm.py index 5b58487d..f7433713 100644 --- a/tests/dataflow/test_daisy_chain_gemm.py +++ b/tests/dataflow/test_daisy_chain_gemm.py @@ -86,8 +86,6 @@ def gemm(A: int16[M, K], B: int16[K, N], C: int16[M, N]): fifo_A[i - 1, j].put(a) with allo.meta_if(i < M): fifo_B[i, j - 1].put(b) - with allo.meta_else(): - pass with allo.meta_if(i == 1): packed_tmp: UInt(M * 16) = 0 diff --git a/tests/dataflow/test_unified_systolic.py b/tests/dataflow/test_unified_systolic.py index d53607ff..7f327b4c 100644 --- a/tests/dataflow/test_unified_systolic.py +++ b/tests/dataflow/test_unified_systolic.py @@ -2,19 +2,289 @@ # SPDX-License-Identifier: Apache-2.0 import allo -from allo.ir.types import int32, bool +from allo.ir.types import int16, int32, bool from allo.ir.utils import MockBuffer import allo.dataflow as df import allo.backend.hls as hls import numpy as np -M, N, K = 4, 4, 4 -Rt, Ct = 2, 2 -P0, P1 = Rt + 2, Ct + 2 + +@df.region() +def unified_gemm_simple(): + # interconnect + fifo_R = df.array(df.pipe(dtype=int32, shape=(), depth=16), shape=(P0, P1 - 1)) + fifo_C = df.array(df.pipe(dtype=int32, shape=(), depth=16), shape=(P0 - 1, P1)) + inst_broad = df.array(df.pipe(dtype=bool, shape=(), depth=4), shape=(P1 - 1,)) + inst_chain = df.array(df.pipe(dtype=bool, shape=(), depth=4), shape=(P0 - 1, P1)) + + @df.kernel(mapping=[P0, P1]) + def gemm(A: int32[M, K], B: int32[K, N], inst: bool, C: int32[M, N]): + + i, j = df.get_pid() + + # -------------------------------------------------------- + # Decode and Dispatch + + with allo.meta_if(i == 0 and j == 0): + tag: bool = inst + inst_broad[j].put(tag) + inst_chain[i, j].put(tag) + + with allo.meta_else(): + with allo.meta_if(i == 0): + flowtag: bool = inst_broad[j - 1].get() + with allo.meta_else(): + flowtag: bool = inst_chain[i - 1, j].get() + + with allo.meta_if(i == 0 and j != P1 - 1): + inst_broad[j].put(flowtag) + with allo.meta_if(i != P0 - 1): + inst_chain[i, j].put(flowtag) + + # -------------------------------------------------------- + # Computation + + with allo.meta_if(i in {0, U + 1} and j in {0, U + 1}): + pass + + with allo.meta_else(): + # -------------------------------------------------------- + # Parameters + Tlength: int32 = K if flowtag else M + Czero: int32 = 0 + + # peripheral Load + with allo.meta_if(i == 0): + for t in range(Tlength): + if flowtag: + fifo_C[i, j].put(B[t, j - 1]) + else: + fifo_C[i, j].put(Czero) + + with allo.meta_elif(j == 0): + for t in range(Tlength): + fifo_R[i, j].put(A[i - 1, t] if flowtag else A[t, i - 1]) + + # peripheral Drain + with allo.meta_elif(i == U + 1 and j > 0): + for t in range(Tlength): + if flowtag: + c_drain: int32 = fifo_C[i - 1, j].get() + else: + C[t, j - 1] = fifo_C[i - 1, j].get() + + with allo.meta_elif(j == U + 1 and i > 0): + for t in range(Tlength): + r_drain: int32 = fifo_R[i, j - 1].get() + + # main Compute + with allo.meta_else(): + local_S: int32 = 0 if flowtag else B[i - 1, j - 1] + + for t in range(Tlength): + # Flow In + s: int32 = local_S # omit peripheral pe + r: int32 = fifo_R[i, j - 1].get() + c: int32 = fifo_C[i - 1, j].get() + # Core MAC + acti: int32 = r + weight: int32 = c if flowtag else s + psum: int32 = s if flowtag else c + accu: int32 = acti * weight + psum + # Flow Out + local_S = accu if flowtag else s # * + fifo_R[i, j].put(r) + fifo_C[i, j].put(c if flowtag else accu) + + if flowtag: + C[i - 1, j - 1] = local_S @df.region() -def top(): +def unified_gemm_daisy_chain(): + L2_R = df.array(df.pipe(dtype=UInt(U * 16), shape=(), depth=4), shape=(P0 - 1,)) + L2_C = df.array(df.pipe(dtype=UInt(N * 16), shape=(), depth=4), shape=(P1 - 1,)) + + L1_S = df.array(df.pipe(dtype=UInt(U * 16), shape=(), depth=4), shape=(U + 1, N)) + L2_S_in = df.array(df.pipe(dtype=UInt(U * 16), shape=(), depth=4), shape=(N,)) + L2_S_out = df.array(df.pipe(dtype=UInt(U * 16), shape=(), depth=4), shape=(N,)) + + fifo_R = df.array(df.pipe(dtype=int16, shape=(), depth=4), shape=(U, N)) + fifo_C = df.array( + df.pipe(dtype=int16, shape=(), depth=4), shape=(U + 1, N) + ) # Additional one for partial sum in WS + + inst_broad = df.array(df.pipe(dtype=bool, shape=(), depth=4), shape=(P1 - 1,)) + inst_chain = df.array(df.pipe(dtype=bool, shape=(), depth=4), shape=(P0 - 1, P1)) + + @df.kernel(mapping=[P0, P1]) + def gemm(A: int16[M, K], B: int16[K, N], inst: bool, C: int16[M, N]): + + # -------------------------------------------------------- + # Parameters + i, j = df.get_pid() + Rtimes: int16 = U + Ctimes: int16 = N + Tlength: int16 = U + Czero: int16 = 0 + + # -------------------------------------------------------- + # Instruction Decode and Dispatch + + with allo.meta_if(i == 0 and j == 0): + flowtag: bool = inst + inst_broad[j].put(flowtag) + inst_chain[i, j].put(flowtag) + + with allo.meta_else(): + with allo.meta_if(i == 0): + flowtag: bool = inst_broad[j - 1].get() + with allo.meta_else(): + flowtag: bool = inst_chain[i - 1, j].get() + + with allo.meta_if(i == 0 and j != P1 - 1): + inst_broad[j].put(flowtag) + with allo.meta_if(i != P0 - 1): + inst_chain[i, j].put(flowtag) + + # -------------------------------------------------------- + # Computation + + # corner kernels + with allo.meta_if(i == 0 and j == 0): + if not flowtag: + # pack weight + for n in range(N): + packed_S_in: UInt(U * 16) = 0 + for k in range(U): + packed_S_in[k * 16 : (k + 1) * 16] = B[k, n] + L2_S_in[0].put(packed_S_in) + + for u in range(U): + # pack data Row + packed_R: UInt(U * 16) = 0 + if flowtag: + for m in range(U): + packed_R[m * 16 : (m + 1) * 16] = A[m, u] + else: + for k in range(U): + packed_R[k * 16 : (k + 1) * 16] = A[u, k] + L2_R[1].put(packed_R) + # pack data Column + packed_C: UInt(N * 16) = 0 + if flowtag: + for n in range(N): + packed_C[n * 16 : (n + 1) * 16] = B[u, n] + else: + for n in range(N): + packed_C[n * 16 : (n + 1) * 16] = Czero + L2_C[1].put(packed_C) + + with allo.meta_elif(i == P0 - 1 and j == P1 - 1): + for n in range(N): + packed_S_out = L2_S_out[N - 1].get() + for m in range(M): + C[m, n] = packed_S_out[m * 16 : (m + 1) * 16] + + with allo.meta_elif(i in {0, P0 - 1} and j in {0, P1 - 1}): + pass + + # peripheral kernels + with allo.meta_elif(j == 0): + # i > 0, the first column + for u in range(U): + r = L2_R[i].get() + # unpack data + fifo_R[i - 1, 0].put(r[16 * (i - 1) : 16 * i]) + with allo.meta_if(i < U): + L2_R[i + 1].put(r) + + with allo.meta_elif(i == 0): + # j > 0, the first row + if not flowtag: + L1_S[0, j - 1].put(L2_S_in[j - 1].get()) + with allo.meta_if(j != P1 - 2): + for ind in range(N - j): + L2_S_in[j].put(L2_S_in[j - 1].get()) + + for u in range(U): + c = L2_C[j].get() + fifo_C[0, j - 1].put(c[16 * (j - 1) : 16 * j]) + with allo.meta_if(j < N): + L2_C[j + 1].put(c) + + with allo.meta_elif(i == P0 - 1): + if flowtag: # OS + c_C = L1_S[i - 1, N - j].get() + L2_S_out[j - 1].put(c_C) + with allo.meta_if(j != 1): + for ind in range(j - 1): + L2_S_out[j - 1].put(L2_S_out[j - 2].get()) + + else: # WS + with allo.meta_if(j != 1): + for ind in range(j - 1): + L2_S_out[j - 1].put(L2_S_out[j - 2].get()) + + c_C: UInt(U * 16) = 0 + for m in range(U): + c_C[m * 16 : (m + 1) * 16] = fifo_C[U, j - 1].get() + L2_S_out[j - 1].put(c_C) + + with allo.meta_elif(j == P1 - 1): + pass + + # main body + with allo.meta_else(): + local_s: int16 = 0 + + # Stationary Cache-In + if not flowtag: + packed_tmp: UInt(U * 16) = L1_S[i - 1, j - 1].get() + local_s = packed_tmp[16 * (i - 1) : 16 * i] + with allo.meta_if(i < U): + L1_S[i, j - 1].put(packed_tmp) + + for u in range(U): + # Flow In + r: int16 = fifo_R[i - 1, j - 1].get() + c: int16 = fifo_C[i - 1, j - 1].get() + # Core MAC + acti: int16 = r + weight: int16 = c if flowtag else local_s + psum: int16 = local_s if flowtag else c + accu: int16 = acti * weight + psum + if flowtag: + local_s = accu + # Flow Out + with allo.meta_if(j < N): + fifo_R[i - 1, j].put(r) + with allo.meta_if(i < U): + fifo_C[i, j - 1].put(c if flowtag else accu) + with allo.meta_if(i == U): + if not flowtag: + fifo_C[i, j - 1].put(accu) + + # Stationary Cache-Out + if flowtag: + with allo.meta_if(i == 1): + packed_tmp: UInt(U * 16) = 0 + with allo.meta_else(): + packed_tmp: UInt(U * 16) = L1_S[i - 1, j - 1].get() + + packed_c: UInt(U * 16) = 0 + for m in range(U): + if m == i - 1: + packed_c[m * 16 : (m + 1) * 16] = local_s + else: + packed_c[m * 16 : (m + 1) * 16] = packed_tmp[ + m * 16 : (m + 1) * 16 + ] + L1_S[i, j - 1].put(packed_c) + + +@df.region() +def unified_gemm_tiling(): # interconnect fifo_R = df.array(df.pipe(dtype=int32, shape=(), depth=16), shape=(P0, P1 - 1)) fifo_C = df.array(df.pipe(dtype=int32, shape=(), depth=16), shape=(P0 - 1, P1)) @@ -99,7 +369,7 @@ def gemm(A: int32[M, K], B: int32[K, N], inst: bool, C: int32[M, N]): ) for t in range(Tlength): - # Flow IN + # Flow In s: int32 = local_S # omit peripheral pe r: int32 = fifo_R[i, j - 1].get() c: int32 = fifo_C[i - 1, j].get() @@ -108,15 +378,13 @@ def gemm(A: int32[M, K], B: int32[K, N], inst: bool, C: int32[M, N]): weight: int32 = c if flowtag else s psum: int32 = s if flowtag else c accu: int32 = acti * weight + psum - # FLOW OUT + # Flow Out local_S = accu if flowtag else s # * fifo_R[i, j].put(r) fifo_C[i, j].put(c if flowtag else accu) if flowtag: C[ri * Rt + (i - 1), ci * Ct + (j - 1)] = local_S - else: - pass def schedule_unified_systolic(s): @@ -126,7 +394,12 @@ def schedule_unified_systolic(s): return s -def test_unified_systolic(): +U = 4 # Require for same size in two dimension if not tiling +M, N, K = U, 4, U +P0, P1 = U + 2, U + 2 + + +def test_unified_simple(): A = np.random.randint(-8, 8, (M, K)).astype(np.int32) B = np.random.randint(-8, 8, (K, N)).astype(np.int32) @@ -135,12 +408,12 @@ def test_unified_systolic(): if hls.is_available("vitis_hls"): - s = df.customize(top) + s = df.customize(unified_gemm_simple) schedule_unified_systolic(s) # csim test print(" Csim Test ".center(60, "*")) - mod = s.build(target="vitis_hls", mode="csim", project="top.prj") + mod = s.build(target="vitis_hls", mode="csim", project="df-uni-simple-csim.prj") C_truth = np.dot(A, B) print(C_truth) @@ -157,16 +430,115 @@ def test_unified_systolic(): np.testing.assert_allclose(C, C_truth, atol=1e-5) print("Csim: Output-stationary Mode Passed!") - # csyn test - print(" Csyn Test ".center(60, "*")) - mod_csyn = s.build(target="vitis_hls", mode="csyn", project="df-uni-csyn.prj") - mod_csyn() - print("Design: C-Synthesizable!") + # hw_emu test + print(" Hw_emu Test ".center(60, "*")) + mod_hwemu = s.build( + target="vitis_hls", mode="hw_emu", project="df-uni-simple-hwemu.prj" + ) + C = np.zeros((M, N), dtype=np.int32) + mod_hwemu(A, B, flowtag1, C) + print(C) + np.testing.assert_allclose(C, C_truth, atol=1e-5) + print("Hw_emu: Weight-stationary Mode Passed!") + + C = np.zeros((M, N), dtype=np.int32) + mod_hwemu(A, B, flowtag2, C) + print(C) + np.testing.assert_allclose(C, C_truth, atol=1e-5) + print("Hw_emu: Output-stationary Mode Passed!") + + +def test_unified_daisy_chain(): + A = np.random.randint(0, 8, (M, K), dtype=np.int16) + B = np.random.randint(0, 8, (K, N), dtype=np.int16) + C = np.zeros((M, N), dtype=np.int16) + + if hls.is_available("vitis_hls"): + # csim test + print(" Csim Test ".center(60, "*")) + mod = df.build( + unified_gemm_daisy_chain, + target="vitis_hls", + mode="csim", + project="df-uni-daisy-csim.prj", + ) + C_truth = np.dot(A, B) + print(C_truth) + + flowtag1: bool = False + mod(A, B, flowtag1, C) + print(C) + np.testing.assert_allclose(C, C_truth, atol=1e-5) + print("Csim: Weight-stationary Mode Passed!") + + flowtag2: bool = True + C = np.zeros((M, N), dtype=np.int16) + mod(A, B, flowtag2, C) + print(C) + np.testing.assert_allclose(C, C_truth, atol=1e-5) + print("Csim: Output-stationary Mode Passed!") + + # hw_emu test + print(" Hw_emu Test ".center(60, "*")) + mod_hwemu = df.build( + unified_gemm_daisy_chain, + target="vitis_hls", + mode="hw_emu", + project="df-uni-daisy-hwemu.prj", + ) + C = np.zeros((M, N), dtype=np.int32) + mod_hwemu(A, B, flowtag1, C) + print(C) + np.testing.assert_allclose(C, C_truth, atol=1e-5) + print("Hw_emu: Weight-stationary Mode Passed!") + + C = np.zeros((M, N), dtype=np.int32) + mod_hwemu(A, B, flowtag2, C) + print(C) + np.testing.assert_allclose(C, C_truth, atol=1e-5) + print("Hw_emu: Output-stationary Mode Passed!") + + +M, N, K = 4, 4, 4 +Rt, Ct = 2, 2 +P0, P1 = Rt + 2, Ct + 2 + + +def test_unified_tiling(): + + A = np.random.randint(-8, 8, (M, K)).astype(np.int32) + B = np.random.randint(-8, 8, (K, N)).astype(np.int32) + + C = np.zeros((M, N), dtype=np.int32) + + if hls.is_available("vitis_hls"): + + s = df.customize(unified_gemm_tiling) + schedule_unified_systolic(s) + + # csim test + print(" Csim Test ".center(60, "*")) + mod = s.build(target="vitis_hls", mode="csim", project="df-uni-tiling-csim.prj") + C_truth = np.dot(A, B) + print(C_truth) + + flowtag1: bool = False + mod(A, B, flowtag1, C) + print(C) + np.testing.assert_allclose(C, C_truth, atol=1e-5) + print("Csim: Weight-stationary Mode Passed!") + + flowtag2: bool = True + C = np.zeros((M, N), dtype=np.int32) + mod(A, B, flowtag2, C) + print(C) + np.testing.assert_allclose(C, C_truth, atol=1e-5) + print("Csim: Output-stationary Mode Passed!") # hw_emu test print(" Hw_emu Test ".center(60, "*")) mod_hwemu = s.build( - target="vitis_hls", mode="hw_emu", project="df-uni-hwemu.prj" + target="vitis_hls", mode="hw_emu", project="df-uni-tiling-hwemu.prj" ) C = np.zeros((M, N), dtype=np.int32) mod_hwemu(A, B, flowtag1, C) @@ -182,4 +554,13 @@ def test_unified_systolic(): if __name__ == "__main__": - test_unified_systolic() + U = 4 # Require for same size in two dimension if not tiling + M, N, K = U, 4, U + P0, P1 = U + 2, U + 2 + test_unified_simple() + test_unified_daisy_chain() + + M, N, K = 4, 4, 4 + Rt, Ct = 2, 2 + P0, P1 = Rt + 2, Ct + 2 + test_unified_tiling()