diff --git a/Bender.yml b/Bender.yml index f0497c7..45d0671 100644 --- a/Bender.yml +++ b/Bender.yml @@ -32,6 +32,7 @@ sources: # Individual source files are simple string entries: - src/ita_package.sv - src/ita_accumulator.sv + - src/ita_masking.sv - src/ita_controller.sv - src/ita_dotp.sv - src/ita_fifo_controller.sv diff --git a/PyITA/ITA.py b/PyITA/ITA.py index c366685..e69de29 100644 --- a/PyITA/ITA.py +++ b/PyITA/ITA.py @@ -1,1444 +0,0 @@ -# Copyright 2023 ETH Zurich and University of Bologna. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -# ---------------------------------------------------------------------- -# -# File: ITA.py -# -# Last edited: 5.03.2024 -# -# Copyright (C) 2024, ETH Zurich and University of Bologna. -# -# Author: Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich -# -# ---------------------------------------------------------------------- - -import os -import sys -from functools import partial -from typing import Union - -import numpy as np -from numpy.typing import ArrayLike, DTypeLike - -import seaborn as sns -import matplotlib.pyplot as plt - -from .softmax import fastSoftmax, realSoftmax, streamingPartialSoftmax -from .gelu import gelu_requantize, i_gelu_requantized, get_i_gelu_constants, get_i_gelu_requantized_constants -from .util import (generate_matrix_mem, pack_8b_to_word, pack_array_8b_to_word, pack_hex_24b, pack_multihead_8b_to_word, - pack_multihead_24b_to_word, random_shuffled_tensor, requantize, split_matrix, to_hex, write_matrix, - write_matrix_mem, write_matrix_mem_hex, write_vector_mem_hex, get_almost_symmetric_scaling_factor, - error_MAEP) - - -class Transformer: - WO = 26 - WI = 8 - - def __init__(self, - S: int, - P: int, - E: int, - F: int, - H: int, - path: Union[str, os.PathLike], - bias: bool = True, - activation: str = "identity", - mask: str = "none", - Q: ArrayLike = None, - K: ArrayLike = None, - V: ArrayLike = None, - Wq: ArrayLike = None, - Wk: ArrayLike = None, - Wv: ArrayLike = None, - Wo: ArrayLike = None, - Bq: ArrayLike = None, - Bk: ArrayLike = None, - Bv: ArrayLike = None, - Bo: ArrayLike = None, - FF_in: ArrayLike = None, - Wff: ArrayLike = None, - Wff2: ArrayLike = None, - Bff: ArrayLike = None, - Bff2: ArrayLike = None): - - self.ITA_N = 16 - self.ITA_M = 64 - - # WIESEP: Set numpy print options - np.set_printoptions(threshold = sys.maxsize) - np.set_printoptions(linewidth = np.inf) - - self._init_paths(path) - - self.S_ITA = ((S - 1) // self.ITA_M + 1) * self.ITA_M - self.P_ITA = ((P - 1) // self.ITA_M + 1) * self.ITA_M - self.E_ITA = ((E - 1) // self.ITA_M + 1) * self.ITA_M - self.F_ITA = ((F - 1) // self.ITA_M + 1) * self.ITA_M - self.H_ITA = 4 - self.split = self.ITA_M // self.ITA_N - - self.S = S - self.P = P - self.E = E - self.F = F - self.H = H - self.bias = bias - self.activation = activation - self.mask = mask - - # Setup transformation functions - self.split_m_m = partial(split_matrix, block_shape = (self.ITA_M, self.ITA_M)) - self.split_m_n = partial(split_matrix, block_shape = (self.ITA_M, self.ITA_N)) - - self._validate_matrix_constraints(K, V) - self._initialize_quantization_parameters() - self._init_gelu_constants() - self._initialize_tensors(Q, V, Wq, Wk, Wv, Wo, Bq, Bk, Bv, Bo, FF_in, Wff, Wff2, Bff, Bff2) - - def split_multihead_m_m(self, multihead_array: np.ndarray): - """ - Split a multihead array into blocks of size ITA_M x ITA_M. - - Args: - multihead_array (np.ndarray): A 3-dimensional numpy array to be split into blocks. - - Returns: - np.ndarray: A 3-dimensional numpy array with the blocks of size ITA_M x ITA_M, where all blocks are stacked vertically in the inner dimensions. - """ - return [self.split_m_m(array) for array in multihead_array] - - def _validate_matrix_constraints(self, K: ArrayLike, V: ArrayLike): - # WIESEP: Ensure that K is the same as V because we do cross-attention - assert (np.all(K == V)) - - # WIESEP: Current restrictions for ITA - # assert (self.S % self.ITA_M == 0), "Sequence length must be divisible by ITA_M" - # assert (self.P % self.ITA_M == 0), "Projection space must be divisible by ITA_M" - # assert (self.E % self.ITA_M == 0), "Embedding size must be divisible by ITA_M" - # assert (self.F % self.ITA_M == 0), "Feedforward size must be divisible by ITA_M" - - assert ( - self.E <= 512 - ), f"Embedding size must be less than {int(2**(self.WO-17))} because the internal bit width is {self.WO} bits" - assert ( - self.P <= 512 - ), f"Projection space must be less than {int(2**(self.WO-17))} because the internal bit width is {self.WO} bits" - assert ( - self.S <= 512 - ), f"Sequence length must be less than {int(2**(self.WO-17))} because the internal bit width is {self.WO} bits" - assert ( - self.F <= 512 - ), f"Feedforward size must be less than {int(2**(self.WO-17))} because the internal bit width is {self.WO} bits" - - # assert (self.H % self.H_ITA == 0 or self.H == 1), "Number of heads must be one or divisible by H_ITA" - - def _initialize_tensors(self, Q, V, Wq, Wk, Wv, Wo, Bq, Bk, Bv, Bo, FF_in, Wff, Wff2, Bff, Bff2): - - self.exp_sum = np.zeros(self.S, dtype = np.int32) - - self.Q_in = random_shuffled_tensor((self.S, self.E), self.WI) if Q is None else Q - self.Q = np.pad(self.Q_in, ((0, self.S_ITA - self.S), (0, self.E_ITA - self.E))) - - self.V_in = random_shuffled_tensor((self.S, self.E), self.WI) if V is None else V - self.V = np.pad(self.V_in, ((0, self.S_ITA - self.S), (0, self.E_ITA - self.E))) - - # WIESEP: K is the same as V because we do cross-attention - self.K_in = self.V_in - self.K = self.V - - self.FF_in = random_shuffled_tensor((self.S, self.E), self.WI) if FF_in is None else FF_in - self.FF = np.pad(self.FF_in, ((0, self.S_ITA - self.S), (0, self.E_ITA - self.E))) - - #### Weight matrices #### - self.Wq_in = random_shuffled_tensor((self.H, self.E, self.P), self.WI) if Wq is None else Wq - self.Wq = np.pad(self.Wq_in, ((0, 0), (0, self.E_ITA - self.E), (0, self.P_ITA - self.P))) - - self.Wk_in = random_shuffled_tensor((self.H, self.E, self.P), self.WI) if Wk is None else Wk - self.Wk = np.pad(self.Wk_in, ((0, 0), (0, self.E_ITA - self.E), (0, self.P_ITA - self.P))) - - self.Wv_in = random_shuffled_tensor((self.H, self.E, self.P), self.WI) if Wv is None else Wv - self.Wv = np.pad(self.Wv_in, ((0, 0), (0, self.E_ITA - self.E), (0, self.P_ITA - self.P))) - - self.Wo_in = random_shuffled_tensor((self.H, self.P, self.E), self.WI) if Wo is None else Wo - self.Wo = np.pad(self.Wo_in, ((0, 0), (0, self.P_ITA - self.P), (0, self.E_ITA - self.E))) - - self.Wff_in = random_shuffled_tensor((1, self.E, self.F), self.WI) if Wff is None else Wff - self.Wff = np.pad(self.Wff_in, ((0, 0), (0, self.E_ITA - self.E), (0, self.F_ITA - self.F))) - self.Wff2_in = random_shuffled_tensor((1, self.F, self.E), self.WI) if Wff2 is None else Wff2 - self.Wff2 = np.pad(self.Wff2_in, ((0, 0), (0, self.F_ITA - self.F), (0, self.E_ITA - self.E))) - - #### Bias matrices #### - if self.bias: - self.Bq_in = random_shuffled_tensor( - (self.H, self.P), int(np.log2(self.P)) + 8, type = np.int32) if Bq is None else Bq - else: - self.Bq_in = np.zeros((self.H, self.P), dtype = np.int8) - self.Bq = np.pad(self.Bq_in, ((0, 0), (0, self.P_ITA - self.P))) - self.Bq_broadcast = np.reshape(np.repeat(self.Bq, self.S, axis = 0), (self.H, self.S, self.P_ITA)) - self.Bq_broadcast = np.pad(self.Bq_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0))) - - - if self.bias: - self.Bk_in = random_shuffled_tensor( - (self.H, self.P), int(np.log2(self.P)) + 8, type = np.int32) if Bk is None else Bk - else: - self.Bk_in = np.zeros((self.H, self.P), dtype = np.int8) - self.Bk = np.pad(self.Bk_in, ((0, 0), (0, self.P_ITA - self.P))) - self.Bk_broadcast = np.reshape(np.repeat(self.Bk, self.S, axis = 0), (self.H, self.S, self.P_ITA)) - self.Bk_broadcast = np.pad(self.Bk_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0))) - - if self.bias: - self.Bv_in = random_shuffled_tensor( - (self.H, self.P), int(np.log2(self.P)) + 8, type = np.int32) if Bv is None else Bv - else: - self.Bv_in = np.zeros((self.H, self.P), dtype = np.int8) - self.Bv = np.pad(self.Bv_in, ((0, 0), (0, self.P_ITA - self.P))) - self.Bv_broadcast = np.reshape(np.repeat(self.Bv, self.S, axis = 0), (self.H, self.S, self.P_ITA)) - self.Bv_broadcast = np.pad(self.Bv_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0))) - - if self.bias: - self.Bo_in = random_shuffled_tensor( - (self.H, self.E), int(np.log2(self.E)) + 8, type = np.int32) if Bo is None else Bo - else: - self.Bo_in = np.zeros((self.H, self.E), dtype = np.int8) - self.Bo = np.pad(self.Bo_in, ((0, 0), (0, self.E_ITA - self.E))) - self.Bo_broadcast = np.reshape(np.repeat(self.Bo, self.S, axis = 0), (self.H, self.S, self.E_ITA)) - self.Bo_broadcast = np.pad(self.Bo_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0))) - - if self.bias: - self.Bff_in = random_shuffled_tensor( - (1, self.F), int(np.log2(self.F)) + 8, type = np.int32) if Bff is None else Bff - else: - self.Bff_in = np.zeros((1, self.F), dtype = np.int8) - self.Bff = np.pad(self.Bff_in, ((0, 0), (0, self.F_ITA - self.F))) - self.Bff_broadcast = np.reshape(np.repeat(self.Bff, self.S, axis = 0), (1, self.S, self.F_ITA)) - self.Bff_broadcast = np.pad(self.Bff_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0))) - if self.bias: - self.Bff2_in = random_shuffled_tensor( - (1, self.E), int(np.log2(self.E)) + 8, type = np.int32) if Bff2 is None else Bff2 - else: - self.Bff2_in = np.zeros((1, self.E), dtype = np.int8) - self.Bff2 = np.pad(self.Bff2_in, ((0, 0), (0, self.E_ITA - self.E))) - self.Bff2_broadcast = np.reshape(np.repeat(self.Bff2, self.S, axis = 0), (1, self.S, self.E_ITA)) - self.Bff2_broadcast = np.pad(self.Bff2_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0))) - - #### Intermediate tensors #### - - self.Qp = None - self.Qp_requant = None - self.Kp = None - self.Kp_requant = None - self.Vp = None - self.Vp_requant = None - self.FFp = None - self.FFp_requant = None - self.FF2p = None - self.FF2p_requant = None - - self.A = None - self.A_requant = None - self.A_real_softmax = np.zeros([self.H, self.S, self.S], dtype = np.int8) - self.A_partial_softmax = np.zeros([self.H, self.S, self.S], dtype = np.int8) - - self.Mask = None - - self.O_soft = None - self.O_soft_requant = None - - self.Out_soft = None - self.Out_soft_requant = None - - self.Out_soft_sum = None - self.Out_soft_sum_requant = None - - self.preactivation = np.random.randint(-128, 127, size = (self.S, self.F), dtype = np.int8) - self.postactivation = None - - def _initialize_quantization_parameters(self): - # WIESEP: 6 steps for attention layer and one to requantize the accumulated output, 2 for feedforward - self.requant_eps_mult = np.zeros((7, self.H), dtype = np.uint8) - self.requant_right_shift = np.zeros((7, self.H), dtype = np.uint8) - - # WIESEP: Add parameter in transformers will always be zero as there are no batch normalization layers - self.requant_add = np.zeros((7, self.H), dtype = np.int8) - - for i in range(7): - self.requant_eps_mult[i, :] = np.random.randint(64, 127, size = (1, self.H), dtype = np.uint8) - - if i < 3: # Q, K, V - max_bit_width = np.log2(self.requant_eps_mult[i, :].astype(np.uint32) * self.E * 2**9).astype(np.uint32) - elif i == 3: # QK - max_bit_width = np.log2(self.requant_eps_mult[i, :].astype(np.uint32) * self.P * 2**8).astype(np.uint32) - elif i == 4: # AV - max_bit_width = np.log2(self.requant_eps_mult[i, :].astype(np.uint32) * self.S * 2**5).astype(np.uint32) - elif i == 5: # OW - max_bit_width = np.log2(self.requant_eps_mult[i, :].astype(np.uint32) * self.E * 2**9).astype(np.uint32) - elif i == 6: # Sum OW - max_bit_width = np.log2(self.requant_eps_mult[i, :].astype(np.uint32) * self.H * 2**7).astype(np.uint32) - - # WIESEP: Last requatization after head summation shares the same parameters - if i == 6: - self.requant_right_shift[i, :] = np.tile(max_bit_width[0] - 8 + 2, self.H) - else: - self.requant_right_shift[i, :] = max_bit_width - 8 + 2 - - write_matrix([self.requant_eps_mult.T], "RQS_ATTN_MUL", self.paths["base"]) - write_matrix([self.requant_right_shift.T], "RQS_ATTN_SHIFT", self.paths["base"]) - write_matrix([self.requant_add.T], "RQS_ATTN_ADD", self.paths["base"]) - - self.requant_eps_mult_ffn = np.zeros((2, 1), dtype = np.uint8) - self.requant_right_shift_ffn = np.zeros((2, 1), dtype = np.uint8) - self.requant_add_ffn = np.zeros((2, 1), dtype = np.int8) - - for i in range(2): - self.requant_eps_mult_ffn[i, :] = np.random.randint(64, 127, size = (1, 1), dtype = np.uint8) - - if i == 0: - max_bit_width = np.log2(self.requant_eps_mult_ffn[i, :].astype(np.uint32) * self.E * 2**9).astype( - np.uint32) - elif i == 1: - max_bit_width = np.log2(self.requant_eps_mult_ffn[i, :].astype(np.uint32) * self.F * 2**9).astype( - np.uint32) - - self.requant_right_shift_ffn[i, :] = max_bit_width - 8 + 2 - - write_matrix([self.requant_eps_mult_ffn.T], "RQS_FFN_MUL", self.paths["base"]) - write_matrix([self.requant_right_shift_ffn.T], "RQS_FFN_SHIFT", self.paths["base"]) - write_matrix([self.requant_add_ffn.T], "RQS_FFN_ADD", self.paths["base"]) - - def _init_gelu_constants(self): - CLIP_LO = -4 - D = 2**20 - - gelu_eps_mult, _ = get_almost_symmetric_scaling_factor(CLIP_LO, n_bits = 8) - self.q_1, self.q_b, self.q_c, _, _, _, self.gelu_rqs_mul, self.gelu_rqs_shift, self.gelu_rqs_add, S_out = get_i_gelu_requantized_constants( - gelu_eps_mult, D) - - write_matrix([[self.q_1]], "GELU_ONE", self.paths["base"]) - write_matrix([[self.q_b]], "GELU_B", self.paths["base"]) - write_matrix([[self.q_c]], "GELU_C", self.paths["base"]) - write_matrix([[self.gelu_rqs_mul]], "activation_requant_mult", self.paths["base"]) - write_matrix([[self.gelu_rqs_shift]], "activation_requant_shift", self.paths["base"]) - write_matrix([[self.gelu_rqs_add]], "activation_requant_add", self.paths["base"]) - - def _init_paths(self, base_path: Union[str, os.PathLike]): - self.paths = { - "base": base_path, - "mempool": os.path.join(base_path, "mempool/"), - "hwpe": os.path.join(base_path, "hwpe/"), - "standalone": os.path.join(base_path, "standalone/"), - "snitch-cluster": os.path.join(base_path, "snitch-cluster/") - } - for path in self.paths.values(): - os.makedirs(path, exist_ok = True) - - def print_properties(self, verbose: int, text_align = 30): - if verbose > 0: - print(f"{'ITA Sequence Length ' :<{text_align}}: {self.S_ITA}") - print(f"{'ITA Projection Space' :<{text_align}}: {self.P_ITA}") - print(f"{'ITA Embedding Size ' :<{text_align}}: {self.E_ITA}") - print(f"{'ITA Number of Heads ' :<{text_align}}: {self.H_ITA}") - print(f"{'Matrix Sequence Length ' :<{text_align}}: {self.S}") - print(f"{'Matrix Projection Space' :<{text_align}}: {self.P}") - print(f"{'Matrix Embedding Size ' :<{text_align}}: {self.E}") - print(f"{'Matrix Feedforward Size' :<{text_align}}: {self.F}") - print(f"{'Matrix Number of Heads ' :<{text_align}}: {self.H}") - print(f"{'Bias ' :<{text_align}}: {bool(self.bias)}") - print(f"{'Requant Mult Attention ' :<{text_align}}: {list(self.requant_eps_mult)}") - print(f"{'Requant Shift Attention ' :<{text_align}}: {list(self.requant_right_shift)}") - print(f"{'Requant Add Attention ' :<{text_align}}: {list(self.requant_add)}") - print(f"{'Requant Mult FFN ' :<{text_align}}: {list(self.requant_eps_mult_ffn)}") - print(f"{'Requant Shift FFN ' :<{text_align}}: {list(self.requant_right_shift_ffn)}") - print(f"{'Requant Add FFN ' :<{text_align}}: {list(self.requant_add_ffn)}") - - def tiler_QK(self, qk: np.ndarray, weight: np.ndarray, bias: np.ndarray, output: np.ndarray, input_file: str, - weight_file: str, bias_file: str, output_file: str): - """ - Tile input, weight, bias and output for Q and K generation - """ - - # Weight Wqk is H x E x P - # Transpose Wqk to H x P x E - # print(f"qk: {qk.shape}") - # print(f"qk: {weight.shape}") - - weight = np.transpose(weight, (0, 2, 1)) - - tile_x = qk.shape[0] // self.ITA_M # S // ITA_M - tile_inner = qk.shape[1] // self.ITA_M # E // ITA_M - tile_y = weight.shape[1] // self.ITA_M # P // ITA_M - print(f"=> Tile: {input_file} x {weight_file} + {bias_file} = {output_file}") - print(f" X: {tile_x}, Y: {tile_y}, Inner: {tile_inner}") - - # Input QK is S x E - Input = split_matrix(qk, (self.ITA_M, self.ITA_M), flatten = False) - # Repeat each row of each tile split times - Input = np.tile(Input, [1, 1, self.split, 1]) - # Repeat each tile number of output row tiles times - Input = np.tile(Input, [1, tile_y, 1, 1]).reshape((-1, self.ITA_M)) - # fig, ax = plt.subplots(1, 2) # Create a figure with two subplots - # im0 = ax[0].imshow(Input, cmap='viridis') - # im1 = ax[1].imshow(np.squeeze(weight, axis=0)) - - # # Add colorbars for each image if needed - # fig.colorbar(im0, ax=ax[0]) - # fig.colorbar(im1, ax=ax[1]) - - # # Set titles for each subplot - # ax[0].set_title("Inputs") - # ax[1].set_title("Weights") - - plt.show() - write_matrix(Input, input_file, self.paths["standalone"]) - - # Transposed Weight Wqk is H x P x E - for h in range(self.H): - Weight = split_matrix(weight[h], (self.ITA_M, self.ITA_M)) - # Repeat each tile number of output column tiles times - Weight = np.tile(Weight, [tile_x, 1]) - write_matrix(Weight, f"{weight_file}_{h}", self.paths["standalone"]) - - # Bias Bqk is H x P - # Broadcast Bias Bqk to H x S x P - bias = np.tile(bias, [1, self.S_ITA, 1]) - for h in range(self.H): - Bias = split_matrix(bias[h], (self.ITA_M, self.ITA_N)) - write_matrix(Bias, f"{bias_file}_{h}", self.paths["standalone"]) - - # Output QKp is H x S x P - for h in range(self.H): - Output = split_matrix(output[h], (self.ITA_M, self.ITA_N)) - write_matrix(Output, f"{output_file}_{h}", self.paths["standalone"]) - - def tiler_V(self, v, weight, bias, output, input_file, weight_file, bias_file, output_file): - """ - Tile input, weight, bias and output for V generation - *Compute Vp in transposed form* - """ - - # Weight Wv is H x E x P - # Transpose Wv to H x P x E - weight = np.transpose(weight, (0, 2, 1)) - - tile_x = v.shape[0] // self.ITA_M # S // ITA_M - tile_inner = v.shape[1] // self.ITA_M # E // ITA_M - tile_y = weight.shape[1] // self.ITA_M # P // ITA_M - print(f"=> Tile: {input_file} x {weight_file} + {bias_file} = {output_file}") - print(f" X: {tile_x}, Y: {tile_y}, Inner: {tile_inner}") - - # Input V is S x E (will be used as second input) - Input = split_matrix(v, (self.ITA_M, self.ITA_M)) - # Repeat each tile number of output row tiles times - Input = np.tile(Input, [tile_y, 1]) - write_matrix(Input, input_file, self.paths["standalone"]) - - # Transposed Weight Wv is H x P x E (will be used as first input) - for h in range(self.H): - Weight = split_matrix(weight[h], (self.ITA_M, self.ITA_M), flatten = False) - # Repeat each row of each tile split times - Weight = np.tile(Weight, [1, 1, self.split, 1]) - # Repeat each tile number of output column tiles times - Weight = np.tile(Weight, [1, tile_x, 1, 1]).reshape((-1, self.ITA_M)) - write_matrix(Weight, f"{weight_file}_{h}", self.paths["standalone"]) - - # Bias Bv is H x P - # Broadcast Bias Bv to H x S x P - bias = np.tile(bias, [1, self.S_ITA, 1]) - # Transpose Bias Bv to H x P x S - bias = np.transpose(bias, (0, 2, 1)) - for h in range(self.H): - Bias = split_matrix(bias[h], (self.ITA_M, self.ITA_N)) - write_matrix(Bias, f"{bias_file}_{h}", self.paths["standalone"]) - - # Output Vp is H x S x P - # Transpose Vp to H x P x S - output = np.transpose(output, (0, 2, 1)) - for h in range(self.H): - Output = split_matrix(output[h], (self.ITA_M, self.ITA_N)) - write_matrix(Output, f"{output_file}_{h}", self.paths["standalone"]) - - def tiler_AV(self, Qp, Kp, output, input_file, weight_file, output_file): - """ - Tile input, weight, and output for Q.K = A and A.V = O generation - """ - - tile_x = Qp.shape[1] // self.ITA_M - tile_inner = Qp.shape[2] // self.ITA_M - tile_y = Kp.shape[1] // self.ITA_M - print(f"=> Tile: {input_file} x {weight_file} = {output_file}") - print(f" X: {tile_x}, Y: {tile_y}, Inner: {tile_inner}") - - # Input Qp is H x S x P or A is S x S - for h in range(self.H): - Input = split_matrix(Qp[h], (self.ITA_M, self.ITA_M), flatten = False) - # Repeat each row of each tile split times - Input = np.tile(Input, [1, 1, self.split, 1]) - # Repeat each tile number of output row tiles times - Input = np.tile(Input, [1, tile_y, 1, 1]).reshape((-1, self.ITA_M)) - write_matrix(Input, f"{input_file}_{h}", self.paths["standalone"]) - - # Weight Kp is H x S x P or V is H x P x S - for h in range(self.H): - Weight = split_matrix(Kp[h], (self.ITA_M, self.ITA_M)) - # Repeat each tile number of output column tiles times - Weight = np.tile(Weight, [tile_x, 1]) - write_matrix(Weight, f"{weight_file}_{h}", self.paths["standalone"]) - - # Output A is H x S x S or O is H x S x P - for h in range(self.H): - Output = split_matrix(output[h], (self.ITA_M, self.ITA_N)) - write_matrix(Output, f"{output_file}_{h}", self.paths["standalone"]) - - def tiler_Out(self, O, weight, bias, output, input_file, weight_file, bias_file, output_file): - """ - Tile input, weight, bias and output for Output generation - Same as QK but takes multi-head input - """ - - # Weight Wo is H x P x E - # Transpose Wo to H x E x P - weight = np.transpose(weight, (0, 2, 1)) - - tile_x = O.shape[1] // self.ITA_M # S // ITA_M - tile_inner = O.shape[2] // self.ITA_M # P // ITA_M - tile_y = weight.shape[1] // self.ITA_M # E // ITA_M - - print(f"=> Tile: {input_file} x {weight_file} + {bias_file} = {output_file}") - print(f" X: {tile_x}, Y: {tile_y}, Inner: {tile_inner}") - - # Input O is H x S x P - for h in range(self.H): - Input = split_matrix(O[h], (self.ITA_M, self.ITA_M), flatten = False) - # Repeat each row of each tile split times - Input = np.tile(Input, [1, 1, self.split, 1]) - # Repeat each tile number of output row tiles times - Input = np.tile(Input, [1, tile_y, 1, 1]).reshape((-1, self.ITA_M)) - write_matrix(Input, f"{input_file}_{h}", self.paths["standalone"]) - - # Transposed Weight Wo is H x E x P - for h in range(self.H): - Weight = split_matrix(weight[h], (self.ITA_M, self.ITA_M)) - # Repeat each tile number of output column tiles times - Weight = np.tile(Weight, [tile_x, 1]) - write_matrix(Weight, f"{weight_file}_{h}", self.paths["standalone"]) - - # Bias Bo is H x E - # Broadcast Bias Bo to H x S x E - bias = np.tile(bias, [1, self.S_ITA, 1]) - for h in range(self.H): - Bias = split_matrix(bias[h], (self.ITA_M, self.ITA_N)) - write_matrix(Bias, f"{bias_file}_{h}", self.paths["standalone"]) - - # Output is H x S x E - for h in range(self.H): - Output = split_matrix(output[h], (self.ITA_M, self.ITA_N)) - write_matrix(Output, f"{output_file}_{h}", self.paths["standalone"]) - - def step1_Qp(self): - self.Qp = np.matmul(self.Q, self.Wq, dtype = np.int32) + self.Bq_broadcast - self.Qp = np.clip(self.Qp, -2**(self.WO - 1), 2**(self.WO - 1) - 1) - self.Qp_requant = requantize(self.Qp, self.requant_eps_mult[0], self.requant_right_shift[0], - self.requant_add[0]) - - # Set padded values to zero - if (self.S_ITA - self.S) > 0: - self.Qp_requant[:, -(self.S_ITA - self.S):, :] = 0 - if (self.P_ITA - self.P) > 0: - self.Qp_requant[:, :, -(self.P_ITA - self.P):] = 0 - - self.tiler_QK(self.Q, self.Wq, self.Bq, self.Qp_requant, "Q", "Wq", "Bq", "Qp") - - def step2_Kp(self): - self.Kp = np.matmul(self.K, self.Wk, dtype = np.int32) + self.Bk_broadcast - self.Kp = np.clip(self.Kp, -2**(self.WO - 1), 2**(self.WO - 1) - 1) - self.Kp_requant = requantize(self.Kp, self.requant_eps_mult[1], self.requant_right_shift[1], - self.requant_add[1]) - - if (self.S_ITA - self.S) > 0: - self.Kp_requant[:, -(self.S_ITA - self.S):, :] = 0 - if (self.P_ITA - self.P) > 0: - self.Kp_requant[:, :, -(self.P_ITA - self.P):] = 0 - - self.tiler_QK(self.K, self.Wk, self.Bk, self.Kp_requant, "K", "Wk", "Bk", "Kp") - - def step3_Vp(self): - self.Vp = np.matmul(self.V, self.Wv, dtype = np.int32) + self.Bv_broadcast - self.Vp = np.clip(self.Vp, -2**(self.WO - 1), 2**(self.WO - 1) - 1) - self.Vp_requant = requantize(self.Vp, self.requant_eps_mult[2], self.requant_right_shift[2], - self.requant_add[2]) - - if (self.S_ITA - self.S) > 0: - self.Vp_requant[:, -(self.S_ITA - self.S):, :] = 0 - if (self.P_ITA - self.P) > 0: - self.Vp_requant[:, :, -(self.P_ITA - self.P):] = 0 - - # Compute Vp in transposed form - self.tiler_V(self.V, self.Wv, self.Bv, self.Vp_requant, "V", "Wv", "Bv", "Vp") - - def apply_mask(self, index): - - if (self.mask == 'upper_triangular'): - self.Mask = np.full((self.H, self.S, self.S), fill_value=False, dtype='bool') - if (0 < index and index < self.S): - for h in range(self.Mask.shape[0]): - for i in range(self.Mask.shape[1]): - for j in range((i + index), self.Mask.shape[2]): - self.Mask[h][i][j] = True - else: - raise ValueError(f"Index is out of bounds for {self.mask} mask") - elif (self.mask == 'lower_triangular'): - self.Mask = np.full((self.H, self.S, self.S), fill_value=False, dtype='bool') - if (0 < index and index < self.S): - for h in range(self.Mask.shape[0]): - for i in range(index, self.Mask.shape[1]): - for j in range((i-(index-1))): - self.Mask[h][i][j] = True - else: - raise ValueError(f"Index is out of bounds for {self.mask} mask") - elif (self.mask == 'strided'): - self.Mask = np.full((self.H, self.S, self.S), fill_value=True, dtype='bool') - if (0 < index and index < self.S): - for h in range(self.Mask.shape[0]): - for i in range(self.Mask.shape[1]): - self.Mask[h][i][i] = False - for j in range(i, self.Mask.shape[2], index): - self.Mask[h][i][j] = False - self.Mask[h][j][i] = False - else: - raise ValueError(f"Index is out of bounds for {self.mask} mask") - elif (self.mask == 'upper_strided'): - pass - elif (self.mask == 'lower_strided'): - pass - elif (self.mask == 'lower_local'): - pass - elif(self.mask == 'none'): - pass - else: - raise ValueError("Mask not supported") - - - def step4_QK(self, no_partial_softmax, index): - self.A = np.array( - [np.matmul(self.Qp_requant[i], np.transpose(self.Kp_requant[i]), dtype = np.int32) for i in range(self.H)]) - self.A = np.clip(self.A, -2**(self.WO - 1), 2**(self.WO - 1) - 1) - self.A_requant = requantize(self.A, self.requant_eps_mult[3], self.requant_right_shift[3], self.requant_add[3]) - - self.apply_mask(index) - - print(self.Mask) - - matrix = np.squeeze(self.A_requant) - plt.imshow(matrix, cmap='viridis') - plt.colorbar() - plt.title("A_requant/A_stream_soft_in") - plt.show() - - print(f"A_requant row 0: {self.A_requant[0, 0, :]}") - - if (self.S_ITA - self.S) > 0: - self.A_requant[:, -(self.S_ITA - self.S):, :] = 0 - self.A_requant[:, :, -(self.S_ITA - self.S):] = 0 - - self.soft(no_partial_softmax) - - matrix = np.squeeze(self.A_partial_softmax) - plt.imshow(matrix, cmap='viridis') - plt.colorbar() - plt.title("A_partial_softmax") - plt.show() - - self.tiler_AV(self.Qp_requant, self.Kp_requant, self.A_requant, "Qp_in", "Kp_in", "A") - - def soft(self, no_partial_softmax = False): - self.A_real_softmax = realSoftmax(self.A_requant[:, :self.S, :self.S]) - self.A_real_softmax = np.pad(self.A_real_softmax, ((0, 0), (0, self.S_ITA - self.S), (0, self.S_ITA - self.S))) - - if no_partial_softmax: - self.A_partial_softmax = fastSoftmax(self.A_requant[:, :self.S, :self.S]) - self.A_partial_softmax = np.pad(self.A_partial_softmax, - ((0, 0), (0, self.S_ITA - self.S), (0, self.S_ITA - self.S))) - else: - self.A_partial_softmax = streamingPartialSoftmax(self.A_requant[:, :self.S, :self.S], self.Mask) - self.A_partial_softmax[self.Mask] = 0 - print(f"inp_stream_soft_o: {self.A_partial_softmax[0,:,:]}") - print(f"Normalization Sum: {np.sum(self.A_partial_softmax[0,:,:], axis=1)}") - self.A_partial_softmax = np.pad(self.A_partial_softmax, - ((0, 0), (0, self.S_ITA - self.S), (0, self.S_ITA - self.S))) - - if self.H == 1: - A_save = [np.tile(self.A_partial_softmax[i], [self.split, 1]) for i in range(self.H)] - write_matrix(A_save, "A_soft_in", self.paths["standalone"]) - for h in range(self.H): - A_save = self.A_partial_softmax[h] - write_matrix(A_save, f"A_soft_{h}", self.paths["standalone"]) - - def step5_AV(self): - print(f"A_partial_softmax: {self.A_partial_softmax.shape}") - print(f"Vp_requant: {self.Vp_requant.shape}") - - self.O_soft = np.array([ - np.matmul(self.A_partial_softmax[i].astype(np.uint8), self.Vp_requant[i], dtype = np.int32) - for i in range(self.H) - ]) - print(f"O_soft without requant row 0: {self.O_soft[0, 62, :]}") - print(f"O_soft without requant row 0: {self.O_soft[0, 63, :]}") - print(f"O_soft without requant row 0: {self.O_soft[0, 0, :]}") - print(f"O_soft without requant row 0: {self.O_soft[0, 1, :]}") - - self.O_soft = np.clip(self.O_soft, -2**(self.WO - 1), 2**(self.WO - 1) - 1) - self.O_soft_requant = requantize(self.O_soft, self.requant_eps_mult[4], self.requant_right_shift[4], - self.requant_add[4]) - - print(f"O_soft_requant: {self.O_soft_requant[0, 62, :]}") - print(f"O_soft_requant: {self.O_soft_requant[0, 63, :]}") - print(f"O_soft_requant: {self.O_soft_requant[0, 0, :]}") - print(f"O_soft_requant: {self.O_soft_requant[0, 1, :]}") - - if (self.S_ITA - self.S) > 0: - self.O_soft_requant[:, -(self.S_ITA - self.S):, :] = 0 - if (self.P_ITA - self.P) > 0: - self.O_soft_requant[:, :, -(self.P_ITA - self.P):] = 0 - - matrix = np.squeeze(self.O_soft_requant) - plt.imshow(matrix, cmap='viridis') - plt.colorbar() - plt.title("O_soft_requant/O_soft") - plt.show() - - self.tiler_AV(self.A_requant, np.transpose(self.Vp_requant, (0, 2, 1)), self.O_soft_requant, "A_stream_soft_in", - "Vp_in", "O_soft") - - - - def apply_activation(self, preactivation, activation): - if activation not in ["gelu", "relu", "identity"]: - raise ValueError("Activation function not supported") - - if activation == "gelu": - vectorized_gelu = np.vectorize(i_gelu_requantized) - postactivation = vectorized_gelu(preactivation, self.q_1, self.q_b, self.q_c, self.gelu_rqs_mul, - self.gelu_rqs_shift, self.gelu_rqs_add) - elif activation == "relu": - postactivation = np.maximum(preactivation, 0) - vectorized_requantize = np.vectorize(gelu_requantize) - postactivation = vectorized_requantize(postactivation, self.gelu_rqs_mul, self.gelu_rqs_shift, - self.gelu_rqs_add) - elif activation == "identity": - postactivation = preactivation.copy() - - return postactivation - - def step6_O(self): - self.Out_soft = np.matmul(self.O_soft_requant, self.Wo, dtype = np.int32) + self.Bo_broadcast - self.Out_soft = np.clip(self.Out_soft, -2**(self.WO - 1), 2**(self.WO - 1) - 1) - self.Out_soft_requant = requantize(self.Out_soft, self.requant_eps_mult[5], self.requant_right_shift[5], - self.requant_add[5]) - - matrix = np.squeeze(self.Out_soft_requant) - plt.imshow(matrix, cmap='viridis') - plt.colorbar() - plt.title("Out_soft_requant") - plt.show() - - if (self.S_ITA - self.S) > 0: - self.Out_soft_requant[:, -(self.S_ITA - self.S):, :] = 0 - if (self.E_ITA - self.E) > 0: - self.Out_soft_requant[:, :, -(self.E_ITA - self.E):] = 0 - - self.tiler_Out(self.O_soft_requant, self.Wo, self.Bo, self.Out_soft_requant, "O_soft_in", "Wo", "Bo", - "Out_soft") - - def feedforward_layer(self): - self.FFp = np.matmul(self.FF, self.Wff, dtype = np.int32) + self.Bff_broadcast - self.FFp = np.clip(self.FFp, -2**(self.WO - 1), 2**(self.WO - 1) - 1) - self.FFp_requant = requantize(self.FFp, self.requant_eps_mult_ffn[0], self.requant_right_shift_ffn[0], - self.requant_add_ffn[0]) - self.FFp_requant = self.apply_activation(self.FFp_requant, self.activation) - - self.tiler_QK(self.FF, self.Wff, self.Bff, self.FFp_requant, "FF", "Wff", "Bff", "FFp") - - self.FF2p = np.matmul(self.FFp_requant, self.Wff2, dtype = np.int32) + self.Bff2_broadcast - self.FF2p = np.clip(self.FF2p, -2**(self.WO - 1), 2**(self.WO - 1) - 1) - self.FF2p_requant = requantize(self.FF2p, self.requant_eps_mult_ffn[1], self.requant_right_shift_ffn[1], - self.requant_add_ffn[1]) - - self.tiler_Out(self.FFp_requant, self.Wff2, self.Bff2, self.FF2p_requant, "FFp_in", "Wff2", "Bff2", "FF2p") - - def step7_Osum(self): - self.Out_soft_sum = np.sum(self.Out_soft_requant, axis = 0, dtype = np.int32, keepdims = True) - self.Out_soft_sum_requant = requantize(self.Out_soft_sum, self.requant_eps_mult[6], self.requant_right_shift[6], - self.requant_add[6]) - - def test_activations(self): - write_matrix(self.preactivation, "preactivation", self.paths["standalone"]) - gelu = np.zeros(self.preactivation.shape, dtype = np.int8) - relu = np.zeros(self.preactivation.shape, dtype = np.int8) - for i in range(self.preactivation.shape[0]): - for j in range(self.preactivation.shape[1]): - gelu[i, j] = i_gelu_requantized(self.preactivation[i, j], self.q_1, self.q_b, self.q_c, - self.gelu_rqs_mul, self.gelu_rqs_shift, self.gelu_rqs_add) - relu[i, j] = self.preactivation[i, j] if self.preactivation[i, j] > 0 else 0 - relu[i, j] = gelu_requantize(relu[i, j], self.gelu_rqs_mul, self.gelu_rqs_shift, self.gelu_rqs_add) - - write_matrix(gelu, "gelu", self.paths["standalone"]) - write_matrix(relu, "relu", self.paths["standalone"]) - - def export_hwpe(self): - path = self.paths["hwpe"] - - def remove_if_exists(file_name): - if os.path.exists(file_name): - os.remove(file_name) - - # WIESEP: Delete the old file otherwise it will lead to mismatches during RTL simulations as the files are memory mapped - mem_file = "mem" - files = [ - f"{mem_file}.txt", "Output.txt", "Q.txt", "K.txt", "V.txt", "QK.txt", "A.txt", "AV.txt", "OW.txt", "F1.txt", - "F2.txt" - ] - for file in files: - remove_if_exists(f"{path}/{file}") - - # Write the new mem file - # Layer: Attention - for h in range(self.H): - q = split_matrix(self.Q, (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(q, hex_string = False), mem_file, path) - - k = split_matrix(self.K, (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(k, hex_string = False), mem_file, path) - - w1 = split_matrix(np.transpose(self.Wq[h]), (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(w1, hex_string = False), mem_file, path) - - w2 = split_matrix(np.transpose(self.Wk[h]), (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(w2, hex_string = False), mem_file, path) - - w3 = split_matrix(np.transpose(self.Wv[h]), (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(w3, hex_string = False), mem_file, path) - - w4 = split_matrix(np.transpose(self.Wo[h]), (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(w4, hex_string = False), mem_file, path) - - b1_hex = np.vectorize(lambda val: to_hex(val, bit_size = 24))(self.Bq[h]) - # pack 24-bit values into 32-bit words - packed_b1_hex = np.array(pack_hex_24b(b1_hex)) - write_vector_mem_hex(packed_b1_hex, mem_file, path) - - b2_hex = np.vectorize(lambda val: to_hex(val, bit_size = 24))(self.Bk[h]) - # pack 24-bit values into 32-bit words - packed_b2_hex = np.array(pack_hex_24b(b2_hex)) - write_vector_mem_hex(packed_b2_hex, mem_file, path) - - b3_hex = np.vectorize(lambda val: to_hex(val, bit_size = 24))(self.Bv[h]) - # pack 24-bit values into 32-bit words - packed_b3_hex = np.array(pack_hex_24b(b3_hex)) - write_vector_mem_hex(packed_b3_hex, mem_file, path) - - b4_hex = np.vectorize(lambda val: to_hex(val, bit_size = 24))(self.Bo[h]) - # pack 24-bit values into 32-bit words - packed_b4_hex = np.array(pack_hex_24b(b4_hex)) - write_vector_mem_hex(packed_b4_hex, mem_file, path) - - # Write output - qp = split_matrix(self.Qp_requant[h], (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(qp, hex_string = False), "Q", path) - - kp = split_matrix(self.Kp_requant[h], (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(kp, hex_string = False), "K", path) - - v = split_matrix(np.transpose(self.Vp_requant[h]), (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(v, hex_string = False), "V", path) - - qk = split_matrix(self.A_requant[h], (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(qk, hex_string = False), "QK", path) - - a = split_matrix(self.A_partial_softmax[h], (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(a, hex_string = False), "A", path) - - o = split_matrix(self.O_soft_requant[h], (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(o, hex_string = False), "AV", path) - - out = split_matrix(self.Out_soft_requant[h], (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(out, hex_string = False), "OW", path) - - # Layer: Feedforward - ff = split_matrix(self.FF, (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(ff, hex_string = False), mem_file, path) - - wff = split_matrix(np.transpose(self.Wff[0]), (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(wff, hex_string = False), mem_file, path) - - wff2 = split_matrix(np.transpose(self.Wff2[0]), (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(wff2, hex_string = False), mem_file, path) - - bff_hex = np.vectorize(lambda val: to_hex(val, bit_size = 24))(self.Bff[0]) - # pack 24-bit values into 32-bit words - packed_bff_hex = np.array(pack_hex_24b(bff_hex)) - write_vector_mem_hex(packed_bff_hex, mem_file, path) - - bff2_hex = np.vectorize(lambda val: to_hex(val, bit_size = 24))(self.Bff2[0]) - # pack 24-bit values into 32-bit words - packed_bff2_hex = np.array(pack_hex_24b(bff2_hex)) - write_vector_mem_hex(packed_bff2_hex, mem_file, path) - - # Write output - ff = split_matrix(self.FFp_requant[0], (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(ff, hex_string = False), "F1", path) - - ff2 = split_matrix(self.FF2p_requant[0], (self.ITA_M, self.ITA_M)) - write_matrix_mem_hex(pack_array_8b_to_word(ff2, hex_string = False), "F2", path) - - def generate_snitch_cluster(self) -> str: - """ - This function generates a header file for ITA integrated into the the Snitch cluster. - - Returns: - str: The generated configuration file as a string. - """ - - ret = "" - - ret += f"""/* This file is automatically generated by '{" ".join(sys.argv)}' -* Do not edit manually, any manual change will be overwritten. -*/ - -// clang-format off -""" - - def generate_C_array(array, name, type = "uint32_t"): - """ - Generates a C-style array declaration from a numpy array. - - Args: - array (np.ndarray): The numpy array to be converted. - name (str): The name of the array in the generated code. - - Returns: - str: The C-style array declaration. - """ - return f"const {type} {name}[{array.size}] = {{\n{generate_matrix_mem(array)}\n}};\n" - - def generate_multihead_C_array(multihead_array, name, _type): - ret = "" - ret += f"const {_type} {name}[{self.H}][{multihead_array[0].size}] = {{\n" - ret += ",\n".join([f"{{\n{generate_matrix_mem(array)}\n}}" for array in multihead_array]) - ret += "\n};\n" - return ret - - def requant_multihead_harmonization_and_pack_8b(requant_array): - ret = [] - for i in range(self.H): - ret.append(pack_8b_to_word(np.pad(requant_array[:6, i], (0, 2)))) - return np.array(ret) - - def generate_define(name, value): - return f"#define {name.upper()} {value}\n" - - # Inputs (Q, K) - ret += generate_C_array(self.split_m_m(self.Q), "input_q", "int8_t") - ret += generate_C_array(self.split_m_m(self.K), "input_k", "int8_t") - - # Weights (Wq, Wk, Wv, Wo) - ret += generate_multihead_C_array(self.split_multihead_m_m(self.Wq.transpose(0, 2, 1)), "input_Wq", "int8_t") - ret += generate_multihead_C_array(self.split_multihead_m_m(self.Wk.transpose(0, 2, 1)), "input_Wk", "int8_t") - ret += generate_multihead_C_array(self.split_multihead_m_m(self.Wv.transpose(0, 2, 1)), "input_Wv", "int8_t") - ret += generate_multihead_C_array(self.split_multihead_m_m(self.Wo.transpose(0, 2, 1)), "input_Wo", "int8_t") - - # Biases (Bq, Bk, Bv, Bo) - ret += generate_multihead_C_array(self.Bq, "input_Bq", "ita_int24_t") - ret += generate_multihead_C_array(self.Bk, "input_Bk", "ita_int24_t") - ret += generate_multihead_C_array(self.Bv, "input_Bv", "ita_int24_t") - ret += generate_multihead_C_array(self.Bo, "input_Bo", "ita_int24_t") - - # Requantization parameters - ret += generate_multihead_C_array(requant_multihead_harmonization_and_pack_8b(self.requant_eps_mult), - "requant_eps_mult", "int32_t") - ret += generate_multihead_C_array(requant_multihead_harmonization_and_pack_8b(self.requant_right_shift), - "requant_right_shift", "int32_t") - ret += generate_multihead_C_array(requant_multihead_harmonization_and_pack_8b(self.requant_add), "requant_add", - "int32_t") - - # Intermediate results (Qp, Kp, Vp, A, O_soft, Out_soft) - ret += generate_multihead_C_array(self.split_multihead_m_m(self.Qp_requant), "golden_interm_Pq", "int8_t") - ret += generate_multihead_C_array(self.split_multihead_m_m(self.Kp_requant), "golden_interm_Pk", "int8_t") - ret += generate_multihead_C_array(self.split_multihead_m_m(self.Vp_requant.transpose((0, 2, 1))), - "golden_interm_Pv", "int8_t") - ret += generate_multihead_C_array(self.split_multihead_m_m(self.A_requant), "golden_interm_attention", "int8_t") - ret += generate_multihead_C_array(self.split_multihead_m_m(self.O_soft_requant), "golden_interm_head_output", - "int8_t") - ret += generate_multihead_C_array(self.split_multihead_m_m(self.Out_soft_requant), "golden_output", "int8_t") - - ret += "\n" - - ret += generate_define("heads", self.H) - ret += generate_define("sequence_length", self.S) - ret += generate_define("embedding_space", self.E) - ret += generate_define("projection_space", self.P) - ret += generate_define("n_tile_sequence_length", self.S // 64) - ret += generate_define("n_tile_embedding_space", self.E // 64) - ret += generate_define("n_tile_projection_space", self.P // 64) - ret += generate_define("tile_size_sequence_length", 64) - ret += generate_define("tile_size_embedding_space", 64) - ret += generate_define("tile_size_projection_space", 64) - - ret += '\n// clang-format on\n' - - return ret - - def export_snitch_cluster(self, path, filename = "mem_snitch_cluster.h"): - if path == './': - path = self.paths["snitch-cluster"] - - print(f"=> Exporting memory file to '{path}'") - - with open(os.path.join(path, filename), "w") as f: - f.write(self.generate_snitch_cluster()) - - def export_mempool(self, path): - # WIESEP: TODO: Refactor code to use new split_matrix function - - if path == './': - path = self.paths["mempool"] - - print(f"=> Exporting memory file to '{path}'") - - requant_eps_mult = np.pad(self.requant_eps_mult[:6, :].T, ((0, 0), (0, 2)), mode = "constant") - requant_right_shift = np.pad(self.requant_right_shift[:6, :].T, ((0, 0), (0, 2)), mode = "constant") - requant_add = np.pad(self.requant_add[:6, :].T, ((0, 0), (0, 2)), mode = "constant") - - with open('%s%s.c' % (path, "mem"), "w+") as f: - f.write(f"""/* This file is automatically generated by '{" ".join(sys.argv)}' -* Do not edit manually, any manual change will be overwritten. -*/ - -// clang-format off -""") - - with open('%s%s.c' % (path, "mem"), "a+") as f: - f.write('#include \n') - f.write(f'\nconst uint8_t Requant_Mult[{self.H}][{requant_eps_mult[0].size}] = ' + '{') - write_matrix_mem([requant_eps_mult], "mem", path) - - with open('%s%s.c' % (path, "mem"), "a+") as f: - f.write('};' + f'\nconst uint8_t Requant_Shift[{self.H}][{requant_right_shift[0].size}] = ' + '{') - write_matrix_mem([requant_right_shift], "mem", path) - - with open('%s%s.c' % (path, "mem"), "a+") as f: - f.write('};' + f'\nconst int8_t Requant_Add[{self.H}][{requant_add[0].size}] = ' + '{') - write_matrix_mem([requant_add], "mem", path) - - with open('%s%s.c' % (path, "mem"), "a+") as f: - f.write('};\n\n') - - for h in range(self.H): - with open('%s%s.c' % (path, "mem"), "a+") as f: - f.write(f'const int8_t inputs_{h}[] __attribute__((aligned(0x1000))) = ' + '{\n') - - w4 = np.concatenate([np.transpose(self.Wo[h])]) - write_matrix_mem(w4, "mem", path) - - w3 = np.concatenate([np.transpose(self.Wv[h])]) - write_matrix_mem(w3, "mem", path) - - w2 = np.concatenate([np.transpose(self.Wk[h])]) - write_matrix_mem(w2, "mem", path) - - q = np.concatenate(np.split(self.Q, self.split, axis = 1)) - write_matrix_mem(q, "mem", path) - - k = np.concatenate(np.split(self.K, self.split, axis = 1)) - write_matrix_mem(k, "mem", path) - - # w1 = np.concatenate([np.transpose(self.Wq[i]) for i in range(self.H)]) - w1 = np.concatenate(np.split(np.concatenate([np.transpose(self.Wq[h])]), self.split, axis = 1)) - write_matrix_mem(w1, "mem", path) - - b4 = np.reshape(np.split(self.Bo_broadcast[h], self.split, axis = 1), (self.S_ITA, self.E_ITA)) - write_matrix_mem(b4, "mem", path) - - b3 = np.reshape( - np.split(np.reshape(np.transpose(self.Bv_broadcast[h]), (self.P_ITA, self.S_ITA)), self.split, - axis = 1), (self.P_ITA, self.S_ITA)) - write_matrix_mem(b3, "mem", path) - - b2 = np.reshape(np.split(self.Bk_broadcast[h], self.split, axis = 1), (self.S_ITA, self.P_ITA)) - write_matrix_mem(b2, "mem", path) - - b1 = np.reshape(np.split(self.Bq_broadcast[h], self.split, axis = 1), (self.S_ITA, self.P_ITA)) - write_matrix_mem(b1, "mem", path) - - with open('%s%s.c' % (path, "mem"), "ab+") as f: - f.seek(-1, os.SEEK_END) - f.truncate() - with open('%s%s.c' % (path, "mem"), "a+") as f: - f.write('\n};\n\n') - - with open('%s%s.c' % (path, "mem"), "a+") as f: - f.write('\n// clang-format on\n') - tot_bytes = np.size(self.Q) + np.size(self.K) + np.size(self.Wq) + np.size(self.Bq_broadcast) \ - + np.size(self.Wk) + np.size(self.Bk_broadcast) + np.size(self.Wv) + np.size(self.Bv_broadcast) + \ - np.size(self.Wo) + np.size(self.Bo_broadcast) - - tot_params = tot_bytes = np.size(self.Q) + np.size(self.K) + np.size(self.Wq) + np.size(self.Bq) \ - + np.size(self.Wk) + np.size(self.Bk) + np.size(self.Wv) + np.size(self.Bv) + \ - np.size(self.Wo) + np.size(self.Bo) - - print(f"{'Number of Bytes' :<{30}}: {tot_bytes} ({tot_bytes/1024} kB)") - print(f"{'Number of Parameters' :<{30}}: {tot_params} ({tot_params/1000} k)") - - def export_numpy(self): - assert np.all(np.equal(self.K, self.V)), "For ITA, keys and values have to be equal" - q = self.Q_in - k = self.K_in - w1 = self.Wq_in - b1 = self.Bq_in - w2 = self.Wk_in - b2 = self.Bk_in - w3 = self.Wv_in - b3 = self.Bv_in - w4 = self.Wo_in - b4 = self.Bo_in - o = self.Out_soft_requant[:, :self.S, :self.E] - o_sum = self.Out_soft_sum_requant[:, :self.S, :self.E] - np.savez('%s%s.npz' % (self.paths["base"], "mha"), - q = q, - k = k, - w1 = w1, - b1 = b1, - w2 = w2, - b2 = b2, - w3 = w3, - b3 = b3, - w4 = w4, - b4 = b4, - o = o, - o_sum = o_sum, - rqs_mult = self.requant_eps_mult, - rqs_shift = self.requant_right_shift, - rqs_add = self.requant_add) - - -def generateTestVectors(path, **kwargs): - s = kwargs['S'] - p = kwargs['P'] - e = kwargs['E'] - f = kwargs['F'] - h = kwargs['H'] - activation = kwargs['activation'] - mask = kwargs['mask'] - index = kwargs['I'] - bias = int(not kwargs['no_bias']) - export_snitch_cluster = kwargs['export_snitch_cluster'] - export_mempool = kwargs['export_mempool'] - - acc1 = Transformer(s, p, e, f, h, bias = bias, path = path, activation = activation, mask = mask) - - if kwargs['verbose']: - print("=> Generating test vectors...") - acc1.print_properties(kwargs['verbose']) - acc1.step1_Qp() - acc1.step2_Kp() - acc1.step3_Vp() - acc1.step4_QK(kwargs['no_partial_softmax'], index=index) - acc1.step5_AV() - acc1.step6_O() - acc1.step7_Osum() - acc1.feedforward_layer() - acc1.test_activations() - - if export_mempool: - acc1.export_mempool(kwargs['mem_path']) - if export_snitch_cluster: - acc1.export_snitch_cluster(kwargs['mem_path']) - acc1.export_hwpe() - acc1.export_numpy() - - def calculate_tensor_stats(tensor, name, tol = 1e-1): - # Calculate the similarly of elements within one row and over all columns - similarity_row = np.mean(np.abs(np.diff(tensor, axis = -2))) - similarity_column = np.mean(np.abs(np.diff(tensor, axis = -1))) - - if (similarity_row < tol) or (similarity_column < tol): - if name is not None: - print(f"WARNING: {name} is constant!") - print(f"{name} Mean-Squared Difference (row) : {similarity_row:5.1f}") - print(f"{name} Mean-Squared Difference (column): {similarity_column:5.1f}") - if kwargs['skip_vector_validation'] is False: - raise ValueError(f"Tensor {name} is constant! This is a bad test vector!") - else: - print(f" WARNING: Tensor {name} is constant! This is a bad test vector!") - else: - print(" WARNING: Tensor is constant!") - print(f" Mean-Squared Difference (row) : {similarity_row:5.1f}") - print(f" Mean-Squared Difference (column): {similarity_column:5.1f}") - - return similarity_row, similarity_column - - def print_tensor_stats(tensor, name = None): - print(f" Min: {np.min(tensor)}") - print(f" Max: {np.max(tensor)}") - - similarity_row, similarity_column = calculate_tensor_stats(tensor, name) - - print(f" Mean-Squared Difference (row) : {similarity_row:5.1f}") - print(f" Mean-Squared Difference (column): {similarity_column:5.1f}") - - # Calculate all tensor statistics - tensors = { - "Qp": acc1.Qp_requant, - "Kp": acc1.Kp_requant, - "Vp": acc1.Vp_requant, - "A": acc1.A_requant, - "A_soft": acc1.A_partial_softmax, - "O_soft": acc1.O_soft_requant, - "Out_soft": acc1.Out_soft_requant, - "Out_soft_sum": acc1.Out_soft_sum_requant - } - - for name, tensor in tensors.items(): - calculate_tensor_stats(tensor, name) - - # Check if softmax is sufficiently precise - maep_softmax = error_MAEP(acc1.A_partial_softmax, acc1.A_real_softmax) - if maep_softmax > 5: - print(f"WARNING: Softmax is not precise enough! MAEP Error to Integer Softmax: {maep_softmax:.2f}%") - - if kwargs['verbose'] > 1: - print("=> Qp") - print_tensor_stats(acc1.Qp_requant) - if kwargs['verbose'] > 4: - print(acc1.Qp) - if kwargs['verbose'] > 3: - print(acc1.Qp_requant) - - print("=> Kp") - print_tensor_stats(acc1.Kp_requant) - if kwargs['verbose'] > 4: - print(acc1.Kp) - if kwargs['verbose'] > 3: - print(acc1.Kp_requant) - - print("=> Vp") - print_tensor_stats(acc1.Vp_requant) - if kwargs['verbose'] > 4: - print(acc1.Vp) - if kwargs['verbose'] > 3: - print(acc1.Vp_requant) - - print("=> A") - print_tensor_stats(acc1.A_requant) - if kwargs['verbose'] > 4: - print(acc1.A) - if kwargs['verbose'] > 3: - print(acc1.A_requant) - - print("=> A (partial softmax)") - print_tensor_stats(acc1.A_partial_softmax) - print(f" MAEP Error to Integer Softmax: {maep_softmax:.2f}%") - if kwargs['verbose'] > 3: - print(acc1.A_partial_softmax) - - print("=> O (soft)") - print_tensor_stats(acc1.O_soft_requant) - if kwargs['verbose'] > 4: - print(acc1.O_soft) - if kwargs['verbose'] > 3: - print(acc1.O_soft_requant) - - print("=> Output (all heads)") - print_tensor_stats(acc1.Out_soft_requant) - if kwargs['verbose'] > 3: - print(acc1.Out_soft_requant) - - print("=> Output (accumulated)") - print_tensor_stats(acc1.Out_soft_sum_requant) - if kwargs['verbose'] > 3: - print(acc1.Out_soft_sum_requant) - - if kwargs['plot_tensors']: - # Plot distribution of all input and output tensors - import matplotlib.pyplot as plt - import seaborn as sns - from matplotlib.gridspec import GridSpec - - def plot_distribution(tensor, title, ax): - sns.histplot(tensor.flatten(), bins = 50, kde = True, ax = ax) - ax.set_title(title) - - # Plot color values of all tensors - def plot_heatmap(tensor, title, ax): - # If tensor is more than 2D, only plot the first 2D - if len(tensor.shape) > 2: - tensor = tensor[0] - - sns.heatmap(tensor, ax = ax, cbar = False) - # Do not show ticks - ax.set_xticks([]) - ax.set_yticks([]) - ax.set_title(title) - - # Create sublots - fig = plt.figure(figsize = (12, 12), layout = 'tight', dpi = 72) - - gs = GridSpec(8, 12, figure = fig) - - ax = fig.add_subplot(gs[0, 0:3]) - plot_distribution(acc1.Q, "Q", ax) - ax = fig.add_subplot(gs[0, 3:6]) - plot_heatmap(acc1.Q, "Q", ax) - ax = fig.add_subplot(gs[0, 6:9]) - plot_distribution(acc1.K, "K", ax) - ax = fig.add_subplot(gs[0, 9:12]) - plot_heatmap(acc1.K, "K", ax) - - ax = fig.add_subplot(gs[1, 0:3]) - plot_distribution(acc1.Wq, "Wq", ax) - ax = fig.add_subplot(gs[1, 3:6]) - plot_distribution(acc1.Wk, "Wk", ax) - ax = fig.add_subplot(gs[1, 6:9]) - plot_distribution(acc1.Wv, "Wv", ax) - ax = fig.add_subplot(gs[1, 9:12]) - plot_distribution(acc1.Wo, "Wo", ax) - - ax = fig.add_subplot(gs[2, 0:3]) - plot_heatmap(acc1.Wq, "Wq", ax) - ax = fig.add_subplot(gs[2, 3:6]) - plot_heatmap(acc1.Wk, "Wk", ax) - ax = fig.add_subplot(gs[2, 6:9]) - plot_heatmap(acc1.Wv, "Wv", ax) - ax = fig.add_subplot(gs[2, 9:12]) - plot_heatmap(acc1.Wo, "Wo", ax) - - ax = fig.add_subplot(gs[3, 0:3]) - plot_distribution(acc1.Bq, "Bq", ax) - ax = fig.add_subplot(gs[3, 3:6]) - plot_distribution(acc1.Bk, "Bk", ax) - ax = fig.add_subplot(gs[3, 6:9]) - plot_distribution(acc1.Bv, "Bv", ax) - ax = fig.add_subplot(gs[3, 9:12]) - plot_distribution(acc1.Bo, "Bo", ax) - - ax = fig.add_subplot(gs[4, 0:3]) - plot_distribution(acc1.Qp_requant, "Qp", ax) - ax = fig.add_subplot(gs[4, 3:6]) - plot_distribution(acc1.Kp_requant, "Kp", ax) - ax = fig.add_subplot(gs[4, 6:9]) - plot_distribution(acc1.Vp_requant, "Vp", ax) - - ax = fig.add_subplot(gs[5, 0:3]) - plot_heatmap(acc1.Qp_requant, "Qp", ax) - ax = fig.add_subplot(gs[5, 3:6]) - plot_heatmap(acc1.Kp_requant, "Kp", ax) - ax = fig.add_subplot(gs[5, 6:9]) - plot_heatmap(acc1.Vp_requant, "Vp", ax) - - ax = fig.add_subplot(gs[6, 0:3]) - plot_distribution(acc1.A_requant, "QK", ax) - ax = fig.add_subplot(gs[6, 3:6]) - plot_distribution(acc1.A_partial_softmax, "A", ax) - ax = fig.add_subplot(gs[6, 6:9]) - plot_distribution(acc1.O_soft_requant, "O", ax) - ax = fig.add_subplot(gs[6, 9:12]) - plot_distribution(acc1.Out_soft_requant, "Out", ax) - - ax = fig.add_subplot(gs[7, 0:3]) - plot_heatmap(acc1.A_requant, "QK", ax) - ax = fig.add_subplot(gs[7, 3:6]) - plot_heatmap(acc1.A_partial_softmax, "A", ax) - ax = fig.add_subplot(gs[7, 6:9]) - plot_heatmap(acc1.O_soft_requant, "O", ax) - ax = fig.add_subplot(gs[7, 9:12]) - plot_heatmap(acc1.Out_soft_requant, "Out", ax) - - plt.show() - - -def util_main(**kwargs): - B = 8 - log2e = np.log2(np.exp(1)) - eps_max = B / (2**B) - - N = 1024 - A = np.random.randint(-128, 127, size = (1, N, N), dtype = np.int8) - input_float = A * eps_max # Assume eps is eps_max - input_int = A - - fast_softmax = fastSoftmax(input_float, False) - fast_integer_softmax = fastSoftmax(input_int, True) / 255 - - fast_partial_softmax = streamingPartialSoftmax(input_float, False) - fast_partial_integer_softmax = streamingPartialSoftmax(input_int, True) / 255 - - softmax = realSoftmax(input_float, False) - integer_softmax = realSoftmax(input_int, True) / 255 - - print(f"=> L2 Softmax Differences:") - print( - f" Softmax - Fast Softmax : {np.linalg.norm((softmax-fast_softmax)[0], 2):.10}" - ) - print( - f" Softmax - Fast Partial Softmax : {np.linalg.norm((softmax-fast_partial_softmax)[0], 2):.10}" - ) - print( - f" Softmax - Fast Integer Softmax : {np.linalg.norm((softmax-fast_integer_softmax)[0], 2):.10}" - ) - print( - f" Softmax - Fast Partial Integer Softmax : {np.linalg.norm((softmax-fast_partial_integer_softmax)[0], 2):.10}" - ) - # print(f" Integer Softmax - Fast Integer Softmax : {np.linalg.norm((integer_softmax-fast_integer_softmax)[0], 2):.3}") - # print(f" Integer Softmax - Fast Partial Integer Softmax : {np.linalg.norm((integer_softmax-fast_partial_integer_softmax)[0], 2):.3}") - # print(f" Softmax - Integer Softmax : {np.linalg.norm((integer_softmax-softmax)[0], 2):.3}") - # print(f" Fast Softmax - Fast Partial Softmax : {np.linalg.norm((fast_softmax-fast_partial_softmax)[0], 2):.3}") - # print(f" Fast Integer Softmax - Fast Partial Integer Softmax : {np.linalg.norm((fast_integer_softmax-fast_partial_integer_softmax)[0], 2):.3}") - - TEST_QUANTLIB = True - if TEST_QUANTLIB: - import torch - - from quantlib.algorithms.pact.pact_ops import (PACTIntegerITAMax, PACTIntegerITAPartialMax, PACTITAMax, - PACTITAPartialMax) - input = torch.tensor(input_float).unsqueeze(0).float() - - ITAMax = PACTITAMax() - ITAPartialMax = PACTITAPartialMax(ita_sequence_length = N) - ITAmax_softmax = ITAMax.forward(input).detach().numpy().squeeze(axis = 0) - ITApartialmax_softmax = ITAPartialMax.forward(input).detach().numpy().squeeze(axis = 0) - - ITAMax.started = torch.tensor(1) - ITAPartialMax.started = torch.tensor(1) - ITAMax.set_eps_in(torch.tensor((eps_max,))) - ITAPartialMax.set_eps_in(torch.tensor((eps_max,))) - ITAMax_integer_softmax = ITAMax.forward(input).detach().numpy().squeeze(axis = 0) - ITAPartialMax_integer_softmax = ITAPartialMax.forward(input).detach().numpy().squeeze(axis = 0) - - input = torch.tensor(input_int).unsqueeze(0).float() - ITAIntegerMax_softmax = PACTIntegerITAMax.MySoftmax.forward( - None, input, torch.tensor(256)).detach().numpy().squeeze(axis = 0) - ITAPartialIntegerMax_softmax = PACTIntegerITAMax.MySoftmax.forward( - None, input, torch.tensor(256)).detach().numpy().squeeze(axis = 0) - - print() - print(f"=> L2 PyTorch Softmax Differences:") - print( - f" Fast Softmax - ITAmax : {np.linalg.norm((fast_softmax-ITAmax_softmax)[0], 2):.3}" - ) - print( - f" Fast Partial Softmax - ITAPartialMax : {np.linalg.norm((fast_partial_softmax-ITApartialmax_softmax)[0], 2):.3}" - ) - print( - f" Fast Integer Softmax - Fake-Quantized ITAmax : {np.linalg.norm((fast_integer_softmax-ITAMax_integer_softmax)[0], 2):.3}" - ) - print( - f" Fast Integer Partial Softmax - Fake-Quantized ITAPartialMax : {np.linalg.norm((fast_partial_integer_softmax-ITAPartialMax_integer_softmax)[0], 2):.3}" - ) - print( - f" Fast Integer Softmax - True-Quantized ITAmax : {np.linalg.norm((fast_integer_softmax-ITAIntegerMax_softmax/255)[0], 2):.3}" - ) - print( - f" Fast Integer Partial Softmax - True-Quantized ITAPartialMax : {np.linalg.norm((fast_partial_integer_softmax-ITAPartialIntegerMax_softmax/255)[0], 2):.3}" - ) diff --git a/modelsim/sim_ita_tb_wave.tcl b/modelsim/sim_ita_tb_wave.tcl index 78d5ce5..490d5e6 100644 --- a/modelsim/sim_ita_tb_wave.tcl +++ b/modelsim/sim_ita_tb_wave.tcl @@ -20,10 +20,6 @@ add wave -noupdate -expand -group {Masking Signals} /ita_tb/dut/i_softmax_top/i_ add wave -noupdate -expand -group {Masking Signals} /ita_tb/dut/i_softmax_top/i_softmax/exp_sum_d add wave -noupdate -expand -group {Masking Signals} /ita_tb/dut/i_softmax_top/i_softmax/exp_sum_q add wave -noupdate -expand -group {Masking Signals} /ita_tb/dut/i_softmax_top/i_softmax/disable_row -add wave -noupdate -expand -group {Masking Signals} -group {Mask Tile Pos} -radix unsigned /ita_tb/dut/i_controller/mask_tile_x_pos_d -add wave -noupdate -expand -group {Masking Signals} -group {Mask Tile Pos} -radix unsigned /ita_tb/dut/i_controller/mask_tile_x_pos_q -add wave -noupdate -expand -group {Masking Signals} -group {Mask Tile Pos} -radix unsigned /ita_tb/dut/i_controller/mask_tile_y_pos_d -add wave -noupdate -expand -group {Masking Signals} -group {Mask Tile Pos} -radix unsigned /ita_tb/dut/i_controller/mask_tile_y_pos_q add wave -noupdate -expand -group {Masking Signals} -group {Mask Tile Pos} -radix unsigned /ita_tb/dut/i_controller/first_outer_dim add wave -noupdate -expand -group {Masking Signals} /ita_tb/dut/i_inp2_mux/clk_i add wave -noupdate -expand -group {Masking Signals} /ita_tb/dut/last_inner_tile_q6 @@ -54,18 +50,6 @@ add wave -noupdate -expand -group {Masking Signals} /ita_tb/dut/i_softmax_top/i_ add wave -noupdate -expand -group {Masking Signals} /ita_tb/dut/i_controller/step_q add wave -noupdate -expand -group {Masking Signals} /ita_tb/dut/i_softmax_top/i_softmax/calc_stream_soft_en_q add wave -noupdate -expand -group {Masking Signals} -radix unsigned /ita_tb/dut/i_controller/count_q -add wave -noupdate -expand -group {Masking Signals} -radix binary /ita_tb/dut/i_controller/mask_d -add wave -noupdate -expand -group {Masking Signals} -radix unsigned /ita_tb/dut/i_controller/mask_pos_d -add wave -noupdate -expand -group {Masking Signals} -radix unsigned /ita_tb/dut/i_controller/mask_pos_q -add wave -noupdate -expand -group {Masking Signals} -radix unsigned /ita_tb/dut/i_controller/mask_col_offset_q -add wave -noupdate -radix unsigned /ita_tb/dut/i_controller/mask_tile_x_pos_d -add wave -noupdate -radix unsigned /ita_tb/dut/i_controller/mask_tile_x_pos_q -add wave -noupdate -radix unsigned /ita_tb/dut/i_controller/mask_tile_y_pos_d -add wave -noupdate -radix unsigned /ita_tb/dut/i_controller/mask_tile_y_pos_q -add wave -noupdate -radix unsigned /ita_tb/dut/i_controller/tile_x_d -add wave -noupdate -radix unsigned /ita_tb/dut/i_controller/tile_x_q -add wave -noupdate -radix unsigned /ita_tb/dut/i_controller/tile_y_d -add wave -noupdate -radix unsigned /ita_tb/dut/i_controller/tile_y_q add wave -noupdate /ita_tb/dut/calc_en_q5 add wave -noupdate /ita_tb/dut/calc_en_q6 add wave -noupdate /ita_tb/dut/calc_en_q7 @@ -118,438 +102,7 @@ add wave -noupdate /ita_tb/dut/i_activation/data_o add wave -noupdate /ita_tb/dut/i_fifo/data_i add wave -noupdate /ita_tb/dut/i_fifo/data_o add wave -noupdate /ita_tb/dut/oup_o -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/clk_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/rst_ni -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/mode_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/eps_mult_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/right_shift_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/calc_en_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/calc_en_q_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/result_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/add_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/requant_oup_o -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/mult_signed -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/product -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/shifted_added -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/shifted_d -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/shifted_q -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/add_q1 -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/add_q2 -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/add_q3 -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/add_q4 -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/requant_oup_d -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/requant_oup_q -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/clk_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/rst_ni -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/mode_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/eps_mult_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/right_shift_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/calc_en_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/calc_en_q_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/result_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/add_i -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/requant_oup_o -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/mult_signed -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/product -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/shifted_added -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/shifted_d -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/shifted_q -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/add_q1 -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/add_q2 -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/add_q3 -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/add_q4 -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/requant_oup_d -add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/requant_oup_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/clk_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/rst_ni -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/ctrl_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inp_valid_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inp_ready_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/weight_valid_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/weight_ready_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/bias_valid_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/bias_ready_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/oup_valid_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/oup_ready_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/pop_softmax_fifo_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/step_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/soft_addr_div_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/softmax_done_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/calc_en_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/first_inner_tile_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/last_inner_tile_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_x_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_y_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inner_tile_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/requant_add_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/requant_add_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inp_bias_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inp_bias_pad_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/busy_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/calc_en_q1_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/step_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/step_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/count_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/count_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/bias_count -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_pos_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_pos_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_col_offset_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_col_offset_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inner_tile_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inner_tile_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_tile_x_pos_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_tile_x_pos_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_tile_y_pos_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_tile_y_pos_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_x_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_x_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/bias_tile_x_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/bias_tile_x_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_y_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_y_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/bias_tile_y_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/bias_tile_y_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/softmax_tile_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/softmax_tile_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/ongoing_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/ongoing_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/ongoing_soft_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/ongoing_soft_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inp_bias -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inp_bias_padded -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/last_time -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inner_tile_dim -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/first_outer_dim -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/second_outer_dim -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/first_outer_dim_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/first_outer_dim_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/second_outer_dim_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/second_outer_dim_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/softmax_fifo -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/softmax_div -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/softmax_div_done_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/softmax_div_done_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/busy_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/busy_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/requant_add_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/requant_add_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/clk_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/rst_ni -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/ctrl_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inp_valid_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inp_ready_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/weight_valid_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/weight_ready_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/bias_valid_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/bias_ready_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/oup_valid_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/oup_ready_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/pop_softmax_fifo_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/step_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/soft_addr_div_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/softmax_done_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/calc_en_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/first_inner_tile_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/last_inner_tile_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_x_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_y_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inner_tile_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/requant_add_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/requant_add_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inp_bias_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inp_bias_pad_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/busy_o -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/calc_en_q1_i -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/step_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/step_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/count_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/count_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/bias_count -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_pos_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_pos_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_col_offset_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_col_offset_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inner_tile_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inner_tile_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_tile_x_pos_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_tile_x_pos_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_tile_y_pos_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_tile_y_pos_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_x_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_x_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/bias_tile_x_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/bias_tile_x_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_y_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/tile_y_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/bias_tile_y_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/bias_tile_y_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/softmax_tile_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/softmax_tile_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/ongoing_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/ongoing_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/ongoing_soft_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/ongoing_soft_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inp_bias -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inp_bias_padded -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/last_time -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/mask_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/inner_tile_dim -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/first_outer_dim -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/second_outer_dim -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/first_outer_dim_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/first_outer_dim_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/second_outer_dim_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/second_outer_dim_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/softmax_fifo -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/softmax_div -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/softmax_div_done_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/softmax_div_done_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/busy_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/busy_q -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/requant_add_d -add wave -noupdate -expand -group Controller /ita_tb/dut/i_controller/requant_add_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/clk_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/rst_ni -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/ctrl_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/step_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/calc_en_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/requant_oup_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/calc_stream_soft_en_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/soft_addr_div_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/softmax_done_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/pop_softmax_fifo_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/inp_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/inp_stream_soft_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_inp_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_valid_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_ready_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_valid_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_ready_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_oup_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/read_acc_en_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/read_acc_addr_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/read_acc_data_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/write_acc_en_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/write_acc_addr_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/write_acc_data_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/prev_max_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/max_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/max_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/read_max_en_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/read_max_addr_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/read_max_data_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/write_max_en_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/write_max_addr_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/write_max_data_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_x_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_y_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/inner_tile_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/mask_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_q1 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_q2 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_q3 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_q4 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_q1 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_q2 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_q3 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_q4 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/inner_tile_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_y_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/exp_sum_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/exp_sum_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_soft_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_soft_q1 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_soft_q2 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_soft_mask_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_div_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_div_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/addr_div_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/addr_div_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_read_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_read_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_write_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_write_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/requant_oup_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/max_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/max_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/shift_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/shift_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/shift_diff -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/shift_sum_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/shift_sum_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/max_diff -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/shift_inp -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/shift_inp_diff -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/calc_stream_soft_en_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/calc_en_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/calc_en_q1 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/calc_en_q2 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/calc_en_q3 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/fifo_full -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/fifo_empty -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/push_to_fifo -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/pop_from_fifo -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/data_to_fifo -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/data_from_fifo -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/fifo_usage -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/disable_shift -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/disable_row -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/disable_col -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/clk_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/rst_ni -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/ctrl_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/step_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/calc_en_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/requant_oup_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/calc_stream_soft_en_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/soft_addr_div_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/softmax_done_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/pop_softmax_fifo_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/inp_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/inp_stream_soft_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_inp_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_valid_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_ready_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_valid_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_ready_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_oup_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/read_acc_en_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/read_acc_addr_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/read_acc_data_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/write_acc_en_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/write_acc_addr_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/write_acc_data_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/prev_max_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/max_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/max_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/read_max_en_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/read_max_addr_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/read_max_data_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/write_max_en_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/write_max_addr_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/write_max_data_o -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_x_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_y_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/inner_tile_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/mask_i -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_q1 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_q2 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_q3 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_q4 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_q1 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_q2 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_q3 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_q4 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/inner_tile_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_x_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/tile_y_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/mask_tile_x_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/mask_tile_x_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/mask_tile_y_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/mask_tile_y_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/exp_sum_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/exp_sum_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_soft_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_soft_q1 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_soft_q2 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_soft_mask_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_div_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/count_div_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/addr_div_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/addr_div_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_read_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_read_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_write_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/div_write_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/requant_oup_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/max_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/max_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/shift_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/shift_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/shift_diff -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/shift_sum_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/shift_sum_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/max_diff -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/shift_inp -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/shift_inp_diff -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/calc_stream_soft_en_q -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/calc_en_d -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/calc_en_q1 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/calc_en_q2 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/calc_en_q3 -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/fifo_full -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/fifo_empty -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/push_to_fifo -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/pop_from_fifo -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/data_to_fifo -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/data_from_fifo -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/fifo_usage -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/disable_shift -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/disable_row -add wave -noupdate -group {Softmax Controller} /ita_tb/dut/i_softmax_top/i_softmax/disable_col -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/clk_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/rst_ni -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/calc_en_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/calc_en_q_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/first_tile_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/first_tile_q_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/last_tile_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/last_tile_q_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/oup_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/inp_bias_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/result_o -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/read_en -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/read_addr -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/read_data -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/read_data_unused -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/write_en -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/write_addr -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/write_data -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/read_addr_d -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/read_addr_q -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/write_addr_d -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/write_addr_q -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/result_d -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/result_q -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/clk_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/rst_ni -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/calc_en_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/calc_en_q_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/first_tile_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/first_tile_q_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/last_tile_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/last_tile_q_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/oup_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/inp_bias_i -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/result_o -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/read_en -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/read_addr -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/read_data -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/read_data_unused -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/write_en -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/write_addr -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/write_data -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/read_addr_d -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/read_addr_q -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/write_addr_d -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/write_addr_q -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/result_d -add wave -noupdate -group Accumulator /ita_tb/dut/i_accumulator/result_q add wave -noupdate -group Requantizer /ita_tb/dut/i_requantizer/* -add wave -expand -group Controller /ita_tb/dut/i_controller/* +add wave -group {Controller} /ita_tb/dut/i_controller/* add wave -group {Softmax Controller} ita_tb/dut/i_softmax_top/i_softmax/* add wave -group {Accumulator} ita_tb/dut/i_accumulator/* \ No newline at end of file diff --git a/src/ita.sv b/src/ita.sv index 8a9fe25..7822050 100644 --- a/src/ita.sv +++ b/src/ita.sv @@ -219,8 +219,7 @@ module ita .inp_bias_i (inp_bias ), .inp_bias_pad_o (inp_bias_padded ), .mask_o (mask ), - .busy_o (busy_o ), - .calc_en_q1_i (calc_en_q1 ) + .busy_o (busy_o ) ); ita_input_sampler i_input_sampler ( diff --git a/src/ita_controller.sv b/src/ita_controller.sv index d3c2f82..3a834bb 100644 --- a/src/ita_controller.sv +++ b/src/ita_controller.sv @@ -36,18 +36,14 @@ module ita_controller input bias_t inp_bias_i , output bias_t inp_bias_pad_o , output logic [N-1:0] mask_o , - output logic busy_o , - input logic calc_en_q1_i + output logic busy_o ); step_e step_d, step_q; counter_t count_d, count_q, bias_count; - counter_t mask_pos_d, mask_pos_q; - logic [3:0] mask_col_offset_d, mask_col_offset_q; + counter_t tile_d, tile_q; counter_t inner_tile_d, inner_tile_q; - counter_t mask_tile_x_pos_d, mask_tile_x_pos_q; - counter_t mask_tile_y_pos_d, mask_tile_y_pos_q; counter_t tile_x_d, tile_x_q, bias_tile_x_d, bias_tile_x_q; counter_t tile_y_d, tile_y_q, bias_tile_y_d, bias_tile_y_q; counter_t softmax_tile_d, softmax_tile_q; @@ -56,14 +52,12 @@ module ita_controller bias_t inp_bias, inp_bias_padded; logic last_time; - logic [N-1:0] mask_d, mask_q; tile_t inner_tile_dim; - logic [WO-WI*2-2:0] first_outer_dim, second_outer_dim; - logic [WO-WI*2-2:0] first_outer_dim_d, first_outer_dim_q; - logic [WO-WI*2-2:0] second_outer_dim_d, second_outer_dim_q; + input_dim_t first_outer_dim, second_outer_dim; + input_dim_t first_outer_dim_d, first_outer_dim_q; + input_dim_t second_outer_dim_d, second_outer_dim_q; - logic softmax_fifo, softmax_div, softmax_div_done_d, softmax_div_done_q, busy_d, busy_q; requant_oup_t requant_add, requant_add_d, requant_add_q; @@ -74,7 +68,7 @@ module ita_controller assign inner_tile_o = inner_tile_q; assign requant_add_o = requant_add_q; assign inp_bias_pad_o = inp_bias_padded; - assign mask_o = mask_q; + always_comb begin count_d = count_q; @@ -96,10 +90,9 @@ module ita_controller last_time = 1'b0; requant_add = {N {requant_add_i}}; inp_bias = inp_bias_i; - - busy_d = busy_q; - softmax_fifo = 1'b0; - softmax_div = 1'b0; + busy_d = busy_q; + softmax_fifo = 1'b0; + softmax_div = 1'b0; if (step_q != AV) begin softmax_div_done_d = 1'b0; @@ -390,143 +383,6 @@ module ita_controller end inp_bias_padded = inp_bias; - case (ctrl_i.mask_type) - None: begin - mask_col_offset_d = '0; - mask_tile_x_pos_d = '0; - mask_tile_y_pos_d = '0; - mask_pos_d = '0; - mask_d = '0; - end - UpperTriangular: begin - mask_col_offset_d = (step_q == QK || step_q == AV) ? mask_col_offset_q : ((ctrl_i.mask_start_index) & (N-1)); - mask_tile_x_pos_d = (step_q == QK || step_q == AV) ? mask_tile_x_pos_q : ((ctrl_i.mask_start_index) / M); - mask_tile_y_pos_d = mask_tile_y_pos_q; - mask_pos_d = (step_q == QK || step_q == AV) ? mask_pos_q : ((((ctrl_i.mask_start_index)/N)*M) & ((M*M/N)-1)); - mask_d = '0; - - if (step_q == QK) begin - if (mask_tile_x_pos_q == tile_x_q && mask_tile_y_pos_q == tile_y_q && last_inner_tile_o == 1'b1) begin - if (count_q == ((M*M/N)-1)) begin - mask_tile_x_pos_d = mask_tile_x_pos_q + 1'b1; - end - if ((count_q >= mask_pos_q) && (count_q < (mask_pos_q + N))) begin - if ((count_q & (M-1)) == (M-1) && !(((count_q + mask_col_offset_q) & (N-1)) == (N-1))) begin - mask_tile_y_pos_d = tile_y_q + 1'b1; - mask_tile_x_pos_d = tile_x_q; - mask_pos_d = ((count_q + (((ctrl_i.tile_s * (M*M/N)) - M) + 1)) & ((M*M/N)-1)); - end else if ((count_q & (M-1)) == (M-1) && (((count_q + mask_col_offset_q) & (N-1)) == (N-1))) begin - if ((count_q / M) == ((M/N)-1)) begin - mask_tile_y_pos_d = tile_y_q + 1'b1; - mask_tile_x_pos_d = tile_x_q + 1'b1; - mask_pos_d = ((count_q + ((ctrl_i.tile_s * (M*M/N)) + 1)) & ((M*M/N)-1)); - end else begin - mask_tile_y_pos_d = tile_y_q + 1'b1; - mask_tile_x_pos_d = tile_x_q; - mask_pos_d = ((count_q + ((ctrl_i.tile_s * (M*M/N)) + 1)) & ((M*M/N)-1)); - end - end else if (((count_q + mask_col_offset_q) & (N-1)) == (N-1)) begin - mask_pos_d = (mask_pos_q + (N - ((mask_pos_q + mask_col_offset_q) & (N-1))) + M) & ((M*M/N)-1); - end - for (int i = 0; i < N; i++) begin - if (((count_q + mask_col_offset_q) & (N-1)) <= i) begin - mask_d[i] = 1'b1; - end else begin - mask_d[i] = 1'b0; - end - end - end else if ((count_q & (M-1)) < (mask_pos_q & (M-1))) begin - for (int i = 0; i < N; i++) begin - mask_d[i] = 1'b1; - end - end - end else if (mask_tile_x_pos_q <= tile_x_q && mask_tile_y_pos_q != tile_y_q && last_inner_tile_o == 1'b1) begin - for (int i = 0; i < N; i++) begin - mask_d[i] = 1'b1; - end - end else if (mask_tile_x_pos_q != tile_x_q && mask_tile_y_pos_q == tile_y_q && last_inner_tile_o == 1'b1) begin - for (int i = 0; i < N; i++) begin - mask_d[i] = 1'b0; - end - end - end - end - LowerTriangular: begin - mask_col_offset_d = '0; - mask_tile_x_pos_d = '0; - mask_tile_y_pos_d = (step_q == QK || step_q == AV) ? mask_tile_y_pos_q : ((ctrl_i.mask_start_index) / M); - mask_pos_d = (step_q == QK || step_q == AV) ? mask_pos_q : (ctrl_i.mask_start_index & (M-1)); - mask_d = '0; - - if (step_q == QK) begin - if (mask_tile_x_pos_q == tile_x_q && mask_tile_y_pos_q == tile_y_q && last_inner_tile_o == 1'b1) begin - if (count_q == ((M*M/N)-1)) begin - mask_tile_x_pos_d = mask_tile_x_pos_q + 1'b1; - end - if ((count_q >= mask_pos_q) && (count_q < (mask_pos_q + N))) begin - if (((count_q & (M-1)) == (M-1)) && !(((count_q + (N - (ctrl_i.mask_start_index & (N-1)))) & (N-1)) == (N-1))) begin - mask_tile_y_pos_d = tile_y_q + 1'b1; - mask_tile_x_pos_d = tile_x_q; - mask_pos_d = ((count_q + (((ctrl_i.tile_s * (M*M/N)) - M) + 1)) & ((M*M/N)-1)); - end else if (((count_q & (M-1)) == (M-1)) && (((count_q + (N - (ctrl_i.mask_start_index & (N-1)))) & (N-1)) == (N-1))) begin - if ((count_q / M) == ((M/N)-1)) begin - mask_tile_y_pos_d = tile_y_q + 1'b1; - mask_tile_x_pos_d = tile_x_q + 1'b1; - mask_pos_d = ((count_q + ((ctrl_i.tile_s * (M*M/N)) + 1)) & ((M*M/N)-1)); - end else begin - mask_tile_y_pos_d = tile_y_q + 1'b1; - mask_tile_x_pos_d = tile_x_q; - mask_pos_d = ((count_q + ((ctrl_i.tile_s * (M*M/N)) + 1)) & ((M*M/N)-1)); - end - end else if (((count_q + (N - (ctrl_i.mask_start_index & (N-1)))) & (N-1)) == (N-1)) begin - mask_pos_d = (mask_pos_q + (count_q - mask_pos_q + 1) + M) & ((M*M/N)-1); - end - for (int i = 0; i < N; i++) begin - if (((count_q + (N - (ctrl_i.mask_start_index & (N-1)))) & (N-1)) >= i) begin - mask_d[i] = 1'b1; - end else begin - mask_d[i] = 1'b0; - end - end - end else if ((count_q & (M-1)) >= (mask_pos_q & (M-1))) begin - for (int i = 0; i < N; i++) begin - mask_d[i] = 1'b1; - end - end - end else if (mask_tile_x_pos_q > tile_x_q && mask_tile_y_pos_q == tile_y_q && last_inner_tile_o == 1'b1) begin - for (int i = 0; i < N; i++) begin - mask_d[i] = 1'b1; - end - end else if (mask_tile_x_pos_q >= tile_x_q && mask_tile_y_pos_q != tile_y_q && last_inner_tile_o == 1'b1) begin - for (int i = 0; i < N; i++) begin - mask_d[i] = 1'b0; - end - end - end - end - Strided: begin - mask_col_offset_d = '0; - mask_tile_x_pos_d = '0; - mask_tile_y_pos_d = '0; - mask_pos_d = '0; - mask_d = '0; - - if (step_q == QK) begin - if (last_inner_tile_o == 1'b1) begin - for (int i = 0; i < N; i++) begin - //col_pos = count_q/M + i + mask_tile_x_pos_q * M - //row_pos = count_q & (M-1) + mask_tile_y_pos_q * M - if ((((((count_q / M) * N) + i + (tile_x_q * M)) - ((count_q & (M-1)) + (tile_y_q * M))) & (ctrl_i.mask_start_index-1)) == 0) begin - mask_d[i] = 1'b0; - end else begin - mask_d[i] = 1'b1; - end - end - end - end - end - endcase - if (inp_valid_i && inp_ready_o && oup_valid_i && oup_ready_i && last_inner_tile_o) begin ongoing_d = ongoing_q; end else if (inp_valid_i && inp_ready_o && last_inner_tile_o) begin @@ -561,11 +417,6 @@ module ita_controller bias_tile_y_q <= '0; first_outer_dim_q <= '0; second_outer_dim_q <= '0; - mask_pos_q <= '0; - mask_col_offset_q <= '0; - mask_tile_x_pos_q <= '0; - mask_tile_y_pos_q <= '0; - mask_q <= '0; end else begin step_q <= step_d; count_q <= count_d; @@ -583,13 +434,20 @@ module ita_controller bias_tile_y_q <= bias_tile_y_d; first_outer_dim_q <= first_outer_dim_d; second_outer_dim_q <= second_outer_dim_d; - if (calc_en_o) begin - mask_pos_q <= mask_pos_d; - mask_tile_x_pos_q <= mask_tile_x_pos_d; - mask_tile_y_pos_q <= mask_tile_y_pos_d; - end - mask_q <= mask_d; - mask_col_offset_q <= mask_col_offset_d; end end + + ita_masking i_masking ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .ctrl_i (ctrl_i), + .step_i (step_o), + .calc_en_i (calc_en_o), + .last_inner_tile_i (last_inner_tile_o), + .count_i (count_q), + .tile_x_i (tile_x_o), + .tile_y_i (tile_y_o), + .mask_o (mask_o) + ); + endmodule diff --git a/src/ita_masking.sv b/src/ita_masking.sv new file mode 100644 index 0000000..7273309 --- /dev/null +++ b/src/ita_masking.sv @@ -0,0 +1,190 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +/** + ITA masking module. +*/ + +module ita_masking + import ita_package::*; +( + input logic clk_i, + input logic rst_ni, + input ctrl_t ctrl_i, + input step_e step_i, + input logic calc_en_i, + input logic last_inner_tile_i, + input counter_t count_i, + input counter_t tile_x_i, + input counter_t tile_y_i, + output logic [N-1:0] mask_o +); + + logic [3:0] mask_col_offset_d, mask_col_offset_q; + counter_t mask_tile_x_pos_d, mask_tile_x_pos_q; + counter_t mask_tile_y_pos_d, mask_tile_y_pos_q; + counter_t mask_pos_d, mask_pos_q; + logic [N-1:0] mask_d, mask_q; + + assign mask_o = mask_q; + + always_comb begin + case (ctrl_i.mask_type) + None: begin + mask_col_offset_d = '0; + mask_tile_x_pos_d = '0; + mask_tile_y_pos_d = '0; + mask_pos_d = '0; + mask_d = '0; + end + UpperTriangular: begin + mask_col_offset_d = (step_i == QK || step_i == AV) ? mask_col_offset_q : ((ctrl_i.mask_start_index) & (N-1)); + mask_tile_x_pos_d = (step_i == QK || step_i == AV) ? mask_tile_x_pos_q : ((ctrl_i.mask_start_index) / M); + mask_tile_y_pos_d = mask_tile_y_pos_q; + mask_pos_d = (step_i == QK || step_i == AV) ? mask_pos_q : ((((ctrl_i.mask_start_index)/N)*M) & ((M*M/N)-1)); + mask_d = '0; + + if (step_i == QK) begin + if (mask_tile_x_pos_q == tile_x_i && mask_tile_y_pos_q == tile_y_i && last_inner_tile_i == 1'b1) begin + if (count_i == ((M * M / N) - 1)) begin + mask_tile_x_pos_d = mask_tile_x_pos_q + 1'b1; + end + if ((count_i >= mask_pos_q) && (count_i < (mask_pos_q + N))) begin + if ((count_i & (M-1)) == (M-1) && !(((count_i + mask_col_offset_q) & (N-1)) == (N-1))) begin + mask_tile_y_pos_d = tile_y_i + 1'b1; + mask_tile_x_pos_d = tile_x_i; + mask_pos_d = ((count_i + (((ctrl_i.tile_s * (M*M/N)) - M) + 1)) & ((M*M/N)-1)); + end else if ((count_i & (M-1)) == (M-1) && (((count_i + mask_col_offset_q) & (N-1)) == (N-1))) begin + if ((count_i / M) == ((M / N) - 1)) begin + mask_tile_y_pos_d = tile_y_i + 1'b1; + mask_tile_x_pos_d = tile_x_i + 1'b1; + mask_pos_d = ((count_i + ((ctrl_i.tile_s * (M*M/N)) + 1)) & ((M*M/N)-1)); + end else begin + mask_tile_y_pos_d = tile_y_i + 1'b1; + mask_tile_x_pos_d = tile_x_i; + mask_pos_d = ((count_i + ((ctrl_i.tile_s * (M*M/N)) + 1)) & ((M*M/N)-1)); + end + end else if (((count_i + mask_col_offset_q) & (N - 1)) == (N - 1)) begin + mask_pos_d = (mask_pos_q + (N - ((mask_pos_q + mask_col_offset_q) & (N-1))) + M) & ((M*M/N)-1); + end + for (int i = 0; i < N; i++) begin + if (((count_i + mask_col_offset_q) & (N - 1)) <= i) begin + mask_d[i] = 1'b1; + end else begin + mask_d[i] = 1'b0; + end + end + end else if ((count_i & (M - 1)) < (mask_pos_q & (M - 1))) begin + for (int i = 0; i < N; i++) begin + mask_d[i] = 1'b1; + end + end + end else if (mask_tile_x_pos_q <= tile_x_i && mask_tile_y_pos_q != tile_y_i && last_inner_tile_i == 1'b1) begin + for (int i = 0; i < N; i++) begin + mask_d[i] = 1'b1; + end + end else if (mask_tile_x_pos_q != tile_x_i && mask_tile_y_pos_q == tile_y_i && last_inner_tile_i == 1'b1) begin + for (int i = 0; i < N; i++) begin + mask_d[i] = 1'b0; + end + end + end + end + LowerTriangular: begin + mask_col_offset_d = '0; + mask_tile_x_pos_d = '0; + mask_tile_y_pos_d = (step_i == QK || step_i == AV) ? mask_tile_y_pos_q : ((ctrl_i.mask_start_index) / M); + mask_pos_d = (step_i == QK || step_i == AV) ? mask_pos_q : (ctrl_i.mask_start_index & (M-1)); + mask_d = '0; + + if (step_i == QK) begin + if (mask_tile_x_pos_q == tile_x_i && mask_tile_y_pos_q == tile_y_i && last_inner_tile_i == 1'b1) begin + if (count_i == ((M * M / N) - 1)) begin + mask_tile_x_pos_d = mask_tile_x_pos_q + 1'b1; + end + if ((count_i >= mask_pos_q) && (count_i < (mask_pos_q + N))) begin + if (((count_i & (M-1)) == (M-1)) && !(((count_i + (N - (ctrl_i.mask_start_index & (N-1)))) & (N-1)) == (N-1))) begin + mask_tile_y_pos_d = tile_y_i + 1'b1; + mask_tile_x_pos_d = tile_x_i; + mask_pos_d = ((count_i + (((ctrl_i.tile_s * (M*M/N)) - M) + 1)) & ((M*M/N)-1)); + end else if (((count_i & (M-1)) == (M-1)) && (((count_i + (N - (ctrl_i.mask_start_index & (N-1)))) & (N-1)) == (N-1))) begin + if ((count_i / M) == ((M / N) - 1)) begin + mask_tile_y_pos_d = tile_y_i + 1'b1; + mask_tile_x_pos_d = tile_x_i + 1'b1; + mask_pos_d = ((count_i + ((ctrl_i.tile_s * (M*M/N)) + 1)) & ((M*M/N)-1)); + end else begin + mask_tile_y_pos_d = tile_y_i + 1'b1; + mask_tile_x_pos_d = tile_x_i; + mask_pos_d = ((count_i + ((ctrl_i.tile_s * (M*M/N)) + 1)) & ((M*M/N)-1)); + end + end else if (((count_i + (N - (ctrl_i.mask_start_index & (N-1)))) & (N-1)) == (N-1)) begin + mask_pos_d = (mask_pos_q + (count_i - mask_pos_q + 1) + M) & ((M * M / N) - 1); + end + for (int i = 0; i < N; i++) begin + if (((count_i + (N - (ctrl_i.mask_start_index & (N - 1)))) & (N - 1)) >= i) begin + mask_d[i] = 1'b1; + end else begin + mask_d[i] = 1'b0; + end + end + end else if ((count_i & (M - 1)) >= (mask_pos_q & (M - 1))) begin + for (int i = 0; i < N; i++) begin + mask_d[i] = 1'b1; + end + end + end else if (mask_tile_x_pos_q > tile_x_i && mask_tile_y_pos_q == tile_y_i && last_inner_tile_i == 1'b1) begin + for (int i = 0; i < N; i++) begin + mask_d[i] = 1'b1; + end + end else if (mask_tile_x_pos_q >= tile_x_i && mask_tile_y_pos_q != tile_y_i && last_inner_tile_i == 1'b1) begin + for (int i = 0; i < N; i++) begin + mask_d[i] = 1'b0; + end + end + end + end + Strided: begin + mask_col_offset_d = '0; + mask_tile_x_pos_d = '0; + mask_tile_y_pos_d = '0; + mask_pos_d = '0; + mask_d = '0; + + if (step_i == QK) begin + if (last_inner_tile_i == 1'b1) begin + for (int i = 0; i < N; i++) begin + //col_pos = count_i/M * N + i + tile_x_i * M + //row_pos = count_i & (M-1) + tile_y_i * M + //Marcel Kant: Does only work if ctrl_i.mask_start_index is a power of two + if ((((((count_i / M) * N) + i + (tile_x_i * M)) - ((count_i & (M-1)) + (tile_y_i * M))) & (ctrl_i.mask_start_index-1)) == 0) begin + mask_d[i] = 1'b0; + end else begin + mask_d[i] = 1'b1; + end + end + end + end + end + endcase + end + + always_ff @(posedge clk_i or negedge rst_ni) begin + if (~rst_ni) begin + mask_pos_q <= '0; + mask_tile_x_pos_q <= '0; + mask_tile_y_pos_q <= '0; + mask_col_offset_q <= '0; + mask_q <= '0; + end else begin + if (calc_en_i) begin + mask_pos_q <= mask_pos_d; + mask_tile_x_pos_q <= mask_tile_x_pos_d; + mask_tile_y_pos_q <= mask_tile_y_pos_d; + end + mask_col_offset_q <= mask_col_offset_d; + mask_q <= mask_d; + end + end + +endmodule diff --git a/src/ita_package.sv b/src/ita_package.sv index f63a299..184e0f9 100644 --- a/src/ita_package.sv +++ b/src/ita_package.sv @@ -49,10 +49,11 @@ package ita_package; typedef logic [N_REQUANT_CONSTS-1:0][EMS-1:0] requant_const_array_t; typedef logic signed [WI-1:0] requant_t; typedef logic signed [N_REQUANT_CONSTS-1:0][WI-1:0] requant_array_t; - typedef logic [WO-WI*2-2:0] seq_length_t; - typedef logic [WO-WI*2-2:0] proj_space_t; - typedef logic [WO-WI*2-2:0] embed_size_t; - typedef logic [WO-WI*2-2:0] ff_size_t; + typedef logic [WO-WI*2-1:0] input_dim_t; + typedef input_dim_t seq_length_t; + typedef input_dim_t proj_space_t; + typedef input_dim_t embed_size_t; + typedef input_dim_t ff_size_t; typedef logic [ 32-1:0] tile_t; typedef struct packed { logic start ; diff --git a/src/ita_softmax.sv b/src/ita_softmax.sv index e99653b..70bdfe8 100644 --- a/src/ita_softmax.sv +++ b/src/ita_softmax.sv @@ -187,6 +187,7 @@ module ita_softmax write_max_addr_o = count_q3; write_max_data_o = max_q; for (int i = 0; i < N; i++) begin + //Marcel Kant: This if statement is most likely not required if (shift_q[i] != 4'hF) exp_sum_d += unsigned'(9'h100)>>shift_q[i]; end @@ -317,7 +318,7 @@ module ita_softmax end Strided: begin //col_pos = i + mask_tile_x_q * M - //row_pos = count_soft_mask_q & (M-1) + mask_tile_y_pos_q * M + //row_pos = count_soft_mask_q & (M-1) + mask_tile_y_q * M if ((((i + (mask_tile_x_q * M)) - ((count_soft_mask_q & (M-1)) + (mask_tile_y_q * M))) & (ctrl_i.mask_start_index-1)) == 0) begin disable_col[i] = 1'b0; end else begin