diff --git a/disaggregated_memory/boards/arm_main_board.py b/disaggregated_memory/boards/arm_main_board.py index 9daf51fb83..c4f2ae5f92 100644 --- a/disaggregated_memory/boards/arm_main_board.py +++ b/disaggregated_memory/boards/arm_main_board.py @@ -28,7 +28,6 @@ # into one single board. import os import sys - from typing import ( List, Sequence, @@ -40,6 +39,7 @@ os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) ) +from cachehierarchies.dm_caches import ClassicPrivateL1PrivateL2SharedL3DMCache from memories.external_remote_memory import ExternalRemoteMemory import m5 @@ -53,6 +53,7 @@ Port, SrcClockDomain, Terminal, + VExpress_GEM5_V1, VncServer, VoltageDomain, ) @@ -61,9 +62,9 @@ ArmDefaultRelease, ArmRelease, ) -from m5.objects.RealView import ( - VExpress_GEM5_Base, - VExpress_GEM5_Foundation, +from m5.util import ( + fatal, + warn, ) from m5.util.fdthelper import ( Fdt, @@ -78,13 +79,14 @@ from gem5.components.cachehierarchies.abstract_cache_hierarchy import ( AbstractCacheHierarchy, ) +from gem5.components.memory import SingleChannelDDR4_2400 from gem5.components.memory.abstract_memory_system import AbstractMemorySystem from gem5.components.processors.abstract_processor import AbstractProcessor +from gem5.components.processors.cpu_types import CPUTypes +from gem5.components.processors.simple_processor import SimpleProcessor +from gem5.isas import ISA from gem5.utils.override import overrides -from m5.util import ( - fatal, - warn, -) + class ArmComposableMemoryBoard(ArmBoard): """ @@ -108,8 +110,6 @@ class ArmComposableMemoryBoard(ArmBoard): local memory or at a custom address range defined by the user. :cache_hierarchy: An abstract_cache_hierarchy compatible with local and remote memories. - :platform: Arm-specific platform to use with this board. - :release: Arm-specific extensions to use with this board. :remote_memory_access_cycles: Optionally add some latency to access the remote memory. If the remote memory is being simulated in SST, then pass this as a param on the sst-side runscript. @@ -118,16 +118,9 @@ class ArmComposableMemoryBoard(ArmBoard): """ def __init__( - self, - clk_freq: str, - processor: AbstractProcessor, - local_memory: AbstractMemorySystem, - remote_memory: AbstractMemorySystem, - cache_hierarchy: AbstractCacheHierarchy, - platform: VExpress_GEM5_Base = VExpress_GEM5_Foundation(), - release: ArmRelease = ArmDefaultRelease(), - remote_memory_access_cycles: int = 750, - remote_memory_address_range: AddrRange = None, + self,/ + remote_memory_address_range, + use_sst ) -> None: # The parent board calls get_memory(), which needs overriding. self._localMemory = local_memory @@ -167,18 +160,23 @@ def __init__( size=self._remoteMemory.get_size(), ) assert self._remoteMemoryAddressRange is not None + # Memory: Dual Channel DDR4 2400 DRAM device. + + self.local_memory = SingleChannelDDR4_2400(size="8GiB") super().__init__( - clk_freq=clk_freq, - processor=processor, - memory=local_memory, - cache_hierarchy=cache_hierarchy, - platform=platform, - release=release, + clk_freq="4GHz", + processor=SimpleProcessor(cpu_type=CPUTypes.O3, isa=ISA.ARM, num_cores=8), + memory=self.local_memory, + cache_hierarchy=ClassicPrivateL1PrivateL2SharedL3DMCache( + l1d_size="32KiB", l1i_size="32KiB", l2_size="1MiB", l3_size="2MiB" + ), + platform=VExpress_GEM5_V1(), + release=ArmDefaultRelease.for_kvm(), + ) + self.remote_memory = ExternalRemoteMemory( + addr_range=remote_memory_address_range, use_sst_sim=use_sst ) - - self.local_memory = local_memory - self.remote_memory = remote_memory # The amount of latency to access the remote memory has to be either # implemented using a non-coherent crossbar that connects the the diff --git a/disaggregated_memory/configs/arm-main.py b/disaggregated_memory/configs/arm-main.py index f1f9cd02e8..084aa6d930 100644 --- a/disaggregated_memory/configs/arm-main.py +++ b/disaggregated_memory/configs/arm-main.py @@ -180,8 +180,6 @@ local_memory=local_memory, remote_memory=remote_memory, cache_hierarchy=cache_hierarchy, - platform=VExpress_GEM5_V1(), - release=ArmDefaultRelease.for_kvm(), ) # commands to execute to run the simulation. @@ -218,12 +216,13 @@ # Since we are using kvm to boot the system, we can boot the system with # systemd enabled! -cmd = ["m5 --addr=0x10010000 exit;"] \ - + local_stream \ - + interleave_stream \ - + remote_stream \ +cmd = ( + ["m5 --addr=0x10010000 exit;"] + + local_stream + + interleave_stream + + remote_stream + ["m5 --addr=0x10010000 exit;"] - +) workload = CustomWorkload( diff --git a/disaggregated_memory/configs/exp-stream-interleave.py b/disaggregated_memory/configs/exp-stream-interleave.py index f2aadcda64..8137d7c4ef 100644 --- a/disaggregated_memory/configs/exp-stream-interleave.py +++ b/disaggregated_memory/configs/exp-stream-interleave.py @@ -156,14 +156,14 @@ # Here we setup the parameters of the l1 and l2 caches. cache_hierarchy = ClassicPrivateL1PrivateL2SharedL3DMCache( - l1d_size="32KiB", l1i_size="32KiB", l2_size="2MiB", l3_size="16MiB" + l1d_size="32KiB", l1i_size="32KiB", l2_size="1MiB", l3_size="2MiB" ) # cache_hierarchy = ClassicPrivateL1PrivateL2DMCache( # l1d_size="32KiB", l1i_size="32KiB", l2_size="4MiB" # ) # Memory: Dual Channel DDR4 2400 DRAM device. -local_memory = DualChannelDDR4_2400(size=args.local_memory_size) +local_memory = SingleChannelDDR4_2400(size=args.local_memory_size) # Either suppy the size of the remote memory or the address range of the # remote memory. Since this is inside the external memory, it does not matter @@ -199,7 +199,7 @@ "numastat;", "numactl --interleave=0,1 -- " + "/home/ubuntu/simple-vectorizable-benchmarks/stream/" - + "stream.hw.m5 3145728;", + + "stream.hw.m5 8388608;", "numastat;", ] diff --git a/disaggregated_memory/configs/exp-stream-local.py b/disaggregated_memory/configs/exp-stream-local.py index ff226bd028..aaab959dce 100644 --- a/disaggregated_memory/configs/exp-stream-local.py +++ b/disaggregated_memory/configs/exp-stream-local.py @@ -156,14 +156,14 @@ # Here we setup the parameters of the l1 and l2 caches. cache_hierarchy = ClassicPrivateL1PrivateL2SharedL3DMCache( - l1d_size="32KiB", l1i_size="32KiB", l2_size="2MiB", l3_size="16MiB" + l1d_size="32KiB", l1i_size="32KiB", l2_size="1MiB", l3_size="2MiB" ) # cache_hierarchy = ClassicPrivateL1PrivateL2DMCache( # l1d_size="32KiB", l1i_size="32KiB", l2_size="4MiB" # ) # Memory: Dual Channel DDR4 2400 DRAM device. -local_memory = DualChannelDDR4_2400(size=args.local_memory_size) +local_memory = SingleChannelDDR4_2400(size=args.local_memory_size) # Either suppy the size of the remote memory or the address range of the # remote memory. Since this is inside the external memory, it does not matter @@ -199,7 +199,7 @@ "numastat;", "numactl --membind=0 -- " + "/home/ubuntu/simple-vectorizable-benchmarks/stream/" - + "stream.hw.m5 3145728;", + + "stream.hw.m5 8388608;", "numastat;", ] diff --git a/disaggregated_memory/configs/exp-stream-remote.py b/disaggregated_memory/configs/exp-stream-remote.py index 4d88fe0c24..b544e03833 100644 --- a/disaggregated_memory/configs/exp-stream-remote.py +++ b/disaggregated_memory/configs/exp-stream-remote.py @@ -42,8 +42,6 @@ ) from boards.arm_main_board import ArmComposableMemoryBoard -from cachehierarchies.dm_caches import ClassicPrivateL1PrivateL2DMCache -from cachehierarchies.dm_caches import ClassicPrivateL1PrivateL2SharedL3DMCache from memories.external_remote_memory import ExternalRemoteMemory import m5 @@ -55,36 +53,17 @@ from m5.objects.RealView import VExpress_GEM5_V1 from m5.util import warn -from gem5.components.memory import ( - DualChannelDDR4_2400, - SingleChannelDDR4_2400, -) -from gem5.components.processors.cpu_types import CPUTypes -from gem5.components.processors.simple_processor import SimpleProcessor -from gem5.isas import ISA from gem5.resources.resource import * from gem5.resources.workload import * from gem5.resources.workload import Workload +from gem5.simulate import exit_event_generators +from gem5.simulate.exit_event import ExitEvent from gem5.simulate.simulator import Simulator from gem5.utils.requires import requires # SST passes a couple of arguments for this system to simulate. parser = argparse.ArgumentParser() -# basic parameters. -parser.add_argument( - "--cpu-type", - type=str, - choices=["atomic", "timing", "o3", "kvm"], - default="atomic", - help="CPU type", -) -parser.add_argument( - "--cpu-clock-rate", - type=str, - required=True, - help="CPU Clock", -) parser.add_argument( "--instance", type=int, @@ -92,35 +71,12 @@ help="Instance id is need to correctly read and write to the " + "checkpoint in a multi-node simulation.", ) - -# Parameters related to local memory -parser.add_argument( - "--local-memory-size", - type=str, - required=True, - help="Local memory size", -) - -# Parameters related to remote memory -parser.add_argument( - "--is-composable", - type=str, - required=True, - choices=["True", "False"], - help="Tell the simulation to either use gem5 or SST as the remote memory.", -) parser.add_argument( "--remote-memory-addr-range", type=str, required=True, help="Remote memory range", ) -parser.add_argument( - "--remote-memory-latency", - type=int, - required=True, - help="Remote memory latency in Ticks (has to be converted prior)", -) # Parameters related to checkpoints. parser.add_argument( @@ -132,62 +88,19 @@ ) parser.add_argument( "--take-ckpt", - type=str, - default="False", + type=bool, + default=False, required=True, help="optionally put a path to restore a checkpoint", ) args = parser.parse_args() -cpu_type = { - "o3": CPUTypes.O3, - "atomic": CPUTypes.ATOMIC, - "timing": CPUTypes.TIMING, - "kvm": CPUTypes.KVM, -}[args.cpu_type] -use_sst = {"True": True, "False": False}[args.is_composable] - remote_memory_range = list(map(int, args.remote_memory_addr_range.split(","))) remote_memory_range = AddrRange(remote_memory_range[0], remote_memory_range[1]) -# This runs a check to ensure the gem5 binary is compiled for ARM. -requires(isa_required=ISA.ARM) - -# Here we setup the parameters of the l1 and l2 caches. -cache_hierarchy = ClassicPrivateL1PrivateL2SharedL3DMCache( - l1d_size="32KiB", l1i_size="32KiB", l2_size="2MiB", l3_size="16MiB" -) -# cache_hierarchy = ClassicPrivateL1PrivateL2DMCache( -# l1d_size="32KiB", l1i_size="32KiB", l2_size="4MiB" -# ) - -# Memory: Dual Channel DDR4 2400 DRAM device. -local_memory = DualChannelDDR4_2400(size=args.local_memory_size) - -# Either suppy the size of the remote memory or the address range of the -# remote memory. Since this is inside the external memory, it does not matter -# what type of memory is being simulated. This can either be initialized with -# a size or a memory address range, which is mroe flexible. Adding remote -# memory latency automatically adds a non-coherent crossbar to simulate latency -remote_memory = ExternalRemoteMemory( - addr_range=remote_memory_range, use_sst_sim=use_sst -) - -# Here we setup the processor. We use a simple processor. -processor = SimpleProcessor(cpu_type=cpu_type, isa=ISA.ARM, num_cores=8) -# breakpoint() # Here we setup the board which allows us to do Full-System ARM simulations. -board = ArmComposableMemoryBoard( - clk_freq=args.cpu_clock_rate, - processor=processor, - local_memory=local_memory, - remote_memory=remote_memory, - cache_hierarchy=cache_hierarchy, - platform=VExpress_GEM5_V1(), - release=ArmDefaultRelease.for_kvm(), - remote_memory_access_cycles = 0 -) +board = ArmComposableMemoryBoard(remote_memory_range, use_sst=True) # commands to execute to run the simulation. mount_cmd = ["mount -t sysfs - /sys;", "mount -t proc - /proc;"] @@ -199,7 +112,7 @@ "numastat;", "numactl --membind=1 -- " + "/home/ubuntu/simple-vectorizable-benchmarks/stream/" - + "stream.hw.m5 3145728;", + + "stream.hw.m5 8388608;", "numastat;", ] @@ -244,40 +157,22 @@ # This disk image needs to have NUMA tools installed. board.set_workload(workload) -# This script will boot two NUMA nodes in a full system simulation where the -# gem5 node will be sending instructions to the SST node. the simulation will -# after displaying numastat information on the terminal, which can be viewed -# from board.terminal. -board._pre_instantiate() -root = Root(full_system=True, board=board) -board._post_instantiate() - - -# define on_exit_event -def handle_exit(): - yield True # Stop the simulation. We're done. - +if args.take_ckpt: + exit_event = exit_event_generators.save_checkpoint_generator +else: + exit_event = exit_event_generators.exit_generator -# Here are the different scenarios: -# no checkpoint, run everything in gem5 -if args.take_ckpt == "True": - if args.cpu_type == "kvm": - # ensure that sst is not being used here. - assert use_sst == False - root.sim_quantum = int(1e9) - m5.instantiate() +simulator = Simulator( + board=board, + on_exit_event={ + ExitEvent.EXIT: exit_event, + }, +) - # probably this script is being called only in gem5. Since we are not using - # the simulator module, we might have to add more m5.simulate() - m5.simulate() - if ckpt_to_read_write != "": - m5.checkpoint(os.path.join(os.getcwd(), ckpt_to_read_write)) +if not use_sst: + simulator.run() else: # This is called in SST. SST will take care of running this script. - # Instantiate the system regardless of the simulator. - m5.instantiate(ckpt_to_read_write) - - # we can still use gem5. So making another if-else - if use_sst == False: - m5.simulate() - # otherwise just let SST do the simulation. + # SST won't call instantiate, though, since it doesn't use the simulator + # object + simulator._instantiate() diff --git a/ext/sst/sst/arm_composable_memory.py b/ext/sst/sst/arm_composable_memory.py index 8ada7cf06a..e541202b97 100644 --- a/ext/sst/sst/arm_composable_memory.py +++ b/ext/sst/sst/arm_composable_memory.py @@ -125,7 +125,7 @@ def get_address_range(node, local_mem_size, remote_mem_size, blank_mem_size): sst_memory_size = str( (memory_nodes * int(node_memory_slice[0])) + \ ((system_nodes) * int(remote_memory_slice[0:1])) + \ - int(blank_memory_space[0]) + int(blank_memory_space[0])) print(sst_memory_size, addr_range_end) # There is one cache bus connecting all gem5 ports to the remote memory. diff --git a/ext/sst/sst/exp_stream_remote_arm_composable_memory.py b/ext/sst/sst/exp_stream_remote_arm_composable_memory.py index 8fdaa76aac..e12496db22 100644 --- a/ext/sst/sst/exp_stream_remote_arm_composable_memory.py +++ b/ext/sst/sst/exp_stream_remote_arm_composable_memory.py @@ -36,7 +36,7 @@ disaggregated_memory_latency = "1ps" cache_link_latency = "1ps" -cpu_clock_rate = "3.1GHz" +cpu_clock_rate = "4GHz" def connect_components(link_name: str, low_port_name: str, low_port_idx: int, high_port_name: str, high_port_idx: int, @@ -93,7 +93,7 @@ def get_address_range(node, local_mem_size, remote_mem_size, blank_mem_size): # Define the CPU type cpu_type = "o3" -gem5_run_script = "../../disaggregated_memory/configs/exp-stream-remote.py" +gem5_run_script = "disaggregated_memory/configs/exp-stream-remote.py" # =========================================================================== # @@ -106,12 +106,13 @@ def get_address_range(node, local_mem_size, remote_mem_size, blank_mem_size): # This example uses fixed number of node size -> 2 GiB # The directory controller decides where the addresses are mapped to. -node_memory_slice = "2GiB" -node_memory_slice_in_hex = 0x80000000 +node_memory_slice = "8GiB" +node_memory_slice_in_hex = 0x200000000 -# We are use 32 GiB of remote memory per node. -remote_memory_slice = "2GiB" -remote_memory_slice_in_hex = 0x80000000 +# This script should only be used for the STREAM experiments. +# We are use 1 GiB of remote memory per node. +remote_memory_slice = "1GiB" +remote_memory_slice_in_hex = 0x40000000 # The first 2 GB is ignored for I/O devices. blank_memory_space = "2GiB" @@ -119,7 +120,7 @@ def get_address_range(node, local_mem_size, remote_mem_size, blank_mem_size): # SST memory node size. Each system gets a 32 GiB slice of fixed memory. assert(len(node_memory_slice) == 4), "The length of local mem size must be 4" -assert(len(remote_memory_slice) == 4), "The length of remote mem size must be 5" +assert(len(remote_memory_slice) == 4), "The length of remote mem size must be 4" assert(len(blank_memory_space) == 4), "The length must be 4" # \033[92m {}\033[00m sst_memory_size = str( @@ -158,9 +159,14 @@ def get_address_range(node, local_mem_size, remote_mem_size, blank_mem_size): "channels" : 4, "channel.numRanks" : 2, "channel.rank.numBanks" : 16, - "channel.rank.bank.pagePolicy" : "memHierarchy.timeoutPagePolicy", + "channel.transaction_Q_size": 128, + "channel.rank.bank.CL" : 14, + "channel.rank.bank.RCD" : 14, + "channel.rank.bank.TRAS" : 32, + "channel.rank.bank.TRP" : 14, + "channel.rank.bank.pagePolicy" : "memHierarchy.simplePagePolicy", "channel.rank.bank.transactionQ" : "memHierarchy.reorderTransactionQ", - "channel.rank.bank.pagePolicy.timeoutCycles" : 50, + "channel.rank.bank.pagePolicy.close" : 0, "printconfig" : 1, }) @@ -196,7 +202,7 @@ def get_address_range(node, local_mem_size, remote_mem_size, blank_mem_size): } port_list = [] for port in ports: - port_list.append(port) + port_list.append(port) cpu_params = { "frequency" : cpu_clock_rate,