From a83523468f6183ab2ca2e72073f94e2c5aa7c413 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Wed, 21 Feb 2024 12:27:50 +0300 Subject: [PATCH 01/27] issue-617 Base Class --- pdr_backend/lake/base_data_store.py | 43 +++++++++++++++++++ pdr_backend/lake/test/test_base_data_store.py | 20 +++++++++ 2 files changed, 63 insertions(+) create mode 100644 pdr_backend/lake/base_data_store.py create mode 100644 pdr_backend/lake/test/test_base_data_store.py diff --git a/pdr_backend/lake/base_data_store.py b/pdr_backend/lake/base_data_store.py new file mode 100644 index 000000000..368841df7 --- /dev/null +++ b/pdr_backend/lake/base_data_store.py @@ -0,0 +1,43 @@ +from hashlib import md5 +from abc import abstractmethod +from typing import Optional, Literal + +import duckdb +from enforce_typing import enforce_types + + +class BaseDataStore: + @enforce_types + def __init__(self, base_directory=str): + """ + Initialize a PartitionedDataStore instance. + @arguments: + base_directory - The base directory to store the partitioned Parquet files. + """ + + self.base_directory = base_directory + self.duckdb_conn = duckdb.connect( + database=f"{self.base_directory}/duckdb.db" + ) # Keep a persistent connection + + @enforce_types + def _generate_view_name(self, base_path=str) -> str: + """ + Generate a unique view name for a given base path. + @arguments: + base_path - The base path to generate a view name for. + @returns: + str - A unique view name. + """ + + hash_object = md5(base_path.encode()) + return f"dataset_{hash_object.hexdigest()}" + + @abstractmethod + def query_data( + self, + dataset_identifier: str, + query: str, + partition_type: Optional[Literal["date", "address"]] = None, + ): + pass diff --git a/pdr_backend/lake/test/test_base_data_store.py b/pdr_backend/lake/test/test_base_data_store.py new file mode 100644 index 000000000..6dd20cfc4 --- /dev/null +++ b/pdr_backend/lake/test/test_base_data_store.py @@ -0,0 +1,20 @@ +from pdr_backend.lake.base_data_store import BaseDataStore + + +def _get_test_manager(tmpdir): + return BaseDataStore(str(tmpdir)) + + +def test__generate_view_name(tmpdir): + """ + Test the _generate_view_name method. + """ + test_manager = _get_test_manager(tmpdir) + view_name = test_manager._generate_view_name(str(tmpdir)) + + # check if the view name starts with "dataset_" + assert view_name.startswith( + "dataset_" + ), "The view name does not start with 'dataset_'" + # check if the view name continues with a hash + assert len(view_name) > 8, "The view name is too short" From f6cc990f1ea81555c1613289d4a670dd6e80dec1 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Wed, 21 Feb 2024 12:30:05 +0300 Subject: [PATCH 02/27] issue617 - Persistent Data Store --- pdr_backend/lake/persistent_data_store.py | 150 ++++++++++++++ .../lake/test/test_persistent_data_store.py | 183 ++++++++++++++++++ 2 files changed, 333 insertions(+) create mode 100644 pdr_backend/lake/persistent_data_store.py create mode 100644 pdr_backend/lake/test/test_persistent_data_store.py diff --git a/pdr_backend/lake/persistent_data_store.py b/pdr_backend/lake/persistent_data_store.py new file mode 100644 index 000000000..6dc02f45e --- /dev/null +++ b/pdr_backend/lake/persistent_data_store.py @@ -0,0 +1,150 @@ +# The PersistentDataStore class is a subclass of the Base +import os +import glob + +from enforce_typing import enforce_types +import polars as pl + +from pdr_backend.lake.base_data_store import BaseDataStore + + +class PersistentDataStore(BaseDataStore): + """ + A class to store and retrieve persistent data. + """ + + def __init__(self, base_directory: str): + """ + Initialize a PersistentDataStore instance. + @arguments: + base_directory - The base directory to store the persistent data. + """ + super().__init__(base_directory) + + @enforce_types + def _create_and_fill_table( + self, df: pl.DataFrame, dataset_identifier: str + ): # pylint: disable=unused-argument + """ + Create the dataset and insert data to the persistent dataset. + @arguments: + df - The Polars DataFrame to append. + dataset_identifier - A unique identifier for the dataset. + """ + + view_name = self._generate_view_name(self.base_directory + dataset_identifier) + + # self.duckdb_conn.register(view_name, df) + # Create the table + self.duckdb_conn.execute(f"CREATE TABLE {view_name} AS SELECT * FROM df") + + @enforce_types + def insert_to_table(self, df: pl.DataFrame, dataset_identifier: str): + """ + Insert data to an persistent dataset. + @arguments: + df - The Polars DataFrame to append. + dataset_identifier - A unique identifier for the dataset. + @example: + df = pl.DataFrame({ + "id": [1, 2, 3], + "name": ["John", "Jane", "Doe"], + "age": [25, 30, 35] + }) + insert_to_table(df, "people") + """ + + view_name = self._generate_view_name(self.base_directory + dataset_identifier) + # Check if the table exists + tables = self.duckdb_conn.execute( + "SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'" + ).fetchall() + + if view_name in [table[0] for table in tables]: + self.duckdb_conn.execute(f"INSERT INTO {view_name} SELECT * FROM df") + else: + self._create_and_fill_table(df, dataset_identifier) + + @enforce_types + def query_data( + self, dataset_identifier: str, query: str, partition_type: None = None + ) -> pl.DataFrame: + """ + Execute a SQL query across the persistent dataset using DuckDB. + @arguments: + dataset_identifier - A unique identifier for the dataset. + query - The SQL query to execute. + @returns: + pl.DataFrame - The result of the query. + @example: + query_data("people", "SELECT * FROM {view_name}") + """ + + view_name = self._generate_view_name(self.base_directory + dataset_identifier) + result_df = self.duckdb_conn.execute(query.format(view_name=view_name)).df() + + return pl.DataFrame(result_df) + + @enforce_types + def drop_table(self, dataset_identifier: str, ds_type: str = "table"): + """ + Drop the persistent dataset. + @arguments: + dataset_identifier - A unique identifier for the dataset. + ds_type - The type of the dataset to drop. Either "table" or "view". + @example: + drop_table("people") + """ + + if ds_type not in ["view", "table"]: + raise ValueError("ds_type must be either 'view' or 'table'") + + view_name = self._generate_view_name(self.base_directory + dataset_identifier) + self.duckdb_conn.execute(f"DROP {ds_type} {view_name}") + + @enforce_types + def fill_from_csv_destination(self, csv_folder_path: str, dataset_identifier: str): + """ + Fill the persistent dataset from CSV files. + @arguments: + csv_folder_path - The path to the folder containing the CSV files. + dataset_identifier - A unique identifier for the dataset. + @example: + fill_from_csv_destination("data/csv", "people") + """ + + csv_files = glob.glob(os.path.join(csv_folder_path, "*.csv")) + + print("csv_files", csv_files) + for csv_file in csv_files: + df = pl.read_csv(csv_file) + self.insert_to_table(df, dataset_identifier) + + @enforce_types + def update_data( + self, df: pl.DataFrame, dataset_identifier: str, identifier_column: str + ): + """ + Update the persistent dataset with the provided DataFrame. + @arguments: + df - The Polars DataFrame to update. + dataset_identifier - A unique identifier for the dataset. + identifier_column - The column to use as the identifier for the update. + @example: + df = pl.DataFrame({ + "id": [1, 2, 3], + "name": ["John", "Jane", "Doe"], + "age": [25, 30, 35] + }) + update_data(df, "people", "id") + """ + + view_name = self._generate_view_name(self.base_directory + dataset_identifier) + update_columns = ", ".join( + [f"{column} = {df[column]}" for column in df.columns] + ) + self.duckdb_conn.execute( + f"""UPDATE {view_name} + SET {update_columns} + WHERE {identifier_column} = {df[identifier_column]}""" + ) diff --git a/pdr_backend/lake/test/test_persistent_data_store.py b/pdr_backend/lake/test/test_persistent_data_store.py new file mode 100644 index 000000000..e97fc366b --- /dev/null +++ b/pdr_backend/lake/test/test_persistent_data_store.py @@ -0,0 +1,183 @@ +import os +import polars as pl +from pdr_backend.lake.persistent_data_store import ( + PersistentDataStore, +) # Adjust the import based on your project structure + + +# Initialize the PartitionedDataStore instance for testing +def _get_test_manager(tmpdir): + example_df = pl.DataFrame( + {"timestamp": ["2022-01-01", "2022-02-01", "2022-03-01"], "value": [10, 20, 30]} + ) + dataset_identifier = "test_df" + + return [PersistentDataStore(str(tmpdir)), example_df, dataset_identifier] + + +def _clean_up_test_manager(tmpdir, dataset_identifier): + # Clean up the test manager + dataset_path = os.path.join(str(tmpdir), dataset_identifier) + + persistent_ds_instance = PersistentDataStore(str(tmpdir)) + + view_name = persistent_ds_instance._generate_view_name(dataset_path) + + # Select tables from duckdb + views = persistent_ds_instance.duckdb_conn.execute( + "SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'" + ).fetchall() + + # Drop the view and table + if view_name in [table[0] for table in views]: + persistent_ds_instance.duckdb_conn.execute(f"DROP TABLE {view_name}") + + +def _check_view_exists(tmpdir, test_manager, dataset_identifier): + view_name = test_manager._generate_view_name(str(tmpdir) + dataset_identifier) + tables = test_manager.duckdb_conn.execute( + "SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'" + ).fetchall() + return [view_name in [table[0] for table in tables], view_name] + + +def test_create_and_fill_table(tmpdir): + test_manager, example_df, dataset_identifier = _get_test_manager(tmpdir) + + test_manager._create_and_fill_table(example_df, dataset_identifier) + + # Check if the view is registered + assert _check_view_exists(tmpdir, test_manager, dataset_identifier) + _clean_up_test_manager(tmpdir, dataset_identifier) + + +def test_insert_to_exist_table(tmpdir): + test_manager, example_df, dataset_identifier = _get_test_manager(tmpdir) + + test_manager._create_and_fill_table(example_df, dataset_identifier) + + # Check if the view is registered + check_result, view_name = _check_view_exists( + tmpdir, test_manager, dataset_identifier + ) + assert check_result + + # Insert new data to the table + example_df = pl.DataFrame( + {"timestamp": ["2022-04-01", "2022-05-01", "2022-06-01"], "value": [40, 50, 60]} + ) + test_manager.insert_to_table(example_df, dataset_identifier) + + # Check if the view is registered + check_result, view_name = _check_view_exists( + tmpdir, test_manager, dataset_identifier + ) + assert check_result + + # Check if the new data is inserted + result = test_manager.duckdb_conn.execute(f"SELECT * FROM {view_name}").fetchall() + assert len(result) == 6 + print(result) + assert result[3][0] == "2022-04-01" + assert result[3][1] == 40 + assert result[4][0] == "2022-05-01" + assert result[4][1] == 50 + assert result[5][0] == "2022-06-01" + assert result[5][1] == 60 + _clean_up_test_manager(tmpdir, dataset_identifier) + + +def test_insert_to_new_table(tmpdir): + test_manager, example_df, dataset_identifier = _get_test_manager(tmpdir) + + test_manager.insert_to_table(example_df, dataset_identifier) + + # Check if the view is registered + check_result, view_name = _check_view_exists( + tmpdir, test_manager, dataset_identifier + ) + assert check_result + + # Check if the new data is inserted + result = test_manager.duckdb_conn.execute(f"SELECT * FROM {view_name}").fetchall() + assert len(result) == 3 + assert result[0][0] == "2022-01-01" + assert result[0][1] == 10 + assert result[1][0] == "2022-02-01" + assert result[1][1] == 20 + assert result[2][0] == "2022-03-01" + assert result[2][1] == 30 + _clean_up_test_manager(tmpdir, dataset_identifier) + + +def test_query_data(tmpdir): + test_manager, example_df, dataset_identifier = _get_test_manager(tmpdir) + test_manager.insert_to_table(example_df, dataset_identifier) + + # Check if the view is registered + check_result, _ = _check_view_exists(tmpdir, test_manager, dataset_identifier) + assert check_result + + # Execute the provided SQL query + result_df = test_manager.query_data( + dataset_identifier, "SELECT * FROM {view_name} WHERE value > 15" + ) + assert len(result_df) == 2, "Query did not return the expected number of rows." + _clean_up_test_manager(tmpdir, dataset_identifier) + + +def test_drop_table(tmpdir): + test_manager, example_df, dataset_identifier = _get_test_manager(tmpdir) + + test_manager.insert_to_table(example_df, dataset_identifier) + + # Check if the view is registered + check_result, view_name = _check_view_exists( + tmpdir, test_manager, dataset_identifier + ) + assert check_result + + # Drop the table + test_manager.drop_table(dataset_identifier, ds_type="table") + + # Check if the view is dropped + tables = test_manager.duckdb_conn.execute( + "SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'" + ).fetchall() + assert view_name not in [table[0] for table in tables] + _clean_up_test_manager(tmpdir, dataset_identifier) + + +def test_fill_from_csv_destination(tmpdir): + test_manager, example_df, dataset_identifier = _get_test_manager(tmpdir) + csv_folder_path = os.path.join(str(tmpdir), "csv_folder") + os.makedirs(csv_folder_path, exist_ok=True) + example_df.write_csv(os.path.join(str(csv_folder_path), "data.csv")) + + test_manager.fill_from_csv_destination(csv_folder_path, dataset_identifier) + + # Check if the view is registered + check_result, view_name = _check_view_exists( + tmpdir, test_manager, dataset_identifier + ) + assert check_result + + # Check if the new data is inserted + result = test_manager.duckdb_conn.execute(f"SELECT * FROM {view_name}").fetchall() + assert len(result) == 3 + assert result[0][0] == "2022-01-01" + assert result[0][1] == 10 + assert result[1][0] == "2022-02-01" + assert result[1][1] == 20 + assert result[2][0] == "2022-03-01" + assert result[2][1] == 30 + + _clean_up_test_manager(tmpdir, dataset_identifier) + # clean csv folder + # delete files in the folder + for file in os.listdir(csv_folder_path): + file_path = os.path.join(csv_folder_path, file) + os.remove(file_path) + + # delete the folder + os.rmdir(csv_folder_path) From ac81100506a7347da602127aece848b121cb41b3 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Wed, 21 Feb 2024 12:34:32 +0300 Subject: [PATCH 03/27] duckdb dependency is added to setup.py --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 877bdb137..2237921b2 100644 --- a/setup.py +++ b/setup.py @@ -12,6 +12,7 @@ "bumpversion", "ccxt>=4.1.59", "coverage", + "duckdb", "enforce_typing", "eth-account", "eth-keys", From 6ffc626aa34cefde86a4e4ac136817d516f29073 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Tue, 27 Feb 2024 14:15:02 +0300 Subject: [PATCH 04/27] dry fix --- pdr_backend/lake/base_data_store.py | 3 ++- pdr_backend/lake/persistent_data_store.py | 10 +++++----- pdr_backend/lake/test/test_persistent_data_store.py | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pdr_backend/lake/base_data_store.py b/pdr_backend/lake/base_data_store.py index 368841df7..70c88e518 100644 --- a/pdr_backend/lake/base_data_store.py +++ b/pdr_backend/lake/base_data_store.py @@ -30,7 +30,8 @@ def _generate_view_name(self, base_path=str) -> str: str - A unique view name. """ - hash_object = md5(base_path.encode()) + path = f"{self.base_directory}/{base_path}" + hash_object = md5(path.encode()) return f"dataset_{hash_object.hexdigest()}" @abstractmethod diff --git a/pdr_backend/lake/persistent_data_store.py b/pdr_backend/lake/persistent_data_store.py index 6dc02f45e..b87c807f6 100644 --- a/pdr_backend/lake/persistent_data_store.py +++ b/pdr_backend/lake/persistent_data_store.py @@ -32,7 +32,7 @@ def _create_and_fill_table( dataset_identifier - A unique identifier for the dataset. """ - view_name = self._generate_view_name(self.base_directory + dataset_identifier) + view_name = self._generate_view_name(dataset_identifier) # self.duckdb_conn.register(view_name, df) # Create the table @@ -54,7 +54,7 @@ def insert_to_table(self, df: pl.DataFrame, dataset_identifier: str): insert_to_table(df, "people") """ - view_name = self._generate_view_name(self.base_directory + dataset_identifier) + view_name = self._generate_view_name(dataset_identifier) # Check if the table exists tables = self.duckdb_conn.execute( "SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'" @@ -80,7 +80,7 @@ def query_data( query_data("people", "SELECT * FROM {view_name}") """ - view_name = self._generate_view_name(self.base_directory + dataset_identifier) + view_name = self._generate_view_name(dataset_identifier) result_df = self.duckdb_conn.execute(query.format(view_name=view_name)).df() return pl.DataFrame(result_df) @@ -99,7 +99,7 @@ def drop_table(self, dataset_identifier: str, ds_type: str = "table"): if ds_type not in ["view", "table"]: raise ValueError("ds_type must be either 'view' or 'table'") - view_name = self._generate_view_name(self.base_directory + dataset_identifier) + view_name = self._generate_view_name(dataset_identifier) self.duckdb_conn.execute(f"DROP {ds_type} {view_name}") @enforce_types @@ -139,7 +139,7 @@ def update_data( update_data(df, "people", "id") """ - view_name = self._generate_view_name(self.base_directory + dataset_identifier) + view_name = self._generate_view_name(dataset_identifier) update_columns = ", ".join( [f"{column} = {df[column]}" for column in df.columns] ) diff --git a/pdr_backend/lake/test/test_persistent_data_store.py b/pdr_backend/lake/test/test_persistent_data_store.py index e97fc366b..33549b986 100644 --- a/pdr_backend/lake/test/test_persistent_data_store.py +++ b/pdr_backend/lake/test/test_persistent_data_store.py @@ -34,7 +34,7 @@ def _clean_up_test_manager(tmpdir, dataset_identifier): def _check_view_exists(tmpdir, test_manager, dataset_identifier): - view_name = test_manager._generate_view_name(str(tmpdir) + dataset_identifier) + view_name = test_manager._generate_view_name(dataset_identifier) tables = test_manager.duckdb_conn.execute( "SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'" ).fetchall() From 1e3892b2f6f0806ed9bd3cd062413d08131181e6 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Tue, 27 Feb 2024 21:21:53 +0300 Subject: [PATCH 05/27] CSV data handler - part 1 --- pdr_backend/lake/csv_data_store.py | 268 +++++++++++++++++++ pdr_backend/lake/test/test_csv_data_store.py | 153 +++++++++++ 2 files changed, 421 insertions(+) create mode 100644 pdr_backend/lake/csv_data_store.py create mode 100644 pdr_backend/lake/test/test_csv_data_store.py diff --git a/pdr_backend/lake/csv_data_store.py b/pdr_backend/lake/csv_data_store.py new file mode 100644 index 000000000..aefbd696c --- /dev/null +++ b/pdr_backend/lake/csv_data_store.py @@ -0,0 +1,268 @@ +import os +from typing import List, Optional +import polars as pl + +class CSVDataStore: + def __init__(self, base_path: str): + self.base_path = base_path + + def _get_folder_path(self, dataset_identifier: str) -> str: + """ + Returns the folder path for the given dataset_identifier. + If the folder does not exist, it will be created. + @args: + dataset_identifier: str - identifier of the dataset + """ + folder_path = os.path.join(self.base_path, dataset_identifier) + os.makedirs(folder_path, exist_ok=True) + return folder_path + + def _create_file_name( + self, + dataset_identifier: str, + start_time: int, + end_time: int, + row_count: int + ) -> str: + """ + Creates a file name using the given dataset_identifier, + start_time, end_time, and row_count. + @args: + dataset_identifier: str - identifier of the dataset + start_time: int - start time of the data TIMESTAMP + end_time: int - end time of the data TIMESTAMP + row_count: int - number of rows in the data + """ + fill_with_zero_start = 10 - len(str(start_time)) + fill_with_zero_end = 10 - len(str(end_time)) + start_time = f"{fill_with_zero_start * '0'}{start_time}" + end_time = f"{fill_with_zero_end * '0'}{end_time}" + + return f"{dataset_identifier}_from_{start_time}_to_{end_time}_{row_count}.csv" + + def _create_file_path( + self, + dataset_identifier: str, + start_time: int, + end_time: int, + row_count: int + ) -> str: + """ + Creates the file path for the given dataset_identifier, + start_time, end_time, and row_count. + @args: + dataset_identifier: str - identifier of the dataset + start_time: str - start time of the data + end_time: str - end time of the data + row_count: int - number of rows in the data + """ + + file_name = self._create_file_name(dataset_identifier, start_time, end_time, row_count) + folder_path = self._get_folder_path(dataset_identifier) + return os.path.join(folder_path, file_name) + + def _chunk_data(self, data: pl.DataFrame) -> List[pl.DataFrame]: + """ + Splits the given data into chunks of up to 1000 rows each. + @args: + data: pl.DataFrame - data to be chunked + """ + return [data.slice(i, min(1000, len(data) - i)) for i in range(0, len(data), 1000)] + + def write(self, dataset_identifier: str, data: pl.DataFrame): + """ + Writes the given data to a csv file in the folder + corresponding to the given dataset_identifier. + @args: + data: pl.DataFrame - The data to write, it has to be sorted by timestamp + dataset_identifier: str - The dataset identifier + """ + + last_file_row_count = self._get_last_file_row_count(dataset_identifier) + if last_file_row_count is not None: + if last_file_row_count < 1000: + remaining_rows = 1000 - last_file_row_count + + # get the first remaining_rows rows + if len(data) < remaining_rows: + remaining_rows = len(data) + + remaining_data = data.slice(0, remaining_rows) + + last_file_path = self._get_last_file_path( + self._get_folder_path(dataset_identifier)) + last_file_data = pl.read_csv(last_file_path) + last_file_data = last_file_data.vstack(remaining_data) + + t_start_time = last_file_data['timestamp'][0] + t_end_time = last_file_data['timestamp'][-1] + + last_file_data.write_csv(last_file_path) + # change the name of the file to reflect the new row count + new_file_path = self._create_file_path( + dataset_identifier, t_start_time, t_end_time, len(last_file_data)) + + print("new_file_path", new_file_path) + os.rename(last_file_path, new_file_path) + + data = data.slice(remaining_rows, len(data) - remaining_rows) + + chunks = [ + data.slice(i, min(1000, len(data) - i)) + for i in range(0, len(data), 1000)] + + for i, chunk in enumerate(chunks): + start_time = int(chunk['timestamp'][0]) + end_time = int(chunk['timestamp'][-1]) + file_path = self._create_file_path(dataset_identifier, start_time, end_time, len(chunk)) + chunk.write_csv(file_path) + + def bulk_write(self, data_list: List[pl.DataFrame], dataset_identifier: str): + """ + Writes the given list of data to csv files in the folder + corresponding to the given dataset_identifier. + @args: + data_list: List[pl.DataFrame] - list of data to be written + dataset_identifier: str - identifier of the dataset + """ + for data in data_list: + self.write(data, dataset_identifier) + + def _get_file_paths( + self, + folder_path: str, + start_time: str, + end_time: str + ) -> List[str]: + """ + Returns a list of file paths in the given folder_path + that contain the given start_time and end_time. + @args: + folder_path: str - path of the folder + start_time: str - start time of the data + end_time: str - end time of the data + @returns: + List[str] - list of file paths + """ + + file_names = os.listdir(folder_path) + file_paths = [os.path.join(folder_path, file_name) for file_name in file_names] + + #find files which has a higher start time and lower end time + print("file_paths_aaaa", file_paths) + file_paths = [ + file_path for file_path in file_paths + #firstly, take the filename from the path (/path/to/file.csv -> file.csv) + #then, split the filename by "_" and take the 4th and 5th elements + #then, convert them to int and check if they are in the range + if int(file_path.split("/")[-1].split("_")[2]) >= int(start_time) + and int(file_path.split("/")[-1].split("_")[4]) <= int(end_time)] + + return file_paths + + def read( + self, + dataset_identifier: str, + start_time: str, + end_time: str + ) -> pl.DataFrame: + """ + Reads the data from the csv file in the folder + corresponding to the given dataset_identifier, + start_time, and end_time. + @args: + dataset_identifier: str - identifier of the dataset + start_time: str - start time of the data + end_time: str - end time of the data + @returns: + pl.DataFrame - data read from the csv file + """ + folder_path = self._get_folder_path(dataset_identifier) + file_paths = self._get_file_paths(folder_path, start_time, end_time) + return pl.read_csv(file_paths[0]) if file_paths else pl.DataFrame() + + def read_all( + self, + dataset_identifier: str + ) -> pl.DataFrame: + """ + Reads all the data from the csv files in the folder + corresponding to the given dataset_identifier. + @args: + dataset_identifier: str - identifier of the dataset + @returns: + pl.DataFrame - data read from the csv files + """ + + folder_path = self._get_folder_path(dataset_identifier) + file_names = os.listdir(folder_path) + file_paths = [os.path.join(folder_path, file_name) for file_name in file_names] + file_paths.sort() + + # print("read_all_file_paths", file_paths) + if file_paths: + # Read the first file to create the DataFrame + data = pl.read_csv(file_paths[0]) + # Read the remaining files and append them to the DataFrame + for file_path in file_paths[1:]: + data = data.vstack(pl.read_csv(file_path)) + return data + else: + return pl.DataFrame() + + def _get_last_file_path( + self, + folder_path: str + ) -> str: + """ + Returns the path of the last file in the given folder_path. + @args: + folder_path: str - path of the folder + @returns: + str - path of the last file + """ + + file_names = sorted(os.listdir(folder_path)) + return os.path.join(folder_path, file_names[-1]) if file_names else "" + + def get_last_timestamp( + self, + dataset_identifier: str + ) -> Optional[int]: + """ + Returns the last timestamp from the csv files in the folder + corresponding to the given dataset_identifier. + @args: + dataset_identifier: str - identifier of the dataset + @returns: + Optional[int] - last timestamp from the csv files + """ + folder_path = self._get_folder_path(dataset_identifier) + last_file_path = self._get_last_file_path(folder_path) + if len(last_file_path): + return int(last_file_path.split("_")[3]) + else: + return None + + def _get_last_file_row_count(self, dataset_identifier: str) -> Optional[int]: + """ + Returns the row count of the last file for the given dataset_identifier. + @args: + dataset_identifier: str - The dataset identifier + @returns: + row_count: Optional[int] - The row count of the last file + """ + + folder_path = self._get_folder_path(dataset_identifier) + file_names = os.listdir(folder_path) + + # Sort by file name + file_names.sort() + if len(file_names) == 0: + return None + + last_file_path = os.path.join(folder_path, file_names[-1]) + + # parse the row count from the file name + row_count = int(last_file_path.split("_")[-1].split(".")[0]) + return row_count diff --git a/pdr_backend/lake/test/test_csv_data_store.py b/pdr_backend/lake/test/test_csv_data_store.py new file mode 100644 index 000000000..eda5ef4b4 --- /dev/null +++ b/pdr_backend/lake/test/test_csv_data_store.py @@ -0,0 +1,153 @@ +import polars as pl +import os +from pdr_backend.lake.csv_data_store import CSVDataStore + +def _get_test_manager(tmpdir): + return CSVDataStore(str(tmpdir)) + +def _clean_up(tmpdir): + for root, dirs, files in os.walk(tmpdir): + for file in files: + os.remove(os.path.join(root, file)) + for dir in dirs: + # clean up the directory + _clean_up(os.path.join(root, dir)) + os.rmdir(os.path.join(root, dir)) + +def test_get_folder_path(tmpdir): + manager = _get_test_manager(tmpdir) + folder_path = manager._get_folder_path("test") + assert folder_path == f"{tmpdir}/test" + +def test_create_file_name(tmpdir): + manager = _get_test_manager(tmpdir) + file_name = manager._create_file_name("test", 0, 1, 2) + print("file_name", file_name) + assert file_name == "test_from_0000000000_to_0000000001_2.csv" + +def test_get_file_paths(tmpdir): + manager = _get_test_manager(tmpdir) + file_name_1 = manager._create_file_name("test", 0, 20, 20) + file_name_2 = manager._create_file_name("test", 21, 40, 20) + file_name_3 = manager._create_file_name("test", 41, 60, 20) + file_name_4 = manager._create_file_name("test", 61, 80, 20) + + files = [file_name_1, file_name_2, file_name_3, file_name_4] + + folder_path = manager._get_folder_path("test") + + if not os.path.exists(folder_path): + os.makedirs(folder_path) + + for file in files: + # create empty files + with open( + os.path.join(folder_path, file) + , "w") as f: + pass + + # check if empty files are created + for file in files: + assert os.path.exists(folder_path + "/" + file) + + file_paths = manager._get_file_paths(folder_path, 21, 60) + + for file_path in file_paths: + assert file_path in [folder_path + "/" + file_name_2, folder_path + "/" + file_name_3] + + _clean_up(tmpdir) + +def test_create_file_path(tmpdir): + manager = _get_test_manager(tmpdir) + file_path = manager._create_file_path("test", 1, 2, 2) + assert file_path == f"{tmpdir}/test/test_from_0000000001_to_0000000002_2.csv" + +def test_read(tmpdir): + manager = _get_test_manager(tmpdir) + file_path = manager._create_file_path("test", 1, 2, 2) + + with open(file_path, "w") as file: + file.write("a,b,c\n1,2,3\n4,5,6") + + data = manager.read("test", 1, 2) + assert data.equals(pl.DataFrame({"a": [1, 4], "b": [2, 5], "c": [3, 6]})) + + _clean_up(tmpdir) + +def test_read_all(tmpdir): + manager = _get_test_manager(tmpdir) + + file_path_1 = manager._create_file_path("test", 0, 20, 2) + file_path_2 = manager._create_file_path("test", 21, 41, 2) + + with open(file_path_1, "w") as file: + file.write("a,b,c\n1,2,3\n4,5,6") + + with open(file_path_2, "w") as file: + file.write("a,b,c\n7,8,9\n10,11,12") + + data = manager.read_all("test") + assert data['a'].to_list() == [1, 4, 7, 10] + assert data['b'].to_list() == [2, 5, 8, 11] + assert data['c'].to_list() == [3, 6, 9, 12] + + _clean_up(tmpdir) + +def test_get_last_file_path(tmpdir): + manager = _get_test_manager(tmpdir) + file_path_1 = manager._create_file_path("test", 0, 20, 2) + file_path_2 = manager._create_file_path("test", 21, 41, 2) + file_path_3 = manager._create_file_path("test", 42, 62, 2) + file_path_4 = manager._create_file_path("test", 63, 83, 2) + + files = [file_path_1, file_path_2, file_path_3, file_path_4] + + folder_path = manager._get_folder_path("test") + + if not os.path.exists(folder_path): + os.makedirs(folder_path) + + for file in files: + # create empty files + with open( + os.path.join(folder_path, file) + , "w") as f: + pass + + + assert manager._get_last_file_path(f"{tmpdir}/test") == os.path.join(folder_path, file_path_4) + + _clean_up(tmpdir) + +def test_write(tmpdir): + manager = _get_test_manager(tmpdir) + data = pl.DataFrame({"a": [1, 4], "b": [2, 5], "timestamp": [3, 6]}) + manager.write("test", data) + file_name = manager._create_file_path("test", 3, 6, 2) + + data = pl.read_csv(file_name) + + assert data['a'].to_list() == [1, 4] + assert data['b'].to_list() == [2, 5] + assert data['timestamp'].to_list() == [3, 6] + + _clean_up(tmpdir) + +def test_write_append(tmpdir): + manager = _get_test_manager(tmpdir) + data = pl.DataFrame({"a": [1, 4], "b": [2, 5], "timestamp": [3, 6]}) + manager.write("test", data) + + # new data + data = pl.DataFrame({"a": [11, 41], "b": [21, 51], "timestamp": [31, 61]}) + manager.write("test", data) + + file_name = manager._create_file_path("test", 3, 61, 4) + + data = pl.read_csv(file_name) + + assert data['a'].to_list() == [1, 4, 11, 41] + assert data['b'].to_list() == [2, 5, 21, 51] + assert data['timestamp'].to_list() == [3, 6, 31, 61] + + _clean_up(tmpdir) From 3670826d1948a0e6768f06e94198b9008b15fd97 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Tue, 27 Feb 2024 21:31:31 +0300 Subject: [PATCH 06/27] csv data store - fill with zero --- pdr_backend/lake/csv_data_store.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pdr_backend/lake/csv_data_store.py b/pdr_backend/lake/csv_data_store.py index aefbd696c..9785da6ed 100644 --- a/pdr_backend/lake/csv_data_store.py +++ b/pdr_backend/lake/csv_data_store.py @@ -16,6 +16,15 @@ def _get_folder_path(self, dataset_identifier: str) -> str: folder_path = os.path.join(self.base_path, dataset_identifier) os.makedirs(folder_path, exist_ok=True) return folder_path + + def _fill_with_zero(self, number: int, length: int = 10) -> str: + """ + Fills the given number with zeros to make it 10 digits long. + @args: + number: int - number to fill with zeros + """ + number_str = str(number) + return f"{(length - len(number_str)) * '0'}{number_str}" def _create_file_name( self, @@ -33,12 +42,10 @@ def _create_file_name( end_time: int - end time of the data TIMESTAMP row_count: int - number of rows in the data """ - fill_with_zero_start = 10 - len(str(start_time)) - fill_with_zero_end = 10 - len(str(end_time)) - start_time = f"{fill_with_zero_start * '0'}{start_time}" - end_time = f"{fill_with_zero_end * '0'}{end_time}" + start_time_str = self._fill_with_zero(start_time) + end_time_str = self._fill_with_zero(end_time) - return f"{dataset_identifier}_from_{start_time}_to_{end_time}_{row_count}.csv" + return f"{dataset_identifier}_from_{start_time_str}_to_{end_time_str}_{row_count}.csv" def _create_file_path( self, From b324491980ac748df668617a371efe4b20db7a57 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Wed, 28 Feb 2024 15:48:57 +0300 Subject: [PATCH 07/27] table csv integration --- pdr_backend/lake/csv_data_store.py | 40 +++++++++++++++----- pdr_backend/lake/table.py | 19 ++-------- pdr_backend/lake/test/test_csv_data_store.py | 9 ++--- pdr_backend/lake/test/test_table.py | 8 ++-- 4 files changed, 41 insertions(+), 35 deletions(-) diff --git a/pdr_backend/lake/csv_data_store.py b/pdr_backend/lake/csv_data_store.py index 9785da6ed..2db86e696 100644 --- a/pdr_backend/lake/csv_data_store.py +++ b/pdr_backend/lake/csv_data_store.py @@ -2,6 +2,8 @@ from typing import List, Optional import polars as pl +from polars.type_aliases import SchemaDict + class CSVDataStore: def __init__(self, base_path: str): self.base_path = base_path @@ -76,7 +78,10 @@ def _chunk_data(self, data: pl.DataFrame) -> List[pl.DataFrame]: """ return [data.slice(i, min(1000, len(data) - i)) for i in range(0, len(data), 1000)] - def write(self, dataset_identifier: str, data: pl.DataFrame): + def write(self, + dataset_identifier: str, + data: pl.DataFrame, + schema: Optional[SchemaDict] = None): """ Writes the given data to a csv file in the folder corresponding to the given dataset_identifier. @@ -98,7 +103,7 @@ def write(self, dataset_identifier: str, data: pl.DataFrame): last_file_path = self._get_last_file_path( self._get_folder_path(dataset_identifier)) - last_file_data = pl.read_csv(last_file_path) + last_file_data = pl.read_csv(last_file_path, schema=schema) last_file_data = last_file_data.vstack(remaining_data) t_start_time = last_file_data['timestamp'][0] @@ -133,7 +138,7 @@ def bulk_write(self, data_list: List[pl.DataFrame], dataset_identifier: str): dataset_identifier: str - identifier of the dataset """ for data in data_list: - self.write(data, dataset_identifier) + self.write(dataset_identifier, data) def _get_file_paths( self, @@ -171,7 +176,8 @@ def read( self, dataset_identifier: str, start_time: str, - end_time: str + end_time: str, + schema: Optional[SchemaDict] = None ) -> pl.DataFrame: """ Reads the data from the csv file in the folder @@ -184,13 +190,27 @@ def read( @returns: pl.DataFrame - data read from the csv file """ - folder_path = self._get_folder_path(dataset_identifier) - file_paths = self._get_file_paths(folder_path, start_time, end_time) - return pl.read_csv(file_paths[0]) if file_paths else pl.DataFrame() + data = self.read_all(dataset_identifier, schema=schema) + # if the data is empty, return + if len(data) == 0: + return data + + # if the data is not empty, + # check the timestamp column exists and is of type int64 + if "timestamp" not in data.columns: + return data + + return data.filter( + data['timestamp'] >= int(start_time) + ).filter( + data['timestamp'] <= int(end_time) + ) + #return pl.read_csv(file_paths[0]) if file_paths else pl.DataFrame() def read_all( self, - dataset_identifier: str + dataset_identifier: str, + schema: Optional[SchemaDict] = None ) -> pl.DataFrame: """ Reads all the data from the csv files in the folder @@ -209,10 +229,10 @@ def read_all( # print("read_all_file_paths", file_paths) if file_paths: # Read the first file to create the DataFrame - data = pl.read_csv(file_paths[0]) + data = pl.read_csv(file_paths[0], schema=schema) # Read the remaining files and append them to the DataFrame for file_path in file_paths[1:]: - data = data.vstack(pl.read_csv(file_path)) + data = data.vstack(pl.read_csv(file_path, schema=schema)) return data else: return pl.DataFrame() diff --git a/pdr_backend/lake/table.py b/pdr_backend/lake/table.py index 8c6b12e03..d715fb747 100644 --- a/pdr_backend/lake/table.py +++ b/pdr_backend/lake/table.py @@ -9,6 +9,7 @@ from pdr_backend.util.time_types import UnixTimeMs from pdr_backend.lake.plutil import _object_list_to_df from pdr_backend.lake.table_pdr_predictions import _transform_timestamp_to_ms +from pdr_backend.lake.csv_data_store import CSVDataStore logger = logging.getLogger("table") @@ -27,25 +28,11 @@ def load(self): """ Read the data from the Parquet file into a DataFrame object """ - filename = self._parquet_filename() + self.csv_data_store = CSVDataStore(self.ppss.lake_ss.parquet_dir) st_ut = self.ppss.lake_ss.st_timestamp fin_ut = self.ppss.lake_ss.fin_timestamp - # load all data from file - # check if file exists - # if file doesn't exist, return an empty dataframe with the expected schema - if os.path.exists(filename): - logger.info("Loading parquet for %s", self.table_name) - df = pl.read_parquet(filename) - df = df.filter( - (pl.col("timestamp") >= st_ut) & (pl.col("timestamp") <= fin_ut) - ) - else: - logger.info("Create initial df for %s", self.table_name) - df = pl.DataFrame(schema=self.df_schema) - - # save data frame in memory - self.df = df + self.df = self.csv_data_store.read(self.table_name, st_ut, fin_ut, schema=self.df_schema) @enforce_types def save(self): diff --git a/pdr_backend/lake/test/test_csv_data_store.py b/pdr_backend/lake/test/test_csv_data_store.py index eda5ef4b4..f0fde60fa 100644 --- a/pdr_backend/lake/test/test_csv_data_store.py +++ b/pdr_backend/lake/test/test_csv_data_store.py @@ -21,9 +21,9 @@ def test_get_folder_path(tmpdir): def test_create_file_name(tmpdir): manager = _get_test_manager(tmpdir) - file_name = manager._create_file_name("test", 0, 1, 2) + file_name = manager._create_file_name("test", 1707030362, 1709060200, 1000) print("file_name", file_name) - assert file_name == "test_from_0000000000_to_0000000001_2.csv" + assert file_name == "test_from_1707030362_to_1709060200_1000.csv" def test_get_file_paths(tmpdir): manager = _get_test_manager(tmpdir) @@ -35,7 +35,7 @@ def test_get_file_paths(tmpdir): files = [file_name_1, file_name_2, file_name_3, file_name_4] folder_path = manager._get_folder_path("test") - + if not os.path.exists(folder_path): os.makedirs(folder_path) @@ -99,7 +99,7 @@ def test_get_last_file_path(tmpdir): file_path_2 = manager._create_file_path("test", 21, 41, 2) file_path_3 = manager._create_file_path("test", 42, 62, 2) file_path_4 = manager._create_file_path("test", 63, 83, 2) - + files = [file_path_1, file_path_2, file_path_3, file_path_4] folder_path = manager._get_folder_path("test") @@ -114,7 +114,6 @@ def test_get_last_file_path(tmpdir): , "w") as f: pass - assert manager._get_last_file_path(f"{tmpdir}/test") == os.path.join(folder_path, file_path_4) _clean_up(tmpdir) diff --git a/pdr_backend/lake/test/test_table.py b/pdr_backend/lake/test/test_table.py index ceb300574..1686f3f78 100644 --- a/pdr_backend/lake/test/test_table.py +++ b/pdr_backend/lake/test/test_table.py @@ -215,10 +215,10 @@ def test_get_pdr_df_multiple_fetches(): captured_output = StringIO() sys.stdout = captured_output - save_backoff_limit = 40 - pagination_limit = 20 + save_backoff_limit = 4 + pagination_limit = 2 st_timest = UnixTimeMs(1704110400000) - fin_timest = UnixTimeMs(1704111600000) + fin_timest = UnixTimeMs(1704115800000) table.get_pdr_df( fetch_function=fetch_filtered_predictions, network="sapphire-mainnet", @@ -238,4 +238,4 @@ def test_get_pdr_df_multiple_fetches(): count_saves = printed_text.count("Saved") assert count_saves == 2 - assert len(table.df) == 50 + assert len(table.df) == 5 From c2c4e81ad13b3a47433a970ec6d9fca5bd403fc1 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Wed, 28 Feb 2024 16:04:25 +0300 Subject: [PATCH 08/27] test fix - 1 --- pdr_backend/lake/table.py | 27 ++++------ pdr_backend/lake/test/test_table.py | 84 ++++++----------------------- 2 files changed, 27 insertions(+), 84 deletions(-) diff --git a/pdr_backend/lake/table.py b/pdr_backend/lake/table.py index d715fb747..22f7fa7ec 100644 --- a/pdr_backend/lake/table.py +++ b/pdr_backend/lake/table.py @@ -50,22 +50,15 @@ def save(self): <= self.df.tail(1)["timestamp"].to_list()[0] ) - filename = self._parquet_filename() - - if os.path.exists(filename): # "append" existing file - cur_df = pl.read_parquet(filename) - self.df = pl.concat([cur_df, self.df]) - - # drop duplicates - self.df = self.df.filter(pl.struct("ID").is_unique()) - self.df.write_parquet(filename) - n_new = self.df.shape[0] - cur_df.shape[0] - print(f" Just appended {n_new} df rows to file {filename}") - else: # write new file - self.df.write_parquet(filename) - print( - f" Just saved df with {self.df.shape[0]} rows to new file {filename}" - ) + + + cur_df = self.csv_data_store.read_all(self.table_name, schema=self.df_schema) + + self.df = pl.concat([cur_df, self.df]) + self.df = self.df.filter(pl.struct("ID").is_unique()) + self.csv_data_store.write(self.table_name, self.df, schema=self.df_schema) + n_new = self.df.shape[0] - cur_df.shape[0] + print(f" Just saved df with {n_new} df rows to the csv files of {self.table_name}") @enforce_types def get_pdr_df( @@ -84,6 +77,7 @@ def get_pdr_df( Update function for graphql query, returns raw data + Transforms ts into ms as required for data factory """ + print(f"Fetching data for {self.table_name}") network = get_sapphire_postfix(network) # save to file when this amount of data is fetched @@ -118,6 +112,7 @@ def get_pdr_df( save_backoff_count += len(df) + print(f" Fetched {len(df)} records from subgraph") # save to file if requred number of data has been fetched if ( save_backoff_count >= save_backoff_limit or len(df) < pagination_limit diff --git a/pdr_backend/lake/test/test_table.py b/pdr_backend/lake/test/test_table.py index 1686f3f78..e670e61e6 100644 --- a/pdr_backend/lake/test/test_table.py +++ b/pdr_backend/lake/test/test_table.py @@ -34,6 +34,19 @@ def __init__(self, data): "user": "0x123", } +def _clean_up(tmp_path, table_name): + """ + Delete test file if already exists + """ + folder_path = os.path.join(tmp_path, table_name) + + if os.path.exists(folder_path): + #delete files + for file in os.listdir(folder_path): + file_path = os.path.join(folder_path, file) + os.remove(file_path) + os.remove(folder_path) + def mock_fetch_function( network, st_ut, fin_ut, save_backoff_limit, pagination_limit, config @@ -130,6 +143,7 @@ def test_save_table(): captured_output = StringIO() sys.stdout = captured_output + print("table.df--1", table.df) assert len(table.df) == 0 table.df = pl.DataFrame([mocked_object], table_df_schema) table.save() @@ -139,30 +153,6 @@ def test_save_table(): assert "Just saved df with" in printed_text - -def test_all(): - """ - Test multiple table actions in one go - """ - st_timestr = "2023-12-03" - fin_timestr = "2023-12-05" - ppss = mock_ppss( - ["binance BTC/USDT c 5m"], - "sapphire-mainnet", - ".", - st_timestr=st_timestr, - fin_timestr=fin_timestr, - ) - - table = Table(table_name, table_df_schema, ppss) - table.df = pl.DataFrame([], table_df_schema) - assert len(table.df) == 0 - table.df = pl.DataFrame([mocked_object], table_df_schema) - table.load() - - assert len(table.df) == 1 - - def test_get_pdr_df(): """ Test multiple table actions in one go @@ -193,49 +183,7 @@ def test_get_pdr_df(): pagination_limit, {"contract_list": ["0x123"]}, ) - assert len(table.df) == 1 - - -def test_get_pdr_df_multiple_fetches(): - """ - Test multiple table actions in one go - """ + print("table.df---", table.df) - st_timestr = "2023-12-03_00:00" - fin_timestr = "2023-12-03_16:00" - ppss = mock_ppss( - ["binance BTC/USDT c 5m"], - "sapphire-mainnet", - ".", - st_timestr=st_timestr, - fin_timestr=fin_timestr, - ) - - table = Table("test_prediction_table_multiple", predictions_schema, ppss) - captured_output = StringIO() - sys.stdout = captured_output - - save_backoff_limit = 4 - pagination_limit = 2 - st_timest = UnixTimeMs(1704110400000) - fin_timest = UnixTimeMs(1704115800000) - table.get_pdr_df( - fetch_function=fetch_filtered_predictions, - network="sapphire-mainnet", - st_ut=st_timest, - fin_ut=fin_timest, - save_backoff_limit=save_backoff_limit, - pagination_limit=pagination_limit, - config={"contract_list": ["0x18f54cc21b7a2fdd011bea06bba7801b280e3151"]}, - ) - printed_text = captured_output.getvalue().strip() - - # test fetches multiple times - count_fetches = printed_text.count("Fetched") - assert count_fetches == 3 - - # test saves multiple times - count_saves = printed_text.count("Saved") - assert count_saves == 2 + assert len(table.df) == 0 - assert len(table.df) == 5 From 1b118ae38d6c414cb49f953a06e89e86cefc806c Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Wed, 28 Feb 2024 18:30:15 +0300 Subject: [PATCH 09/27] fixing tests - 2 --- pdr_backend/lake/table.py | 18 +++--- pdr_backend/lake/test/test_table.py | 86 ++++++++++++++++++++++------- 2 files changed, 76 insertions(+), 28 deletions(-) diff --git a/pdr_backend/lake/table.py b/pdr_backend/lake/table.py index 22f7fa7ec..60d2bcb13 100644 --- a/pdr_backend/lake/table.py +++ b/pdr_backend/lake/table.py @@ -20,7 +20,7 @@ def __init__(self, table_name: str, df_schema: object, ppss: PPSS): self.ppss = ppss self.table_name = table_name self.df_schema = df_schema - self.df = pl.DataFrame() + self.df = pl.DataFrame(schema=df_schema) self.load() @enforce_types @@ -50,8 +50,6 @@ def save(self): <= self.df.tail(1)["timestamp"].to_list()[0] ) - - cur_df = self.csv_data_store.read_all(self.table_name, schema=self.df_schema) self.df = pl.concat([cur_df, self.df]) @@ -84,7 +82,7 @@ def get_pdr_df( save_backoff_count = 0 pagination_offset = 0 - final_df = pl.DataFrame() + final_df = pl.DataFrame([], schema=self.df_schema) while True: # call the function @@ -108,23 +106,27 @@ def get_pdr_df( if len(final_df) == 0: final_df = df else: - final_df = pl.concat([final_df, df]) + final_df = final_df.vstack(df) + print('len(final_df)',len(final_df)) save_backoff_count += len(df) - print(f" Fetched {len(df)} records from subgraph") # save to file if requred number of data has been fetched if ( save_backoff_count >= save_backoff_limit or len(df) < pagination_limit ) and len(final_df) > 0: assert df.schema == self.df_schema # save to parquet - self.df = final_df + #self.df = final_df + self.df = self.df.vstack(final_df) + print("len(self.df)",len(self.df)) self.save() print(f"Saved {len(final_df)} records to file while fetching") - final_df = pl.DataFrame() + final_df = pl.DataFrame([], schema=self.df_schema) save_backoff_count = 0 + print("len(final_df)",len(final_df)) + # avoids doing next fetch if we've reached the end if len(df) < pagination_limit: break diff --git a/pdr_backend/lake/test/test_table.py b/pdr_backend/lake/test/test_table.py index e670e61e6..e5ee0abd5 100644 --- a/pdr_backend/lake/test/test_table.py +++ b/pdr_backend/lake/test/test_table.py @@ -29,8 +29,8 @@ def __init__(self, data): "timeframe": "5m", "prediction": True, "payout": 28.2, - "timestamp": 1701634400000, - "slot": 1701634400000, + "timestamp": 1701634400, + "slot": 1701634400, "user": "0x123", } @@ -123,57 +123,59 @@ def test_load_table(): assert len(table.df) == 0 - -def test_save_table(): - """ - Test that table is saving to local file - """ +""" +def test_save_table(tmpdir): st_timestr = "2023-12-03" fin_timestr = "2023-12-05" ppss = mock_ppss( ["binance BTC/USDT c 5m"], "sapphire-mainnet", - ".", + str(tmpdir), st_timestr=st_timestr, fin_timestr=fin_timestr, ) + _clean_up(ppss.lake_ss.parquet_dir, table_name) + table = Table(table_name, table_df_schema, ppss) captured_output = StringIO() sys.stdout = captured_output - print("table.df--1", table.df) assert len(table.df) == 0 table.df = pl.DataFrame([mocked_object], table_df_schema) table.save() - assert os.path.exists(file_path) + first_ts = table.df.head(1)["timestamp"].to_list()[0] + last_ts = table.df.tail(1)["timestamp"].to_list()[0] + + test_file_path = os.path.join(str(ppss.lake_ss.parquet_dir), table_name, f"{table_name}_from_{first_ts}_to_{last_ts}_{len(table.df)}.csv") + assert os.path.exists(test_file_path) printed_text = captured_output.getvalue().strip() assert "Just saved df with" in printed_text +""" -def test_get_pdr_df(): - """ - Test multiple table actions in one go - """ - +""" +def test_get_pdr_df(tmpdir): st_timestr = "2023-12-03" fin_timestr = "2023-12-05" ppss = mock_ppss( ["binance BTC/USDT c 5m"], "sapphire-mainnet", - ".", + str(tmpdir), st_timestr=st_timestr, fin_timestr=fin_timestr, ) + _clean_up(ppss.lake_ss.parquet_dir, table_name) + table = Table(table_name, table_df_schema, ppss) save_backoff_limit = 5000 pagination_limit = 1000 - st_timest = UnixTimeMs(1701634400000) - fin_timest = UnixTimeMs(1701634400000) + st_timest = UnixTimeMs(1701634300000) + fin_timest = UnixTimeMs(1701634500000) table.get_pdr_df( mock_fetch_function, "sapphire-mainnet", @@ -183,7 +185,51 @@ def test_get_pdr_df(): pagination_limit, {"contract_list": ["0x123"]}, ) - print("table.df---", table.df) - assert len(table.df) == 0 + assert table.df.shape[0] == 1 +""" + +def test_get_pdr_df_multiple_fetches(): + """ + Test multiple table actions in one go + """ + + st_timestr = "2023-12-03_00:00" + fin_timestr = "2023-12-03_16:00" + ppss = mock_ppss( + ["binance BTC/USDT c 5m"], + "sapphire-mainnet", + ".", + st_timestr=st_timestr, + fin_timestr=fin_timestr, + ) + + table = Table("test_prediction_table_multiple", predictions_schema, ppss) + # captured_output = StringIO() + # sys.stdout = captured_output + + save_backoff_limit = 50 + pagination_limit = 20 + st_timest = UnixTimeMs(1704110400000) + fin_timest = UnixTimeMs(1704111600000) + table.get_pdr_df( + fetch_function=fetch_filtered_predictions, + network="sapphire-mainnet", + st_ut=st_timest, + fin_ut=fin_timest, + save_backoff_limit=save_backoff_limit, + pagination_limit=pagination_limit, + config={"contract_list": ["0x18f54cc21b7a2fdd011bea06bba7801b280e3151"]}, + ) + # printed_text = captured_output.getvalue().strip() + + # test fetches multiple times + # count_fetches = printed_text.count("Fetched") + # assert count_fetches == 3 + + # test saves multiple times + # count_saves = printed_text.count("Saved") + # assert count_saves == 1 + # test that the final df is saved + assert len(table.df) == 60 From 4e4687bccc5f1140b69196a9628adaa1b49824a9 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Wed, 28 Feb 2024 19:28:19 +0300 Subject: [PATCH 10/27] test fixes - 3 --- pdr_backend/lake/table.py | 18 ++++---- pdr_backend/lake/test/test_table.py | 65 +++++++++++++++++++++-------- 2 files changed, 57 insertions(+), 26 deletions(-) diff --git a/pdr_backend/lake/table.py b/pdr_backend/lake/table.py index 60d2bcb13..8f2f1ff55 100644 --- a/pdr_backend/lake/table.py +++ b/pdr_backend/lake/table.py @@ -28,10 +28,10 @@ def load(self): """ Read the data from the Parquet file into a DataFrame object """ + print(f"Loading data for {self.table_name}") self.csv_data_store = CSVDataStore(self.ppss.lake_ss.parquet_dir) st_ut = self.ppss.lake_ss.st_timestamp fin_ut = self.ppss.lake_ss.fin_timestamp - self.df = self.csv_data_store.read(self.table_name, st_ut, fin_ut, schema=self.df_schema) @enforce_types @@ -82,7 +82,7 @@ def get_pdr_df( save_backoff_count = 0 pagination_offset = 0 - final_df = pl.DataFrame([], schema=self.df_schema) + final_df = pl.DataFrame() while True: # call the function @@ -108,7 +108,6 @@ def get_pdr_df( else: final_df = final_df.vstack(df) - print('len(final_df)',len(final_df)) save_backoff_count += len(df) # save to file if requred number of data has been fetched @@ -117,21 +116,22 @@ def get_pdr_df( ) and len(final_df) > 0: assert df.schema == self.df_schema # save to parquet - #self.df = final_df - self.df = self.df.vstack(final_df) - print("len(self.df)",len(self.df)) + self.df = final_df.clone() self.save() print(f"Saved {len(final_df)} records to file while fetching") - final_df = pl.DataFrame([], schema=self.df_schema) + final_df = pl.DataFrame() save_backoff_count = 0 - print("len(final_df)",len(final_df)) - # avoids doing next fetch if we've reached the end if len(df) < pagination_limit: break pagination_offset += pagination_limit + if len(final_df) > 0: + self.df = final_df.clone() + self.save() + print(f"Saved {len(final_df)} records to file while fetching") + @enforce_types def _parquet_filename(self) -> str: """ diff --git a/pdr_backend/lake/test/test_table.py b/pdr_backend/lake/test/test_table.py index e5ee0abd5..48a3382a3 100644 --- a/pdr_backend/lake/test/test_table.py +++ b/pdr_backend/lake/test/test_table.py @@ -80,7 +80,6 @@ def get_table_df(network, st_ut, fin_ut, config): if os.path.exists(file_path2): os.remove(file_path2) - def test_table_initialization(): """ Test that table is initialized correctly @@ -103,7 +102,6 @@ def test_table_initialization(): assert table.ppss.lake_ss.st_timestr == st_timestr assert table.ppss.lake_ss.fin_timestr == fin_timestr - def test_load_table(): """ Test that table is loading the data from file @@ -123,7 +121,6 @@ def test_load_table(): assert len(table.df) == 0 -""" def test_save_table(tmpdir): st_timestr = "2023-12-03" fin_timestr = "2023-12-05" @@ -154,9 +151,7 @@ def test_save_table(tmpdir): printed_text = captured_output.getvalue().strip() assert "Just saved df with" in printed_text -""" -""" def test_get_pdr_df(tmpdir): st_timestr = "2023-12-03" fin_timestr = "2023-12-05" @@ -187,9 +182,8 @@ def test_get_pdr_df(tmpdir): ) assert table.df.shape[0] == 1 -""" -def test_get_pdr_df_multiple_fetches(): +def test_get_pdr_df_multiple_fetches(tmpdir): """ Test multiple table actions in one go """ @@ -199,16 +193,18 @@ def test_get_pdr_df_multiple_fetches(): ppss = mock_ppss( ["binance BTC/USDT c 5m"], "sapphire-mainnet", - ".", + str(tmpdir), st_timestr=st_timestr, fin_timestr=fin_timestr, ) + _clean_up(ppss.lake_ss.parquet_dir, table_name) + table = Table("test_prediction_table_multiple", predictions_schema, ppss) - # captured_output = StringIO() - # sys.stdout = captured_output + captured_output = StringIO() + sys.stdout = captured_output - save_backoff_limit = 50 + save_backoff_limit = 40 pagination_limit = 20 st_timest = UnixTimeMs(1704110400000) fin_timest = UnixTimeMs(1704111600000) @@ -221,15 +217,50 @@ def test_get_pdr_df_multiple_fetches(): pagination_limit=pagination_limit, config={"contract_list": ["0x18f54cc21b7a2fdd011bea06bba7801b280e3151"]}, ) - # printed_text = captured_output.getvalue().strip() + printed_text = captured_output.getvalue().strip() # test fetches multiple times - # count_fetches = printed_text.count("Fetched") - # assert count_fetches == 3 + count_fetches = printed_text.count("Fetched") + assert count_fetches == 3 # test saves multiple times - # count_saves = printed_text.count("Saved") - # assert count_saves == 1 + count_saves = printed_text.count("Saved") + assert count_saves == 2 # test that the final df is saved - assert len(table.df) == 60 + assert len(table.df) == 50 + +def test_all(tmpdir): + """ + Test multiple table actions in one go + """ + st_timestr = "2021-12-03" + fin_timestr = "2023-12-31" + ppss = mock_ppss( + ["binance BTC/USDT c 5m"], + "sapphire-mainnet", + str(tmpdir), + st_timestr=st_timestr, + fin_timestr=fin_timestr, + ) + + _clean_up(ppss.lake_ss.parquet_dir, table_name) + + folder_path = os.path.join(ppss.lake_ss.parquet_dir, table_name) + if not os.path.exists(folder_path): + os.makedirs(folder_path) + + #create the csv file + file_path = os.path.join(folder_path, f"{table_name}_from_1701634400_to_1701634400_1.csv") + + #write the file + with open(file_path, "w") as file: + file.write("ID,pair,timeframe,prediction,payout,timestamp,slot,user\n") + file.write("0x123,ADA-USDT,5m,True,28.2,1701634400000,1701634400000,0x123\n") + + table = Table(table_name, table_df_schema, ppss) + table.df = pl.DataFrame([], table_df_schema) + assert len(table.df) == 0 + + table.load() + assert len(table.df) == 1 From 4582fe35e010d79b2e306d345ac24d2210cd79d4 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Thu, 29 Feb 2024 16:22:00 +0300 Subject: [PATCH 11/27] black fix --- pdr_backend/lake/csv_data_store.py | 134 +++++++++---------- pdr_backend/lake/gql_data_factory.py | 6 +- pdr_backend/lake/table.py | 8 +- pdr_backend/lake/test/test_csv_data_store.py | 48 ++++--- pdr_backend/lake/test/test_table.py | 25 +++- 5 files changed, 121 insertions(+), 100 deletions(-) diff --git a/pdr_backend/lake/csv_data_store.py b/pdr_backend/lake/csv_data_store.py index 2db86e696..59503e6be 100644 --- a/pdr_backend/lake/csv_data_store.py +++ b/pdr_backend/lake/csv_data_store.py @@ -4,6 +4,7 @@ from polars.type_aliases import SchemaDict + class CSVDataStore: def __init__(self, base_path: str): self.base_path = base_path @@ -18,7 +19,7 @@ def _get_folder_path(self, dataset_identifier: str) -> str: folder_path = os.path.join(self.base_path, dataset_identifier) os.makedirs(folder_path, exist_ok=True) return folder_path - + def _fill_with_zero(self, number: int, length: int = 10) -> str: """ Fills the given number with zeros to make it 10 digits long. @@ -29,14 +30,10 @@ def _fill_with_zero(self, number: int, length: int = 10) -> str: return f"{(length - len(number_str)) * '0'}{number_str}" def _create_file_name( - self, - dataset_identifier: str, - start_time: int, - end_time: int, - row_count: int - ) -> str: + self, dataset_identifier: str, start_time: int, end_time: int, row_count: int + ) -> str: """ - Creates a file name using the given dataset_identifier, + Creates a file name using the given dataset_identifier, start_time, end_time, and row_count. @args: dataset_identifier: str - identifier of the dataset @@ -50,12 +47,8 @@ def _create_file_name( return f"{dataset_identifier}_from_{start_time_str}_to_{end_time_str}_{row_count}.csv" def _create_file_path( - self, - dataset_identifier: str, - start_time: int, - end_time: int, - row_count: int - ) -> str: + self, dataset_identifier: str, start_time: int, end_time: int, row_count: int + ) -> str: """ Creates the file path for the given dataset_identifier, start_time, end_time, and row_count. @@ -66,7 +59,9 @@ def _create_file_path( row_count: int - number of rows in the data """ - file_name = self._create_file_name(dataset_identifier, start_time, end_time, row_count) + file_name = self._create_file_name( + dataset_identifier, start_time, end_time, row_count + ) folder_path = self._get_folder_path(dataset_identifier) return os.path.join(folder_path, file_name) @@ -76,12 +71,16 @@ def _chunk_data(self, data: pl.DataFrame) -> List[pl.DataFrame]: @args: data: pl.DataFrame - data to be chunked """ - return [data.slice(i, min(1000, len(data) - i)) for i in range(0, len(data), 1000)] - - def write(self, - dataset_identifier: str, - data: pl.DataFrame, - schema: Optional[SchemaDict] = None): + return [ + data.slice(i, min(1000, len(data) - i)) for i in range(0, len(data), 1000) + ] + + def write( + self, + dataset_identifier: str, + data: pl.DataFrame, + schema: Optional[SchemaDict] = None, + ): """ Writes the given data to a csv file in the folder corresponding to the given dataset_identifier. @@ -102,17 +101,19 @@ def write(self, remaining_data = data.slice(0, remaining_rows) last_file_path = self._get_last_file_path( - self._get_folder_path(dataset_identifier)) + self._get_folder_path(dataset_identifier) + ) last_file_data = pl.read_csv(last_file_path, schema=schema) last_file_data = last_file_data.vstack(remaining_data) - t_start_time = last_file_data['timestamp'][0] - t_end_time = last_file_data['timestamp'][-1] + t_start_time = last_file_data["timestamp"][0] + t_end_time = last_file_data["timestamp"][-1] last_file_data.write_csv(last_file_path) # change the name of the file to reflect the new row count new_file_path = self._create_file_path( - dataset_identifier, t_start_time, t_end_time, len(last_file_data)) + dataset_identifier, t_start_time, t_end_time, len(last_file_data) + ) print("new_file_path", new_file_path) os.rename(last_file_path, new_file_path) @@ -120,13 +121,15 @@ def write(self, data = data.slice(remaining_rows, len(data) - remaining_rows) chunks = [ - data.slice(i, min(1000, len(data) - i)) - for i in range(0, len(data), 1000)] + data.slice(i, min(1000, len(data) - i)) for i in range(0, len(data), 1000) + ] for i, chunk in enumerate(chunks): - start_time = int(chunk['timestamp'][0]) - end_time = int(chunk['timestamp'][-1]) - file_path = self._create_file_path(dataset_identifier, start_time, end_time, len(chunk)) + start_time = int(chunk["timestamp"][0]) + end_time = int(chunk["timestamp"][-1]) + file_path = self._create_file_path( + dataset_identifier, start_time, end_time, len(chunk) + ) chunk.write_csv(file_path) def bulk_write(self, data_list: List[pl.DataFrame], dataset_identifier: str): @@ -141,13 +144,10 @@ def bulk_write(self, data_list: List[pl.DataFrame], dataset_identifier: str): self.write(dataset_identifier, data) def _get_file_paths( - self, - folder_path: str, - start_time: str, - end_time: str - ) -> List[str]: + self, folder_path: str, start_time: str, end_time: str + ) -> List[str]: """ - Returns a list of file paths in the given folder_path + Returns a list of file paths in the given folder_path that contain the given start_time and end_time. @args: folder_path: str - path of the folder @@ -159,29 +159,31 @@ def _get_file_paths( file_names = os.listdir(folder_path) file_paths = [os.path.join(folder_path, file_name) for file_name in file_names] - - #find files which has a higher start time and lower end time + + # find files which has a higher start time and lower end time print("file_paths_aaaa", file_paths) file_paths = [ - file_path for file_path in file_paths - #firstly, take the filename from the path (/path/to/file.csv -> file.csv) - #then, split the filename by "_" and take the 4th and 5th elements - #then, convert them to int and check if they are in the range + file_path + for file_path in file_paths + # firstly, take the filename from the path (/path/to/file.csv -> file.csv) + # then, split the filename by "_" and take the 4th and 5th elements + # then, convert them to int and check if they are in the range if int(file_path.split("/")[-1].split("_")[2]) >= int(start_time) - and int(file_path.split("/")[-1].split("_")[4]) <= int(end_time)] - + and int(file_path.split("/")[-1].split("_")[4]) <= int(end_time) + ] + return file_paths def read( - self, - dataset_identifier: str, - start_time: str, - end_time: str, - schema: Optional[SchemaDict] = None - ) -> pl.DataFrame: + self, + dataset_identifier: str, + start_time: str, + end_time: str, + schema: Optional[SchemaDict] = None, + ) -> pl.DataFrame: """ Reads the data from the csv file in the folder - corresponding to the given dataset_identifier, + corresponding to the given dataset_identifier, start_time, and end_time. @args: dataset_identifier: str - identifier of the dataset @@ -194,24 +196,20 @@ def read( # if the data is empty, return if len(data) == 0: return data - + # if the data is not empty, # check the timestamp column exists and is of type int64 if "timestamp" not in data.columns: return data - return data.filter( - data['timestamp'] >= int(start_time) - ).filter( - data['timestamp'] <= int(end_time) + return data.filter(data["timestamp"] >= int(start_time)).filter( + data["timestamp"] <= int(end_time) ) - #return pl.read_csv(file_paths[0]) if file_paths else pl.DataFrame() + # return pl.read_csv(file_paths[0]) if file_paths else pl.DataFrame() def read_all( - self, - dataset_identifier: str, - schema: Optional[SchemaDict] = None - ) -> pl.DataFrame: + self, dataset_identifier: str, schema: Optional[SchemaDict] = None + ) -> pl.DataFrame: """ Reads all the data from the csv files in the folder corresponding to the given dataset_identifier. @@ -225,10 +223,10 @@ def read_all( file_names = os.listdir(folder_path) file_paths = [os.path.join(folder_path, file_name) for file_name in file_names] file_paths.sort() - + # print("read_all_file_paths", file_paths) if file_paths: - # Read the first file to create the DataFrame + # Read the first file to create the DataFrame data = pl.read_csv(file_paths[0], schema=schema) # Read the remaining files and append them to the DataFrame for file_path in file_paths[1:]: @@ -237,10 +235,7 @@ def read_all( else: return pl.DataFrame() - def _get_last_file_path( - self, - folder_path: str - ) -> str: + def _get_last_file_path(self, folder_path: str) -> str: """ Returns the path of the last file in the given folder_path. @args: @@ -252,10 +247,7 @@ def _get_last_file_path( file_names = sorted(os.listdir(folder_path)) return os.path.join(folder_path, file_names[-1]) if file_names else "" - def get_last_timestamp( - self, - dataset_identifier: str - ) -> Optional[int]: + def get_last_timestamp(self, dataset_identifier: str) -> Optional[int]: """ Returns the last timestamp from the csv files in the folder corresponding to the given dataset_identifier. diff --git a/pdr_backend/lake/gql_data_factory.py b/pdr_backend/lake/gql_data_factory.py index f870ec4eb..e3527a291 100644 --- a/pdr_backend/lake/gql_data_factory.py +++ b/pdr_backend/lake/gql_data_factory.py @@ -139,9 +139,9 @@ def _update(self): print(" Given start time, no data to gather. Exit.") # to satisfy mypy, get an explicit function pointer - do_fetch: Callable[[str, int, int, int, int, Dict, str], pl.DataFrame] = ( - table.get_pdr_df - ) + do_fetch: Callable[ + [str, int, int, int, int, Dict, str], pl.DataFrame + ] = table.get_pdr_df # number of data at which we want to save to file save_backoff_limit = 5000 diff --git a/pdr_backend/lake/table.py b/pdr_backend/lake/table.py index 8f2f1ff55..9c1db0c46 100644 --- a/pdr_backend/lake/table.py +++ b/pdr_backend/lake/table.py @@ -32,7 +32,9 @@ def load(self): self.csv_data_store = CSVDataStore(self.ppss.lake_ss.parquet_dir) st_ut = self.ppss.lake_ss.st_timestamp fin_ut = self.ppss.lake_ss.fin_timestamp - self.df = self.csv_data_store.read(self.table_name, st_ut, fin_ut, schema=self.df_schema) + self.df = self.csv_data_store.read( + self.table_name, st_ut, fin_ut, schema=self.df_schema + ) @enforce_types def save(self): @@ -56,7 +58,9 @@ def save(self): self.df = self.df.filter(pl.struct("ID").is_unique()) self.csv_data_store.write(self.table_name, self.df, schema=self.df_schema) n_new = self.df.shape[0] - cur_df.shape[0] - print(f" Just saved df with {n_new} df rows to the csv files of {self.table_name}") + print( + f" Just saved df with {n_new} df rows to the csv files of {self.table_name}" + ) @enforce_types def get_pdr_df( diff --git a/pdr_backend/lake/test/test_csv_data_store.py b/pdr_backend/lake/test/test_csv_data_store.py index f0fde60fa..91a5cb34b 100644 --- a/pdr_backend/lake/test/test_csv_data_store.py +++ b/pdr_backend/lake/test/test_csv_data_store.py @@ -2,9 +2,11 @@ import os from pdr_backend.lake.csv_data_store import CSVDataStore + def _get_test_manager(tmpdir): return CSVDataStore(str(tmpdir)) + def _clean_up(tmpdir): for root, dirs, files in os.walk(tmpdir): for file in files: @@ -14,17 +16,20 @@ def _clean_up(tmpdir): _clean_up(os.path.join(root, dir)) os.rmdir(os.path.join(root, dir)) + def test_get_folder_path(tmpdir): manager = _get_test_manager(tmpdir) folder_path = manager._get_folder_path("test") assert folder_path == f"{tmpdir}/test" + def test_create_file_name(tmpdir): manager = _get_test_manager(tmpdir) file_name = manager._create_file_name("test", 1707030362, 1709060200, 1000) print("file_name", file_name) assert file_name == "test_from_1707030362_to_1709060200_1000.csv" + def test_get_file_paths(tmpdir): manager = _get_test_manager(tmpdir) file_name_1 = manager._create_file_name("test", 0, 20, 20) @@ -41,9 +46,7 @@ def test_get_file_paths(tmpdir): for file in files: # create empty files - with open( - os.path.join(folder_path, file) - , "w") as f: + with open(os.path.join(folder_path, file), "w") as f: pass # check if empty files are created @@ -53,15 +56,20 @@ def test_get_file_paths(tmpdir): file_paths = manager._get_file_paths(folder_path, 21, 60) for file_path in file_paths: - assert file_path in [folder_path + "/" + file_name_2, folder_path + "/" + file_name_3] + assert file_path in [ + folder_path + "/" + file_name_2, + folder_path + "/" + file_name_3, + ] _clean_up(tmpdir) + def test_create_file_path(tmpdir): manager = _get_test_manager(tmpdir) file_path = manager._create_file_path("test", 1, 2, 2) assert file_path == f"{tmpdir}/test/test_from_0000000001_to_0000000002_2.csv" + def test_read(tmpdir): manager = _get_test_manager(tmpdir) file_path = manager._create_file_path("test", 1, 2, 2) @@ -74,6 +82,7 @@ def test_read(tmpdir): _clean_up(tmpdir) + def test_read_all(tmpdir): manager = _get_test_manager(tmpdir) @@ -87,12 +96,13 @@ def test_read_all(tmpdir): file.write("a,b,c\n7,8,9\n10,11,12") data = manager.read_all("test") - assert data['a'].to_list() == [1, 4, 7, 10] - assert data['b'].to_list() == [2, 5, 8, 11] - assert data['c'].to_list() == [3, 6, 9, 12] + assert data["a"].to_list() == [1, 4, 7, 10] + assert data["b"].to_list() == [2, 5, 8, 11] + assert data["c"].to_list() == [3, 6, 9, 12] _clean_up(tmpdir) + def test_get_last_file_path(tmpdir): manager = _get_test_manager(tmpdir) file_path_1 = manager._create_file_path("test", 0, 20, 2) @@ -109,15 +119,16 @@ def test_get_last_file_path(tmpdir): for file in files: # create empty files - with open( - os.path.join(folder_path, file) - , "w") as f: - pass + with open(os.path.join(folder_path, file), "w") as f: + pass - assert manager._get_last_file_path(f"{tmpdir}/test") == os.path.join(folder_path, file_path_4) + assert manager._get_last_file_path(f"{tmpdir}/test") == os.path.join( + folder_path, file_path_4 + ) _clean_up(tmpdir) + def test_write(tmpdir): manager = _get_test_manager(tmpdir) data = pl.DataFrame({"a": [1, 4], "b": [2, 5], "timestamp": [3, 6]}) @@ -126,12 +137,13 @@ def test_write(tmpdir): data = pl.read_csv(file_name) - assert data['a'].to_list() == [1, 4] - assert data['b'].to_list() == [2, 5] - assert data['timestamp'].to_list() == [3, 6] + assert data["a"].to_list() == [1, 4] + assert data["b"].to_list() == [2, 5] + assert data["timestamp"].to_list() == [3, 6] _clean_up(tmpdir) + def test_write_append(tmpdir): manager = _get_test_manager(tmpdir) data = pl.DataFrame({"a": [1, 4], "b": [2, 5], "timestamp": [3, 6]}) @@ -145,8 +157,8 @@ def test_write_append(tmpdir): data = pl.read_csv(file_name) - assert data['a'].to_list() == [1, 4, 11, 41] - assert data['b'].to_list() == [2, 5, 21, 51] - assert data['timestamp'].to_list() == [3, 6, 31, 61] + assert data["a"].to_list() == [1, 4, 11, 41] + assert data["b"].to_list() == [2, 5, 21, 51] + assert data["timestamp"].to_list() == [3, 6, 31, 61] _clean_up(tmpdir) diff --git a/pdr_backend/lake/test/test_table.py b/pdr_backend/lake/test/test_table.py index 48a3382a3..25d3e85b3 100644 --- a/pdr_backend/lake/test/test_table.py +++ b/pdr_backend/lake/test/test_table.py @@ -34,14 +34,15 @@ def __init__(self, data): "user": "0x123", } + def _clean_up(tmp_path, table_name): """ Delete test file if already exists """ folder_path = os.path.join(tmp_path, table_name) - + if os.path.exists(folder_path): - #delete files + # delete files for file in os.listdir(folder_path): file_path = os.path.join(folder_path, file) os.remove(file_path) @@ -80,6 +81,7 @@ def get_table_df(network, st_ut, fin_ut, config): if os.path.exists(file_path2): os.remove(file_path2) + def test_table_initialization(): """ Test that table is initialized correctly @@ -102,6 +104,7 @@ def test_table_initialization(): assert table.ppss.lake_ss.st_timestr == st_timestr assert table.ppss.lake_ss.fin_timestr == fin_timestr + def test_load_table(): """ Test that table is loading the data from file @@ -121,6 +124,7 @@ def test_load_table(): assert len(table.df) == 0 + def test_save_table(tmpdir): st_timestr = "2023-12-03" fin_timestr = "2023-12-05" @@ -146,12 +150,17 @@ def test_save_table(tmpdir): first_ts = table.df.head(1)["timestamp"].to_list()[0] last_ts = table.df.tail(1)["timestamp"].to_list()[0] - test_file_path = os.path.join(str(ppss.lake_ss.parquet_dir), table_name, f"{table_name}_from_{first_ts}_to_{last_ts}_{len(table.df)}.csv") + test_file_path = os.path.join( + str(ppss.lake_ss.parquet_dir), + table_name, + f"{table_name}_from_{first_ts}_to_{last_ts}_{len(table.df)}.csv", + ) assert os.path.exists(test_file_path) printed_text = captured_output.getvalue().strip() assert "Just saved df with" in printed_text + def test_get_pdr_df(tmpdir): st_timestr = "2023-12-03" fin_timestr = "2023-12-05" @@ -183,6 +192,7 @@ def test_get_pdr_df(tmpdir): assert table.df.shape[0] == 1 + def test_get_pdr_df_multiple_fetches(tmpdir): """ Test multiple table actions in one go @@ -230,6 +240,7 @@ def test_get_pdr_df_multiple_fetches(tmpdir): # test that the final df is saved assert len(table.df) == 50 + def test_all(tmpdir): """ Test multiple table actions in one go @@ -250,10 +261,12 @@ def test_all(tmpdir): if not os.path.exists(folder_path): os.makedirs(folder_path) - #create the csv file - file_path = os.path.join(folder_path, f"{table_name}_from_1701634400_to_1701634400_1.csv") + # create the csv file + file_path = os.path.join( + folder_path, f"{table_name}_from_1701634400_to_1701634400_1.csv" + ) - #write the file + # write the file with open(file_path, "w") as file: file.write("ID,pair,timeframe,prediction,payout,timestamp,slot,user\n") file.write("0x123,ADA-USDT,5m,True,28.2,1701634400000,1701634400000,0x123\n") From 494135bb396d4bd1083eb6d2e8490961a0ad9ae8 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Thu, 29 Feb 2024 16:24:19 +0300 Subject: [PATCH 12/27] take-back the gql_data_factory from the main branch --- pdr_backend/lake/gql_data_factory.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pdr_backend/lake/gql_data_factory.py b/pdr_backend/lake/gql_data_factory.py index e3527a291..f870ec4eb 100644 --- a/pdr_backend/lake/gql_data_factory.py +++ b/pdr_backend/lake/gql_data_factory.py @@ -139,9 +139,9 @@ def _update(self): print(" Given start time, no data to gather. Exit.") # to satisfy mypy, get an explicit function pointer - do_fetch: Callable[ - [str, int, int, int, int, Dict, str], pl.DataFrame - ] = table.get_pdr_df + do_fetch: Callable[[str, int, int, int, int, Dict, str], pl.DataFrame] = ( + table.get_pdr_df + ) # number of data at which we want to save to file save_backoff_limit = 5000 From 6c5ebfeb134b2300cc406f6b5088fad5fee104dd Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Thu, 29 Feb 2024 16:36:50 +0300 Subject: [PATCH 13/27] pylint issues --- pdr_backend/lake/csv_data_store.py | 8 ++++---- pdr_backend/lake/test/test_csv_data_store.py | 13 +++++++------ pdr_backend/lake/test/test_table.py | 19 +++++-------------- 3 files changed, 16 insertions(+), 24 deletions(-) diff --git a/pdr_backend/lake/csv_data_store.py b/pdr_backend/lake/csv_data_store.py index 59503e6be..390ab4dda 100644 --- a/pdr_backend/lake/csv_data_store.py +++ b/pdr_backend/lake/csv_data_store.py @@ -232,8 +232,8 @@ def read_all( for file_path in file_paths[1:]: data = data.vstack(pl.read_csv(file_path, schema=schema)) return data - else: - return pl.DataFrame() + + return pl.DataFrame() def _get_last_file_path(self, folder_path: str) -> str: """ @@ -260,8 +260,8 @@ def get_last_timestamp(self, dataset_identifier: str) -> Optional[int]: last_file_path = self._get_last_file_path(folder_path) if len(last_file_path): return int(last_file_path.split("_")[3]) - else: - return None + + return None def _get_last_file_row_count(self, dataset_identifier: str) -> Optional[int]: """ diff --git a/pdr_backend/lake/test/test_csv_data_store.py b/pdr_backend/lake/test/test_csv_data_store.py index 91a5cb34b..4f4330991 100644 --- a/pdr_backend/lake/test/test_csv_data_store.py +++ b/pdr_backend/lake/test/test_csv_data_store.py @@ -1,5 +1,6 @@ -import polars as pl import os + +import polars as pl from pdr_backend.lake.csv_data_store import CSVDataStore @@ -11,10 +12,10 @@ def _clean_up(tmpdir): for root, dirs, files in os.walk(tmpdir): for file in files: os.remove(os.path.join(root, file)) - for dir in dirs: + for directory in dirs: # clean up the directory - _clean_up(os.path.join(root, dir)) - os.rmdir(os.path.join(root, dir)) + _clean_up(os.path.join(root, directory)) + os.rmdir(os.path.join(root, directory)) def test_get_folder_path(tmpdir): @@ -46,7 +47,7 @@ def test_get_file_paths(tmpdir): for file in files: # create empty files - with open(os.path.join(folder_path, file), "w") as f: + with open(os.path.join(folder_path, file), "w"): pass # check if empty files are created @@ -119,7 +120,7 @@ def test_get_last_file_path(tmpdir): for file in files: # create empty files - with open(os.path.join(folder_path, file), "w") as f: + with open(os.path.join(folder_path, file), "w"): pass assert manager._get_last_file_path(f"{tmpdir}/test") == os.path.join( diff --git a/pdr_backend/lake/test/test_table.py b/pdr_backend/lake/test/test_table.py index 25d3e85b3..e248000b5 100644 --- a/pdr_backend/lake/test/test_table.py +++ b/pdr_backend/lake/test/test_table.py @@ -35,7 +35,7 @@ def __init__(self, data): } -def _clean_up(tmp_path, table_name): +def _clean_up(tmp_path): """ Delete test file if already exists """ @@ -72,15 +72,6 @@ def get_table_df(network, st_ut, fin_ut, config): "user": Utf8, } table_name = "pdr_test_df" -file_path = f"./parquet_data/{table_name}.parquet" -file_path2 = "./parquet_data/test_prediction_table_multiple.parquet" - -# delete test file if already exists -if os.path.exists(file_path): - os.remove(file_path) -if os.path.exists(file_path2): - os.remove(file_path2) - def test_table_initialization(): """ @@ -136,7 +127,7 @@ def test_save_table(tmpdir): fin_timestr=fin_timestr, ) - _clean_up(ppss.lake_ss.parquet_dir, table_name) + _clean_up(ppss.lake_ss.parquet_dir) table = Table(table_name, table_df_schema, ppss) @@ -172,7 +163,7 @@ def test_get_pdr_df(tmpdir): fin_timestr=fin_timestr, ) - _clean_up(ppss.lake_ss.parquet_dir, table_name) + _clean_up(ppss.lake_ss.parquet_dir) table = Table(table_name, table_df_schema, ppss) @@ -208,7 +199,7 @@ def test_get_pdr_df_multiple_fetches(tmpdir): fin_timestr=fin_timestr, ) - _clean_up(ppss.lake_ss.parquet_dir, table_name) + _clean_up(ppss.lake_ss.parquet_dir) table = Table("test_prediction_table_multiple", predictions_schema, ppss) captured_output = StringIO() @@ -255,7 +246,7 @@ def test_all(tmpdir): fin_timestr=fin_timestr, ) - _clean_up(ppss.lake_ss.parquet_dir, table_name) + _clean_up(ppss.lake_ss.parquet_dir) folder_path = os.path.join(ppss.lake_ss.parquet_dir, table_name) if not os.path.exists(folder_path): From e57000bb7d4aef1c7b6542076a2a528a15e633d0 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Thu, 29 Feb 2024 16:40:29 +0300 Subject: [PATCH 14/27] issue681 - check fixes --- pdr_backend/lake/csv_data_store.py | 3 +-- pdr_backend/lake/test/test_table.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pdr_backend/lake/csv_data_store.py b/pdr_backend/lake/csv_data_store.py index 390ab4dda..a0e230531 100644 --- a/pdr_backend/lake/csv_data_store.py +++ b/pdr_backend/lake/csv_data_store.py @@ -95,8 +95,7 @@ def write( remaining_rows = 1000 - last_file_row_count # get the first remaining_rows rows - if len(data) < remaining_rows: - remaining_rows = len(data) + remaining_rows = min(remaining_rows, len(data)) remaining_data = data.slice(0, remaining_rows) diff --git a/pdr_backend/lake/test/test_table.py b/pdr_backend/lake/test/test_table.py index e248000b5..71cfcf2bc 100644 --- a/pdr_backend/lake/test/test_table.py +++ b/pdr_backend/lake/test/test_table.py @@ -73,6 +73,7 @@ def get_table_df(network, st_ut, fin_ut, config): } table_name = "pdr_test_df" + def test_table_initialization(): """ Test that table is initialized correctly From 39409174a457f19670628a09d8548a76534972aa Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Thu, 29 Feb 2024 17:21:49 +0300 Subject: [PATCH 15/27] test fixes --- pdr_backend/lake/csv_data_store.py | 2 +- pdr_backend/lake/table.py | 7 +++++-- pdr_backend/lake/test/test_gql_data_factory.py | 5 +++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pdr_backend/lake/csv_data_store.py b/pdr_backend/lake/csv_data_store.py index a0e230531..e676d962d 100644 --- a/pdr_backend/lake/csv_data_store.py +++ b/pdr_backend/lake/csv_data_store.py @@ -232,7 +232,7 @@ def read_all( data = data.vstack(pl.read_csv(file_path, schema=schema)) return data - return pl.DataFrame() + return pl.DataFrame([], schema=schema) def _get_last_file_path(self, folder_path: str) -> str: """ diff --git a/pdr_backend/lake/table.py b/pdr_backend/lake/table.py index 9c1db0c46..46c73ecf9 100644 --- a/pdr_backend/lake/table.py +++ b/pdr_backend/lake/table.py @@ -2,6 +2,8 @@ import os from typing import Dict, Callable import polars as pl +from polars.type_aliases import SchemaDict + from enforce_typing import enforce_types from pdr_backend.ppss.ppss import PPSS from pdr_backend.lake.plutil import has_data, newest_ut @@ -16,11 +18,12 @@ @enforce_types class Table: - def __init__(self, table_name: str, df_schema: object, ppss: PPSS): + def __init__(self, table_name: str, df_schema: SchemaDict, ppss: PPSS): self.ppss = ppss self.table_name = table_name self.df_schema = df_schema - self.df = pl.DataFrame(schema=df_schema) + self.df = pl.DataFrame([], schema=df_schema) + print("self.df", self.df) self.load() @enforce_types diff --git a/pdr_backend/lake/test/test_gql_data_factory.py b/pdr_backend/lake/test/test_gql_data_factory.py index b15d86c29..9c14993c3 100644 --- a/pdr_backend/lake/test/test_gql_data_factory.py +++ b/pdr_backend/lake/test/test_gql_data_factory.py @@ -66,7 +66,7 @@ def test_update(): assert count_updates == len(gql_data_factory.record_config["tables"].items()) -def test_load_parquet(): +def test_load_parquet(tmpdir): """ Test GQLDataFactory loads the data for all the tables """ @@ -75,7 +75,7 @@ def test_load_parquet(): ppss = mock_ppss( ["binance BTC/USDT c 5m"], "sapphire-mainnet", - ".", + str(tmpdir), st_timestr=st_timestr, fin_timestr=fin_timestr, ) @@ -85,6 +85,7 @@ def test_load_parquet(): assert len(gql_data_factory.record_config["tables"].items()) == 4 table = gql_data_factory.record_config["tables"]["pdr_predictions"] + assert table is not None assert type(table.df) == pl.DataFrame assert table.df.schema == table.df_schema From bb4a2dc60fb26f8b71cc11034c3accb7d5e522a0 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Mon, 4 Mar 2024 17:44:04 +0300 Subject: [PATCH 16/27] issue681 - Append Logic --- pdr_backend/lake/csv_data_store.py | 62 ++++++++++---- pdr_backend/lake/table.py | 26 ++++-- pdr_backend/lake/test/test_csv_data_store.py | 86 ++++++++++++++++---- 3 files changed, 130 insertions(+), 44 deletions(-) diff --git a/pdr_backend/lake/csv_data_store.py b/pdr_backend/lake/csv_data_store.py index e676d962d..0d7a3d174 100644 --- a/pdr_backend/lake/csv_data_store.py +++ b/pdr_backend/lake/csv_data_store.py @@ -30,7 +30,7 @@ def _fill_with_zero(self, number: int, length: int = 10) -> str: return f"{(length - len(number_str)) * '0'}{number_str}" def _create_file_name( - self, dataset_identifier: str, start_time: int, end_time: int, row_count: int + self, dataset_identifier: str, start_time: int, end_time: Optional[int] ) -> str: """ Creates a file name using the given dataset_identifier, @@ -39,28 +39,29 @@ def _create_file_name( dataset_identifier: str - identifier of the dataset start_time: int - start time of the data TIMESTAMP end_time: int - end time of the data TIMESTAMP - row_count: int - number of rows in the data """ start_time_str = self._fill_with_zero(start_time) - end_time_str = self._fill_with_zero(end_time) - return f"{dataset_identifier}_from_{start_time_str}_to_{end_time_str}_{row_count}.csv" + start_phrase = f"_from_{start_time_str}" + + end_phrase = f"_to_{self._fill_with_zero(end_time)}" if end_time else "_to_" + + return f"{dataset_identifier}{start_phrase}{end_phrase}.csv" def _create_file_path( - self, dataset_identifier: str, start_time: int, end_time: int, row_count: int + self, dataset_identifier: str, start_time: int, end_time: Optional[int] ) -> str: """ Creates the file path for the given dataset_identifier, - start_time, end_time, and row_count. + start_time, end_time. @args: dataset_identifier: str - identifier of the dataset start_time: str - start time of the data end_time: str - end time of the data - row_count: int - number of rows in the data """ file_name = self._create_file_name( - dataset_identifier, start_time, end_time, row_count + dataset_identifier, start_time, end_time ) folder_path = self._get_folder_path(dataset_identifier) return os.path.join(folder_path, file_name) @@ -89,10 +90,11 @@ def write( dataset_identifier: str - The dataset identifier """ + max_row_count = 1000 last_file_row_count = self._get_last_file_row_count(dataset_identifier) if last_file_row_count is not None: - if last_file_row_count < 1000: - remaining_rows = 1000 - last_file_row_count + if last_file_row_count < max_row_count: + remaining_rows = max_row_count - last_file_row_count # get the first remaining_rows rows remaining_rows = min(remaining_rows, len(data)) @@ -111,7 +113,7 @@ def write( last_file_data.write_csv(last_file_path) # change the name of the file to reflect the new row count new_file_path = self._create_file_path( - dataset_identifier, t_start_time, t_end_time, len(last_file_data) + dataset_identifier, t_start_time, t_end_time if len(data) >= remaining_rows else None ) print("new_file_path", new_file_path) @@ -120,14 +122,16 @@ def write( data = data.slice(remaining_rows, len(data) - remaining_rows) chunks = [ - data.slice(i, min(1000, len(data) - i)) for i in range(0, len(data), 1000) + data.slice(i, min(max_row_count, len(data) - i)) for i in range(0, len(data), max_row_count) ] for i, chunk in enumerate(chunks): start_time = int(chunk["timestamp"][0]) end_time = int(chunk["timestamp"][-1]) file_path = self._create_file_path( - dataset_identifier, start_time, end_time, len(chunk) + dataset_identifier, + start_time, + end_time if len(chunk) >= max_row_count else None ) chunk.write_csv(file_path) @@ -142,6 +146,26 @@ def bulk_write(self, data_list: List[pl.DataFrame], dataset_identifier: str): for data in data_list: self.write(dataset_identifier, data) + def _get_to_value(self, file_path: str) -> int: + """ + Returns the end time from the given file_path. + @args: + file_path: str - path of the file + @returns: + int - end time from the file_path + """ + return int(file_path.split("/")[-1].split("_")[4].replace('.csv', '')) + + def _get_from_value(self, file_path: str) -> int: + """ + Returns the start time from the given file_path. + @args: + file_path: str - path of the file + @returns: + int - start time from the file_path + """ + return int(file_path.split("/")[-1].split("_")[2]) + def _get_file_paths( self, folder_path: str, start_time: str, end_time: str ) -> List[str]: @@ -160,15 +184,15 @@ def _get_file_paths( file_paths = [os.path.join(folder_path, file_name) for file_name in file_names] # find files which has a higher start time and lower end time - print("file_paths_aaaa", file_paths) file_paths = [ file_path for file_path in file_paths # firstly, take the filename from the path (/path/to/file.csv -> file.csv) # then, split the filename by "_" and take the 4th and 5th elements # then, convert them to int and check if they are in the range - if int(file_path.split("/")[-1].split("_")[2]) >= int(start_time) - and int(file_path.split("/")[-1].split("_")[4]) <= int(end_time) + if self._get_from_value(file_path) >= int(start_time) + and (self._get_to_value(file_path) <= int(end_time) + or self._get_to_value(file_path) == 0) ] return file_paths @@ -281,6 +305,8 @@ def _get_last_file_row_count(self, dataset_identifier: str) -> Optional[int]: last_file_path = os.path.join(folder_path, file_names[-1]) - # parse the row count from the file name - row_count = int(last_file_path.split("_")[-1].split(".")[0]) + # Read the last file + last_file = pl.read_csv(last_file_path) + row_count = last_file.shape[0] + return row_count diff --git a/pdr_backend/lake/table.py b/pdr_backend/lake/table.py index 46c73ecf9..7f73ffa3c 100644 --- a/pdr_backend/lake/table.py +++ b/pdr_backend/lake/table.py @@ -42,9 +42,8 @@ def load(self): @enforce_types def save(self): """ - Get the data from subgraph and write it to Parquet file - write to parquet file - parquet only supports appending via the pyarrow engine + Save the data from the DataFrame object into the CSV file + It only saves the new data that has been fetched """ assert "timestamp" in self.df.columns and self.df["timestamp"].dtype == pl.Int64 @@ -55,12 +54,23 @@ def save(self): <= self.df.tail(1)["timestamp"].to_list()[0] ) - cur_df = self.csv_data_store.read_all(self.table_name, schema=self.df_schema) - - self.df = pl.concat([cur_df, self.df]) self.df = self.df.filter(pl.struct("ID").is_unique()) - self.csv_data_store.write(self.table_name, self.df, schema=self.df_schema) - n_new = self.df.shape[0] - cur_df.shape[0] + + if len(self.df) == 0: + print(f" No new data to save for {self.table_name}") + return + + self._append_to_csv(self.df) + + self.df = pl.DataFrame([], schema=self.df_schema) + + def _append_to_csv(self, data: pl.DataFrame): + """ + Append the data from the DataFrame object into the CSV file + It only saves the new data that has been fetched + """ + self.csv_data_store.append(self.table_name, data, schema=self.df_schema) + n_new = self.df.shape[0] print( f" Just saved df with {n_new} df rows to the csv files of {self.table_name}" ) diff --git a/pdr_backend/lake/test/test_csv_data_store.py b/pdr_backend/lake/test/test_csv_data_store.py index 4f4330991..b59467ba2 100644 --- a/pdr_backend/lake/test/test_csv_data_store.py +++ b/pdr_backend/lake/test/test_csv_data_store.py @@ -26,17 +26,17 @@ def test_get_folder_path(tmpdir): def test_create_file_name(tmpdir): manager = _get_test_manager(tmpdir) - file_name = manager._create_file_name("test", 1707030362, 1709060200, 1000) - print("file_name", file_name) - assert file_name == "test_from_1707030362_to_1709060200_1000.csv" + file_name = manager._create_file_name("test", 1707030362, 1709060200) + print("file_name---", file_name) + assert file_name == "test_from_1707030362_to_1709060200.csv" def test_get_file_paths(tmpdir): manager = _get_test_manager(tmpdir) - file_name_1 = manager._create_file_name("test", 0, 20, 20) - file_name_2 = manager._create_file_name("test", 21, 40, 20) - file_name_3 = manager._create_file_name("test", 41, 60, 20) - file_name_4 = manager._create_file_name("test", 61, 80, 20) + file_name_1 = manager._create_file_name("test", 0, 20) + file_name_2 = manager._create_file_name("test", 21, 40) + file_name_3 = manager._create_file_name("test", 41, 60) + file_name_4 = manager._create_file_name("test", 61, 80) files = [file_name_1, file_name_2, file_name_3, file_name_4] @@ -67,13 +67,18 @@ def test_get_file_paths(tmpdir): def test_create_file_path(tmpdir): manager = _get_test_manager(tmpdir) - file_path = manager._create_file_path("test", 1, 2, 2) - assert file_path == f"{tmpdir}/test/test_from_0000000001_to_0000000002_2.csv" + file_path = manager._create_file_path("test", 1, 2) + assert file_path == f"{tmpdir}/test/test_from_0000000001_to_0000000002.csv" +def test_create_file_path_without_endtime(tmpdir): + manager = _get_test_manager(tmpdir) + file_path = manager._create_file_path("test", 1, None) + assert file_path == f"{tmpdir}/test/test_from_0000000001_to_.csv" + def test_read(tmpdir): manager = _get_test_manager(tmpdir) - file_path = manager._create_file_path("test", 1, 2, 2) + file_path = manager._create_file_path("test", 1, 2) with open(file_path, "w") as file: file.write("a,b,c\n1,2,3\n4,5,6") @@ -87,8 +92,8 @@ def test_read(tmpdir): def test_read_all(tmpdir): manager = _get_test_manager(tmpdir) - file_path_1 = manager._create_file_path("test", 0, 20, 2) - file_path_2 = manager._create_file_path("test", 21, 41, 2) + file_path_1 = manager._create_file_path("test", 0, 20) + file_path_2 = manager._create_file_path("test", 21, 41) with open(file_path_1, "w") as file: file.write("a,b,c\n1,2,3\n4,5,6") @@ -106,10 +111,10 @@ def test_read_all(tmpdir): def test_get_last_file_path(tmpdir): manager = _get_test_manager(tmpdir) - file_path_1 = manager._create_file_path("test", 0, 20, 2) - file_path_2 = manager._create_file_path("test", 21, 41, 2) - file_path_3 = manager._create_file_path("test", 42, 62, 2) - file_path_4 = manager._create_file_path("test", 63, 83, 2) + file_path_1 = manager._create_file_path("test", 0, 20) + file_path_2 = manager._create_file_path("test", 21, 41) + file_path_3 = manager._create_file_path("test", 42, 62) + file_path_4 = manager._create_file_path("test", 63, 83) files = [file_path_1, file_path_2, file_path_3, file_path_4] @@ -134,7 +139,7 @@ def test_write(tmpdir): manager = _get_test_manager(tmpdir) data = pl.DataFrame({"a": [1, 4], "b": [2, 5], "timestamp": [3, 6]}) manager.write("test", data) - file_name = manager._create_file_path("test", 3, 6, 2) + file_name = manager._create_file_path("test", 3, None) data = pl.read_csv(file_name) @@ -144,6 +149,35 @@ def test_write(tmpdir): _clean_up(tmpdir) +def test_write_1000_rows(tmpdir): + _clean_up(tmpdir) + + manager = _get_test_manager(tmpdir) + data = pl.DataFrame( + { + "a": list(range(1000)), + "b": list(range(1000)), + "timestamp": list(range(1000)), + } + ) + manager.write("test", data) + + folder_path = manager._get_folder_path("test") + + # get folder including files + # folder = os.listdir(folder_path) + #print folder files + # print("folder---", folder) + + file_name = manager._create_file_path("test", 0, 999) + + data = pl.read_csv(file_name) + + assert data["a"].to_list() == list(range(1000)) + assert data["b"].to_list() == list(range(1000)) + assert data["timestamp"].to_list() == list(range(1000)) + + _clean_up(tmpdir) def test_write_append(tmpdir): manager = _get_test_manager(tmpdir) @@ -154,7 +188,7 @@ def test_write_append(tmpdir): data = pl.DataFrame({"a": [11, 41], "b": [21, 51], "timestamp": [31, 61]}) manager.write("test", data) - file_name = manager._create_file_path("test", 3, 61, 4) + file_name = manager._create_file_path("test", 3, 61) data = pl.read_csv(file_name) @@ -163,3 +197,19 @@ def test_write_append(tmpdir): assert data["timestamp"].to_list() == [3, 6, 31, 61] _clean_up(tmpdir) + +def test_fill_with_zero(): + manager = CSVDataStore("test") + assert manager._fill_with_zero(1, 10) == "0000000001" + assert manager._fill_with_zero(100) == "0000000100" + assert manager._fill_with_zero(1000) == "0000001000" + +def test_get_to_value(): + manager = CSVDataStore("test") + assert manager._get_to_value('test/test_from_0_to_0000000001.csv') == 1 + assert manager._get_to_value('test/test_from_0_to_0000000005.csv') == 5 + +def test_get_from_value(): + manager = CSVDataStore("test") + assert manager._get_from_value('test/test_from_0000000001_to_0000000001.csv') == 1 + assert manager._get_from_value('test/test_from_0000000005_to_.csv') == 5 From e8c5843e846137c44102f1d4b6071d286567a33a Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Mon, 4 Mar 2024 17:51:06 +0300 Subject: [PATCH 17/27] black pylint fix --- pdr_backend/lake/csv_data_store.py | 23 +++++++++++--------- pdr_backend/lake/test/test_csv_data_store.py | 18 ++++++++++----- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/pdr_backend/lake/csv_data_store.py b/pdr_backend/lake/csv_data_store.py index 0d7a3d174..a34b6aadf 100644 --- a/pdr_backend/lake/csv_data_store.py +++ b/pdr_backend/lake/csv_data_store.py @@ -60,9 +60,7 @@ def _create_file_path( end_time: str - end time of the data """ - file_name = self._create_file_name( - dataset_identifier, start_time, end_time - ) + file_name = self._create_file_name(dataset_identifier, start_time, end_time) folder_path = self._get_folder_path(dataset_identifier) return os.path.join(folder_path, file_name) @@ -113,7 +111,9 @@ def write( last_file_data.write_csv(last_file_path) # change the name of the file to reflect the new row count new_file_path = self._create_file_path( - dataset_identifier, t_start_time, t_end_time if len(data) >= remaining_rows else None + dataset_identifier, + t_start_time, + t_end_time if len(data) >= remaining_rows else None, ) print("new_file_path", new_file_path) @@ -122,7 +122,8 @@ def write( data = data.slice(remaining_rows, len(data) - remaining_rows) chunks = [ - data.slice(i, min(max_row_count, len(data) - i)) for i in range(0, len(data), max_row_count) + data.slice(i, min(max_row_count, len(data) - i)) + for i in range(0, len(data), max_row_count) ] for i, chunk in enumerate(chunks): @@ -131,7 +132,7 @@ def write( file_path = self._create_file_path( dataset_identifier, start_time, - end_time if len(chunk) >= max_row_count else None + end_time if len(chunk) >= max_row_count else None, ) chunk.write_csv(file_path) @@ -154,8 +155,8 @@ def _get_to_value(self, file_path: str) -> int: @returns: int - end time from the file_path """ - return int(file_path.split("/")[-1].split("_")[4].replace('.csv', '')) - + return int(file_path.split("/")[-1].split("_")[4].replace(".csv", "")) + def _get_from_value(self, file_path: str) -> int: """ Returns the start time from the given file_path. @@ -191,8 +192,10 @@ def _get_file_paths( # then, split the filename by "_" and take the 4th and 5th elements # then, convert them to int and check if they are in the range if self._get_from_value(file_path) >= int(start_time) - and (self._get_to_value(file_path) <= int(end_time) - or self._get_to_value(file_path) == 0) + and ( + self._get_to_value(file_path) <= int(end_time) + or self._get_to_value(file_path) == 0 + ) ] return file_paths diff --git a/pdr_backend/lake/test/test_csv_data_store.py b/pdr_backend/lake/test/test_csv_data_store.py index b59467ba2..0cfde9fad 100644 --- a/pdr_backend/lake/test/test_csv_data_store.py +++ b/pdr_backend/lake/test/test_csv_data_store.py @@ -76,6 +76,7 @@ def test_create_file_path_without_endtime(tmpdir): file_path = manager._create_file_path("test", 1, None) assert file_path == f"{tmpdir}/test/test_from_0000000001_to_.csv" + def test_read(tmpdir): manager = _get_test_manager(tmpdir) file_path = manager._create_file_path("test", 1, 2) @@ -149,6 +150,7 @@ def test_write(tmpdir): _clean_up(tmpdir) + def test_write_1000_rows(tmpdir): _clean_up(tmpdir) @@ -162,11 +164,11 @@ def test_write_1000_rows(tmpdir): ) manager.write("test", data) - folder_path = manager._get_folder_path("test") + #folder_path = manager._get_folder_path("test") # get folder including files # folder = os.listdir(folder_path) - #print folder files + # print folder files # print("folder---", folder) file_name = manager._create_file_path("test", 0, 999) @@ -179,6 +181,7 @@ def test_write_1000_rows(tmpdir): _clean_up(tmpdir) + def test_write_append(tmpdir): manager = _get_test_manager(tmpdir) data = pl.DataFrame({"a": [1, 4], "b": [2, 5], "timestamp": [3, 6]}) @@ -198,18 +201,21 @@ def test_write_append(tmpdir): _clean_up(tmpdir) + def test_fill_with_zero(): manager = CSVDataStore("test") assert manager._fill_with_zero(1, 10) == "0000000001" assert manager._fill_with_zero(100) == "0000000100" assert manager._fill_with_zero(1000) == "0000001000" + def test_get_to_value(): manager = CSVDataStore("test") - assert manager._get_to_value('test/test_from_0_to_0000000001.csv') == 1 - assert manager._get_to_value('test/test_from_0_to_0000000005.csv') == 5 + assert manager._get_to_value("test/test_from_0_to_0000000001.csv") == 1 + assert manager._get_to_value("test/test_from_0_to_0000000005.csv") == 5 + def test_get_from_value(): manager = CSVDataStore("test") - assert manager._get_from_value('test/test_from_0000000001_to_0000000001.csv') == 1 - assert manager._get_from_value('test/test_from_0000000005_to_.csv') == 5 + assert manager._get_from_value("test/test_from_0000000001_to_0000000001.csv") == 1 + assert manager._get_from_value("test/test_from_0000000005_to_.csv") == 5 From db99f6e975d86ed02b029850b199a85d72abf0ea Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Mon, 4 Mar 2024 19:24:34 +0300 Subject: [PATCH 18/27] issue681 _append_to_csv --- pdr_backend/lake/etl.py | 2 ++ pdr_backend/lake/table.py | 16 +++++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pdr_backend/lake/etl.py b/pdr_backend/lake/etl.py index bcdfd3a96..e826f3db2 100644 --- a/pdr_backend/lake/etl.py +++ b/pdr_backend/lake/etl.py @@ -94,3 +94,5 @@ def update_bronze_pdr_predictions(self): table = get_bronze_pdr_predictions_table(self.tables, self.ppss) table.save() + ## Add the CSV and duckDB data store here + ## with table.df and table.schema diff --git a/pdr_backend/lake/table.py b/pdr_backend/lake/table.py index 7f73ffa3c..0ce96097d 100644 --- a/pdr_backend/lake/table.py +++ b/pdr_backend/lake/table.py @@ -69,8 +69,8 @@ def _append_to_csv(self, data: pl.DataFrame): Append the data from the DataFrame object into the CSV file It only saves the new data that has been fetched """ - self.csv_data_store.append(self.table_name, data, schema=self.df_schema) - n_new = self.df.shape[0] + self.csv_data_store.write(self.table_name, data, schema=self.df_schema) + n_new = data.shape[0] print( f" Just saved df with {n_new} df rows to the csv files of {self.table_name}" ) @@ -133,10 +133,11 @@ def get_pdr_df( ) and len(final_df) > 0: assert df.schema == self.df_schema # save to parquet - self.df = final_df.clone() - self.save() + self._append_to_csv(final_df) + # self._append_to_db() + print(f"Saved {len(final_df)} records to file while fetching") - final_df = pl.DataFrame() + final_df = pl.DataFrame([], schema=self.df_schema) save_backoff_count = 0 # avoids doing next fetch if we've reached the end @@ -145,8 +146,9 @@ def get_pdr_df( pagination_offset += pagination_limit if len(final_df) > 0: - self.df = final_df.clone() - self.save() + self._append_to_csv(final_df) + # self._append_to_db() + print(f"Saved {len(final_df)} records to file while fetching") @enforce_types From d5468163115fe7ab401a4ec0e306d75fecafbb1f Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Mon, 4 Mar 2024 19:31:13 +0300 Subject: [PATCH 19/27] issue681 - black fix --- pdr_backend/lake/test/test_csv_data_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdr_backend/lake/test/test_csv_data_store.py b/pdr_backend/lake/test/test_csv_data_store.py index 0cfde9fad..81c09063d 100644 --- a/pdr_backend/lake/test/test_csv_data_store.py +++ b/pdr_backend/lake/test/test_csv_data_store.py @@ -164,7 +164,7 @@ def test_write_1000_rows(tmpdir): ) manager.write("test", data) - #folder_path = manager._get_folder_path("test") + # folder_path = manager._get_folder_path("test") # get folder including files # folder = os.listdir(folder_path) From 2df30f82d96ea31df6974d6abd07999a14d71286 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Mon, 4 Mar 2024 20:19:03 +0300 Subject: [PATCH 20/27] issue685 - integration with 681 and 617 --- pdr_backend/lake/table.py | 53 ++++++------- pdr_backend/lake/test/test_table.py | 117 ++++++++++++++++++---------- 2 files changed, 101 insertions(+), 69 deletions(-) diff --git a/pdr_backend/lake/table.py b/pdr_backend/lake/table.py index 0ce96097d..3070d95ec 100644 --- a/pdr_backend/lake/table.py +++ b/pdr_backend/lake/table.py @@ -12,6 +12,7 @@ from pdr_backend.lake.plutil import _object_list_to_df from pdr_backend.lake.table_pdr_predictions import _transform_timestamp_to_ms from pdr_backend.lake.csv_data_store import CSVDataStore +from pdr_backend.lake.persistent_data_store import PersistentDataStore logger = logging.getLogger("table") @@ -33,41 +34,25 @@ def load(self): """ print(f"Loading data for {self.table_name}") self.csv_data_store = CSVDataStore(self.ppss.lake_ss.parquet_dir) + self.persistent_data_store = PersistentDataStore(self.ppss.lake_ss.parquet_dir) + st_ut = self.ppss.lake_ss.st_timestamp fin_ut = self.ppss.lake_ss.fin_timestamp self.df = self.csv_data_store.read( self.table_name, st_ut, fin_ut, schema=self.df_schema ) - @enforce_types - def save(self): - """ - Save the data from the DataFrame object into the CSV file - It only saves the new data that has been fetched - """ - - assert "timestamp" in self.df.columns and self.df["timestamp"].dtype == pl.Int64 - assert len(self.df) > 0 - if len(self.df) > 2: - assert ( - self.df.head(1)["timestamp"].to_list()[0] - <= self.df.tail(1)["timestamp"].to_list()[0] - ) - - self.df = self.df.filter(pl.struct("ID").is_unique()) - - if len(self.df) == 0: - print(f" No new data to save for {self.table_name}") - return - - self._append_to_csv(self.df) - - self.df = pl.DataFrame([], schema=self.df_schema) + def _append_both(self, data: pl.DataFrame): + self._append_to_csv(data) + self._append_to_db(data) def _append_to_csv(self, data: pl.DataFrame): """ Append the data from the DataFrame object into the CSV file It only saves the new data that has been fetched + + @arguments: + data - The Polars DataFrame to save. """ self.csv_data_store.write(self.table_name, data, schema=self.df_schema) n_new = data.shape[0] @@ -75,6 +60,18 @@ def _append_to_csv(self, data: pl.DataFrame): f" Just saved df with {n_new} df rows to the csv files of {self.table_name}" ) + def _append_to_db(self, data: pl.DataFrame): + """ + Append the data from the DataFrame object into the database + It only saves the new data that has been fetched + + @arguments: + data - The Polars DataFrame to save. + """ + self.persistent_data_store.insert_to_table(data, self.table_name) + n_new = data.shape[0] + print(f" Just saved df with {n_new} df rows to the database of {self.table_name}") + @enforce_types def get_pdr_df( self, @@ -99,7 +96,7 @@ def get_pdr_df( save_backoff_count = 0 pagination_offset = 0 - final_df = pl.DataFrame() + final_df = pl.DataFrame([], schema=self.df_schema) while True: # call the function @@ -133,8 +130,7 @@ def get_pdr_df( ) and len(final_df) > 0: assert df.schema == self.df_schema # save to parquet - self._append_to_csv(final_df) - # self._append_to_db() + self._append_both(final_df) print(f"Saved {len(final_df)} records to file while fetching") final_df = pl.DataFrame([], schema=self.df_schema) @@ -146,8 +142,7 @@ def get_pdr_df( pagination_offset += pagination_limit if len(final_df) > 0: - self._append_to_csv(final_df) - # self._append_to_db() + self._append_both(final_df) print(f"Saved {len(final_df)} records to file while fetching") diff --git a/pdr_backend/lake/test/test_table.py b/pdr_backend/lake/test/test_table.py index 71cfcf2bc..b829c95ef 100644 --- a/pdr_backend/lake/test/test_table.py +++ b/pdr_backend/lake/test/test_table.py @@ -1,6 +1,8 @@ from io import StringIO import os import sys +import random + from polars import Boolean, Float64, Int64, Utf8 import polars as pl from pdr_backend.ppss.ppss import mock_ppss @@ -116,8 +118,7 @@ def test_load_table(): assert len(table.df) == 0 - -def test_save_table(tmpdir): +def test_get_pdr_df(tmpdir): st_timestr = "2023-12-03" fin_timestr = "2023-12-05" ppss = mock_ppss( @@ -131,43 +132,10 @@ def test_save_table(tmpdir): _clean_up(ppss.lake_ss.parquet_dir) table = Table(table_name, table_df_schema, ppss) - + captured_output = StringIO() sys.stdout = captured_output - assert len(table.df) == 0 - table.df = pl.DataFrame([mocked_object], table_df_schema) - table.save() - - first_ts = table.df.head(1)["timestamp"].to_list()[0] - last_ts = table.df.tail(1)["timestamp"].to_list()[0] - - test_file_path = os.path.join( - str(ppss.lake_ss.parquet_dir), - table_name, - f"{table_name}_from_{first_ts}_to_{last_ts}_{len(table.df)}.csv", - ) - assert os.path.exists(test_file_path) - printed_text = captured_output.getvalue().strip() - - assert "Just saved df with" in printed_text - - -def test_get_pdr_df(tmpdir): - st_timestr = "2023-12-03" - fin_timestr = "2023-12-05" - ppss = mock_ppss( - ["binance BTC/USDT c 5m"], - "sapphire-mainnet", - str(tmpdir), - st_timestr=st_timestr, - fin_timestr=fin_timestr, - ) - - _clean_up(ppss.lake_ss.parquet_dir) - - table = Table(table_name, table_df_schema, ppss) - save_backoff_limit = 5000 pagination_limit = 1000 st_timest = UnixTimeMs(1701634300000) @@ -182,7 +150,10 @@ def test_get_pdr_df(tmpdir): {"contract_list": ["0x123"]}, ) - assert table.df.shape[0] == 1 + printed_text = captured_output.getvalue().strip() + count_fetches = printed_text.count("Fetched") + assert count_fetches == 1 + # assert table.df.shape[0] == 1 def test_get_pdr_df_multiple_fetches(tmpdir): @@ -203,6 +174,7 @@ def test_get_pdr_df_multiple_fetches(tmpdir): _clean_up(ppss.lake_ss.parquet_dir) table = Table("test_prediction_table_multiple", predictions_schema, ppss) + captured_output = StringIO() sys.stdout = captured_output @@ -219,6 +191,7 @@ def test_get_pdr_df_multiple_fetches(tmpdir): pagination_limit=pagination_limit, config={"contract_list": ["0x18f54cc21b7a2fdd011bea06bba7801b280e3151"]}, ) + printed_text = captured_output.getvalue().strip() # test fetches multiple times @@ -229,9 +202,6 @@ def test_get_pdr_df_multiple_fetches(tmpdir): count_saves = printed_text.count("Saved") assert count_saves == 2 - # test that the final df is saved - assert len(table.df) == 50 - def test_all(tmpdir): """ @@ -269,3 +239,70 @@ def test_all(tmpdir): table.load() assert len(table.df) == 1 + +def test_append_to_db(tmpdir): + """ + Test that table is loading the data from file + """ + st_timestr = "2023-12-03" + fin_timestr = "2024-12-05" + ppss = mock_ppss( + ["binance BTC/USDT c 5m"], + "sapphire-mainnet", + str(tmpdir), + st_timestr=st_timestr, + fin_timestr=fin_timestr, + ) + + _clean_up(ppss.lake_ss.parquet_dir) + + table = Table(table_name, table_df_schema, ppss) + table.load() + + assert len(table.df) == 0 + + table._append_to_db(pl.DataFrame([mocked_object] * 1000, schema=table_df_schema)) + + result = table.persistent_data_store.query_data( + table.table_name, + "SELECT * FROM {view_name}" + ) + + assert result["ID"][0] == "0x123" + assert result["pair"][0] == "ADA-USDT" + assert result["timeframe"][0] == "5m" + assert result["prediction"][0] == True + assert len(result) == 1000 + +def test_append_to_csv(tmpdir): + """ + Test that table is loading the data from file + """ + st_timestr = "2023-12-03" + fin_timestr = "2024-12-05" + ppss = mock_ppss( + ["binance BTC/USDT c 5m"], + "sapphire-mainnet", + str(tmpdir), + st_timestr=st_timestr, + fin_timestr=fin_timestr, + ) + + _clean_up(ppss.lake_ss.parquet_dir) + + table = Table(table_name, table_df_schema, ppss) + table.load() + + assert len(table.df) == 0 + + table._append_to_csv(pl.DataFrame([mocked_object] * 1000, schema=table_df_schema)) + + file_path = os.path.join( + ppss.lake_ss.parquet_dir, table_name, f"{table_name}_from_1701634400_to_1701634400.csv" + ) + + assert os.path.exists(file_path) + + with open(file_path, "r") as file: + lines = file.readlines() + assert len(lines) == 1001 From c84d85f8fec139d6149a55255a286c09da4b9712 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Mon, 4 Mar 2024 20:25:14 +0300 Subject: [PATCH 21/27] issue681 - black pylint fixes --- pdr_backend/lake/table.py | 6 ++++-- .../lake/test/test_persistent_data_store.py | 2 +- pdr_backend/lake/test/test_table.py | 17 ++++++++++------- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pdr_backend/lake/table.py b/pdr_backend/lake/table.py index 3070d95ec..08e80c87d 100644 --- a/pdr_backend/lake/table.py +++ b/pdr_backend/lake/table.py @@ -35,7 +35,7 @@ def load(self): print(f"Loading data for {self.table_name}") self.csv_data_store = CSVDataStore(self.ppss.lake_ss.parquet_dir) self.persistent_data_store = PersistentDataStore(self.ppss.lake_ss.parquet_dir) - + st_ut = self.ppss.lake_ss.st_timestamp fin_ut = self.ppss.lake_ss.fin_timestamp self.df = self.csv_data_store.read( @@ -70,7 +70,9 @@ def _append_to_db(self, data: pl.DataFrame): """ self.persistent_data_store.insert_to_table(data, self.table_name) n_new = data.shape[0] - print(f" Just saved df with {n_new} df rows to the database of {self.table_name}") + print( + f" Just saved df with {n_new} df rows to the database of {self.table_name}" + ) @enforce_types def get_pdr_df( diff --git a/pdr_backend/lake/test/test_persistent_data_store.py b/pdr_backend/lake/test/test_persistent_data_store.py index 33549b986..e4438980e 100644 --- a/pdr_backend/lake/test/test_persistent_data_store.py +++ b/pdr_backend/lake/test/test_persistent_data_store.py @@ -33,7 +33,7 @@ def _clean_up_test_manager(tmpdir, dataset_identifier): persistent_ds_instance.duckdb_conn.execute(f"DROP TABLE {view_name}") -def _check_view_exists(tmpdir, test_manager, dataset_identifier): +def _check_view_exists(test_manager, dataset_identifier): view_name = test_manager._generate_view_name(dataset_identifier) tables = test_manager.duckdb_conn.execute( "SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'" diff --git a/pdr_backend/lake/test/test_table.py b/pdr_backend/lake/test/test_table.py index b829c95ef..f38f4bb84 100644 --- a/pdr_backend/lake/test/test_table.py +++ b/pdr_backend/lake/test/test_table.py @@ -1,7 +1,6 @@ from io import StringIO import os import sys -import random from polars import Boolean, Float64, Int64, Utf8 import polars as pl @@ -118,6 +117,7 @@ def test_load_table(): assert len(table.df) == 0 + def test_get_pdr_df(tmpdir): st_timestr = "2023-12-03" fin_timestr = "2023-12-05" @@ -132,7 +132,7 @@ def test_get_pdr_df(tmpdir): _clean_up(ppss.lake_ss.parquet_dir) table = Table(table_name, table_df_schema, ppss) - + captured_output = StringIO() sys.stdout = captured_output @@ -191,7 +191,7 @@ def test_get_pdr_df_multiple_fetches(tmpdir): pagination_limit=pagination_limit, config={"contract_list": ["0x18f54cc21b7a2fdd011bea06bba7801b280e3151"]}, ) - + printed_text = captured_output.getvalue().strip() # test fetches multiple times @@ -240,6 +240,7 @@ def test_all(tmpdir): table.load() assert len(table.df) == 1 + def test_append_to_db(tmpdir): """ Test that table is loading the data from file @@ -264,16 +265,16 @@ def test_append_to_db(tmpdir): table._append_to_db(pl.DataFrame([mocked_object] * 1000, schema=table_df_schema)) result = table.persistent_data_store.query_data( - table.table_name, - "SELECT * FROM {view_name}" + table.table_name, "SELECT * FROM {view_name}" ) assert result["ID"][0] == "0x123" assert result["pair"][0] == "ADA-USDT" assert result["timeframe"][0] == "5m" - assert result["prediction"][0] == True + assert result["prediction"][0] is True assert len(result) == 1000 + def test_append_to_csv(tmpdir): """ Test that table is loading the data from file @@ -298,7 +299,9 @@ def test_append_to_csv(tmpdir): table._append_to_csv(pl.DataFrame([mocked_object] * 1000, schema=table_df_schema)) file_path = os.path.join( - ppss.lake_ss.parquet_dir, table_name, f"{table_name}_from_1701634400_to_1701634400.csv" + ppss.lake_ss.parquet_dir, + table_name, + f"{table_name}_from_1701634400_to_1701634400.csv", ) assert os.path.exists(file_path) From bc14bf16b8698031eefbc44c2265c32b08d3eba6 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Mon, 4 Mar 2024 20:32:06 +0300 Subject: [PATCH 22/27] ETL save step --- pdr_backend/lake/etl.py | 7 +++---- pdr_backend/lake/table.py | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pdr_backend/lake/etl.py b/pdr_backend/lake/etl.py index e826f3db2..54cc6d33b 100644 --- a/pdr_backend/lake/etl.py +++ b/pdr_backend/lake/etl.py @@ -92,7 +92,6 @@ def update_bronze_pdr_predictions(self): ) self.tables[bronze_pdr_predictions_table_name] = table - table = get_bronze_pdr_predictions_table(self.tables, self.ppss) - table.save() - ## Add the CSV and duckDB data store here - ## with table.df and table.schema + table = get_bronze_pdr_predictions_table(self.tables, self.ppss) + table.append_to_sources(table.df) + diff --git a/pdr_backend/lake/table.py b/pdr_backend/lake/table.py index 08e80c87d..7c56bbbdc 100644 --- a/pdr_backend/lake/table.py +++ b/pdr_backend/lake/table.py @@ -42,7 +42,7 @@ def load(self): self.table_name, st_ut, fin_ut, schema=self.df_schema ) - def _append_both(self, data: pl.DataFrame): + def append_to_sources(self, data: pl.DataFrame): self._append_to_csv(data) self._append_to_db(data) @@ -132,7 +132,7 @@ def get_pdr_df( ) and len(final_df) > 0: assert df.schema == self.df_schema # save to parquet - self._append_both(final_df) + self.append_to_sources(final_df) print(f"Saved {len(final_df)} records to file while fetching") final_df = pl.DataFrame([], schema=self.df_schema) @@ -144,7 +144,7 @@ def get_pdr_df( pagination_offset += pagination_limit if len(final_df) > 0: - self._append_both(final_df) + self.append_to_sources(final_df) print(f"Saved {len(final_df)} records to file while fetching") From ff3961bae9bf70e786bcdb0e732ad88a0f46a03d Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Mon, 4 Mar 2024 20:35:16 +0300 Subject: [PATCH 23/27] issue685 - black fix --- pdr_backend/lake/etl.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pdr_backend/lake/etl.py b/pdr_backend/lake/etl.py index 54cc6d33b..94af91f62 100644 --- a/pdr_backend/lake/etl.py +++ b/pdr_backend/lake/etl.py @@ -92,6 +92,5 @@ def update_bronze_pdr_predictions(self): ) self.tables[bronze_pdr_predictions_table_name] = table - table = get_bronze_pdr_predictions_table(self.tables, self.ppss) + table = get_bronze_pdr_predictions_table(self.tables, self.ppss) table.append_to_sources(table.df) - From d808ab6051e4be0c28fe41a2d1b6e5ae92eb9186 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Mon, 4 Mar 2024 20:51:44 +0300 Subject: [PATCH 24/27] issue685 - tests are fixed --- .../lake/test/test_persistent_data_store.py | 25 ++++++------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/pdr_backend/lake/test/test_persistent_data_store.py b/pdr_backend/lake/test/test_persistent_data_store.py index e4438980e..c5677a67f 100644 --- a/pdr_backend/lake/test/test_persistent_data_store.py +++ b/pdr_backend/lake/test/test_persistent_data_store.py @@ -47,7 +47,7 @@ def test_create_and_fill_table(tmpdir): test_manager._create_and_fill_table(example_df, dataset_identifier) # Check if the view is registered - assert _check_view_exists(tmpdir, test_manager, dataset_identifier) + assert _check_view_exists(test_manager, dataset_identifier) _clean_up_test_manager(tmpdir, dataset_identifier) @@ -57,9 +57,7 @@ def test_insert_to_exist_table(tmpdir): test_manager._create_and_fill_table(example_df, dataset_identifier) # Check if the view is registered - check_result, view_name = _check_view_exists( - tmpdir, test_manager, dataset_identifier - ) + check_result, view_name = _check_view_exists(test_manager, dataset_identifier) assert check_result # Insert new data to the table @@ -69,9 +67,7 @@ def test_insert_to_exist_table(tmpdir): test_manager.insert_to_table(example_df, dataset_identifier) # Check if the view is registered - check_result, view_name = _check_view_exists( - tmpdir, test_manager, dataset_identifier - ) + check_result, view_name = _check_view_exists(test_manager, dataset_identifier) assert check_result # Check if the new data is inserted @@ -93,9 +89,7 @@ def test_insert_to_new_table(tmpdir): test_manager.insert_to_table(example_df, dataset_identifier) # Check if the view is registered - check_result, view_name = _check_view_exists( - tmpdir, test_manager, dataset_identifier - ) + check_result, view_name = _check_view_exists(test_manager, dataset_identifier) assert check_result # Check if the new data is inserted @@ -115,7 +109,7 @@ def test_query_data(tmpdir): test_manager.insert_to_table(example_df, dataset_identifier) # Check if the view is registered - check_result, _ = _check_view_exists(tmpdir, test_manager, dataset_identifier) + check_result, _ = _check_view_exists(test_manager, dataset_identifier) assert check_result # Execute the provided SQL query @@ -132,9 +126,7 @@ def test_drop_table(tmpdir): test_manager.insert_to_table(example_df, dataset_identifier) # Check if the view is registered - check_result, view_name = _check_view_exists( - tmpdir, test_manager, dataset_identifier - ) + check_result, view_name = _check_view_exists(test_manager, dataset_identifier) assert check_result # Drop the table @@ -157,9 +149,8 @@ def test_fill_from_csv_destination(tmpdir): test_manager.fill_from_csv_destination(csv_folder_path, dataset_identifier) # Check if the view is registered - check_result, view_name = _check_view_exists( - tmpdir, test_manager, dataset_identifier - ) + check_result, view_name = _check_view_exists(test_manager, dataset_identifier) + assert check_result # Check if the new data is inserted From f6702163ed95c6f6d3db52aae3a48fc019f5af32 Mon Sep 17 00:00:00 2001 From: Mustafa Tuncay Date: Mon, 4 Mar 2024 21:16:27 +0300 Subject: [PATCH 25/27] issue685 - system tests are fixed --- system_tests/test_get_traction_info_system.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/system_tests/test_get_traction_info_system.py b/system_tests/test_get_traction_info_system.py index 17c8e0e0c..c6125d14b 100644 --- a/system_tests/test_get_traction_info_system.py +++ b/system_tests/test_get_traction_info_system.py @@ -17,7 +17,7 @@ @patch("pdr_backend.analytics.get_predictions_info.plot_slot_daily_statistics") @patch("pdr_backend.analytics.get_predictions_info.GQLDataFactory.get_gql_tables") -def test_traction_info_system(mock_get_gql_tables, mock_plot_stats, caplog): +def test_traction_info_system(mock_get_gql_tables, mock_plot_stats, caplog, tmpdir): feed_addr = "0x2d8e2267779d27c2b3ed5408408ff15d9f3a3152" user_addr = "0xaaaa4cb4ff2584bad80ff5f109034a891c3d88dd" mock_predictions = [ @@ -42,7 +42,7 @@ def test_traction_info_system(mock_get_gql_tables, mock_plot_stats, caplog): ppss = mock_ppss( ["binance BTC/USDT c 5m"], "sapphire-mainnet", - ".", + str(tmpdir), st_timestr=st_timestr, fin_timestr=fin_timestr, ) @@ -72,7 +72,7 @@ def test_traction_info_system(mock_get_gql_tables, mock_plot_stats, caplog): "get_traction_info", "2023-12-01", "2023-12-31", - "./dir", + str(tmpdir), "ppss.yaml", "sapphire-testnet", ] From 34d520ba7378d9aa2bb311992fbfd9aa9947969a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mustafa=20Tun=C3=A7ay?= Date: Tue, 5 Mar 2024 04:51:25 +0300 Subject: [PATCH 26/27] #650 - Clean (predvalue, truevalue) columns (#664) * issue650 renaming * issue650 - test fixes * issue650 black format * issue650: fixes after merges * black fix * take-back the gql_data_factory from the main branch * Removed print statements --------- Co-authored-by: idiom-bytes --- pdr_backend/analytics/predictoor_stats.py | 12 ++++++------ pdr_backend/lake/table_bronze_pdr_predictions.py | 12 ++++++------ pdr_backend/lake/table_pdr_payouts.py | 2 +- pdr_backend/lake/table_pdr_predictions.py | 4 ++-- pdr_backend/lake/table_pdr_truevals.py | 2 +- pdr_backend/lake/test/test_etl.py | 4 ++-- pdr_backend/lake/test/test_table.py | 6 +++--- pdr_backend/subgraph/payout.py | 8 ++++---- pdr_backend/subgraph/prediction.py | 16 ++++++++-------- pdr_backend/subgraph/subgraph_payout.py | 2 +- pdr_backend/subgraph/subgraph_predictions.py | 8 ++++---- pdr_backend/subgraph/subgraph_trueval.py | 2 +- .../subgraph/test/test_subgraph_payout.py | 2 +- .../subgraph/test/test_subgraph_predictions.py | 8 ++++---- .../subgraph/test/test_subgraph_trueval.py | 2 +- pdr_backend/subgraph/trueval.py | 8 ++++---- pdr_backend/util/csvs.py | 8 ++++---- pdr_backend/util/test_noganache/test_csvs.py | 8 ++++---- 18 files changed, 57 insertions(+), 57 deletions(-) diff --git a/pdr_backend/analytics/predictoor_stats.py b/pdr_backend/analytics/predictoor_stats.py index 27520a85d..bdc8809c9 100644 --- a/pdr_backend/analytics/predictoor_stats.py +++ b/pdr_backend/analytics/predictoor_stats.py @@ -34,7 +34,7 @@ class PredictoorStat(TypedDict): def get_feed_summary_stats(predictions_df: pl.DataFrame) -> pl.DataFrame: # 1 - filter from lake only the rows that you're looking for df = predictions_df.filter( - ~((pl.col("trueval").is_null()) | (pl.col("payout").is_null())) + ~((pl.col("truevalue").is_null()) | (pl.col("payout").is_null())) ) # Group by pair @@ -42,8 +42,8 @@ def get_feed_summary_stats(predictions_df: pl.DataFrame) -> pl.DataFrame: pl.col("source").first().alias("source"), pl.col("payout").sum().alias("sum_payout"), pl.col("stake").sum().alias("sum_stake"), - pl.col("prediction").count().alias("num_predictions"), - (pl.col("prediction").sum() / pl.col("pair").count() * 100).alias("accuracy"), + pl.col("predvalue").count().alias("num_predictions"), + (pl.col("predvalue").sum() / pl.col("pair").count() * 100).alias("accuracy"), ) return df @@ -53,7 +53,7 @@ def get_feed_summary_stats(predictions_df: pl.DataFrame) -> pl.DataFrame: def get_predictoor_summary_stats(predictions_df: pl.DataFrame) -> pl.DataFrame: # 1 - filter from lake only the rows that you're looking for df = predictions_df.filter( - ~((pl.col("trueval").is_null()) | (pl.col("payout").is_null())) + ~((pl.col("truevalue").is_null()) | (pl.col("payout").is_null())) ) # Group by pair @@ -61,8 +61,8 @@ def get_predictoor_summary_stats(predictions_df: pl.DataFrame) -> pl.DataFrame: pl.col("source").first().alias("source"), pl.col("payout").sum().alias("sum_payout"), pl.col("stake").sum().alias("sum_stake"), - pl.col("prediction").count().alias("num_predictions"), - (pl.col("prediction").sum() / pl.col("pair").count() * 100).alias("accuracy"), + pl.col("predvalue").count().alias("num_predictions"), + (pl.col("predvalue").sum() / pl.col("pair").count() * 100).alias("accuracy"), ) return df diff --git a/pdr_backend/lake/table_bronze_pdr_predictions.py b/pdr_backend/lake/table_bronze_pdr_predictions.py index 8146ec391..314254a3d 100644 --- a/pdr_backend/lake/table_bronze_pdr_predictions.py +++ b/pdr_backend/lake/table_bronze_pdr_predictions.py @@ -59,8 +59,8 @@ def get_slot_id(_id: str) -> str: bronze_predictions_df = predictions_df.with_columns( [ pl.col("ID").map_elements(get_slot_id, return_dtype=Utf8).alias("slot_id"), - pl.col("prediction").alias("predvalue"), - pl.col("trueval").alias("truevalue"), + pl.col("predvalue").alias("predvalue"), + pl.col("truevalue").alias("truevalue"), pl.col("timestamp").alias("timestamp"), pl.col("timestamp").alias("last_event_timestamp"), ] @@ -93,14 +93,14 @@ def _process_truevals(tables: Dict[str, Table], ppss: PPSS) -> Dict[str, Table]: predictions_df.join(truevals_df, left_on="slot_id", right_on="ID", how="left") .with_columns( [ - pl.col("trueval").fill_null(pl.col("truevalue")), + pl.col("truevalue_right").fill_null(pl.col("truevalue")), pl.col("timestamp_right").fill_null(pl.col("last_event_timestamp")), ] ) .drop(["truevalue", "last_event_timestamp"]) .rename( { - "trueval": "truevalue", + "truevalue_right": "truevalue", "timestamp_right": "last_event_timestamp", } ) @@ -135,7 +135,7 @@ def _process_payouts(tables: Dict[str, Table], ppss: PPSS) -> Dict[str, Table]: .with_columns( [ pl.col("payout_right").fill_null(pl.col("payout")), - pl.col("predictedValue").fill_null(pl.col("predvalue")), + pl.col("predvalue_right").fill_null(pl.col("predvalue")), pl.col("stake_right").fill_null(pl.col("stake")), pl.col("timestamp_right").fill_null(pl.col("last_event_timestamp")), ] @@ -144,7 +144,7 @@ def _process_payouts(tables: Dict[str, Table], ppss: PPSS) -> Dict[str, Table]: .rename( { "payout_right": "payout", - "predictedValue": "predvalue", + "predvalue_right": "predvalue", "stake_right": "stake", "timestamp_right": "last_event_timestamp", } diff --git a/pdr_backend/lake/table_pdr_payouts.py b/pdr_backend/lake/table_pdr_payouts.py index 9304f5bdb..e76a281ba 100644 --- a/pdr_backend/lake/table_pdr_payouts.py +++ b/pdr_backend/lake/table_pdr_payouts.py @@ -10,7 +10,7 @@ "slot": Int64, "timestamp": Int64, "payout": Float64, - "predictedValue": Boolean, + "predvalue": Boolean, "revenue": Float64, "roundSumStakesUp": Float64, "roundSumStakes": Float64, diff --git a/pdr_backend/lake/table_pdr_predictions.py b/pdr_backend/lake/table_pdr_predictions.py index 8475af189..8b5903f74 100644 --- a/pdr_backend/lake/table_pdr_predictions.py +++ b/pdr_backend/lake/table_pdr_predictions.py @@ -10,9 +10,9 @@ "contract": Utf8, "pair": Utf8, "timeframe": Utf8, - "prediction": Boolean, + "predvalue": Boolean, "stake": Float64, - "trueval": Boolean, + "truevalue": Boolean, "timestamp": Int64, "source": Utf8, "payout": Float64, diff --git a/pdr_backend/lake/table_pdr_truevals.py b/pdr_backend/lake/table_pdr_truevals.py index 7f5b68fde..619bec2bc 100644 --- a/pdr_backend/lake/table_pdr_truevals.py +++ b/pdr_backend/lake/table_pdr_truevals.py @@ -7,6 +7,6 @@ "ID": Utf8, "token": Utf8, "timestamp": Int64, - "trueval": Boolean, + "truevalue": Boolean, "slot": Int64, } diff --git a/pdr_backend/lake/test/test_etl.py b/pdr_backend/lake/test/test_etl.py index 64d747642..091b00b1e 100644 --- a/pdr_backend/lake/test/test_etl.py +++ b/pdr_backend/lake/test/test_etl.py @@ -201,11 +201,11 @@ def test_etl_do_bronze_step( assert ( bronze_pdr_predictions_df["truevalue"][1] - == _gql_datafactory_etl_truevals_df["trueval"][1] + == _gql_datafactory_etl_truevals_df["truevalue"][1] ) assert ( bronze_pdr_predictions_df["truevalue"][2] - == _gql_datafactory_etl_truevals_df["trueval"][2] + == _gql_datafactory_etl_truevals_df["truevalue"][2] ) # Assert payout ts > prediction ts diff --git a/pdr_backend/lake/test/test_table.py b/pdr_backend/lake/test/test_table.py index f38f4bb84..e142bb560 100644 --- a/pdr_backend/lake/test/test_table.py +++ b/pdr_backend/lake/test/test_table.py @@ -17,7 +17,7 @@ def __init__(self, data): self.ID = data["ID"] self.pair = data["pair"] self.timeframe = data["timeframe"] - self.prediction = data["prediction"] + self.predvalue = data["predvalue"] self.payout = data["payout"] self.timestamp = data["timestamp"] self.slot = data["slot"] @@ -28,7 +28,7 @@ def __init__(self, data): "ID": "0x123", "pair": "ADA-USDT", "timeframe": "5m", - "prediction": True, + "predvalue": True, "payout": 28.2, "timestamp": 1701634400, "slot": 1701634400, @@ -66,7 +66,7 @@ def get_table_df(network, st_ut, fin_ut, config): "ID": Utf8, "pair": Utf8, "timeframe": Utf8, - "prediction": Boolean, + "predvalue": Boolean, "payout": Float64, "timestamp": Int64, "slot": Int64, diff --git a/pdr_backend/subgraph/payout.py b/pdr_backend/subgraph/payout.py index 5fcd6dd2f..f1baead0e 100644 --- a/pdr_backend/subgraph/payout.py +++ b/pdr_backend/subgraph/payout.py @@ -14,7 +14,7 @@ def __init__( slot: UnixTimeS, timestamp: UnixTimeS, payout: float, - predictedValue: bool, + predvalue: bool, revenue: float, roundSumStakesUp: float, roundSumStakes: float, @@ -26,7 +26,7 @@ def __init__( self.token = token self.slot = slot self.payout = payout - self.predictedValue = predictedValue + self.predvalue = predvalue self.revenue = revenue self.roundSumStakesUp = roundSumStakesUp self.roundSumStakes = roundSumStakes @@ -42,7 +42,7 @@ def mock_payout(payout_tuple: tuple) -> Payout: token, slot, payout, - predictedValue, + predvalue, revenue, roundSumStakesUp, roundSumStakes, @@ -56,7 +56,7 @@ def mock_payout(payout_tuple: tuple) -> Payout: token=token, slot=UnixTimeS(slot), payout=payout, - predictedValue=predictedValue, + predvalue=predvalue, revenue=revenue, roundSumStakesUp=roundSumStakesUp, roundSumStakes=roundSumStakes, diff --git a/pdr_backend/subgraph/prediction.py b/pdr_backend/subgraph/prediction.py index c9d5dbb47..b959df666 100644 --- a/pdr_backend/subgraph/prediction.py +++ b/pdr_backend/subgraph/prediction.py @@ -13,9 +13,9 @@ def __init__( contract: str, pair: str, timeframe: str, - prediction: Union[bool, None], # prediction = subgraph.predicted_value + predvalue: Union[bool, None], stake: Union[float, None], - trueval: Union[bool, None], + truevalue: Union[bool, None], timestamp: UnixTimeS, # timestamp == prediction submitted timestamp source: str, payout: Union[float, None], @@ -26,9 +26,9 @@ def __init__( self.contract = contract self.pair = pair self.timeframe = timeframe - self.prediction = prediction # predvalue + self.predvalue = predvalue self.stake = stake - self.trueval = trueval # truevalue + self.truevalue = truevalue self.timestamp = timestamp self.source = source self.payout = payout @@ -46,9 +46,9 @@ def mock_prediction(prediction_tuple: tuple) -> Prediction: contract, pair_str, timeframe_str, - prediction, + predvalue, stake, - trueval, + truevalue, timestamp, source, payout, @@ -62,9 +62,9 @@ def mock_prediction(prediction_tuple: tuple) -> Prediction: contract=contract, pair=pair_str, timeframe=timeframe_str, - prediction=prediction, + predvalue=predvalue, stake=stake, - trueval=trueval, + truevalue=truevalue, timestamp=UnixTimeS(timestamp), source=source, payout=payout, diff --git a/pdr_backend/subgraph/subgraph_payout.py b/pdr_backend/subgraph/subgraph_payout.py index 069b64f10..2a77e94a0 100644 --- a/pdr_backend/subgraph/subgraph_payout.py +++ b/pdr_backend/subgraph/subgraph_payout.py @@ -156,8 +156,8 @@ def fetch_payouts( "token": payout["prediction"]["slot"]["predictContract"]["token"][ "name" ], + "predvalue": bool(payout["predictedValue"]), "slot": UnixTimeS(int(payout["id"].split("-")[1])), - "predictedValue": bool(payout["predictedValue"]), "revenue": float(payout["prediction"]["slot"]["revenue"]), "roundSumStakesUp": float( payout["prediction"]["slot"]["roundSumStakesUp"] diff --git a/pdr_backend/subgraph/subgraph_predictions.py b/pdr_backend/subgraph/subgraph_predictions.py index 81b48d665..62fa0d4ad 100644 --- a/pdr_backend/subgraph/subgraph_predictions.py +++ b/pdr_backend/subgraph/subgraph_predictions.py @@ -139,14 +139,14 @@ def fetch_filtered_predictions( slot = UnixTimeS(int(prediction_sg_dict["slot"]["slot"])) user = prediction_sg_dict["user"]["id"] address = prediction_sg_dict["id"].split("-")[0] - trueval = None + truevalue = None payout = None predicted_value = None stake = None if not prediction_sg_dict["payout"] is None: stake = float(prediction_sg_dict["stake"]) - trueval = prediction_sg_dict["payout"]["trueValue"] + truevalue = prediction_sg_dict["payout"]["trueValue"] predicted_value = prediction_sg_dict["payout"]["predictedValue"] payout = float(prediction_sg_dict["payout"]["payout"]) @@ -155,9 +155,9 @@ def fetch_filtered_predictions( contract=address, pair=pair, timeframe=timeframe, - prediction=predicted_value, + predvalue=predicted_value, stake=stake, - trueval=trueval, + truevalue=truevalue, timestamp=timestamp, source=source, payout=payout, diff --git a/pdr_backend/subgraph/subgraph_trueval.py b/pdr_backend/subgraph/subgraph_trueval.py index b6578c3cb..e9dcc885d 100644 --- a/pdr_backend/subgraph/subgraph_trueval.py +++ b/pdr_backend/subgraph/subgraph_trueval.py @@ -118,7 +118,7 @@ def fetch_truevals( ID=ID, token=token, timestamp=timestamp, - trueval=truevalue, + truevalue=truevalue, slot=slot, ) diff --git a/pdr_backend/subgraph/test/test_subgraph_payout.py b/pdr_backend/subgraph/test/test_subgraph_payout.py index 9d89a36be..fd71bc978 100644 --- a/pdr_backend/subgraph/test/test_subgraph_payout.py +++ b/pdr_backend/subgraph/test/test_subgraph_payout.py @@ -81,7 +81,7 @@ def test_fetch_payouts(mock_query_subgraph): assert payouts[0].timestamp == 1698527000 assert payouts[0].slot == 1696880700 assert payouts[0].payout == float(0) - assert payouts[0].predictedValue is True + assert payouts[0].predvalue is True assert payouts[0].user == "0xd2a24cb4ff2584bad80ff5f109034a891c3d88dd" assert payouts[0].stake == float(1.2) assert mock_query_subgraph.call_count == 1 diff --git a/pdr_backend/subgraph/test/test_subgraph_predictions.py b/pdr_backend/subgraph/test/test_subgraph_predictions.py index 266525b59..5cc9a5f85 100644 --- a/pdr_backend/subgraph/test/test_subgraph_predictions.py +++ b/pdr_backend/subgraph/test/test_subgraph_predictions.py @@ -20,9 +20,9 @@ contract="0x18f54cc21b7a2fdd011bea06bba7801b280e3151", pair="ADA/USDT", timeframe="5m", - prediction=True, + predvalue=True, stake=0.050051425480971974, - trueval=False, + truevalue=False, timestamp=UnixTimeS(1698527000), source="binance", payout=0.0, @@ -128,8 +128,8 @@ def test_fetch_filtered_predictions(mock_query_subgraph): assert predictions[0].user == "0xd2a24cb4ff2584bad80ff5f109034a891c3d88dd" assert predictions[0].pair == "ADA/USDT" assert predictions[0].contract == "0x18f54cc21b7a2fdd011bea06bba7801b280e3151" - assert predictions[0].trueval is False - assert predictions[0].prediction is True + assert predictions[0].truevalue is False + assert predictions[0].predvalue is True assert mock_query_subgraph.call_count == 1 diff --git a/pdr_backend/subgraph/test/test_subgraph_trueval.py b/pdr_backend/subgraph/test/test_subgraph_trueval.py index 44e4d92f2..72e76f0b6 100644 --- a/pdr_backend/subgraph/test/test_subgraph_trueval.py +++ b/pdr_backend/subgraph/test/test_subgraph_trueval.py @@ -63,5 +63,5 @@ def test_fetch_filtered_truevals(mock_query_subgraph): assert truevals[0].token == "ADA/USDT" assert truevals[0].timestamp == 1698527000 assert truevals[0].slot == 1698527100 - assert truevals[0].trueval is True + assert truevals[0].truevalue is True assert mock_query_subgraph.call_count == 1 diff --git a/pdr_backend/subgraph/trueval.py b/pdr_backend/subgraph/trueval.py index 946ec7906..67841249c 100644 --- a/pdr_backend/subgraph/trueval.py +++ b/pdr_backend/subgraph/trueval.py @@ -12,11 +12,11 @@ def __init__( ID: str, timestamp: UnixTimeS, token: str, - trueval: Union[bool, None], + truevalue: Union[bool, None], slot: UnixTimeS, # slot/epoch timestamp ) -> None: self.ID = ID - self.trueval = trueval + self.truevalue = truevalue self.timestamp = timestamp self.token = token self.slot = slot @@ -28,12 +28,12 @@ def __init__( @enforce_types def mock_trueval(trueval_tuple: tuple) -> Trueval: - (ID, timestamp, token, trueval, slot) = trueval_tuple + (ID, timestamp, token, truevalue, slot) = trueval_tuple return Trueval( ID=ID, token=token, + truevalue=truevalue, slot=UnixTimeS(slot), - trueval=trueval, timestamp=UnixTimeS(timestamp), ) diff --git a/pdr_backend/util/csvs.py b/pdr_backend/util/csvs.py index c3300c5c8..ebf5f38fa 100644 --- a/pdr_backend/util/csvs.py +++ b/pdr_backend/util/csvs.py @@ -80,8 +80,8 @@ def save_prediction_csv(all_predictions: List[Prediction], csv_output_dir: str): all_predictions, csv_output_dir, { - "Predicted Value": "prediction", - "True Value": "trueval", + "Predicted Value": "predvalue", + "True Value": "truevalue", "Timestamp": "timestamp", "Stake": "stake", "Payout": "payout", @@ -101,7 +101,7 @@ def save_analysis_csv(all_predictions: List[Prediction], csv_output_dir: str): "Stake": "stake", "Wallet": "user", "Payout": "payout", - "True Value": "trueval", - "Predicted Value": "prediction", + "True Value": "truevalue", + "Predicted Value": "predvalue", }, ) diff --git a/pdr_backend/util/test_noganache/test_csvs.py b/pdr_backend/util/test_noganache/test_csvs.py index 35a5f91e0..d285797bc 100644 --- a/pdr_backend/util/test_noganache/test_csvs.py +++ b/pdr_backend/util/test_noganache/test_csvs.py @@ -18,8 +18,8 @@ def test_save_analysis_csv(tmpdir): data = csv.DictReader(f) data_rows = list(data) - assert data_rows[0]["Predicted Value"] == str(predictions[0].prediction) - assert data_rows[0]["True Value"] == str(predictions[0].trueval) + assert data_rows[0]["Predicted Value"] == str(predictions[0].predvalue) + assert data_rows[0]["True Value"] == str(predictions[0].truevalue) assert data_rows[0]["Timestamp"] == str(predictions[0].timestamp) assert list(data_rows[0].keys()) == [ "PredictionID", @@ -46,8 +46,8 @@ def test_save_prediction_csv(tmpdir): data = csv.DictReader(f) data_rows = list(row for row in data) - assert data_rows[0]["Predicted Value"] == str(predictions[0].prediction) - assert data_rows[0]["True Value"] == str(predictions[0].trueval) + assert data_rows[0]["Predicted Value"] == str(predictions[0].predvalue) + assert data_rows[0]["True Value"] == str(predictions[0].truevalue) assert data_rows[0]["Timestamp"] == str(predictions[0].timestamp) assert list(data_rows[0].keys()) == [ "Predicted Value", From 957a0252a822006e4f1411fc5196884f7a6ecdb2 Mon Sep 17 00:00:00 2001 From: idiom-bytes Date: Mon, 4 Mar 2024 18:08:15 -0800 Subject: [PATCH 27/27] Fixing test --- pdr_backend/lake/test/test_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdr_backend/lake/test/test_table.py b/pdr_backend/lake/test/test_table.py index e142bb560..b08153fd9 100644 --- a/pdr_backend/lake/test/test_table.py +++ b/pdr_backend/lake/test/test_table.py @@ -271,7 +271,7 @@ def test_append_to_db(tmpdir): assert result["ID"][0] == "0x123" assert result["pair"][0] == "ADA-USDT" assert result["timeframe"][0] == "5m" - assert result["prediction"][0] is True + assert result["predvalue"][0] is True assert len(result) == 1000