Merge pull request #95 from Leibniz-HBI/90-featbackend-simplify-datab…

…ase-tables-with-json-columns 90 featbackend simplify database tables with json columns
Leibniz-HBI · Dec 6, 2024 · afcdf21 · afcdf21
2 parents eec4fa3 + 0cac941
commit afcdf21
Show file tree

Hide file tree

Showing 23 changed files with 800 additions and 721 deletions.
diff --git a/.gitignore b/.gitignore
@@ -154,3 +154,5 @@ notebooks/
 
 # spiderexpress files
 *.pe.yml
+
+.DS_Store
diff --git a/README.md b/README.md
@@ -50,8 +50,6 @@ In the future we will provide a PyPI package which will make the installation pr
 
 ## Usage
 
-
-
 ```bash
 $ spiderexpress --help
 Usage: spiderexpress [OPTIONS] COMMAND [ARGS]...
@@ -64,6 +62,7 @@ Options:
 Commands:
   create  create a new configuration
   start   start a job
+  list    list all available connectors and strategies
 ```
 
 ### create
@@ -80,6 +79,12 @@ Options:
   --interactive / --non-interactive
 ```
 
+This leads you through a dialogue to create a configuration file for your project.
+You need to supply at least the following information:
+- the name of the project,
+- a path for the seed file.
+
+```bash
 ### start
 
 This command starts a `spiderexpress` job with the given configuration file.
@@ -100,7 +105,7 @@ A `spiderexpress` project could for example look like this:
 ├── my_project
 │   ├── my_project.pe.yml
 │   ├── my_project.db
-│   └── seed_file.txt
+│   └── seed_file.json
 ```
 
 Whereas `my_project.db` is the resulting database, `my_project.pe.yml` is the project's configuration in which a data source and sampling strategy and other parameters may be specified (see [Configuration](#configuration) for further details) and `seed_file.txt` is a text file which contains one node name per line.
@@ -114,35 +119,50 @@ For example projects, please refer to the `examples` directory or the unit tests
 The resulting file could look like something like this example:
 
 ```yaml
-project_name: spider
-batch_size: 150
-db_url: test2.sqlite
+db_url: sqlite:/// # in memory database for testing
+db_schema:
+empty_seeds: stop
 max_iteration: 10000
-edge_table_name: edge_list
-node_table_name: node_list
-seeds:
-  - ...
-connector: telegram
-strategy:
-  spikyball:
-    layer_max_size: 150
+layers:
+  test:
+    eager: false
+    connector:
+      csv:
+        node_list_location: tests/stubs/7th_graders/nodes.csv
+        edge_list_location: tests/stubs/7th_graders/edges.csv
+        mode: out
+    routers:
+      - all:  # This is the name of the router and should be the type of edge.
+          source: source  # This is the field that is mapped to the source columns.
+          target:
+            - field: target  # This is the field that is mapped to the target columns.
+              dispatch_with: test  # This is the name of the layer to dispatch to.
     sampler:
-      source_node_probability:
-        coefficient: 1
-        weights:
-          subscriber_count: 4
-          videos_count: 1
-      target_node_probability:
-        coefficient: 1
-        weights:
-      edge_probability:
-        coefficient: 1
-        weights:
-          views: 1
+      random:
+        n: 5
+project_name: spider
+seeds:
+  test:
+    - "1"
+    - "13"
 ```
 
+> [!note] Nomenclature
+> `spiderexpress` uses `source`and `target` as they variable names for the edges, nodes must have a `name`.
+
 ## Table Schemas
 
+`spiderexpress` is an entirely persistent application, meaning that all of its state information are kept,
+including the data retrieved from the connectors, dense network-layer, the sparse network as well as the application's own
+data.
+
+Most of the user-facing data is kept in the following few tables:
+
+- `raw_data`, here we keep the raw data, as returned by the connector.
+- `layer_dense_edges` and `layer_dense_nodes`, here we keep the network information on the dense network.
+- `layer_sparse_edges` and `layer_sparse_nodes`, sample as above, but only the sampled edges are included.
+
+
 How the tables are structured is determined by the configuration file.
 The following sections describe the minimal configuration for the tables as well as the configuration syntax.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -15,6 +15,7 @@ click = "*"
 psycopg2-binary = "*"
 PyYAML = "*"
 transitions = "*"
+pydantic = "^2.9.2"
 
 [tool.poetry.dev-dependencies]
 ipykernel = "*"

diff --git a/spiderexpress/__init__.py b/spiderexpress/__init__.py
@@ -1,6 +1,8 @@
 """.. include:: ../README.md"""
 
 from .spider import Spider
-from .types import Configuration
+from .types import Configuration, PlugIn
+
+__all__ = ["Spider", "Configuration", "PlugIn"]
 
 __version__ = "0.1.0a0"
diff --git a/spiderexpress/cli.py b/spiderexpress/cli.py
@@ -11,13 +11,14 @@
 - Refine verbs/commands for the CLI
 - find a mechanism for stopping/starting collections
 """
+
+import sys
 from importlib.metadata import entry_points
 from pathlib import Path
-from loguru import logger as log
 
 import click
 import yaml
-import sys
+from loguru import logger as log
 
 from .spider import CONNECTOR_GROUP, STRATEGY_GROUP, Spider
 from .types import Configuration
@@ -34,22 +35,21 @@ def cli(ctx):
 @cli.command()
 @click.argument("config", type=click.Path(path_type=Path, exists=True))
 @click.option("-v", "--verbose", count=True)
-@click.option("-l", "--logfile", type=click.Path(dir_okay=False, writable=True, path_type=str))
+@click.option(
+    "-l", "--logfile", type=click.Path(dir_okay=False, writable=True, path_type=str)
+)
 @click.pass_context
 def start(ctx: click.Context, config: Path, verbose: int, logfile: str):
     """start a job"""
-    logging_level = max(50 - (10 * verbose), 0) # Allows logging level to be between 0 and 50.
+    logging_level = max(
+        50 - (10 * verbose), 0
+    )  # Allows logging level to be between 0 and 50.
     logging_configuration = {
-        "handlers": [
-            {
-                "sink": logfile or sys.stdout,
-                "level": logging_level
-            }
-        ],
-        "extra": {}
+        "handlers": [{"sink": logfile or sys.stdout, "level": logging_level}],
+        "extra": {},
     }
     log.configure(**logging_configuration)
-    log.debug(f"Starting logging with verbosity {logging_level}.")    
+    log.debug(f"Starting logging with verbosity {logging_level}.")
     ctx.obj.start(config)
 
 
@@ -62,7 +62,13 @@ def create(config: str, interactive: bool):
 
     if interactive:
         for key, description in [
-            ("seeds", "add seeds?"),
+            ("project_name", "Name of your project?"),
+            ("db_url", "URL of your database?"),
+            ("max_iteration", "How many iterations should be done?"),
+            (
+                "empty_seeds",
+                "What should happen if seeds are empty? Can be 'stop' or 'retry'",
+            ),
             ("seed_file", "do you wish to read a file for seeds?"),
         ]:
             args[key] = click.prompt(description)
@@ -76,7 +82,7 @@ def create(config: str, interactive: bool):
 @cli.command()
 def list():  # pylint: disable=W0622
     """list all plugins"""
-    click.echo("--- connectors ---", color="blue")
+    click.echo("--- connectors ---")
     for connector in entry_points(group=CONNECTOR_GROUP):
         click.echo(connector.name)
     click.echo("--- strategies ---")

diff --git a/spiderexpress/connectors/csv.py b/spiderexpress/connectors/csv.py
@@ -1,13 +1,15 @@
 """A CSV-reading, network-rippin' connector for your testing purposes."""
+
 import dataclasses
 from typing import Dict, List, Optional, Tuple, Union
 
 import pandas as pd
 
-from spiderexpress.types import PlugIn, fromdict
+from spiderexpress.types import PlugIn, from_dict
 
 _cache = {}
 
+
 @dataclasses.dataclass
 class CSVConnectorConfiguration:
     """Configuration items for the csv_connector."""
@@ -23,7 +25,7 @@ def csv_connector(
 ) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """The CSV connector!"""
     if isinstance(configuration, dict):
-        configuration = fromdict(CSVConnectorConfiguration, configuration)
+        configuration = from_dict(CSVConnectorConfiguration, configuration)
 
     if configuration.cache:
         if configuration.edge_list_location not in _cache:
@@ -61,9 +63,11 @@ def csv_connector(
 
     return (
         edge_return,
-        nodes.loc[nodes.name.isin(node_ids), :]
-        if nodes is not None
-        else pd.DataFrame(),
+        (
+            nodes.loc[nodes.name.isin(node_ids), :]
+            if nodes is not None
+            else pd.DataFrame()
+        ),
     )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -154,3 +154,5 @@ notebooks/

		# spiderexpress files
		*.pe.yml

		.DS_Store