Skip to content

Commit

Permalink
Merge branch 'main' into LuluFreeDesign-patch-4
Browse files Browse the repository at this point in the history
  • Loading branch information
loicguillois authored Jun 11, 2024
2 parents 7cabb4b + 7c0d37b commit 4b513b4
Show file tree
Hide file tree
Showing 15 changed files with 306 additions and 0 deletions.
14 changes: 14 additions & 0 deletions .github/workflows/github-actions-data-stack.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: Data Stack CI

on:
push:
paths:
- analytics/**

jobs:
deploy-dagster:
uses: ./.github/workflows/deploy.yml
with:
app: dagster-production
branch: main
secrets: inherit
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,8 @@ yarn-error.log*

/talisman_report
.clever.json

# Data Stack
.pyc
__pycache__
analytics/dagster/storage/
1 change: 1 addition & 0 deletions analytics/dagster/.nux/nux.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
seen: 1
21 changes: 21 additions & 0 deletions analytics/dagster/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM python:3.10-slim

RUN pip install dagster-webserver dagster-postgres dagster-aws

# Install dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt

ENV DAGSTER_HOME=/opt/dagster/dagster_home/

RUN mkdir -p $DAGSTER_HOME

COPY dagster.yaml workspace.yaml $DAGSTER_HOME

COPY src/ $DAGSTER_HOME

WORKDIR $DAGSTER_HOME

EXPOSE 3000

ENTRYPOINT ["dagster-webserver", "-h", "0.0.0.0", "-p", "3000"]
15 changes: 15 additions & 0 deletions analytics/dagster/dagster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
storage:
postgres:
postgres_db:
username:
env: DAGSTER_PG_USERNAME
password:
env: DAGSTER_PG_PASSWORD
hostname:
env: DAGSTER_PG_HOST
db_name:
env: DAGSTER_PG_DB
port: 5432

telemetry:
enabled: false
1 change: 1 addition & 0 deletions analytics/dagster/data/most_frequent_words.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"\u2013": 9, "new": 6, "hn": 5, "why": 5, "ai": 4, "show": 4, "from": 4, "macos": 3, "language": 3, "server": 3, "at": 3, "video": 3, "may": 3, "be": 3, "using": 3, "pdf": 3, "vision": 2, "sequoia": 2, "virtual": 2, "arm": 2, "exploring": 2, "apple's": 2, "models": 2, "just": 2, "fast": 2}
101 changes: 101 additions & 0 deletions analytics/dagster/data/topstories.csv

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions analytics/dagster/data/topstory_ids.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[40643207, 40643181, 40643167, 40639506, 40642871, 40642476, 40630952, 40631796, 40639606, 40623864, 40639628, 40639450, 40644323, 40635397, 40636292, 40640927, 40632064, 40640833, 40633003, 40643071, 40638764, 40641848, 40636844, 40637102, 40632745, 40636883, 40641704, 40641795, 40633902, 40639032, 40640424, 40641615, 40641932, 40644454, 40631614, 40637374, 40632533, 40637785, 40631466, 40635697, 40631573, 40635789, 40633773, 40642801, 40644605, 40632397, 40615002, 40644111, 40630699, 40643499, 40622209, 40644459, 40627113, 40638386, 40640534, 40643744, 40636854, 40636122, 40637303, 40641388, 40631439, 40631585, 40643259, 40622671, 40619311, 40638445, 40640635, 40627563, 40637089, 40626807, 40623497, 40632773, 40641361, 40622999, 40641091, 40639742, 40631223, 40630656, 40634042, 40643998, 40638741, 40643454, 40634269, 40634774, 40642328, 40642272, 40631558, 40622191, 40634465, 40641116, 40634186, 40640076, 40641443, 40633871, 40638990, 40640499, 40636079, 40625959, 40626969, 40639299]
3 changes: 3 additions & 0 deletions analytics/dagster/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
matplotlib
pandas
requests
1 change: 1 addition & 0 deletions analytics/dagster/src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .definitions import defs as defs
Empty file.
98 changes: 98 additions & 0 deletions analytics/dagster/src/assets/hackernews.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import base64
import json
import os
from io import BytesIO

import matplotlib.pyplot as plt
import pandas as pd
import requests
from dagster import AssetExecutionContext, MaterializeResult, MetadataValue, asset


@asset(group_name="hackernews", compute_kind="HackerNews API")
def topstory_ids() -> None:
"""Get up to 100 top stories from the HackerNews topstories endpoint.
API Docs: https://github.com/HackerNews/API#new-top-and-best-stories
"""
newstories_url = "https://hacker-news.firebaseio.com/v0/topstories.json"
top_new_story_ids = requests.get(newstories_url).json()[:100]

os.makedirs("data", exist_ok=True)
with open("data/topstory_ids.json", "w") as f:
json.dump(top_new_story_ids, f)


@asset(deps=[topstory_ids], group_name="hackernews", compute_kind="HackerNews API")
def topstories(context: AssetExecutionContext) -> MaterializeResult:
"""Get items based on story ids from the HackerNews items endpoint. It may take 30 seconds to fetch all 100 items.
API Docs: https://github.com/HackerNews/API#items
"""
with open("data/topstory_ids.json", "r") as f:
topstory_ids = json.load(f)

results = []
for item_id in topstory_ids:
item = requests.get(
f"https://hacker-news.firebaseio.com/v0/item/{item_id}.json"
).json()
results.append(item)

if len(results) % 20 == 0:
context.log.info(f"Got {len(results)} items so far.")

df = pd.DataFrame(results)
df.to_csv("data/topstories.csv")

return MaterializeResult(
metadata={
"num_records": len(df), # Metadata can be any key-value pair
"preview": MetadataValue.md(df.head().to_markdown()),
# The `MetadataValue` class has useful static methods to build Metadata
}
)


@asset(deps=[topstories], group_name="hackernews", compute_kind="Plot")
def most_frequent_words(context: AssetExecutionContext) -> MaterializeResult:
"""Get the top 25 most frequent words in the titles of the top 100 HackerNews stories."""
stopwords = ["a", "the", "an", "of", "to", "in", "for", "and", "with", "on", "is"]

topstories = pd.read_csv("data/topstories.csv")

# loop through the titles and count the frequency of each word
word_counts = {}
for raw_title in topstories["title"]:
title = raw_title.lower()
for word in title.split():
cleaned_word = word.strip(".,-!?:;()[]'\"-")
if cleaned_word not in stopwords and len(cleaned_word) > 0:
word_counts[cleaned_word] = word_counts.get(cleaned_word, 0) + 1

# Get the top 25 most frequent words
top_words = {
pair[0]: pair[1]
for pair in sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:25]
}

# Make a bar chart of the top 25 words
plt.figure(figsize=(10, 6))
plt.bar(list(top_words.keys()), list(top_words.values()))
plt.xticks(rotation=45, ha="right")
plt.title("Top 25 Words in Hacker News Titles")
plt.tight_layout()

# Convert the image to a saveable format
buffer = BytesIO()
plt.savefig(buffer, format="png")
image_data = base64.b64encode(buffer.getvalue())

# Convert the image to Markdown to preview it within Dagster
md_content = f"![img](data:image/png;base64,{image_data.decode()})"

with open("data/most_frequent_words.json", "w") as f:
json.dump(top_words, f)

# Attach the Markdown content as metadata to the asset
return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)})
16 changes: 16 additions & 0 deletions analytics/dagster/src/definitions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from dagster import (
Definitions,
ScheduleDefinition,
define_asset_job,
load_assets_from_package_module,
)

from . import assets

daily_refresh_schedule = ScheduleDefinition(
job=define_asset_job(name="all_assets_job"), cron_schedule="0 0 * * *"
)

defs = Definitions(
assets=load_assets_from_package_module(assets), schedules=[daily_refresh_schedule]
)
2 changes: 2 additions & 0 deletions analytics/dagster/workspace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
load_from:
- python_module: src
27 changes: 27 additions & 0 deletions analytics/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
services:
dagster:
build:
context: dagster/
dockerfile: Dockerfile
volumes:
- ./dagster:/opt/dagster/dagster_home
env_file:
- .env
ports:
- 3000:3000

postgres:
image: postgres:latest
ports:
- 54322:5432
env_file:
- .env
environment:
- POSTGRES_PASSWORD=${DAGSTER_PG_PASSWORD}
- POSTGRES_USER=${DAGSTER_PG_USERNAME}
- POSTGRES_DB=${DAGSTER_PG_DB}
volumes:
- dagster-postgres:/var/lib/postgresql/data
volumes:
dagster-postgres:
driver: local

0 comments on commit 4b513b4

Please sign in to comment.