diff --git a/.github/workflows/github-actions-data-stack.yml b/.github/workflows/github-actions-data-stack.yml new file mode 100644 index 000000000..78e97818e --- /dev/null +++ b/.github/workflows/github-actions-data-stack.yml @@ -0,0 +1,14 @@ +name: Data Stack CI + +on: + push: + paths: + - analytics/** + +jobs: + deploy-dagster: + uses: ./.github/workflows/deploy.yml + with: + app: dagster-production + branch: main + secrets: inherit diff --git a/.gitignore b/.gitignore index 1db1155bd..26436ffc3 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,8 @@ yarn-error.log* /talisman_report .clever.json + +# Data Stack +.pyc +__pycache__ +analytics/dagster/storage/ diff --git a/analytics/dagster/.nux/nux.yaml b/analytics/dagster/.nux/nux.yaml new file mode 100644 index 000000000..1ba1db1b2 --- /dev/null +++ b/analytics/dagster/.nux/nux.yaml @@ -0,0 +1 @@ +seen: 1 diff --git a/analytics/dagster/Dockerfile b/analytics/dagster/Dockerfile new file mode 100644 index 000000000..f68ce7418 --- /dev/null +++ b/analytics/dagster/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.10-slim + +RUN pip install dagster-webserver dagster-postgres dagster-aws + +# Install dependencies +COPY requirements.txt . +RUN pip install -r requirements.txt + +ENV DAGSTER_HOME=/opt/dagster/dagster_home/ + +RUN mkdir -p $DAGSTER_HOME + +COPY dagster.yaml workspace.yaml $DAGSTER_HOME + +COPY src/ $DAGSTER_HOME + +WORKDIR $DAGSTER_HOME + +EXPOSE 3000 + +ENTRYPOINT ["dagster-webserver", "-h", "0.0.0.0", "-p", "3000"] diff --git a/analytics/dagster/dagster.yaml b/analytics/dagster/dagster.yaml new file mode 100644 index 000000000..fedb16205 --- /dev/null +++ b/analytics/dagster/dagster.yaml @@ -0,0 +1,15 @@ +storage: + postgres: + postgres_db: + username: + env: DAGSTER_PG_USERNAME + password: + env: DAGSTER_PG_PASSWORD + hostname: + env: DAGSTER_PG_HOST + db_name: + env: DAGSTER_PG_DB + port: 5432 + +telemetry: + enabled: false \ No newline at end of file diff --git a/analytics/dagster/data/most_frequent_words.json b/analytics/dagster/data/most_frequent_words.json new file mode 100644 index 000000000..1fae8bcd2 --- /dev/null +++ b/analytics/dagster/data/most_frequent_words.json @@ -0,0 +1 @@ +{"\u2013": 9, "new": 6, "hn": 5, "why": 5, "ai": 4, "show": 4, "from": 4, "macos": 3, "language": 3, "server": 3, "at": 3, "video": 3, "may": 3, "be": 3, "using": 3, "pdf": 3, "vision": 2, "sequoia": 2, "virtual": 2, "arm": 2, "exploring": 2, "apple's": 2, "models": 2, "just": 2, "fast": 2} \ No newline at end of file diff --git a/analytics/dagster/data/topstories.csv b/analytics/dagster/data/topstories.csv new file mode 100644 index 000000000..5ed79db95 --- /dev/null +++ b/analytics/dagster/data/topstories.csv @@ -0,0 +1,101 @@ +,by,descendants,id,kids,score,time,title,type,url,text +0,RafelMri,53.0,40643207,"[40644828, 40644869, 40643824, 40644023, 40644285, 40644054, 40644785, 40643908, 40644404, 40644751, 40644237, 40644218, 40643964, 40643821, 40644052, 40644577, 40643808, 40644093, 40644132, 40644401, 40643772, 40643855, 40644181, 40644433, 40644157]",110,1718087804,POV-Ray – The Persistence of Vision Raytracer,story,http://www.povray.org/, +1,throwaway-blaze,10.0,40643181,"[40644333, 40643182, 40643552, 40644698, 40643829]",85,1718087509,macOS Sequoia to Allow iCloud Logins in Virtual Machines on ARM Macs,story,https://developer.apple.com/documentation/virtualization/using_icloud_with_macos_virtual_machines?language=objc, +2,crowdhailer,34.0,40643167,"[40644331, 40644427, 40644849, 40644435, 40644350, 40644494, 40643997, 40644572]",87,1718087389,"Exploring Gleam, a type-safe language on the BEAM",story,https://christopher.engineering/en/blog/gleam-overview/, +3,2bit,359.0,40639506,"[40641415, 40640114, 40640837, 40640364, 40640302, 40644140, 40644666, 40642536, 40642453, 40640241, 40640131, 40643955, 40640261, 40639998, 40640011, 40643632, 40642527, 40642607, 40640155, 40642413, 40640123, 40642070, 40641060, 40642211, 40640868, 40640907, 40640008, 40639882, 40639995, 40640421, 40639955, 40643131, 40642757, 40641541]",708,1718055731,Apple's On-Device and Server Foundation Models,story,https://machinelearning.apple.com/research/introducing-apple-foundation-models, +4,ulrischa,6.0,40642871,"[40644867, 40644862, 40644521, 40643882, 40644105, 40643945]",67,1718084967,"NanoGPT: The simplest, fastest repository for training medium-sized GPTs",story,https://github.com/karpathy/nanoGPT, +5,calpaterson,21.0,40642476,"[40644318, 40644069, 40643994, 40644710, 40644185]",65,1718081515,DuckDB Isn't Just Fast,story,https://csvbase.com/blog/6, +6,skilled,5.0,40630952,"[40644871, 40644868, 40644860, 40630960]",24,1718002727,OpenWorm – creating a virtual organism in a computer,story,https://openworm.org/, +7,hggh,8.0,40631796,"[40644791, 40643619, 40644764, 40643440, 40643660, 40644673]",66,1718012865,Free Quality SoundFonts (Sf2),story,https://sites.google.com/site/soundfonts4u/, +8,serhack_,235.0,40639606,"[40642804, 40644008, 40640713, 40642152, 40642435, 40640489, 40641439, 40640535, 40640843, 40644592, 40644593, 40640988, 40643483, 40644027, 40641137, 40643295, 40641113, 40642905, 40640001, 40641043, 40641799, 40643533, 40642002, 40644364, 40640959, 40641965, 40640993, 40641564, 40642434, 40642455, 40640096, 40641546, 40640943, 40641359, 40640717, 40640351, 40641790, 40641232, 40642907, 40640823, 40643485, 40640528, 40641907, 40640704, 40641197, 40640240]",463,1718056387,Private Cloud Compute: A new frontier for AI privacy in the cloud,story,https://security.apple.com/blog/private-cloud-compute/, +9,craydandy,93.0,40623864,"[40638690, 40639279, 40639063, 40644599, 40638665, 40639961, 40643096, 40642193, 40642132, 40639073, 40638207, 40639521, 40643858, 40640207, 40641936, 40637550, 40640771, 40642358, 40642552, 40640141, 40643049, 40642367, 40643704, 40641620, 40640304, 40639457, 40639372, 40638602, 40639684, 40644254, 40639496, 40640169, 40639145, 40639572, 40640022, 40637833, 40638418, 40641759]",492,1717934539,I built an ROV to solve missing person cases,story,https://suanto.com/2024/06/06/the-time-I-built-an-ROV-01/, +10,ingve,34.0,40639628,"[40643006, 40642448, 40642573, 40644556, 40643662, 40641861, 40641849, 40644696, 40641783, 40642048, 40643464, 40643407, 40641940]",128,1718056513,"Engage your audience: get to the point, use story structure, force specificity",story,https://iandanielstewart.com/2024/06/09/engage-your-audience-by-getting-to-the-point-using-story-structure-and-forcing-specificity/, +11,Sephr,34.0,40639450,"[40643121, 40641976, 40643737, 40642087, 40643963, 40643579, 40642391, 40639455]",84,1718055438,Big Tech's role in enabling link fraud – take 2,story,https://eligrey.com/blog/link-fraud/,"I posted an earlier draft of this article to Hacker News 4 months ago which was well received [1], but it didn't garner much discussion around the core issue.

Dialog around link fraud complacency was likely sidetracked due to a lack of real-world examples to better illustrate the technical problem at hand. To better illustrate the issue, I've added an examples section citing over 20 cases of link fraud on Google Search, Bing, and X.

I hope that this thread can spark a meaningful dialog around law and security as it pertains to Big Tech's role in enabling link fraud. Society can overcome this issue through concerted efforts to raise awareness and enforce existing legislation.

1. https://news.ycombinator.com/item?id=39003929" +12,pantalaimon,1.0,40644323,[40644711],7,1718099031,Exploring TrustZone-M on the NRF9160,story,https://lenas-fieldnotes.de/minimal-tz/, +13,igpay,88.0,40635397,"[40636495, 40644517, 40637466, 40640101, 40639853, 40636066, 40635895, 40636504, 40637214, 40640921, 40636049, 40636394, 40643155, 40641686, 40639473, 40641182, 40640116, 40638177, 40640345, 40639583, 40635916, 40636849, 40636238, 40641700, 40639382, 40637787, 40636255, 40639414, 40638047, 40638078, 40639491, 40639676, 40635781, 40639083, 40636337, 40641581, 40639255, 40642734, 40636567, 40636569, 40636082]",258,1718037605,Show HN: Probabilistic Tic-Tac-Toe,story,https://www.csun.io/2024/06/08/probabilistic-tic-tac-toe.html, +14,drx,416.0,40636292,"[40638134, 40637883, 40637666, 40644842, 40636823, 40636860, 40636579, 40636905, 40644320, 40637926, 40636666, 40637135, 40637867, 40637479, 40637063, 40643303, 40639957, 40642783, 40637855, 40636781, 40644002, 40636726, 40636994, 40636891, 40637965, 40636519, 40636851, 40636869, 40642244, 40639549, 40638628, 40639176, 40636412, 40638501, 40636901, 40637332, 40636895, 40637127, 40638405, 40636416, 40637628, 40638247, 40637029, 40639615, 40637764, 40643982, 40639966, 40636868, 40638858, 40636762, 40637798, 40638860, 40636769, 40638243, 40637682, 40639310, 40637752, 40637531, 40636426, 40636937, 40636429, 40636643, 40642227, 40636978, 40637198]",186,1718042650,Apple unveils 'Passwords' manager app at WWDC 2024,story,https://www.zdnet.com/article/forget-lastpass-apple-unveils-passwords-manager-app-at-wwdc-2024/, +15,MaxLeiter,103.0,40640927,"[40641762, 40644830, 40642780, 40641971, 40642635, 40641815, 40644278, 40644783, 40641869, 40643724, 40643786, 40641775, 40643740, 40643719, 40642084, 40644675, 40641874, 40642222, 40642991, 40641938, 40641993, 40644735, 40644112, 40642839, 40642040, 40642925, 40641744, 40641391, 40642425, 40642158, 40642085, 40642077]",151,1718065705,Ship Something Every Day,story,https://maxleiter.com/blog/ship-every-day, +16,rbanffy,0.0,40632064,,14,1718015793,A Short History of CP/M-86 – By Bradford Morgan White,story,https://www.abortretry.fail/p/a-short-history-of-cpm-86, +17,wglb,20.0,40640833,"[40641073, 40642216, 40643147, 40642953]",79,1718064900,Siberia's 'mammoth graveyard' reveals 800-year human interactions with mammoths,story,https://phys.org/news/2024-06-siberia-mammoth-graveyard-reveals-year.html, +18,xorvoid,18.0,40633003,"[40641284, 40644160, 40644576, 40644752, 40642137, 40643267, 40643153, 40643550, 40642234]",104,1718023668,Forsp: A Forth+Lisp Hybrid Lambda Calculus Language,story,https://xorvoid.com/forsp.html, +19,zdw,53.0,40643071,"[40644062, 40644110, 40643846, 40644102, 40643826, 40643962, 40643954, 40644447, 40644170, 40643728, 40643953, 40644353, 40643918, 40644715]",74,1718086696,Spotify: Droppin' Some Fake Beats,story,https://lcamtuf.substack.com/p/spotify-droppin-some-fake-beats, +20,gradus_ad,20.0,40638764,"[40640605, 40643008, 40640411, 40642101, 40641220, 40642614]",122,1718052124,The new math of how large-scale order emerges,story,https://www.quantamagazine.org/the-new-math-of-how-large-scale-order-emerges-20240610/, +21,nextworddev,11.0,40641848,"[40644853, 40644786, 40644591, 40644679, 40644470]",26,1718075359,Apple's AI Strategy in a Nutshell,story,https://nextword.substack.com/p/apples-ai-strategy-in-a-nutshell, +22,terramex,1069.0,40636844,"[40637094, 40639568, 40637108, 40637125, 40637043, 40637067, 40637267, 40636560, 40638786, 40638242, 40637013, 40638509, 40637903, 40636706, 40636682, 40644337, 40636970, 40636775, 40636631, 40637554, 40644013, 40639132, 40637159, 40636918, 40637450, 40637150, 40636916, 40639332, 40644161, 40637088, 40636974, 40637059, 40636990, 40637041, 40636626, 40636928, 40639726, 40639010, 40637286, 40637103, 40637481, 40637201, 40636744, 40643518, 40638257, 40636859, 40637425, 40637979, 40639129, 40636603, 40636942, 40637028, 40637053, 40636734, 40640389, 40640126, 40639678, 40637096, 40638154, 40641824, 40636923, 40640274, 40637192, 40639419, 40639375, 40640775, 40638585, 40636757, 40636904, 40637623, 40636845, 40637580, 40642597, 40638723, 40637886, 40638557, 40637142, 40637075, 40636709, 40639569, 40636702, 40639263, 40638722, 40636948, 40636998, 40638593, 40637534, 40644244, 40637342, 40643065, 40636853, 40639050, 40636608, 40640785, 40636919, 40642433, 40640628, 40637265, 40638780, 40636587, 40637901, 40637206, 40638885, 40639618, 40636557, 40639731, 40636939, 40636960, 40640067, 40638727, 40639264, 40637071, 40636712, 40638339, 40636834, 40636722, 40636906, 40638498, 40637120, 40637707, 40639625, 40639195, 40638724, 40639702, 40640735, 40637273, 40636785, 40642182, 40641292, 40637777, 40637533, 40640760, 40642952, 40640222, 40638515, 40637238, 40636780, 40641024, 40636968, 40641169, 40638575, 40640902, 40637110, 40640215, 40639910, 40639537, 40638631, 40637780, 40637568, 40637500, 40637129, 40637060, 40636982, 40636953, 40636627, 40636716, 40637182, 40636796, 40636585, 40636864, 40642821, 40637005, 40637571, 40641904, 40637097, 40641908, 40637515, 40637006, 40636606, 40636979, 40636629, 40638845, 40636700, 40639304]",952,1718045327,"Apple Intelligence for iPhone, iPad, and Mac",story,https://www.apple.com/newsroom/2024/06/introducing-apple-intelligence-for-iphone-ipad-and-mac/, +23,throwup238,251.0,40637102,"[40640212, 40644805, 40644731, 40639609, 40639927, 40640380, 40639280, 40640673, 40642275, 40639507, 40639777, 40639901, 40644564, 40640209, 40640005, 40641802, 40642100, 40642270, 40640423, 40641657]",307,1718046026,Deterioration of local community a major driver of loss of play-based childhood,story,https://www.afterbabel.com/p/community-based-childhood, +24,geek_at,100.0,40632745,"[40636265, 40638625, 40636550, 40637021, 40641386, 40636419, 40636184, 40638348, 40636820, 40644280, 40638581, 40638607, 40640660, 40639771, 40636342, 40636115, 40636861, 40637769, 40636595, 40636073, 40641739, 40639951, 40635975, 40636647, 40638457, 40641888, 40637465, 40637980, 40636180, 40636609, 40636531, 40638633, 40636446, 40636470, 40643877]",178,1718021696,Sending emails to my three-year-old,story,https://blog.haschek.at/2024/leaving-a-digital-legacy.html, +25,ca98am79,70.0,40636883,"[40638720, 40639914, 40641540, 40640703, 40638761, 40643274, 40644109, 40643868, 40639197, 40639831, 40640305, 40638781, 40639973, 40639832, 40639585, 40639217, 40640283, 40637922, 40643843, 40641114, 40638537, 40638680]",174,1718045453,The rarest move in chess [video],story,https://www.youtube.com/watch?v=iDnW0WiCqNc, +26,getwiththeprog,11.0,40641704,"[40643067, 40643746, 40643396, 40642840, 40643352, 40642769]",86,1718073516,Biodiversity enhances immune regulation among daycare children,story,https://www.science.org/doi/10.1126/sciadv.aba2578, +27,oidar,44.0,40641795,"[40642251, 40642584, 40642460, 40644545, 40644148, 40643221, 40642223, 40643180, 40642598]",71,1718074517,Blackmagic Cine Immersive Capture for Vision Pro 8160x7200 Resolution per Eye,story,https://www.newsshooter.com/2024/06/10/blackmagic-ursa-cine-immersive-capture-content-for-apple-vision-pro-with-8160-x-7200-resolution-per-eye/, +28,belter,27.0,40633902,"[40634985, 40636336, 40637243, 40638077, 40639703, 40634789, 40637361, 40638375, 40634858, 40640877]",207,1718028864,Mexican Computers: A Brief Technical and Historical Overview,story,https://arxiv.org/abs/2406.04912, +29,skeptrune,,40639032,,1,1718053243,Trieve (YC W24) Is Hiring a DevRel Software Engineer,job,https://www.ycombinator.com/companies/trieve/jobs/2jeeXLs-developer-relations-software-engineer, +30,Anon84,6.0,40640424,"[40643665, 40642810, 40643600, 40643170]",56,1718061486,The Geometry of Categorical and Hierarchical Concepts in Large Language Models,story,https://arxiv.org/abs/2406.01506, +31,indigodaddy,4.0,40641615,"[40642778, 40643014]",39,1718072630,Genes protective during Black Death may now be increasing autoimmune disorders (2022),story,https://www.health.harvard.edu/blog/genes-protective-during-the-black-death-may-now-be-increasing-autoimmune-disorders-202212012859, +32,nequo,2.0,40641932,"[40643995, 40643850]",15,1718076150,The Common Lisp Cookbook (2007),story,https://cl-cookbook.sourceforge.net/index.html, +33,dp-hackernews,0.0,40644454,,6,1718100453,Humans May Be Able to Grow New Teeth Within Just 6 Years,story,https://www.popularmechanics.com/science/health/a60952102/tooth-regrowth-human-trials-japan/, +34,CTOSian,2.0,40631614,"[40643076, 40642574]",30,1718010848,An Arduino interface for 8088 CPUs,story,https://github.com/dbalsom/arduino_8088, +35,drdee,10.0,40637374,"[40644444, 40639874, 40641312, 40643251, 40640756, 40643608, 40640375, 40641889]",48,1718046819,Pyrophone,story,https://en.wikipedia.org/wiki/Pyrophone, +36,unlog,48.0,40632533,"[40638029, 40637652, 40632569, 40643806, 40644001, 40638402, 40639800, 40638597, 40635752, 40636252, 40634593, 40639035, 40638598]",170,1718020020,"Show HN: Crawl a modern website to a zip, serve the website from the zip",story,https://github.com/potahtml/mpa-archive, +37,jandrewrogers,36.0,40637785,"[40640782, 40642475, 40644328, 40643595, 40644221, 40641380, 40642064, 40640980, 40640938, 40640942, 40641142, 40642114, 40640066, 40641364]",98,1718048369,Possible exposure of Earth to dense interstellar medium 2-3M years ago,story,https://www.nature.com/articles/s41550-024-02279-8, +38,082349872349872,0.0,40631466,,7,1718009035,The Last Mathematical Testament of Galois,story,https://www.ias.ac.in/article/fulltext/reso/004/10/0093-0100, +39,classichasclass,19.0,40635697,"[40636983, 40636547, 40636962, 40636524]",69,1718039392,pico9918: A replacement TMS9918A/TMS9929A VDP using a Raspberry Pi Pico,story,https://github.com/visrealm/pico9918, +40,YeGoblynQueenne,5.0,40631573,"[40643301, 40643469]",32,1718010281,Wooden bowling arm that bested Australian cricketer in 1909 rebuilt,story,https://www.theguardian.com/technology/article/2024/jun/10/wooden-bowling-arm-john-venn-machine-rebuilt-cambridge-australia-1909-cricket-team, +41,anarbadalov,63.0,40635789,"[40642151, 40643717, 40636578, 40639561, 40637038, 40636619, 40639597]",56,1718039923,How David Bohm and Hugh Everett changed quantum theory,story,https://daily.jstor.org/how-two-rebel-physicists-changed-quantum-theory/, +42,alishobeiri,34.0,40633773,"[40641405, 40643693, 40635533, 40635793, 40639259, 40642660, 40635673, 40635861, 40636097, 40635754, 40635527, 40635574]",128,1718027962,Show HN: Thread – AI-powered Jupyter Notebook built using React,story,https://github.com/squaredtechnologies/thread,"Hey HN, we're building Thread (https://thread.dev/) an open-source Jupyter Notebook that has a bunch of AI features built in. The easiest way to think of Thread is if the chat interface of OpenAI code interpreter was fused into a Jupyter Notebook development environment where you could still edit code or re-run cells. To check it out, you can see a video demo here: https://www.youtube.com/watch?v=Jq1_eoO6w-c

We initially got the idea when building Vizly (https://vizly.fyi/) a tool that lets non-technical users ask questions from their data. While Vizly is powerful at performing data transformations, as engineers, we often felt that natural language didn't give us enough freedom to edit the code that was generated or to explore the data further for ourselves. That is what gave us the inspiration to start Thread.

We made Thread a pip package (`pip install thread-dev`) because we wanted to make Thread as easily accessible as possible. While there are a lot of notebooks that improve on the notebook development experience, they are often cloud hosted tools that are hard to access as an individual contributor unless your company has signed an enterprise agreement.

With Thread, we are hoping to bring the power of LLMs to the local notebook development environment while blending the editing experience that you can get in a cloud hosted notebook. We have many ideas on the roadmap but instead of building in a vacuum (which we have made the mistake of before) our hope was to get some initial feedback to see if others are as interested in a tool like this as we are.

Would love to hear your feedback and see what you think!" +43,skilled,88.0,40642801,"[40643264, 40643412, 40643328, 40643716, 40643387, 40644439, 40643186, 40643224, 40643354, 40643139, 40643567]",102,1718084413,British duo arrested for SMS phishing via homemade cell tower,story,https://www.cityoflondon.police.uk/news/city-of-london/news/2024/june/two-people-arrested-in-connection-with-investigation-into-homemade-mobile-antenna-used-to-send-thousands-of-smishing-text-messages-to-the-public/, +44,ColinWright,0.0,40644605,,3,1718101983,When Water Flows Uphill [video],story,https://www.youtube.com/watch?v=zzKgnNGqxMw, +45,MaysonL,5.0,40632397,"[40641990, 40638586, 40641983]",43,1718018816,Water,story,https://johncarlosbaez.wordpress.com/2013/11/29/water/, +46,diodorus,0.0,40615002,,5,1717816539,Policing the Chôra: Law Enforcement in Ptolemaic Egypt (2005) [pdf],story,https://www.u.arizona.edu/~jbausch1/cv/BauschatzFullDissertation.pdf, +47,kieto,1.0,40644111,[40644771],9,1718097150,Raspberry Pi IPO,story,https://www.raspberrypi.com/news/raspberry-pi-ipo/, +48,ksec,98.0,40630699,"[40633994, 40630785, 40632875, 40631951, 40634697, 40632445, 40631981, 40643030, 40631980, 40641488, 40633014, 40631681, 40636971, 40637023, 40634549]",224,1717999811,The Magical Mystery Merge Or Why we run FreeBSD-current at Netflix (2023) [pdf],story,https://people.freebsd.org/~gallatin/talks/OpenFest2023.pdf, +49,skilled,9.0,40643499,"[40643971, 40644029, 40643981, 40643851]",12,1718090324,Meta says European data is essential for culturally relevant AI,story,https://stackdiary.com/meta-says-european-data-is-essential-for-culturally-relevant-ai/, +50,magnio,46.0,40622209,"[40632366, 40634152, 40632555, 40632872, 40635193, 40641063, 40633995, 40632986, 40632025, 40635590, 40632960, 40637426, 40634035, 40633658, 40632656]",250,1717910091,"Scratchapixel 4.0, Learn Computer Graphics Programming",story,https://www.scratchapixel.com/index.html, +51,TangerineDream,1.0,40644459,[40644778],5,1718100494,Why curl closes PRs on GitHub,story,https://daniel.haxx.se/blog/2024/06/11/why-curl-closes-prs-on-github/, +52,hindsightbias,9.0,40627113,"[40641173, 40643233, 40641097, 40640806, 40641090]",35,1717963179,Researchers demonstrate the first chip-based 3D printer,story,https://news.mit.edu/2024/researchers-demonstrate-first-chip-based-3d-printer-0606, +53,stareatgoats,75.0,40638386,"[40639352, 40641079, 40639256, 40639531, 40639478, 40641377, 40640886]",175,1718050593,Controversial pesticide research all but vanished from a major conference,story,https://usrtk.org/bees-neonics/entomological-society-america-corporate-partners/, +54,diggan,1.0,40640534,[40644845],12,1718062255,Jank's new persistent string is fast,story,https://jank-lang.org/blog/2023-12-30-fast-string/, +55,isaacfrond,7.0,40643744,"[40644657, 40644684, 40644391, 40644003, 40643970]",11,1718093025,The word 'bot' is increasingly being used as an insult on social media,story,https://www.newscientist.com/article/2434742-the-word-bot-is-increasingly-being-used-as-an-insult-on-social-media/, +56,davidbarker,288.0,40636854,"[40637054, 40638579, 40638081, 40640958, 40637251, 40637530, 40637878, 40640378, 40637540, 40637521, 40637476, 40639082, 40637936, 40640636, 40638397, 40643694, 40643379, 40641428, 40638009, 40638566, 40637340, 40639766, 40638132, 40642538, 40640848, 40638203, 40637735, 40640187, 40638472, 40639971, 40639891, 40637835, 40637607, 40637495, 40643845, 40638404, 40637433, 40639350]",293,1718045347,macOS Sequoia Preview,story,https://www.apple.com/macos/macos-sequoia-preview/, +57,jordigh,0.0,40636122,,29,1718041744,Nvidia-patch: removes restriction simultaneous video encoding sessions,story,https://github.com/keylase/nvidia-patch, +58,thunderbong,4.0,40637303,"[40643039, 40641822, 40642389, 40639992]",31,1718046610,SQLSync: A collaborative offline-first wrapper around SQLite,story,https://github.com/orbitinghail/sqlsync, +59,ibobev,0.0,40641388,,10,1718070220,Python and OpenGL for Scientific Visualization,story,https://www.labri.fr/perso/nrougier/python-opengl/, +60,mkurz,138.0,40631439,"[40631903, 40631596, 40631652, 40631627, 40631605, 40631927, 40631891, 40631645, 40631762, 40631599, 40632911, 40633018, 40635744, 40631622, 40632138, 40634717, 40631913, 40631582, 40631677, 40631578, 40631587, 40631588, 40631595]",205,1718008722,WebKit fix: Quirk news.ycombinator to skip TextAutoSizing,story,https://github.com/WebKit/WebKit/commit/84ae355619354ee1bfa7daaa1fc95565a6726be3, +61,lnyan,9.0,40631585,"[40641138, 40641102, 40642078, 40641614, 40641146, 40641144]",14,1718010471,Filmed.js: film strip image effect,story,https://www.netzgesta.de/filmed/, +62,TamTech,2.0,40643259,[40643951],10,1718088187,The 'Dead Internet Theory',story,https://theconversation.com/the-dead-internet-theory-makes-eerie-claims-about-an-ai-run-web-the-truth-is-more-sinister-229609, +63,skilled,37.0,40622671,"[40633660, 40633996, 40635496, 40634384, 40633068, 40632857, 40634262, 40633937, 40634902, 40633188, 40635405, 40642828, 40635360, 40622680, 40633860, 40634852]",108,1717918301,PiDP-10 – a modern replica of the PDP-10,story,https://obsolescence.dev/pidp10.html, +64,goles,33.0,40619311,"[40627217, 40628315, 40627354, 40635222, 40627219, 40637678, 40636949]",85,1717870329,Western Pennsylvania dirt is used in the infields of most MLB stadiums (2017),story,https://www.post-gazette.com/sports/pirates/2017/08/31/baseball-infield-dirt-mix-prices-duraedge-pnc-park-wrigley-field-duraedge-slippery-rock-soil-pennsylvania/stories/201708310111, +65,jstanley,1.0,40638445,[40643707],30,1718050769,Printable Popup Horizontal Sundials,story,https://www.blocklayer.com/sundial-popeng, +66,letmutex,4.0,40640635,"[40643010, 40643105]",10,1718063038,Effective substring in Rust,story,https://letmutex.com/article/effective-substring-in-rust, +67,matt_d,21.0,40627563,"[40635746, 40638371, 40639932, 40638648, 40635770, 40636509]",54,1717966635,Deep Dive into Ownership in Mojo,story,https://www.modular.com/blog/deep-dive-into-ownership-in-mojo, +68,adomasm3,11.0,40637089,"[40640838, 40641094]",23,1718045992,"Phthalates, Toxic Plastic Additives, Are Everywhere: What's the Acceptable Limit",story,https://molecularspec.substack.com/p/phthalates-in-food-assessing-intake, +69,_Microft,62.0,40626807,"[40634063, 40628369, 40630764, 40628540, 40627486, 40627796, 40627270, 40631169, 40630288, 40630768, 40627106, 40634963, 40628760, 40631912, 40627828, 40634976, 40628881, 40628468, 40631804, 40634076, 40627702, 40631037, 40632909, 40631410]",597,1717960988,Designing a Lego orrery,story,https://marian42.de/article/orrery/, +70,skilled,21.0,40623497,"[40632764, 40633231, 40633583, 40642456, 40633561, 40634618, 40633546]",67,1717929627,Fingerprinting VPNs with Custom Router Firmware [pdf],story,https://censorbib.nymity.ch/pdf/Almutairi2024a.pdf, +71,padolsey,146.0,40632773,"[40634342, 40634094, 40634238, 40644255, 40635146, 40643393, 40633252, 40633460, 40633182, 40633280, 40634328, 40633218, 40634040, 40633874, 40633789, 40633236, 40633138, 40633222, 40634087, 40634939, 40633197, 40633728, 40633348, 40633545, 40633544, 40633972, 40633779, 40633200, 40634515, 40635088, 40634260, 40639669, 40633806, 40633784, 40633413]",429,1718021986,Show HN: Markdown HN profiles at {user}.at.hn,story,https://at.hn,"Very opportunistic toy project as I saw the domain was up for grabs: 'at.hn' is a little site where people can have their own subdomains for whatever their HN username is (opt-in only by adding a slug to your bio). It doesn't really do much. Just shows your HN bio rendered as markdown plus meta stuff. I'm thinking of adding an aggregated user listing on the homepage so people can explore profiles. There's a bunch of interesting people on HN but discoverability is a bit longwinded. I'm wondering what other features people want. Otherwise shall likely leave it as-is. I remember hnbadges was a thing for a while, but can't remember what happened to it. Did people like that? Anyway, at.hn's on github if people want to contribute. - https://github.com/padolsey/at.hn" +72,rudolfwinestock,174.0,40641361,"[40643491, 40643036, 40642066, 40643232, 40642104, 40642060, 40643631, 40644557, 40642524, 40643549, 40642402, 40643143, 40643904, 40642701, 40643462, 40643371, 40643732, 40642559, 40642827, 40643410, 40642674, 40643316, 40643894, 40643241, 40642523, 40641832, 40643047, 40643152, 40643991]",243,1718069975,Noam Chomsky 'no longer able to talk' after 'medical event',story,https://www.independent.co.uk/arts-entertainment/books/news/noam-chomsky-health-update-tributes-b2559831.html, +73,zeristor,19.0,40622999,"[40638576, 40641030, 40638530, 40640784, 40640859, 40640694, 40638153, 40623013, 40639223]",52,1717923633,From Steampunk to Solarpunk (2008),story,http://republicofthebees.blogspot.com/2008/04/from-steampunk-to-solarpunk.html, +74,tanelpoder,0.0,40641091,,9,1718067484,eBPF BCC to libbpf conversion guide,story,https://nakryiko.com/posts/bcc-to-libbpf-howto-guide/, +75,klaussilveira,0.0,40639742,,9,1718057183,Spilo: High Availability PostgreSQL cluster using Docker,story,https://github.com/zalando/spilo, +76,skilled,128.0,40631223,"[40643112, 40641945, 40642732, 40640794, 40641755, 40641262, 40643429, 40642962, 40643738, 40640968, 40640799, 40641086, 40641034, 40641946, 40640693]",110,1718006015,Anti-Cheat Expert: all your pixels are belong to us,story,https://invlpg.dev/post/ace_screenshots/, +77,ingve,5.0,40630656,"[40639631, 40640054, 40639456, 40640422, 40639552]",37,1717999209,The Engine of the Future,story,https://c0de517e.com/014_future_engines.htm, +78,puzzledpenguin,72.0,40634042,"[40634871, 40634563, 40643941, 40635315, 40634557, 40634839, 40638685, 40635354, 40635823, 40635143, 40640779, 40634915, 40640677, 40638647, 40634929, 40640862, 40638003, 40634587, 40635148, 40639701, 40635962, 40635776, 40635687, 40636986, 40634383, 40635710, 40635080, 40635851, 40634599, 40635713]",275,1718029827,23words.com,story,https://23words.com, +79,ndsipa_pomu,0.0,40643998,,3,1718095913,AI trained on photos from kids' entire childhood without their consent,story,https://arstechnica.com/tech-policy/2024/06/ai-trained-on-photos-from-kids-entire-childhood-without-their-consent/, +80,belter,7.0,40638741,"[40642230, 40640734, 40639673, 40639092, 40639759]",36,1718052024,The British Newspaper Archive,story,https://www.britishnewspaperarchive.co.uk/, +81,cyberlimerence,0.0,40643454,,8,1718089924,Inside Mexico’s anti-avocado militias,story,https://www.theguardian.com/news/article/2024/jun/11/inside-mexico-anti-avocado-militias, +82,dlazaro,5.0,40634269,[40640336],24,1718031177,Creating Perfect Font Fallbacks in CSS,story,https://www.aleksandrhovhannisyan.com/blog/perfect-font-fallbacks/, +83,rachofsunshine,254.0,40634774,"[40637454, 40635310, 40635569, 40635650, 40635549, 40636225, 40644432, 40635344, 40642762, 40640065, 40635435, 40636522, 40635656, 40641021, 40640983, 40635722, 40636564, 40635510, 40638937, 40635632, 40642134, 40637939, 40637864, 40635447, 40636096, 40638746, 40636646, 40635786, 40641296, 40640356, 40635554, 40638210, 40637438, 40639430, 40638675, 40638089, 40638054, 40637575, 40638520, 40639990, 40636098, 40635307, 40641261, 40636772, 40639433]",203,1718034153,Why Triplebyte Failed,story,https://www.otherbranch.com/blog/why-triplebyte-failed, +84,zshrc,115.0,40642328,"[40642727, 40642617, 40642936, 40643191, 40642858, 40642723]",124,1718079989,macOS 15.0 supports Nested Virtualization on M3 chips,story,https://developer.apple.com/documentation/virtualization/vzgenericplatformconfiguration/4360553-isnestedvirtualizationsupported, +85,thesuperbigfrog,19.0,40642272,"[40642731, 40643035, 40643531, 40643288, 40643562, 40642610, 40642686]",22,1718079401,Google is ready to fill free streaming TV channels with ads,story,https://www.theverge.com/2024/6/10/24175676/google-fast-ads-streaming-tv-network, +86,rbanffy,143.0,40631558,"[40631874, 40633095, 40632603, 40632813, 40633490, 40633149, 40632950, 40636187, 40632597, 40633134, 40632146, 40637931, 40631902, 40633371, 40634071, 40632151, 40637921, 40634841, 40632695, 40632431, 40635018, 40639828, 40632841, 40631840, 40631810, 40632299]",174,1718010142,The Mythical Non-Roboticist: Wouldn't it be great if everyone could do robotics?,story,https://spectrum.ieee.org/the-mythical-non-roboticist, +87,todsacerdoti,73.0,40622191,"[40633266, 40631494, 40634567, 40630745, 40634705, 40643687, 40638550, 40635879, 40632928, 40634407, 40631059, 40633670, 40636310, 40633589, 40631672, 40631454]",151,1717909634,Dmv.org,story,https://computer.rip/2024-06-08-dmv.org.html, +88,lnyan,150.0,40634465,"[40635258, 40634735, 40634646, 40634754, 40635238, 40636729, 40635124, 40638931, 40634501, 40637834, 40635259, 40635343, 40636178, 40635351, 40634665, 40635472, 40636538, 40636142, 40637868, 40635896]",233,1718032162,"Gainax, known for 'Evangelion' anime production, goes bankrupt",story,https://www.japantimes.co.jp/business/2024/06/08/evangelion-anime-production-company-bankrupt/, +89,carllippert,50.0,40641116,"[40641544, 40642831, 40641648, 40641561, 40642113, 40641406, 40643225, 40642763, 40642486, 40641654]",59,1718067658,Back To Atoms: Why we can stop building SaaS and build the future instead.,story,https://carllippert.com/back-to-atoms/, +90,segasaturn,5.0,40634186,[40639318],21,1718030644,"0patch – Security Patches for Windows 7, 8, 10, Server 2008, Server 2012",story,https://0patch.com/, +91,hackernj,8.0,40640076,"[40640905, 40641494]",57,1718059225,Wild elephants may have names that other elephants use to call them,story,https://www.npr.org/2024/06/07/nx-s1-4994426/wild-elephants-individual-names, +92,typeofhuman,1.0,40641443,[40643502],18,1718070765,New York Times Responds to Source Code Leak,story,https://www.securityweek.com/new-york-times-responds-to-source-code-leak/, +93,whereistimbo,2.0,40633871,[40638951],26,1718028602,EbookFoundation/Free-Programming-Books,story,https://github.com/EbookFoundation/free-programming-books/blob/main/books/free-programming-books-langs.md, +94,vyrotek,3.0,40638990,"[40642274, 40640767, 40641217]",13,1718053030,Pixel-Composer – Node based VFX compositor for pixel art,story,https://pixel-composer.com, +95,zhengiszen,131.0,40640499,"[40641412, 40640929, 40640975, 40640948, 40640994, 40640960, 40640982, 40642311, 40641288, 40641688, 40642247, 40640992, 40640896, 40641842, 40642204]",129,1718062009,Intel pauses work on $25B Israel fab,story,https://www.theregister.com/2024/06/10/intel_israeli_fab/, +96,aarondf,0.0,40636079,,15,1718041505,SQLSync – collaborative offline-first wrapper around SQLite,story,https://sqlsync.dev/, +97,bookofjoe,85.0,40625959,"[40625962, 40629914, 40634767, 40629578, 40628085, 40638122, 40635553, 40632086, 40628532, 40637225, 40632160, 40626116, 40629847]",100,1717953678,A new world of DIY medical testing,story,https://www.washingtonpost.com/technology/2024/06/09/home-health-tests-doctors-fda/, +98,fanf2,53.0,40626969,"[40627849, 40631545, 40628142, 40628172, 40631576, 40628624, 40627830, 40632258, 40633365, 40628350, 40631074, 40630851, 40628212, 40630249, 40629557, 40631094, 40628251]",312,1717962123,Libtree: Ldd as a tree saying why a library is found or not,story,https://github.com/haampie/libtree, +99,rbanffy,1.0,40639299,[40642392],12,1718054634,Django Enhancement Proposal 14: Background Workers,story,https://www.djangoproject.com/weblog/2024/may/29/django-enhancement-proposal-14-background-workers/, diff --git a/analytics/dagster/data/topstory_ids.json b/analytics/dagster/data/topstory_ids.json new file mode 100644 index 000000000..5c9ee3b91 --- /dev/null +++ b/analytics/dagster/data/topstory_ids.json @@ -0,0 +1 @@ +[40643207, 40643181, 40643167, 40639506, 40642871, 40642476, 40630952, 40631796, 40639606, 40623864, 40639628, 40639450, 40644323, 40635397, 40636292, 40640927, 40632064, 40640833, 40633003, 40643071, 40638764, 40641848, 40636844, 40637102, 40632745, 40636883, 40641704, 40641795, 40633902, 40639032, 40640424, 40641615, 40641932, 40644454, 40631614, 40637374, 40632533, 40637785, 40631466, 40635697, 40631573, 40635789, 40633773, 40642801, 40644605, 40632397, 40615002, 40644111, 40630699, 40643499, 40622209, 40644459, 40627113, 40638386, 40640534, 40643744, 40636854, 40636122, 40637303, 40641388, 40631439, 40631585, 40643259, 40622671, 40619311, 40638445, 40640635, 40627563, 40637089, 40626807, 40623497, 40632773, 40641361, 40622999, 40641091, 40639742, 40631223, 40630656, 40634042, 40643998, 40638741, 40643454, 40634269, 40634774, 40642328, 40642272, 40631558, 40622191, 40634465, 40641116, 40634186, 40640076, 40641443, 40633871, 40638990, 40640499, 40636079, 40625959, 40626969, 40639299] \ No newline at end of file diff --git a/analytics/dagster/requirements.txt b/analytics/dagster/requirements.txt new file mode 100644 index 000000000..57e66a4a0 --- /dev/null +++ b/analytics/dagster/requirements.txt @@ -0,0 +1,3 @@ +matplotlib +pandas +requests \ No newline at end of file diff --git a/analytics/dagster/src/__init__.py b/analytics/dagster/src/__init__.py new file mode 100644 index 000000000..3c25b881e --- /dev/null +++ b/analytics/dagster/src/__init__.py @@ -0,0 +1 @@ +from .definitions import defs as defs diff --git a/analytics/dagster/src/assets/__init__.py b/analytics/dagster/src/assets/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/analytics/dagster/src/assets/hackernews.py b/analytics/dagster/src/assets/hackernews.py new file mode 100644 index 000000000..11059a0c1 --- /dev/null +++ b/analytics/dagster/src/assets/hackernews.py @@ -0,0 +1,98 @@ +import base64 +import json +import os +from io import BytesIO + +import matplotlib.pyplot as plt +import pandas as pd +import requests +from dagster import AssetExecutionContext, MaterializeResult, MetadataValue, asset + + +@asset(group_name="hackernews", compute_kind="HackerNews API") +def topstory_ids() -> None: + """Get up to 100 top stories from the HackerNews topstories endpoint. + + API Docs: https://github.com/HackerNews/API#new-top-and-best-stories + """ + newstories_url = "https://hacker-news.firebaseio.com/v0/topstories.json" + top_new_story_ids = requests.get(newstories_url).json()[:100] + + os.makedirs("data", exist_ok=True) + with open("data/topstory_ids.json", "w") as f: + json.dump(top_new_story_ids, f) + + +@asset(deps=[topstory_ids], group_name="hackernews", compute_kind="HackerNews API") +def topstories(context: AssetExecutionContext) -> MaterializeResult: + """Get items based on story ids from the HackerNews items endpoint. It may take 30 seconds to fetch all 100 items. + + API Docs: https://github.com/HackerNews/API#items + """ + with open("data/topstory_ids.json", "r") as f: + topstory_ids = json.load(f) + + results = [] + for item_id in topstory_ids: + item = requests.get( + f"https://hacker-news.firebaseio.com/v0/item/{item_id}.json" + ).json() + results.append(item) + + if len(results) % 20 == 0: + context.log.info(f"Got {len(results)} items so far.") + + df = pd.DataFrame(results) + df.to_csv("data/topstories.csv") + + return MaterializeResult( + metadata={ + "num_records": len(df), # Metadata can be any key-value pair + "preview": MetadataValue.md(df.head().to_markdown()), + # The `MetadataValue` class has useful static methods to build Metadata + } + ) + + +@asset(deps=[topstories], group_name="hackernews", compute_kind="Plot") +def most_frequent_words(context: AssetExecutionContext) -> MaterializeResult: + """Get the top 25 most frequent words in the titles of the top 100 HackerNews stories.""" + stopwords = ["a", "the", "an", "of", "to", "in", "for", "and", "with", "on", "is"] + + topstories = pd.read_csv("data/topstories.csv") + + # loop through the titles and count the frequency of each word + word_counts = {} + for raw_title in topstories["title"]: + title = raw_title.lower() + for word in title.split(): + cleaned_word = word.strip(".,-!?:;()[]'\"-") + if cleaned_word not in stopwords and len(cleaned_word) > 0: + word_counts[cleaned_word] = word_counts.get(cleaned_word, 0) + 1 + + # Get the top 25 most frequent words + top_words = { + pair[0]: pair[1] + for pair in sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:25] + } + + # Make a bar chart of the top 25 words + plt.figure(figsize=(10, 6)) + plt.bar(list(top_words.keys()), list(top_words.values())) + plt.xticks(rotation=45, ha="right") + plt.title("Top 25 Words in Hacker News Titles") + plt.tight_layout() + + # Convert the image to a saveable format + buffer = BytesIO() + plt.savefig(buffer, format="png") + image_data = base64.b64encode(buffer.getvalue()) + + # Convert the image to Markdown to preview it within Dagster + md_content = f"![img](data:image/png;base64,{image_data.decode()})" + + with open("data/most_frequent_words.json", "w") as f: + json.dump(top_words, f) + + # Attach the Markdown content as metadata to the asset + return MaterializeResult(metadata={"plot": MetadataValue.md(md_content)}) diff --git a/analytics/dagster/src/definitions.py b/analytics/dagster/src/definitions.py new file mode 100644 index 000000000..3070b3e35 --- /dev/null +++ b/analytics/dagster/src/definitions.py @@ -0,0 +1,16 @@ +from dagster import ( + Definitions, + ScheduleDefinition, + define_asset_job, + load_assets_from_package_module, +) + +from . import assets + +daily_refresh_schedule = ScheduleDefinition( + job=define_asset_job(name="all_assets_job"), cron_schedule="0 0 * * *" +) + +defs = Definitions( + assets=load_assets_from_package_module(assets), schedules=[daily_refresh_schedule] +) diff --git a/analytics/dagster/workspace.yaml b/analytics/dagster/workspace.yaml new file mode 100644 index 000000000..92341baec --- /dev/null +++ b/analytics/dagster/workspace.yaml @@ -0,0 +1,2 @@ +load_from: + - python_module: src \ No newline at end of file diff --git a/analytics/docker-compose.yml b/analytics/docker-compose.yml new file mode 100644 index 000000000..ff7a6f83b --- /dev/null +++ b/analytics/docker-compose.yml @@ -0,0 +1,27 @@ +services: + dagster: + build: + context: dagster/ + dockerfile: Dockerfile + volumes: + - ./dagster:/opt/dagster/dagster_home + env_file: + - .env + ports: + - 3000:3000 + + postgres: + image: postgres:latest + ports: + - 54322:5432 + env_file: + - .env + environment: + - POSTGRES_PASSWORD=${DAGSTER_PG_PASSWORD} + - POSTGRES_USER=${DAGSTER_PG_USERNAME} + - POSTGRES_DB=${DAGSTER_PG_DB} + volumes: + - dagster-postgres:/var/lib/postgresql/data +volumes: + dagster-postgres: + driver: local \ No newline at end of file