Skip to content

Commit

Permalink
Add end-to-end integration test for entire pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
brawer committed May 14, 2024
1 parent a1d9361 commit 87f5781
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 29 deletions.
32 changes: 32 additions & 0 deletions cmd/qrank-builder/build.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package main

import (
"context"
)

// Build runs the entire QRank pipeline.
func Build(dumps string, numWeeks int, s3 S3) error {
ctx := context.Background()

pageviews, err := buildPageviews(ctx, dumps, numWeeks, s3)
if err != nil {
return err
}

sites, err := ReadWikiSites(dumps)
if err != nil {
return err
}
logger.Printf("found wikimedia dumps for %d sites", len(*sites))

if err := buildPageSignals(ctx, dumps, sites, s3); err != nil {
return err
}

_, err = buildItemSignals(ctx, pageviews, sites, s3)
if err != nil {
return err
}

return nil
}
41 changes: 41 additions & 0 deletions cmd/qrank-builder/build_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package main

import (
"bytes"
"log"
"path/filepath"
"slices"
"testing"
)

// TestBuild is a large integration test that runs the entire pipeline.
func TestBuild(t *testing.T) {
if testing.Short() {
t.Skip()
}

logger = log.New(&bytes.Buffer{}, "", log.Lshortfile)
dumps := filepath.Join("testdata", "dumps")
s3 := NewFakeS3()
if err := Build(dumps /*numWeeks*/, 1, s3); err != nil {
t.Fatal(err)
}

got, err := s3.ReadLines("public/item_signals-20240501.csv.zst")
if err != nil {
t.Fatal(err)
}

want := []string{
"item,pageviews_52w,wikitext_bytes,claims,identifiers,sitelinks",
"Q72,0,3142,550,85,186",
"Q5296,0,2872,0,0,0",
"Q662541,3,4973,32,9,15",
"Q5649951,0,0,1,0,20",
"Q107661323,0,3470,0,0,0",
}

if !slices.Equal(got, want) {
t.Errorf("got %v, want %v", got, want)
}
}
32 changes: 3 additions & 29 deletions cmd/qrank-builder/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,37 +99,11 @@ func NewStorageClient(keypath string) (*minio.Client, error) {
}

func computeQRank(dumpsPath string, testRun bool, storage *minio.Client) error {
ctx := context.Background()
var s3 S3 = storage

// Build pageviews files. We're changing how pageviews get aggregated,
// and this is the new, vastly more efficient approach. But it is not
// fully implemented yet, so we do not yet actually use the output.
// The older approach is done by the call to processPageviews below.
// https://github.com/brawer/wikidata-qrank/issues/23
numWeeks := 52
pageviews, err := buildPageviews(ctx, dumpsPath, numWeeks, s3)
if err != nil {
return err
}

sites, err := ReadWikiSites(dumpsPath)
if err != nil {
return err
}
logger.Printf("found wikimedia dumps for %d sites", len(*sites))
if err := buildPageSignals(ctx, dumpsPath, sites, s3); err != nil {
return err
}

_, err = buildItemSignals(ctx, pageviews, sites, s3)
if err != nil {
return err
}
return Build(dumpsPath /*numWeeks*/, 52, storage)

// TODO: Old code starts here, remove after new implementation is done.
return nil

ctx := context.Background()
outDir := "cache"
if testRun {
outDir = "cache-testrun"
Expand All @@ -148,7 +122,7 @@ func computeQRank(dumpsPath string, testRun bool, storage *minio.Client) error {
return err
}

pageviews, err = processPageviews(testRun, dumpsPath, edate, outDir, ctx)
pageviews, err := processPageviews(testRun, dumpsPath, edate, outDir, ctx)
if err != nil {
return err
}
Expand Down

0 comments on commit 87f5781

Please sign in to comment.