diff --git a/cmd/qrank-builder/build.go b/cmd/qrank-builder/build.go new file mode 100644 index 0000000..4fb2497 --- /dev/null +++ b/cmd/qrank-builder/build.go @@ -0,0 +1,32 @@ +package main + +import ( + "context" +) + +// Build runs the entire QRank pipeline. +func Build(dumps string, numWeeks int, s3 S3) error { + ctx := context.Background() + + pageviews, err := buildPageviews(ctx, dumps, numWeeks, s3) + if err != nil { + return err + } + + sites, err := ReadWikiSites(dumps) + if err != nil { + return err + } + logger.Printf("found wikimedia dumps for %d sites", len(*sites)) + + if err := buildPageSignals(ctx, dumps, sites, s3); err != nil { + return err + } + + _, err = buildItemSignals(ctx, pageviews, sites, s3) + if err != nil { + return err + } + + return nil +} diff --git a/cmd/qrank-builder/build_test.go b/cmd/qrank-builder/build_test.go new file mode 100644 index 0000000..e57343d --- /dev/null +++ b/cmd/qrank-builder/build_test.go @@ -0,0 +1,41 @@ +package main + +import ( + "bytes" + "log" + "path/filepath" + "slices" + "testing" +) + +// TestBuild is a large integration test that runs the entire pipeline. +func TestBuild(t *testing.T) { + if testing.Short() { + t.Skip() + } + + logger = log.New(&bytes.Buffer{}, "", log.Lshortfile) + dumps := filepath.Join("testdata", "dumps") + s3 := NewFakeS3() + if err := Build(dumps /*numWeeks*/, 1, s3); err != nil { + t.Fatal(err) + } + + got, err := s3.ReadLines("public/item_signals-20240501.csv.zst") + if err != nil { + t.Fatal(err) + } + + want := []string{ + "item,pageviews_52w,wikitext_bytes,claims,identifiers,sitelinks", + "Q72,0,3142,550,85,186", + "Q5296,0,2872,0,0,0", + "Q662541,3,4973,32,9,15", + "Q5649951,0,0,1,0,20", + "Q107661323,0,3470,0,0,0", + } + + if !slices.Equal(got, want) { + t.Errorf("got %v, want %v", got, want) + } +} diff --git a/cmd/qrank-builder/main.go b/cmd/qrank-builder/main.go index 058395f..fb908b6 100644 --- a/cmd/qrank-builder/main.go +++ b/cmd/qrank-builder/main.go @@ -99,37 +99,11 @@ func NewStorageClient(keypath string) (*minio.Client, error) { } func computeQRank(dumpsPath string, testRun bool, storage *minio.Client) error { - ctx := context.Background() - var s3 S3 = storage - - // Build pageviews files. We're changing how pageviews get aggregated, - // and this is the new, vastly more efficient approach. But it is not - // fully implemented yet, so we do not yet actually use the output. - // The older approach is done by the call to processPageviews below. - // https://github.com/brawer/wikidata-qrank/issues/23 - numWeeks := 52 - pageviews, err := buildPageviews(ctx, dumpsPath, numWeeks, s3) - if err != nil { - return err - } - - sites, err := ReadWikiSites(dumpsPath) - if err != nil { - return err - } - logger.Printf("found wikimedia dumps for %d sites", len(*sites)) - if err := buildPageSignals(ctx, dumpsPath, sites, s3); err != nil { - return err - } - - _, err = buildItemSignals(ctx, pageviews, sites, s3) - if err != nil { - return err - } + return Build(dumpsPath /*numWeeks*/, 52, storage) // TODO: Old code starts here, remove after new implementation is done. - return nil + ctx := context.Background() outDir := "cache" if testRun { outDir = "cache-testrun" @@ -148,7 +122,7 @@ func computeQRank(dumpsPath string, testRun bool, storage *minio.Client) error { return err } - pageviews, err = processPageviews(testRun, dumpsPath, edate, outDir, ctx) + pageviews, err := processPageviews(testRun, dumpsPath, edate, outDir, ctx) if err != nil { return err }