Skip to content

Commit

Permalink
Generalize helper method for listing files in S3 storage
Browse files Browse the repository at this point in the history
We’ll eventually store other files than just `page_signals`.
#10
  • Loading branch information
brawer committed May 24, 2024
1 parent 29475cf commit e56c840
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 49 deletions.
25 changes: 1 addition & 24 deletions cmd/qrank-builder/pagesignals.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import (
// If a page_signals file is already stored for the last dumped version
// of a site, it is not getting re-built.
func buildPageSignals(ctx context.Context, dumps string, sites *map[string]WikiSite, s3 S3) error {
stored, err := storedPageSignals(ctx, s3)
stored, err := ListStoredFiles(ctx, "page_signals", s3)
if err != nil {
return err
}
Expand Down Expand Up @@ -293,29 +293,6 @@ func processPageTable(ctx context.Context, dumps string, site *WikiSite, out cha
}
}

// StoredPageSignals returns what entity files are available in storage.
func storedPageSignals(ctx context.Context, s3 S3) (map[string][]string, error) {
re := regexp.MustCompile(`^page_signals/([a-z0-9_\-]+)-(\d{8})-page_signals.zst$`)
result := make(map[string][]string, 1000)
opts := minio.ListObjectsOptions{Prefix: "page_signals/"}
for obj := range s3.ListObjects(ctx, "qrank", opts) {
if obj.Err != nil {
return nil, obj.Err
}
if match := re.FindStringSubmatch(obj.Key); match != nil {
arr, ok := result[match[1]]
if !ok {
arr = make([]string, 0, 3)
}
result[match[1]] = append(arr, match[2])
}
}
for _, val := range result {
sort.Strings(val)
}
return result, nil
}

type pageSignalsScanner struct {
err error
paths []string
Expand Down
26 changes: 1 addition & 25 deletions cmd/qrank-builder/pagesignals_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import (
"fmt"
"log"
"path/filepath"
"reflect"
"slices"
"strings"
"testing"
Expand Down Expand Up @@ -83,7 +82,7 @@ func TestBuildPageSignals(t *testing.T) {
}

// Verify that obsolete files have been cleaned up.
stored, err := storedPageSignals(context.Background(), s3)
stored, err := ListStoredFiles(context.Background(), "page_signals", s3)
if err != nil {
t.Error(err)
}
Expand All @@ -94,29 +93,6 @@ func TestBuildPageSignals(t *testing.T) {
}
}

func TestStoredPageSignals(t *testing.T) {
s3 := NewFakeS3()
for _, path := range []string{
"page_signals/alswikibooks-20010203-page_signals.zst",
"page_signals/alswikibooks-20050607-page_signals.zst",
"page_signals/rmwiki-20241122-page_signals.zst",
"page_signals/junk.txt",
} {
s3.data[path] = []byte("content")
}
got, err := storedPageSignals(context.Background(), s3)
if err != nil {
t.Error(err)
}
want := map[string][]string{
"alswikibooks": {"20010203", "20050607"},
"rmwiki": {"20241122"},
}
if !reflect.DeepEqual(got, want) {
t.Errorf("got %v, want %v", got, want)
}
}

func TestPageSignalsScanner(t *testing.T) {
logger = log.New(&bytes.Buffer{}, "", log.Lshortfile)
s3 := NewFakeS3()
Expand Down
25 changes: 25 additions & 0 deletions cmd/qrank-builder/s3.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import (
"fmt"
"io"
"os"
"regexp"
"sort"

"github.com/minio/minio-go/v7"
//"github.com/minio/minio-go/v7/pkg/credentials"
Expand Down Expand Up @@ -105,3 +107,26 @@ func PutInStorage(ctx context.Context, file string, s3 S3, bucket string, dest s
_, err := s3.FPutObject(ctx, bucket, dest, file, options)
return err
}

// ListStoredFiles returns what files are available in S3 storage.
func ListStoredFiles(ctx context.Context, filename string, s3 S3) (map[string][]string, error) {
re := regexp.MustCompile(fmt.Sprintf(`^%s/([a-z0-9_\-]+)-(\d{8})-%s.zst$`, filename, filename))
result := make(map[string][]string, 1000)
opts := minio.ListObjectsOptions{Prefix: filename + "/"}
for obj := range s3.ListObjects(ctx, "qrank", opts) {
if obj.Err != nil {
return nil, obj.Err
}
if match := re.FindStringSubmatch(obj.Key); match != nil {
arr, ok := result[match[1]]
if !ok {
arr = make([]string, 0, 3)
}
result[match[1]] = append(arr, match[2])
}
}
for _, val := range result {
sort.Strings(val)
}
return result, nil
}
24 changes: 24 additions & 0 deletions cmd/qrank-builder/s3_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"fmt"
"io"
"os"
"reflect"
"slices"
"strings"
"sync"
Expand Down Expand Up @@ -229,3 +230,26 @@ func TestReadWriteLinest(t *testing.T) {
}
}
}

func TestListStoredFiles(t *testing.T) {
s3 := NewFakeS3()
for _, path := range []string{
"page_signals/alswikibooks-20010203-page_signals.zst",
"page_signals/alswikibooks-20050607-page_signals.zst",
"page_signals/rmwiki-20241122-page_signals.zst",
"page_signals/junk.txt",
} {
s3.data[path] = []byte("content")
}
got, err := ListStoredFiles(context.Background(), "page_signals", s3)
if err != nil {
t.Error(err)
}
want := map[string][]string{
"alswikibooks": {"20010203", "20050607"},
"rmwiki": {"20241122"},
}
if !reflect.DeepEqual(got, want) {
t.Errorf("got %v, want %v", got, want)
}
}

0 comments on commit e56c840

Please sign in to comment.