diff --git a/cmd/qrank-builder/itemsignals.go b/cmd/qrank-builder/itemsignals.go index 1057888..3dcb200 100644 --- a/cmd/qrank-builder/itemsignals.go +++ b/cmd/qrank-builder/itemsignals.go @@ -6,15 +6,108 @@ package main import ( "bytes" "context" + "encoding/binary" "fmt" "os" "regexp" "time" "github.com/klauspost/compress/zstd" + "github.com/lanrat/extsort" "github.com/minio/minio-go/v7" ) +// ItemSignals contains ranking signals for Wikidata items. +type ItemSignals struct { + item int64 // eg 72 for Q72 + pageviews int64 + wikitextBytes int64 + claims int64 + identifiers int64 + sitelinks int64 +} + +// If we ever want to rank signals for Wikidata lexemes, it would +// probably make sense to use a separate struct (written to a different +// output file) because it's likely not the same set of signals. +// For example, lexemes don't have pageviews, pagerank or wikitextBytes. +// https://github.com/brawer/wikidata-qrank/issues/37 +// type LexemeSignals struct {} + +func (s ItemSignals) ToBytes() []byte { + buf := make([]byte, binary.MaxVarintLen64*6) + p := binary.PutVarint(buf, s.item) + p += binary.PutVarint(buf[p:], s.pageviews) + p += binary.PutVarint(buf[p:], s.wikitextBytes) + p += binary.PutVarint(buf[p:], s.claims) + p += binary.PutVarint(buf[p:], s.identifiers) + p += binary.PutVarint(buf[p:], s.sitelinks) + return buf[0:p] +} + +func ItemSignalsFromBytes(b []byte) extsort.SortType { + item, pos := binary.Varint(b) + pageviews, n := binary.Varint(b[pos:]) + pos += n + wikitextBytes, n := binary.Varint(b[pos:]) + pos += n + claims, n := binary.Varint(b[pos:]) + pos += n + identifiers, n := binary.Varint(b[pos:]) + pos += n + sitelinks, n := binary.Varint(b[pos:]) + return ItemSignals{ + item: item, + pageviews: pageviews, + wikitextBytes: wikitextBytes, + claims: claims, + identifiers: identifiers, + sitelinks: sitelinks, + } +} + +func ItemSignalsLess(a, b extsort.SortType) bool { + aa, bb := a.(ItemSignals), b.(ItemSignals) + + if aa.item < bb.item { + return true + } else if aa.item > bb.item { + return false + } + + if aa.pageviews < bb.pageviews { + return true + } else if aa.pageviews > bb.pageviews { + return false + } + + if aa.wikitextBytes < bb.wikitextBytes { + return true + } else if aa.wikitextBytes > bb.wikitextBytes { + return false + } + + if aa.claims < bb.claims { + return true + } else if aa.claims > bb.claims { + return false + } + + if aa.identifiers < bb.identifiers { + return true + } else if aa.identifiers > bb.identifiers { + return false + } + + if aa.sitelinks < bb.sitelinks { + return true + } else if aa.sitelinks > bb.sitelinks { + return false + } + + return false +} + // BuildItemSignals builds per-item signals and puts them in storage. // If the signals file is already in storage, it does not get re-built. func buildItemSignals(ctx context.Context, pageviews []string, sites *map[string]WikiSite, s3 S3) (time.Time, error) { diff --git a/cmd/qrank-builder/itemsignals_test.go b/cmd/qrank-builder/itemsignals_test.go index 31fec91..32a0737 100644 --- a/cmd/qrank-builder/itemsignals_test.go +++ b/cmd/qrank-builder/itemsignals_test.go @@ -7,11 +7,68 @@ import ( "bytes" "context" "log" + "reflect" "slices" "testing" "time" ) +func TestItemSignalsToBytes(t *testing.T) { + // Serialize and then de-serialize an ItemSignals struct. + a := ItemSignals{1, 2, 3, 4, 5, 6} + got := ItemSignalsFromBytes(a.ToBytes()).(ItemSignals) + if !reflect.DeepEqual(got, a) { + t.Errorf("got %v, want %v", got, a) + } +} + +func TestItemSignalsLess(t *testing.T) { + for _, tc := range []struct { + a string + b string + want bool + }{ + {"123456", "123456", false}, + {"923456", "123456", false}, + {"123456", "923456", true}, + + {"------", "------", false}, + {"7-----", "------", false}, + {"-7----", "------", false}, + {"--7---", "------", false}, + {"---7--", "------", false}, + {"----7-", "------", false}, + {"-----7", "------", false}, + {"------", "7-----", true}, + {"------", "-7----", true}, + {"------", "--7---", true}, + {"------", "---7--", true}, + {"------", "----7-", true}, + {"------", "-----7", true}, + } { + a := ItemSignals{ + item: int64(tc.a[0]), + pageviews: int64(tc.a[1]), + wikitextBytes: int64(tc.a[2]), + claims: int64(tc.a[3]), + identifiers: int64(tc.a[4]), + sitelinks: int64(tc.a[5]), + } + b := ItemSignals{ + item: int64(tc.b[0]), + pageviews: int64(tc.b[1]), + wikitextBytes: int64(tc.b[2]), + claims: int64(tc.b[3]), + identifiers: int64(tc.b[4]), + sitelinks: int64(tc.b[5]), + } + got := ItemSignalsLess(a, b) + if got != tc.want { + t.Errorf("got %v, want %v, for ItemSignalsLess(%#v, %#v)", got, tc.want, a, b) + } + } +} + func TestBuildItemSignals(t *testing.T) { logger = log.New(&bytes.Buffer{}, "", log.Lshortfile) ctx := context.Background()