From b347c62090efd9da0dc6a033c6c6a8f532b943f2 Mon Sep 17 00:00:00 2001 From: StJudeWasHere <707925+StJudeWasHere@users.noreply.github.com> Date: Sun, 10 Nov 2024 11:54:37 +0100 Subject: [PATCH 1/6] Sort wacz index --- internal/services/archiver.go | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/internal/services/archiver.go b/internal/services/archiver.go index c689a10..62756e3 100644 --- a/internal/services/archiver.go +++ b/internal/services/archiver.go @@ -4,7 +4,6 @@ import ( "archive/zip" "bytes" "compress/gzip" - "crypto/md5" "crypto/sha256" "encoding/hex" "encoding/json" @@ -14,6 +13,7 @@ import ( "net/http" "net/url" "os" + "slices" "strings" "time" @@ -123,7 +123,7 @@ func (s *Archiver) AddRecord(response *http.Response) { Length: fmt.Sprintf("%d", len(contentBuffer.Bytes())), Mime: response.Header.Get("Content-Type"), Filename: filePath, - Digest: fmt.Sprintf("%x", md5.Sum(contentBuffer.Bytes())), + Digest: fmt.Sprintf("sha-256:%x", sha256.Sum256(contentBuffer.Bytes())), Offset: fmt.Sprintf("%d", 0), } @@ -164,6 +164,7 @@ func (s *Archiver) Close() { } indexWriter := gzip.NewWriter(indexFile) + cdx := []string{} for _, entry := range s.cdxjEntries { parsedURL, err := url.Parse(entry.TargetURI) if err != nil { @@ -171,22 +172,24 @@ func (s *Archiver) Close() { continue } domainParts := strings.Split(parsedURL.Hostname(), ".") - for i := len(domainParts) - 1; i >= 0; i-- { - indexWriter.Write([]byte(domainParts[i])) - if i > 0 { - indexWriter.Write([]byte(",")) - } - } - indexWriter.Write([]byte(")/")) + slices.Reverse(domainParts) + searchableURL := strings.Join(domainParts, ",") + searchableURL = searchableURL + ")" + parsedURL.RequestURI() cdxjLine := fmt.Sprintf( - "%s %s\n", + "%s %s %s\n", + searchableURL, entry.Timestamp, fmt.Sprintf(`{"offset":"%s","status":"%s","length":"%s","mime":"%s","filename":"%s","url":"%s","digest":"%s"}`, entry.Offset, entry.Status, entry.Length, entry.Mime, entry.Filename, entry.TargetURI, entry.Digest, ), ) - indexWriter.Write([]byte(cdxjLine)) + cdx = append(cdx, cdxjLine) + + } + slices.Sort(cdx) + for _, e := range cdx { + indexWriter.Write([]byte(e)) } indexWriter.Close() From 931bf6f22d9fa9fd53366998677a1d644e78606e Mon Sep 17 00:00:00 2001 From: StJudeWasHere <707925+StJudeWasHere@users.noreply.github.com> Date: Mon, 11 Nov 2024 12:52:00 +0100 Subject: [PATCH 2/6] Refactor archive and add datapackage digest --- internal/services/archiver.go | 377 ++++++++++++++++++++-------------- internal/services/crawler.go | 2 +- internal/services/project.go | 7 +- 3 files changed, 229 insertions(+), 157 deletions(-) diff --git a/internal/services/archiver.go b/internal/services/archiver.go index 62756e3..a9d9809 100644 --- a/internal/services/archiver.go +++ b/internal/services/archiver.go @@ -3,7 +3,6 @@ package services import ( "archive/zip" "bytes" - "compress/gzip" "crypto/sha256" "encoding/hex" "encoding/json" @@ -13,250 +12,260 @@ import ( "net/http" "net/url" "os" + "path/filepath" "slices" + "strconv" "strings" "time" "github.com/google/uuid" "github.com/slyrz/warc" + "github.com/stjudewashere/seonaut/internal/models" ) const ArchiveDir = "archive/" type Archiver struct { - zipWriter *zip.Writer - file *os.File - cdxjEntries []CDXJEntry - pagesEntries []PageEntry + file *os.File + cdxjEntries []CDXJEntry + waczWriter *zip.Writer + warcWriter *warc.Writer + warcOffset int } type CDXJEntry struct { - TargetURI string - Timestamp string - RecordID string - Offset string - Status string - Length string - Mime string - Filename string - Digest string + Offset string `json:"offset"` + Status string `json:"status"` + Length string `json:"length"` + Mime string `json:"mime"` + Filename string `json:"filename"` + Digest string `json:"digest"` + RecordDigest string `json:"recordDigest"` + time time.Time `json:"-"` + url url.URL `json:"-"` } type PageEntry struct { - URL string - TS string + URL string `json:"url"` + TS string `json:"ts"` } // Returns a new Archiver. // It creates a new wacz file for the given url string. -func NewArchiver(urlStr string) (*Archiver, error) { - file, err := os.Create(ArchiveDir + urlStr + ".wacz") +func NewArchiver(p models.Project) (*Archiver, error) { + // Create the project's archive directory if it doesn't exist. + projectPath := ArchiveDir + "/" + strconv.FormatInt(p.Id, 10) + "/" + err := os.MkdirAll(projectPath, 0755) if err != nil { return nil, err } + // Create the wacz file. + file, err := os.Create(projectPath + p.Host + ".wacz") + if err != nil { + return nil, err + } + waczWriter := zip.NewWriter(file) + + // Create the warc writer. + archiveFile, err := waczWriter.Create("data/data.warc") + if err != nil { + log.Printf("failed to create WARC file entry in ZIP: %v", err) + return nil, err + } + warcWriter := warc.NewWriter(archiveFile) + return &Archiver{ - zipWriter: zip.NewWriter(file), - file: file, + waczWriter: waczWriter, + file: file, + warcWriter: warcWriter, }, nil } // AddRecord adds a new response record to the warc file and keeps track // of the added records to create the index once the archiver is closed. func (s *Archiver) AddRecord(response *http.Response) { - uuidStr := uuid.New().String() - record := warc.NewRecord() - record.Header.Set("warc-type", "response") - record.Header.Set("warc-date", time.Now().Format(time.RFC3339)) - record.Header.Set("warc-target-uri", response.Request.URL.String()) - record.Header.Set("content-type", response.Header.Get("Content-Type")) - record.Header.Set("warc-record-id", fmt.Sprintf("", uuidStr)) - - var contentBuffer bytes.Buffer - contentBuffer.WriteString(fmt.Sprintf("HTTP/%d.%d %d %s\r\n", - response.ProtoMajor, response.ProtoMinor, response.StatusCode, response.Status)) - - for key, values := range response.Header { - for _, value := range values { - contentBuffer.WriteString(fmt.Sprintf("%s: %s\r\n", key, value)) - } - } - contentBuffer.WriteString("\r\n") - var bodyCopy bytes.Buffer - _, err := io.Copy(&bodyCopy, response.Body) + err := s.readResponseBody(&bodyCopy, response) if err != nil { - log.Printf("Failed to copy response body: %v", err) + log.Printf("failed to read response body %v", err) return } - response.Body.Close() // Close the original body - response.Body = io.NopCloser(bytes.NewReader(bodyCopy.Bytes())) + var contentBuffer bytes.Buffer + s.readResponseHeaders(&contentBuffer, response) if _, err := io.Copy(&contentBuffer, &bodyCopy); err != nil { - fmt.Println("Error reading response body:", err) + fmt.Println("error reading response body copy:", err) return } + wdate := time.Now() + record := warc.NewRecord() + record.Header.Set("warc-type", "response") + record.Header.Set("warc-date", wdate.Format(time.RFC3339)) + record.Header.Set("warc-target-uri", response.Request.URL.String()) + record.Header.Set("content-type", response.Header.Get("Content-Type")) + record.Header.Set("warc-record-id", fmt.Sprintf("", uuid.New().String())) record.Content = bytes.NewReader(contentBuffer.Bytes()) - - filePath := ArchiveDir + fmt.Sprintf("data-%s.warc.gz", uuidStr) - archiveFile, err := s.zipWriter.Create(filePath) - if err != nil { - log.Printf("Failed to create WARC file entry in ZIP: %v", err) + if _, err := s.warcWriter.WriteRecord(record); err != nil { + log.Printf("failed to write WARC record to archive: %v", err) return } - archiveZipWritter := gzip.NewWriter(archiveFile) - archiveWriter := warc.NewWriter(archiveZipWritter) - if _, err := archiveWriter.WriteRecord(record); err != nil { - log.Printf("Failed to write WARC record to archive: %v", err) - return + cdxjEntry := CDXJEntry{ + Status: fmt.Sprintf("%d", response.StatusCode), + Length: fmt.Sprintf("%d", len(contentBuffer.Bytes())), + Mime: response.Header.Get("Content-Type"), + Filename: "data/data.warc.gz", + Digest: fmt.Sprintf("sha-256:%x", sha256.Sum256(bodyCopy.Bytes())), + RecordDigest: fmt.Sprintf("sha256:%x", sha256.Sum256(contentBuffer.Bytes())), + Offset: fmt.Sprintf("%d", s.warcOffset), + time: wdate, + url: *response.Request.URL, } + s.cdxjEntries = append(s.cdxjEntries, cdxjEntry) - archiveZipWritter.Close() + s.warcOffset += contentBuffer.Len() +} - cdxjEntry := CDXJEntry{ - TargetURI: response.Request.URL.String(), - Timestamp: time.Now().Format("20060102150405"), - RecordID: fmt.Sprintf("", uuidStr), - Status: fmt.Sprintf("%d", response.StatusCode), - Length: fmt.Sprintf("%d", len(contentBuffer.Bytes())), - Mime: response.Header.Get("Content-Type"), - Filename: filePath, - Digest: fmt.Sprintf("sha-256:%x", sha256.Sum256(contentBuffer.Bytes())), - Offset: fmt.Sprintf("%d", 0), +// readResponseBody Reads the http response's body into a bytes.Buffer. Then +// it resets the original response body so it can be used again later on. +func (s *Archiver) readResponseBody(bodyCopy *bytes.Buffer, response *http.Response) error { + _, err := io.Copy(bodyCopy, response.Body) + if err != nil { + return err } + response.Body.Close() // Close the original body + response.Body = io.NopCloser(bytes.NewReader(bodyCopy.Bytes())) - s.cdxjEntries = append(s.cdxjEntries, cdxjEntry) + return nil +} - pageEntry := PageEntry{ - URL: response.Request.URL.String(), - TS: time.Now().Format(time.RFC3339), - } +// readResponseHeaders reads the response's headers into a bytes.Buffer. +func (s *Archiver) readResponseHeaders(contentBuffer *bytes.Buffer, response *http.Response) { + contentBuffer.WriteString( + fmt.Sprintf( + "HTTP/%d.%d %d %s\r\n", + response.ProtoMajor, + response.ProtoMinor, + response.StatusCode, + response.Status, + ), + ) - s.pagesEntries = append(s.pagesEntries, pageEntry) + for key, values := range response.Header { + for _, value := range values { + contentBuffer.WriteString(fmt.Sprintf("%s: %s\r\n", key, value)) + } + } + contentBuffer.WriteString("\r\n") } // Close closes the archive and creates the remaining files. func (s *Archiver) Close() { - pagesFile, err := s.zipWriter.Create("pages/pages.jsonl") + err := s.createIndex() if err != nil { - log.Printf("Failed to create pages file entry in ZIP: %v", err) - return + log.Printf("failed to create index file entry in ZIP: %v", err) } - pagesWriter := gzip.NewWriter(pagesFile) - header := `{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}` - header += "\n" - pagesWriter.Write([]byte(header)) - - for _, page := range s.pagesEntries { - pageLine := fmt.Sprintf(`{"url":"%s","ts":"%s"}`, page.URL, page.TS) - pageLine += "\n" - pagesWriter.Write([]byte(pageLine)) + err = s.createPages() + if err != nil { + log.Printf("failed to create pages file entry in ZIP: %v", err) } - pagesWriter.Close() - indexFile, err := s.zipWriter.Create("indexes/index.cdx.gz") + s.waczWriter.Close() + s.file.Close() + + err = s.createDatapackage() if err != nil { - log.Printf("Failed to create WARC file entry in ZIP: %v", err) + log.Printf("failed to create datapackage.json: %v", err) return } - indexWriter := gzip.NewWriter(indexFile) +} + +// Create the index file. +func (s *Archiver) createIndex() error { + indexWriter, err := s.waczWriter.Create("indexes/index.cdx") + if err != nil { + return err + } cdx := []string{} for _, entry := range s.cdxjEntries { - parsedURL, err := url.Parse(entry.TargetURI) + domainParts := strings.Split(entry.url.Hostname(), ".") + slices.Reverse(domainParts) + searchableURL := strings.Join(domainParts, ",") + searchableURL = searchableURL + ")" + entry.url.RequestURI() + + jsonEntry, err := json.Marshal(entry) if err != nil { - log.Printf("Failed to parse URL: %v", err) + log.Printf("failed to json marshal index %v", err) continue } - domainParts := strings.Split(parsedURL.Hostname(), ".") - slices.Reverse(domainParts) - searchableURL := strings.Join(domainParts, ",") - searchableURL = searchableURL + ")" + parsedURL.RequestURI() - - cdxjLine := fmt.Sprintf( - "%s %s %s\n", - searchableURL, - entry.Timestamp, - fmt.Sprintf(`{"offset":"%s","status":"%s","length":"%s","mime":"%s","filename":"%s","url":"%s","digest":"%s"}`, - entry.Offset, entry.Status, entry.Length, entry.Mime, entry.Filename, entry.TargetURI, entry.Digest, - ), - ) - cdx = append(cdx, cdxjLine) + cdxjLine := fmt.Sprintf("%s %s %s\n", searchableURL, entry.time.Format("20060102150405"), jsonEntry) + cdx = append(cdx, cdxjLine) } + slices.Sort(cdx) for _, e := range cdx { indexWriter.Write([]byte(e)) } - indexWriter.Close() - - s.zipWriter.Close() - s.file.Close() - err = s.createDatapackageJSON() - if err != nil { - log.Printf("Failed to create datapackage.json: %v", err) - return - } + return nil } -// Create the datapackage.json file -func (s *Archiver) createDatapackageJSON() error { - archive, err := zip.OpenReader(s.file.Name()) +// addPage adds a new page record in the pages.jsonl file. +func (s *Archiver) createPages() error { + // Add the pages.jsonl and add the file header + pagesWriter, err := s.waczWriter.Create("pages/pages.jsonl") if err != nil { - log.Printf("Failed to open ZIP archive for reading: %v", err) return err } - defer archive.Close() - calculateSHA256AndSize := func(file *zip.File) (string, int64, error) { - rc, err := file.Open() - if err != nil { - log.Printf("Failed to open file in ZIP: %v", err) - return "", 0, err + header := `{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}` + header += "\n" + pagesWriter.Write([]byte(header)) + + for _, e := range s.cdxjEntries { + page := PageEntry{ + URL: e.url.String(), + TS: e.time.Format(time.RFC3339), } - defer rc.Close() - hash := sha256.New() - _, err = io.Copy(hash, rc) + jsonPage, err := json.Marshal(page) if err != nil { - log.Printf("Failed to calculate SHA256: %v", err) - return "", 0, err + return err } - fileSize := file.FileInfo().Size() - - return "sha256:" + hex.EncodeToString(hash.Sum(nil)), fileSize, nil - } - - var resources []map[string]interface{} - - for _, file := range archive.File { - hash, size, err := calculateSHA256AndSize(file) + jsonPage = append(jsonPage, '\n') + _, err = pagesWriter.Write(jsonPage) if err != nil { - return err + log.Printf("error adding page %s %v", jsonPage, err) } + } - resources = append(resources, map[string]interface{}{ - "name": file.Name, - "path": file.Name, - "hash": hash, - "bytes": size, - }) + return nil +} + +// Create the datapackage.json file. +// Opens the zip file and reads all the files to create the resources json along with the hash. +// Then it saves the datapackage and creates the datapackage-digest json file. +func (s *Archiver) createDatapackage() error { + archive, err := zip.OpenReader(s.file.Name()) + if err != nil { + log.Printf("Failed to open ZIP archive for reading: %v", err) + return err } + defer archive.Close() - datapackage := map[string]interface{}{ - "profile": "data-package", - "wacz_version": "1.1.1", - "resources": resources, + datapackage, err := s.getResources(archive) + if err != nil { + log.Printf("failed to get zip resources %v", err) } - f, err := os.OpenFile(s.file.Name(), os.O_RDWR|os.O_CREATE, 0644) + f, err := os.OpenFile(s.file.Name(), os.O_RDWR, 0644) if err != nil { - log.Println(err) return err } defer f.Close() @@ -264,22 +273,84 @@ func (s *Archiver) createDatapackageJSON() error { zipWriter := zip.NewWriter(f) defer zipWriter.Close() + // Copy existing files. for _, file := range archive.File { zipWriter.Copy(file) } + // create datapackage. datapackageFile, err := zipWriter.Create("datapackage.json") if err != nil { - log.Printf("Failed to create datapackage.json in ZIP: %v", err) + return err + } + _, err = datapackageFile.Write(datapackage) + if err != nil { return err } - encoder := json.NewEncoder(datapackageFile) - err = encoder.Encode(datapackage) + // create datapackage digest. + datapackageDigest, err := zipWriter.Create("datapackage-digest.json") + if err != nil { + return err + } + + hash := sha256.Sum256(datapackage) + hashHex := hex.EncodeToString(hash[:]) + digestMap := map[string]string{ + "path": "datapackage.json", + "hash": "sha256" + hashHex, + } + digest, err := json.MarshalIndent(digestMap, "", " ") + if err != nil { + return nil + } + + _, err = datapackageDigest.Write(digest) if err != nil { - log.Printf("Failed to write datapackage.json: %v", err) return err } return nil } + +// getResources returns a []byte with the json data for the datapackage.json file. +func (s *Archiver) getResources(archive *zip.ReadCloser) ([]byte, error) { + var resources []map[string]interface{} + for _, file := range archive.File { + hash, err := s.calculateHash(file) + if err != nil { + return []byte{}, err + } + + resources = append(resources, map[string]interface{}{ + "name": filepath.Base(file.Name), + "path": file.Name, + "hash": hash, + "bytes": file.FileInfo().Size(), + }) + } + + datapackage := map[string]interface{}{ + "profile": "data-package", + "wacz_version": "1.1.1", + "resources": resources, + } + + return json.MarshalIndent(datapackage, "", " ") +} + +// calculateHash returns the hash string of a zip.File. +func (s *Archiver) calculateHash(file *zip.File) (string, error) { + rc, err := file.Open() + if err != nil { + return "", err + } + defer rc.Close() + hash := sha256.New() + _, err = io.Copy(hash, rc) + if err != nil { + return "", err + } + + return "sha256:" + hex.EncodeToString(hash.Sum(nil)), nil +} diff --git a/internal/services/crawler.go b/internal/services/crawler.go index ccc3706..b4771db 100644 --- a/internal/services/crawler.go +++ b/internal/services/crawler.go @@ -90,7 +90,7 @@ func (s *CrawlerService) StartCrawler(p models.Project, b models.BasicAuth) erro var archiver *Archiver if p.Archive { - archiver, err = NewArchiver(p.Host) + archiver, err = NewArchiver(p) if err != nil { log.Printf("Failed to create archive: %v", err) } diff --git a/internal/services/project.go b/internal/services/project.go index d8bad4b..530f096 100644 --- a/internal/services/project.go +++ b/internal/services/project.go @@ -4,6 +4,7 @@ import ( "errors" "net/url" "os" + "strconv" "strings" "github.com/stjudewashere/seonaut/internal/models" @@ -99,7 +100,7 @@ func (s *ProjectService) DeleteAllUserProjects(user *models.User) { // ArchiveExists checks if a wacz file exists for the current project. // It returns true if it exists, otherwise it returns false. func (s *ProjectService) ArchiveExists(p *models.Project) bool { - _, err := os.Stat(ArchiveDir + p.Host + ".wacz") + _, err := os.Stat(ArchiveDir + "/" + strconv.FormatInt(p.Id, 10) + "/" + p.Host + ".wacz") return err == nil } @@ -110,7 +111,7 @@ func (s *ProjectService) DeleteArchive(p *models.Project) { return } - os.Remove(ArchiveDir + p.Host + ".wacz") + os.Remove(ArchiveDir + "/" + strconv.FormatInt(p.Id, 10) + "/" + p.Host + ".wacz") } // GetArchiveFilePath returns the project's wacz file path if it exists, @@ -120,5 +121,5 @@ func (s *ProjectService) GetArchiveFilePath(p *models.Project) (string, error) { return "", errors.New("WACZ archive file does not exist") } - return ArchiveDir + p.Host + ".wacz", nil + return ArchiveDir + "/" + strconv.FormatInt(p.Id, 10) + "/" + p.Host + ".wacz", nil } From 00b474da2da092442a4d0a3442fa49f7fd76afba Mon Sep 17 00:00:00 2001 From: StJudeWasHere <707925+StJudeWasHere@users.noreply.github.com> Date: Mon, 11 Nov 2024 16:27:07 +0100 Subject: [PATCH 3/6] Update dependencies --- go.mod | 10 +++++----- go.sum | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/go.mod b/go.mod index 44052fe..37afa66 100644 --- a/go.mod +++ b/go.mod @@ -16,9 +16,9 @@ require ( github.com/spf13/viper v1.19.0 github.com/temoto/robotstxt v1.1.2 github.com/turk/go-sitemap v0.0.0-20210912154218-82ad01095e30 - golang.org/x/crypto v0.28.0 - golang.org/x/net v0.30.0 - golang.org/x/text v0.19.0 + golang.org/x/crypto v0.29.0 + golang.org/x/net v0.31.0 + golang.org/x/text v0.20.0 gopkg.in/yaml.v3 v3.0.1 ) @@ -44,7 +44,7 @@ require ( github.com/subosito/gotenv v1.6.0 // indirect go.uber.org/atomic v1.11.0 // indirect go.uber.org/multierr v1.11.0 // indirect - golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c // indirect - golang.org/x/sys v0.26.0 // indirect + golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f // indirect + golang.org/x/sys v0.27.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect ) diff --git a/go.sum b/go.sum index 7e26652..9a28941 100644 --- a/go.sum +++ b/go.sum @@ -137,17 +137,17 @@ go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.28.0 h1:GBDwsMXVQi34v5CCYUm2jkJvu4cbtru2U4TN2PSyQnw= -golang.org/x/crypto v0.28.0/go.mod h1:rmgy+3RHxRZMyY0jjAJShp2zgEdOqj2AO7U0pYmeQ7U= -golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c h1:7dEasQXItcW1xKJ2+gg5VOiBnqWrJc+rq0DPKyvvdbY= -golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c/go.mod h1:NQtJDoLvd6faHhE7m4T/1IY708gDefGGjR/iUW8yQQ8= +golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ= +golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg= +golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f h1:XdNn9LlyWAhLVp6P/i8QYBW+hlyhrhei9uErw2B5GJo= +golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f/go.mod h1:D5SMRVC3C2/4+F/DB1wZsLRnSNimn2Sp/NPsCrsv8ak= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= -golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= +golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo= +golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -156,8 +156,8 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= -golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= +golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= @@ -165,8 +165,8 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= -golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= +golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= From b72543ea8ee344c0271ec0a261d728177c6feeaa Mon Sep 17 00:00:00 2001 From: StJudeWasHere <707925+StJudeWasHere@users.noreply.github.com> Date: Tue, 12 Nov 2024 15:52:20 +0100 Subject: [PATCH 4/6] Read wacz file --- internal/archiver/archiver.go | 374 +++++++++++++++++++++++++++ internal/archiver/reader.go | 122 +++++++++ internal/routes/export.go | 4 +- internal/routes/resource.go | 8 +- internal/services/archiver.go | 363 +++----------------------- internal/services/container.go | 9 +- internal/services/crawler.go | 20 +- internal/services/crawler_handler.go | 6 +- internal/services/project.go | 48 +--- internal/services/project_test.go | 6 +- web/templates/resources.html | 1 + 11 files changed, 584 insertions(+), 377 deletions(-) create mode 100644 internal/archiver/archiver.go create mode 100644 internal/archiver/reader.go diff --git a/internal/archiver/archiver.go b/internal/archiver/archiver.go new file mode 100644 index 0000000..ce11852 --- /dev/null +++ b/internal/archiver/archiver.go @@ -0,0 +1,374 @@ +package archiver + +import ( + "archive/zip" + "bytes" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "net/url" + "os" + "path/filepath" + "slices" + "strings" + "time" + + "github.com/google/uuid" + "github.com/slyrz/warc" +) + +const ArchiveDir = "archive/" + +type Writer struct { + file *os.File + cdxjEntries []CDXJEntry + waczWriter *zip.Writer + warcWriter *warc.Writer + warcOffset int +} + +type CDXJEntry struct { + URL string `json:"url"` + Offset int `json:"offset"` + Status string `json:"status"` + Length int `json:"length"` + Mime string `json:"mime"` + Filename string `json:"filename"` + Digest string `json:"digest"` + RecordDigest string `json:"recordDigest"` + time time.Time `json:"-"` + parsedURL url.URL `json:"-"` +} + +type PageEntry struct { + URL string `json:"url"` + TS string `json:"ts"` +} + +// Returns a new Writer. +// It creates a new wacz file for the given url string. +func NewArchiver(waczPath string) (*Writer, error) { + waczDir := filepath.Dir(waczPath) + + err := os.MkdirAll(waczDir, 0755) + if err != nil { + return nil, err + } + + // Create the wacz file. + file, err := os.Create(waczPath) + if err != nil { + return nil, err + } + waczWriter := zip.NewWriter(file) + + fileInfo, err := file.Stat() + if err != nil { + return nil, fmt.Errorf("failed to get file info %w", err) + } + + header := &zip.FileHeader{ + Name: "data/data.warc", + Method: zip.Store, // Store mode, no compression + } + header.Modified = fileInfo.ModTime() // Optional: keep original modification time + // header.UncompressedSize64 = uint64(fileInfo.Size()) // Set uncompressed size + + // Create the warc writer. + archiveFile, err := waczWriter.CreateHeader(header) //waczWriter.Create("data/data.warc") + if err != nil { + log.Printf("failed to create WARC file entry in ZIP: %v", err) + return nil, err + } + warcWriter := warc.NewWriter(archiveFile) + + return &Writer{ + waczWriter: waczWriter, + file: file, + warcWriter: warcWriter, + }, nil +} + +// AddRecord adds a new response record to the warc file and keeps track +// of the added records to create the index once the Writer is closed. +func (s *Writer) AddRecord(response *http.Response) { + var bodyCopy bytes.Buffer + err := s.readResponseBody(&bodyCopy, response) + if err != nil { + log.Printf("failed to read response body %v", err) + return + } + + var contentBuffer bytes.Buffer + s.readResponseHeaders(&contentBuffer, response) + if _, err := io.Copy(&contentBuffer, &bodyCopy); err != nil { + fmt.Println("error reading response body copy:", err) + return + } + + recordLen := 0 + wdate := time.Now() + record := warc.NewRecord() + record.Header.Set("warc-type", "response") + record.Header.Set("warc-date", wdate.Format(time.RFC3339)) + record.Header.Set("warc-target-uri", response.Request.URL.String()) + record.Header.Set("content-type", response.Header.Get("Content-Type")) + record.Header.Set("warc-record-id", fmt.Sprintf("", uuid.New().String())) + record.Content = bytes.NewReader(contentBuffer.Bytes()) + if recordLen, err = s.warcWriter.WriteRecord(record); err != nil { + log.Printf("failed to write WARC record to archive: %v", err) + return + } + + cdxjEntry := CDXJEntry{ + Status: fmt.Sprintf("%d", response.StatusCode), + Length: recordLen, + Mime: response.Header.Get("Content-Type"), + Filename: "data/data.warc.gz", + Digest: fmt.Sprintf("sha-256:%x", sha256.Sum256(bodyCopy.Bytes())), + RecordDigest: fmt.Sprintf("sha256:%x", sha256.Sum256(contentBuffer.Bytes())), + Offset: s.warcOffset, + time: wdate, + parsedURL: *response.Request.URL, + URL: response.Request.URL.String(), + } + s.cdxjEntries = append(s.cdxjEntries, cdxjEntry) + + s.warcOffset += recordLen +} + +// readResponseBody Reads the http response's body into a bytes.Buffer. Then +// it resets the original response body so it can be used again later on. +func (s *Writer) readResponseBody(bodyCopy *bytes.Buffer, response *http.Response) error { + _, err := io.Copy(bodyCopy, response.Body) + if err != nil { + return err + } + response.Body.Close() // Close the original body + response.Body = io.NopCloser(bytes.NewReader(bodyCopy.Bytes())) + + return nil +} + +// readResponseHeaders reads the response's headers into a bytes.Buffer. +func (s *Writer) readResponseHeaders(contentBuffer *bytes.Buffer, response *http.Response) { + contentBuffer.WriteString( + fmt.Sprintf( + "HTTP/%d.%d %d %s\r\n", + response.ProtoMajor, + response.ProtoMinor, + response.StatusCode, + response.Status, + ), + ) + + for key, values := range response.Header { + for _, value := range values { + contentBuffer.WriteString(fmt.Sprintf("%s: %s\r\n", key, value)) + } + } + contentBuffer.WriteString("\r\n") +} + +// Close closes the archive and creates the remaining files. +func (s *Writer) Close() { + err := s.createIndex() + if err != nil { + log.Printf("failed to create index file entry in ZIP: %v", err) + } + + err = s.createPages() + if err != nil { + log.Printf("failed to create pages file entry in ZIP: %v", err) + } + + s.waczWriter.Close() + s.file.Close() + + err = s.createDatapackage() + if err != nil { + log.Printf("failed to create datapackage.json: %v", err) + return + } +} + +// Create the index file. +func (s *Writer) createIndex() error { + header := &zip.FileHeader{ + Name: "indexes/index.cdx", + Method: zip.Store, // Store mode, no compression + } + + indexWriter, err := s.waczWriter.CreateHeader(header) + if err != nil { + return err + } + + cdx := []string{} + for _, entry := range s.cdxjEntries { + domainParts := strings.Split(entry.parsedURL.Hostname(), ".") + slices.Reverse(domainParts) + searchableURL := strings.Join(domainParts, ",") + searchableURL = searchableURL + ")" + entry.parsedURL.RequestURI() + + jsonEntry, err := json.Marshal(entry) + if err != nil { + log.Printf("failed to json marshal index %v", err) + continue + } + + cdxjLine := fmt.Sprintf("%s %s %s\n", searchableURL, entry.time.Format("20060102150405"), jsonEntry) + cdx = append(cdx, cdxjLine) + } + + slices.Sort(cdx) + for _, e := range cdx { + indexWriter.Write([]byte(e)) + } + + return nil +} + +// addPage adds a new page record in the pages.jsonl file. +func (s *Writer) createPages() error { + // Add the pages.jsonl and add the file header + pagesWriter, err := s.waczWriter.Create("pages/pages.jsonl") + if err != nil { + return err + } + + header := `{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}` + header += "\n" + pagesWriter.Write([]byte(header)) + + for _, e := range s.cdxjEntries { + page := PageEntry{ + URL: e.parsedURL.String(), + TS: e.time.Format(time.RFC3339), + } + + jsonPage, err := json.Marshal(page) + if err != nil { + return err + } + + jsonPage = append(jsonPage, '\n') + _, err = pagesWriter.Write(jsonPage) + if err != nil { + log.Printf("error adding page %s %v", jsonPage, err) + } + } + + return nil +} + +// Create the datapackage.json file. +// Opens the zip file and reads all the files to create the resources json along with the hash. +// Then it saves the datapackage and creates the datapackage-digest json file. +func (s *Writer) createDatapackage() error { + archive, err := zip.OpenReader(s.file.Name()) + if err != nil { + log.Printf("Failed to open ZIP archive for reading: %v", err) + return err + } + defer archive.Close() + + datapackage, err := s.getResources(archive) + if err != nil { + log.Printf("failed to get zip resources %v", err) + } + + f, err := os.OpenFile(s.file.Name(), os.O_RDWR, 0644) + if err != nil { + return err + } + defer f.Close() + + zipWriter := zip.NewWriter(f) + defer zipWriter.Close() + + // Copy existing files. + for _, file := range archive.File { + zipWriter.Copy(file) + } + + // create datapackage. + datapackageFile, err := zipWriter.Create("datapackage.json") + if err != nil { + return err + } + _, err = datapackageFile.Write(datapackage) + if err != nil { + return err + } + + // create datapackage digest. + datapackageDigest, err := zipWriter.Create("datapackage-digest.json") + if err != nil { + return err + } + + hash := sha256.Sum256(datapackage) + hashHex := hex.EncodeToString(hash[:]) + digestMap := map[string]string{ + "path": "datapackage.json", + "hash": "sha256" + hashHex, + } + digest, err := json.MarshalIndent(digestMap, "", " ") + if err != nil { + return nil + } + + _, err = datapackageDigest.Write(digest) + if err != nil { + return err + } + + return nil +} + +// getResources returns a []byte with the json data for the datapackage.json file. +func (s *Writer) getResources(archive *zip.ReadCloser) ([]byte, error) { + var resources []map[string]interface{} + for _, file := range archive.File { + hash, err := s.calculateHash(file) + if err != nil { + return []byte{}, err + } + + resources = append(resources, map[string]interface{}{ + "name": filepath.Base(file.Name), + "path": file.Name, + "hash": hash, + "bytes": file.FileInfo().Size(), + }) + } + + datapackage := map[string]interface{}{ + "profile": "data-package", + "wacz_version": "1.1.1", + "resources": resources, + } + + return json.MarshalIndent(datapackage, "", " ") +} + +// calculateHash returns the hash string of a zip.File. +func (s *Writer) calculateHash(file *zip.File) (string, error) { + rc, err := file.Open() + if err != nil { + return "", err + } + defer rc.Close() + hash := sha256.New() + _, err = io.Copy(hash, rc) + if err != nil { + return "", err + } + + return "sha256:" + hex.EncodeToString(hash.Sum(nil)), nil +} diff --git a/internal/archiver/reader.go b/internal/archiver/reader.go new file mode 100644 index 0000000..69d7c0e --- /dev/null +++ b/internal/archiver/reader.go @@ -0,0 +1,122 @@ +package archiver + +import ( + "archive/zip" + "bufio" + "bytes" + "encoding/json" + "errors" + "fmt" + "io" + "log" + "os" + "strings" + + "github.com/slyrz/warc" +) + +type Reader struct { + waczPath string +} + +func NewReader(waczPath string) *Reader { + return &Reader{ + waczPath: waczPath, + } +} + +func (s *Reader) ReadArchive(urlStr string) (content string) { + wacz, err := zip.OpenReader(s.waczPath) + if err != nil { + log.Printf("failed to open reader %v", err) + return "" + } + defer wacz.Close() + + record, err := s.getCDXEntry(wacz, urlStr) + if err != nil { + return "" + } + + zipoffset, err := s.getWarcOffset(wacz) + if err != nil { + return "" + } + + f, err := os.OpenFile(s.waczPath, os.O_RDWR, 0644) + if err != nil { + log.Println(err) + return + } + defer f.Close() + + buffer := make([]byte, record.Length) + + // Read a specific chunk from the file starting from 'offset' + _, err = f.ReadAt(buffer, zipoffset+int64(record.Offset)) + if err != nil && err.Error() != "EOF" { + log.Println(err) + } + wr, _ := warc.NewReader(bytes.NewReader(buffer)) + r, err := wr.ReadRecord() + if err != nil { + log.Println(err) + } + log.Println(string(buffer)) + + c, _ := io.ReadAll(r.Content) + return string(c) +} + +func (s *Reader) getCDXEntry(wacz *zip.ReadCloser, urlStr string) (*CDXJEntry, error) { + indexFile, err := wacz.Open("indexes/index.cdx") + if err != nil { + log.Printf("failed to open index file %v", err) + return nil, err + } + + var record CDXJEntry + scanner := bufio.NewScanner(indexFile) + for scanner.Scan() { + line := scanner.Text() + + if strings.Contains(line, urlStr) { + // Find the JSON part of the line by locating the first '{' character + jsonStart := strings.Index(line, "{") + if jsonStart == -1 { + fmt.Println("JSON data not found in line:", line) + continue + } + + // Extract the JSON substring + jsonData := line[jsonStart:] + + // Parse the JSON data into the Record struct + err := json.Unmarshal([]byte(jsonData), &record) + if err != nil { + fmt.Println("Error parsing JSON:", err) + continue + } + + return &record, nil + } + } + + return nil, errors.New("URL not found in index file") +} + +func (s *Reader) getWarcOffset(wacz *zip.ReadCloser) (int64, error) { + var zipOffset int64 + var err error + for _, file := range wacz.File { + if file.Name == "data/data.warc" { + zipOffset, err = file.DataOffset() + if err != nil { + return zipOffset, err + } + return zipOffset, nil + } + } + + return zipOffset, errors.New("warc file file not found") +} diff --git a/internal/routes/export.go b/internal/routes/export.go index 9149640..909d70d 100644 --- a/internal/routes/export.go +++ b/internal/routes/export.go @@ -45,7 +45,7 @@ func (h *exportHandler) indexHandler(w http.ResponseWriter, r *http.Request) { return } - archiveExists := h.Container.ProjectService.ArchiveExists(&pv.Project) + archiveExists := h.Container.ArchiveService.ArchiveExists(&pv.Project) h.Renderer.RenderTemplate(w, "export", &PageView{ User: *user, PageTitle: "EXPORT_VIEW", @@ -201,7 +201,7 @@ func (h *exportHandler) waczHandler(w http.ResponseWriter, r *http.Request) { return } - archiveFilePath, err := h.Container.ProjectService.GetArchiveFilePath(&p) + archiveFilePath, err := h.Container.ArchiveService.GetArchiveFilePath(&p) if err != nil { http.Redirect(w, r, "/", http.StatusSeeOther) return diff --git a/internal/routes/resource.go b/internal/routes/resource.go index b2d1924..8e8acd3 100644 --- a/internal/routes/resource.go +++ b/internal/routes/resource.go @@ -63,18 +63,24 @@ func (h *resourceHandler) indexHandler(w http.ResponseWriter, r *http.Request) { return } + pageReportView := h.ReportService.GetPageReport(rid, pv.Crawl.Id, tab, page) + + source := h.Container.ArchiveService.ReadArchive(&pv.Project, pageReportView.PageReport.URL) + data := &struct { PageReportView *models.PageReportView ProjectView *models.ProjectView Eid string Ep string Tab string + Source string }{ ProjectView: pv, Eid: eid, Ep: ep, Tab: tab, - PageReportView: h.ReportService.GetPageReport(rid, pv.Crawl.Id, tab, page), + PageReportView: pageReportView, + Source: source, } pageView := &PageView{ diff --git a/internal/services/archiver.go b/internal/services/archiver.go index a9d9809..332275e 100644 --- a/internal/services/archiver.go +++ b/internal/services/archiver.go @@ -1,356 +1,69 @@ package services import ( - "archive/zip" - "bytes" - "crypto/sha256" - "encoding/hex" - "encoding/json" - "fmt" - "io" - "log" - "net/http" - "net/url" + "errors" "os" - "path/filepath" - "slices" "strconv" - "strings" - "time" - "github.com/google/uuid" - "github.com/slyrz/warc" + "github.com/stjudewashere/seonaut/internal/archiver" "github.com/stjudewashere/seonaut/internal/models" ) -const ArchiveDir = "archive/" - -type Archiver struct { - file *os.File - cdxjEntries []CDXJEntry - waczWriter *zip.Writer - warcWriter *warc.Writer - warcOffset int -} - -type CDXJEntry struct { - Offset string `json:"offset"` - Status string `json:"status"` - Length string `json:"length"` - Mime string `json:"mime"` - Filename string `json:"filename"` - Digest string `json:"digest"` - RecordDigest string `json:"recordDigest"` - time time.Time `json:"-"` - url url.URL `json:"-"` -} - -type PageEntry struct { - URL string `json:"url"` - TS string `json:"ts"` +type ArchiveService struct { + ArchiveDir string } -// Returns a new Archiver. -// It creates a new wacz file for the given url string. -func NewArchiver(p models.Project) (*Archiver, error) { - // Create the project's archive directory if it doesn't exist. - projectPath := ArchiveDir + "/" + strconv.FormatInt(p.Id, 10) + "/" - err := os.MkdirAll(projectPath, 0755) - if err != nil { - return nil, err +func NewArchiveService(ad string) *ArchiveService { + return &ArchiveService{ + ArchiveDir: ad, } - - // Create the wacz file. - file, err := os.Create(projectPath + p.Host + ".wacz") - if err != nil { - return nil, err - } - waczWriter := zip.NewWriter(file) - - // Create the warc writer. - archiveFile, err := waczWriter.Create("data/data.warc") - if err != nil { - log.Printf("failed to create WARC file entry in ZIP: %v", err) - return nil, err - } - warcWriter := warc.NewWriter(archiveFile) - - return &Archiver{ - waczWriter: waczWriter, - file: file, - warcWriter: warcWriter, - }, nil } -// AddRecord adds a new response record to the warc file and keeps track -// of the added records to create the index once the archiver is closed. -func (s *Archiver) AddRecord(response *http.Response) { - var bodyCopy bytes.Buffer - err := s.readResponseBody(&bodyCopy, response) - if err != nil { - log.Printf("failed to read response body %v", err) - return - } - - var contentBuffer bytes.Buffer - s.readResponseHeaders(&contentBuffer, response) - if _, err := io.Copy(&contentBuffer, &bodyCopy); err != nil { - fmt.Println("error reading response body copy:", err) - return - } - - wdate := time.Now() - record := warc.NewRecord() - record.Header.Set("warc-type", "response") - record.Header.Set("warc-date", wdate.Format(time.RFC3339)) - record.Header.Set("warc-target-uri", response.Request.URL.String()) - record.Header.Set("content-type", response.Header.Get("Content-Type")) - record.Header.Set("warc-record-id", fmt.Sprintf("", uuid.New().String())) - record.Content = bytes.NewReader(contentBuffer.Bytes()) - if _, err := s.warcWriter.WriteRecord(record); err != nil { - log.Printf("failed to write WARC record to archive: %v", err) - return - } - - cdxjEntry := CDXJEntry{ - Status: fmt.Sprintf("%d", response.StatusCode), - Length: fmt.Sprintf("%d", len(contentBuffer.Bytes())), - Mime: response.Header.Get("Content-Type"), - Filename: "data/data.warc.gz", - Digest: fmt.Sprintf("sha-256:%x", sha256.Sum256(bodyCopy.Bytes())), - RecordDigest: fmt.Sprintf("sha256:%x", sha256.Sum256(contentBuffer.Bytes())), - Offset: fmt.Sprintf("%d", s.warcOffset), - time: wdate, - url: *response.Request.URL, - } - s.cdxjEntries = append(s.cdxjEntries, cdxjEntry) - - s.warcOffset += contentBuffer.Len() +// ArchiveProject returns an archiver for the specified project. +// It returns an error if the archiver couldn't be created. +func (s *ArchiveService) GetArchiveWriter(p *models.Project) (*archiver.Writer, error) { + return archiver.NewArchiver(s.getArchiveFile(p)) } -// readResponseBody Reads the http response's body into a bytes.Buffer. Then -// it resets the original response body so it can be used again later on. -func (s *Archiver) readResponseBody(bodyCopy *bytes.Buffer, response *http.Response) error { - _, err := io.Copy(bodyCopy, response.Body) - if err != nil { - return err - } - response.Body.Close() // Close the original body - response.Body = io.NopCloser(bytes.NewReader(bodyCopy.Bytes())) +// ReadArchive reads an URLs WACZ record from a project's archive. +func (s *ArchiveService) ReadArchive(p *models.Project, urlStr string) string { + waczPath := s.getArchiveFile(p) + reader := archiver.NewReader(waczPath) - return nil + return reader.ReadArchive(urlStr) } -// readResponseHeaders reads the response's headers into a bytes.Buffer. -func (s *Archiver) readResponseHeaders(contentBuffer *bytes.Buffer, response *http.Response) { - contentBuffer.WriteString( - fmt.Sprintf( - "HTTP/%d.%d %d %s\r\n", - response.ProtoMajor, - response.ProtoMinor, - response.StatusCode, - response.Status, - ), - ) - - for key, values := range response.Header { - for _, value := range values { - contentBuffer.WriteString(fmt.Sprintf("%s: %s\r\n", key, value)) - } - } - contentBuffer.WriteString("\r\n") +// ArchiveExists checks if a wacz file exists for the current project. +// It returns true if it exists, otherwise it returns false. +func (s *ArchiveService) ArchiveExists(p *models.Project) bool { + file := s.getArchiveFile(p) + _, err := os.Stat(file) + return err == nil } -// Close closes the archive and creates the remaining files. -func (s *Archiver) Close() { - err := s.createIndex() - if err != nil { - log.Printf("failed to create index file entry in ZIP: %v", err) - } - - err = s.createPages() - if err != nil { - log.Printf("failed to create pages file entry in ZIP: %v", err) - } - - s.waczWriter.Close() - s.file.Close() - - err = s.createDatapackage() - if err != nil { - log.Printf("failed to create datapackage.json: %v", err) +// DeleteArchive removes the wacz archive file for a given project. +// It checks if the file exists before removing it. +func (s *ArchiveService) DeleteArchive(p *models.Project) { + if !s.ArchiveExists(p) { return } -} - -// Create the index file. -func (s *Archiver) createIndex() error { - indexWriter, err := s.waczWriter.Create("indexes/index.cdx") - if err != nil { - return err - } - - cdx := []string{} - for _, entry := range s.cdxjEntries { - domainParts := strings.Split(entry.url.Hostname(), ".") - slices.Reverse(domainParts) - searchableURL := strings.Join(domainParts, ",") - searchableURL = searchableURL + ")" + entry.url.RequestURI() - - jsonEntry, err := json.Marshal(entry) - if err != nil { - log.Printf("failed to json marshal index %v", err) - continue - } - - cdxjLine := fmt.Sprintf("%s %s %s\n", searchableURL, entry.time.Format("20060102150405"), jsonEntry) - cdx = append(cdx, cdxjLine) - } - - slices.Sort(cdx) - for _, e := range cdx { - indexWriter.Write([]byte(e)) - } - return nil + file := s.getArchiveFile(p) + os.Remove(file) } -// addPage adds a new page record in the pages.jsonl file. -func (s *Archiver) createPages() error { - // Add the pages.jsonl and add the file header - pagesWriter, err := s.waczWriter.Create("pages/pages.jsonl") - if err != nil { - return err - } - - header := `{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}` - header += "\n" - pagesWriter.Write([]byte(header)) - - for _, e := range s.cdxjEntries { - page := PageEntry{ - URL: e.url.String(), - TS: e.time.Format(time.RFC3339), - } - - jsonPage, err := json.Marshal(page) - if err != nil { - return err - } - - jsonPage = append(jsonPage, '\n') - _, err = pagesWriter.Write(jsonPage) - if err != nil { - log.Printf("error adding page %s %v", jsonPage, err) - } +// GetArchiveFilePath returns the project's wacz file path if it exists, +// otherwise it returns an error. +func (s *ArchiveService) GetArchiveFilePath(p *models.Project) (string, error) { + if !s.ArchiveExists(p) { + return "", errors.New("WACZ archive file does not exist") } - return nil + file := s.getArchiveFile(p) + return file, nil } -// Create the datapackage.json file. -// Opens the zip file and reads all the files to create the resources json along with the hash. -// Then it saves the datapackage and creates the datapackage-digest json file. -func (s *Archiver) createDatapackage() error { - archive, err := zip.OpenReader(s.file.Name()) - if err != nil { - log.Printf("Failed to open ZIP archive for reading: %v", err) - return err - } - defer archive.Close() - - datapackage, err := s.getResources(archive) - if err != nil { - log.Printf("failed to get zip resources %v", err) - } - - f, err := os.OpenFile(s.file.Name(), os.O_RDWR, 0644) - if err != nil { - return err - } - defer f.Close() - - zipWriter := zip.NewWriter(f) - defer zipWriter.Close() - - // Copy existing files. - for _, file := range archive.File { - zipWriter.Copy(file) - } - - // create datapackage. - datapackageFile, err := zipWriter.Create("datapackage.json") - if err != nil { - return err - } - _, err = datapackageFile.Write(datapackage) - if err != nil { - return err - } - - // create datapackage digest. - datapackageDigest, err := zipWriter.Create("datapackage-digest.json") - if err != nil { - return err - } - - hash := sha256.Sum256(datapackage) - hashHex := hex.EncodeToString(hash[:]) - digestMap := map[string]string{ - "path": "datapackage.json", - "hash": "sha256" + hashHex, - } - digest, err := json.MarshalIndent(digestMap, "", " ") - if err != nil { - return nil - } - - _, err = datapackageDigest.Write(digest) - if err != nil { - return err - } - - return nil -} - -// getResources returns a []byte with the json data for the datapackage.json file. -func (s *Archiver) getResources(archive *zip.ReadCloser) ([]byte, error) { - var resources []map[string]interface{} - for _, file := range archive.File { - hash, err := s.calculateHash(file) - if err != nil { - return []byte{}, err - } - - resources = append(resources, map[string]interface{}{ - "name": filepath.Base(file.Name), - "path": file.Name, - "hash": hash, - "bytes": file.FileInfo().Size(), - }) - } - - datapackage := map[string]interface{}{ - "profile": "data-package", - "wacz_version": "1.1.1", - "resources": resources, - } - - return json.MarshalIndent(datapackage, "", " ") -} - -// calculateHash returns the hash string of a zip.File. -func (s *Archiver) calculateHash(file *zip.File) (string, error) { - rc, err := file.Open() - if err != nil { - return "", err - } - defer rc.Close() - hash := sha256.New() - _, err = io.Copy(hash, rc) - if err != nil { - return "", err - } - - return "sha256:" + hex.EncodeToString(hash.Sum(nil)), nil +// getArchiveFile returns a string with the path to the project's WACZ file. +func (s *ArchiveService) getArchiveFile(p *models.Project) string { + return s.ArchiveDir + "/" + strconv.FormatInt(p.Id, 10) + "/" + p.Host + ".wacz" } diff --git a/internal/services/container.go b/internal/services/container.go index 1a58085..6d3b0ae 100644 --- a/internal/services/container.go +++ b/internal/services/container.go @@ -29,6 +29,7 @@ type Container struct { CrawlerService *CrawlerService Renderer *Renderer CookieSession *CookieSession + ArchiveService *ArchiveService db *sql.DB issueRepository *repository.IssueRepository @@ -44,6 +45,7 @@ func NewContainer(configFile string) *Container { c := &Container{} c.InitConfig(configFile) c.InitDB() + c.InitArchiveService() c.InitRepositories() c.InitPubSubBroker() c.InitIssueService() @@ -159,7 +161,7 @@ func (c *Container) InitProjectService() { c.crawlRepository, } - c.ProjectService = NewProjectService(repository) + c.ProjectService = NewProjectService(repository, c.ArchiveService) // UserService DeleteHooks are called when a user is deleted. // Add a DeleteHook so it deletes all user projects and crawl @@ -191,6 +193,7 @@ func (c *Container) InitCrawlerService() { Broker: c.PubSubBroker, ReportManager: c.ReportManager, CrawlerHandler: NewCrawlerHandler(c.pageReportRepository, c.PubSubBroker, c.ReportManager), + ArchiveService: c.ArchiveService, Config: c.Config.Crawler, } repository := &struct { @@ -226,3 +229,7 @@ func (c *Container) InitRenderer() { func (c *Container) InitCookieSession() { c.CookieSession = NewCookieSession(c.userRepository) } + +func (c *Container) InitArchiveService() { + c.ArchiveService = NewArchiveService("archive") +} diff --git a/internal/services/crawler.go b/internal/services/crawler.go index b4771db..40fcc7f 100644 --- a/internal/services/crawler.go +++ b/internal/services/crawler.go @@ -35,6 +35,7 @@ type CrawlerServicesContainer struct { Broker *Broker ReportManager *ReportManager CrawlerHandler *CrawlerHandler + ArchiveService *ArchiveService Config *config.CrawlerConfig } @@ -44,6 +45,7 @@ type CrawlerService struct { broker *Broker reportManager *ReportManager crawlerHandler *CrawlerHandler + ArchiveService *ArchiveService crawlers map[int64]*crawler.Crawler lock *sync.RWMutex } @@ -55,6 +57,7 @@ func NewCrawlerService(r CrawlerServiceRepository, s CrawlerServicesContainer) * config: s.Config, reportManager: s.ReportManager, crawlerHandler: s.CrawlerHandler, + ArchiveService: s.ArchiveService, crawlers: make(map[int64]*crawler.Crawler), lock: &sync.RWMutex{}, } @@ -86,23 +89,20 @@ func (s *CrawlerService) StartCrawler(p models.Project, b models.BasicAuth) erro } go func() { + defer s.removeCrawler(&p) defer s.repository.DeleteCrawlData(&previousCrawl) - var archiver *Archiver if p.Archive { - archiver, err = NewArchiver(p) + archiver, err := s.ArchiveService.GetArchiveWriter(&p) if err != nil { log.Printf("Failed to create archive: %v", err) + c.OnResponse(s.crawlerHandler.responseCallback(crawl, &p, c)) + } else { + defer archiver.Close() + c.OnResponse(s.crawlerHandler.archiveCallback(crawl, &p, c, archiver)) } } - if archiver != nil { - defer archiver.Close() - c.OnResponse(s.crawlerHandler.archiveCallback(crawl, &p, c, archiver)) - } else { - c.OnResponse(s.crawlerHandler.responseCallback(crawl, &p, c)) - } - log.Printf("Crawling %s...", p.URL) c.AddRequest(&crawler.RequestMessage{URL: u, Data: crawlerData{}}) @@ -127,8 +127,6 @@ func (s *CrawlerService) StartCrawler(p models.Project, b models.BasicAuth) erro s.repository.UpdateCrawl(crawl) s.broker.Publish(fmt.Sprintf("crawl-%d", p.Id), &models.Message{Name: "CrawlEnd", Data: crawl.TotalURLs}) log.Printf("Crawled %d urls in %s", crawl.TotalURLs, p.URL) - - s.removeCrawler(&p) }() return nil diff --git a/internal/services/crawler_handler.go b/internal/services/crawler_handler.go index 8ed8750..6d3b0db 100644 --- a/internal/services/crawler_handler.go +++ b/internal/services/crawler_handler.go @@ -29,6 +29,10 @@ type crawlerData struct { Depth int } +type Archiver interface { + AddRecord(*http.Response) +} + func NewCrawlerHandler(r CrawlerHandlerRepository, b *Broker, m *ReportManager) *CrawlerHandler { return &CrawlerHandler{ repository: r, @@ -38,7 +42,7 @@ func NewCrawlerHandler(r CrawlerHandlerRepository, b *Broker, m *ReportManager) } } -func (s *CrawlerHandler) archiveCallback(crawl *models.Crawl, p *models.Project, c *crawler.Crawler, a *Archiver) crawler.ResponseCallback { +func (s *CrawlerHandler) archiveCallback(crawl *models.Crawl, p *models.Project, c *crawler.Crawler, a Archiver) crawler.ResponseCallback { responseCallback := s.responseCallback(crawl, p, c) return func(r *crawler.ResponseMessage) { if r.Error == nil && a != nil { diff --git a/internal/services/project.go b/internal/services/project.go index 530f096..5f5bb53 100644 --- a/internal/services/project.go +++ b/internal/services/project.go @@ -3,14 +3,15 @@ package services import ( "errors" "net/url" - "os" - "strconv" "strings" "github.com/stjudewashere/seonaut/internal/models" ) type ( + ArchiveRemover interface { + DeleteArchive(*models.Project) + } ProjectServiceRepository interface { SaveProject(*models.Project, int) DeleteProject(*models.Project) @@ -23,12 +24,16 @@ type ( } ProjectService struct { - repository ProjectServiceRepository + repository ProjectServiceRepository + archiveRemover ArchiveRemover } ) -func NewProjectService(r ProjectServiceRepository) *ProjectService { - return &ProjectService{repository: r} +func NewProjectService(r ProjectServiceRepository, a ArchiveRemover) *ProjectService { + return &ProjectService{ + repository: r, + archiveRemover: a, + } } // SaveProject stores a new project. @@ -74,14 +79,14 @@ func (s *ProjectService) DeleteProject(p *models.Project) { go func() { s.repository.DeleteProjectCrawls(p) s.repository.DeleteProject(p) - s.DeleteArchive(p) + s.archiveRemover.DeleteArchive(p) }() } // Update project details. func (s *ProjectService) UpdateProject(p *models.Project) error { if !p.Archive { - s.DeleteArchive(p) + s.archiveRemover.DeleteArchive(p) } return s.repository.UpdateProject(p) @@ -93,33 +98,6 @@ func (s *ProjectService) DeleteAllUserProjects(user *models.User) { for _, p := range projects { s.repository.DeleteProjectCrawls(&p) s.repository.DeleteProject(&p) - s.DeleteArchive(&p) - } -} - -// ArchiveExists checks if a wacz file exists for the current project. -// It returns true if it exists, otherwise it returns false. -func (s *ProjectService) ArchiveExists(p *models.Project) bool { - _, err := os.Stat(ArchiveDir + "/" + strconv.FormatInt(p.Id, 10) + "/" + p.Host + ".wacz") - return err == nil -} - -// DeleteArchive removes the wacz archive file for a given project. -// It checks if the file exists before removing it. -func (s *ProjectService) DeleteArchive(p *models.Project) { - if !s.ArchiveExists(p) { - return - } - - os.Remove(ArchiveDir + "/" + strconv.FormatInt(p.Id, 10) + "/" + p.Host + ".wacz") -} - -// GetArchiveFilePath returns the project's wacz file path if it exists, -// otherwise it returns an error. -func (s *ProjectService) GetArchiveFilePath(p *models.Project) (string, error) { - if !s.ArchiveExists(p) { - return "", errors.New("WACZ archive file does not exist") + s.archiveRemover.DeleteArchive(&p) } - - return ArchiveDir + "/" + strconv.FormatInt(p.Id, 10) + "/" + p.Host + ".wacz", nil } diff --git a/internal/services/project_test.go b/internal/services/project_test.go index 83a46cf..ce31b04 100644 --- a/internal/services/project_test.go +++ b/internal/services/project_test.go @@ -38,7 +38,11 @@ func (s *projectTestRepository) FindProjectById(id, uid int) (models.Project, er } func (s *projectTestRepository) DeleteProjectCrawls(*models.Project) {} -var service = services.NewProjectService(&projectTestRepository{}) +type ArchiveDeleter struct{} + +func (ad *ArchiveDeleter) DeleteArchive(p *models.Project) {} + +var service = services.NewProjectService(&projectTestRepository{}, &ArchiveDeleter{}) func TestFindProjectById(t *testing.T) { p, err := service.FindProject(gid, guid) diff --git a/web/templates/resources.html b/web/templates/resources.html index 186c2a9..ec0b765 100644 --- a/web/templates/resources.html +++ b/web/templates/resources.html @@ -115,6 +115,7 @@ + {{ $cid := .ProjectView.Crawl.Id }} {{ $pid := .ProjectView.Project.Id }} {{ $crawlSitemap := .ProjectView.Project.CrawlSitemap }} From 3cae11867e220d2f0d559286ba231f700634e75b Mon Sep 17 00:00:00 2001 From: StJudeWasHere <707925+StJudeWasHere@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:58:50 +0100 Subject: [PATCH 5/6] Allow to view source code from archived response --- internal/archiver/reader.go | 121 +++++++++++++------ internal/archiver/{archiver.go => writer.go} | 22 ++-- internal/routes/app.go | 1 + internal/routes/resource.go | 99 ++++++++++++++- internal/services/crawler.go | 7 +- translations/translation.en.yaml | 1 + web/css/archive.css | 14 +++ web/css/style.css | 3 +- web/templates/archive.html | 66 ++++++++++ web/templates/resources.html | 4 +- 10 files changed, 280 insertions(+), 58 deletions(-) rename internal/archiver/{archiver.go => writer.go} (96%) create mode 100644 web/css/archive.css create mode 100644 web/templates/archive.html diff --git a/internal/archiver/reader.go b/internal/archiver/reader.go index 69d7c0e..7e7cb68 100644 --- a/internal/archiver/reader.go +++ b/internal/archiver/reader.go @@ -2,14 +2,16 @@ package archiver import ( "archive/zip" - "bufio" "bytes" "encoding/json" "errors" "fmt" "io" "log" + "net/url" "os" + "slices" + "sort" "strings" "github.com/slyrz/warc" @@ -38,7 +40,12 @@ func (s *Reader) ReadArchive(urlStr string) (content string) { return "" } - zipoffset, err := s.getWarcOffset(wacz) + file, err := s.getZipFile(wacz, "data/data.warc") + if err != nil { + return "" + } + + zipoffset, err := file.DataOffset() if err != nil { return "" } @@ -52,7 +59,6 @@ func (s *Reader) ReadArchive(urlStr string) (content string) { buffer := make([]byte, record.Length) - // Read a specific chunk from the file starting from 'offset' _, err = f.ReadAt(buffer, zipoffset+int64(record.Offset)) if err != nil && err.Error() != "EOF" { log.Println(err) @@ -62,61 +68,98 @@ func (s *Reader) ReadArchive(urlStr string) (content string) { if err != nil { log.Println(err) } - log.Println(string(buffer)) c, _ := io.ReadAll(r.Content) return string(c) } -func (s *Reader) getCDXEntry(wacz *zip.ReadCloser, urlStr string) (*CDXJEntry, error) { - indexFile, err := wacz.Open("indexes/index.cdx") +func (s *Reader) getCDXEntry(wacz *zip.ReadCloser, urlStr string) (*IndexEntry, error) { + file, err := s.getZipFile(wacz, "indexes/index.cdx") if err != nil { - log.Printf("failed to open index file %v", err) return nil, err } + offset, err := file.DataOffset() + if err != nil { + return nil, err + } + size := file.FileInfo().Size() - var record CDXJEntry - scanner := bufio.NewScanner(indexFile) - for scanner.Scan() { - line := scanner.Text() + parsedURL, err := url.Parse(urlStr) + if err != nil { + return nil, err + } - if strings.Contains(line, urlStr) { - // Find the JSON part of the line by locating the first '{' character - jsonStart := strings.Index(line, "{") - if jsonStart == -1 { - fmt.Println("JSON data not found in line:", line) - continue - } + domainParts := strings.Split(parsedURL.Hostname(), ".") + slices.Reverse(domainParts) + searchableURL := strings.Join(domainParts, ",") + searchableURL = searchableURL + ")" + parsedURL.RequestURI() - // Extract the JSON substring - jsonData := line[jsonStart:] + line, err := s.searchFileSegment(offset, size, searchableURL) + if err != nil { + return nil, err + } - // Parse the JSON data into the Record struct - err := json.Unmarshal([]byte(jsonData), &record) - if err != nil { - fmt.Println("Error parsing JSON:", err) - continue - } + var record IndexEntry - return &record, nil - } + jsonStart := strings.Index(line, "{") + if jsonStart == -1 { + fmt.Println("JSON data not found in line:", line) + return nil, fmt.Errorf("invalid IndexEntry %s", line) } - return nil, errors.New("URL not found in index file") + // Extract the JSON substring + jsonData := line[jsonStart:] + + err = json.Unmarshal([]byte(jsonData), &record) + if err != nil { + return nil, fmt.Errorf("error parsing JSON: %w", err) + } + + return &record, nil } -func (s *Reader) getWarcOffset(wacz *zip.ReadCloser) (int64, error) { - var zipOffset int64 - var err error +func (s *Reader) getZipFile(wacz *zip.ReadCloser, waczFile string) (*zip.File, error) { for _, file := range wacz.File { - if file.Name == "data/data.warc" { - zipOffset, err = file.DataOffset() - if err != nil { - return zipOffset, err - } - return zipOffset, nil + if file.Name == waczFile { + return file, nil } } - return zipOffset, errors.New("warc file file not found") + return nil, errors.New("warc file file not found") +} + +func (s *Reader) searchFileSegment(offset, length int64, target string) (string, error) { + file, err := os.Open(s.waczPath) + if err != nil { + return "", fmt.Errorf("failed to open file: %v", err) + } + defer file.Close() + + // Seek to the specified offset + _, err = file.Seek(offset, 0) + if err != nil { + return "", fmt.Errorf("failed to seek to offset: %v", err) + } + + // Read the specified length of bytes + buffer := make([]byte, length) + _, err = file.Read(buffer) + if err != nil { + return "", fmt.Errorf("failed to read segment: %v", err) + } + + // Split the buffer into lines + lines := strings.Split(string(buffer), "\n") + + // Perform binary search on lines in memory using sort.Search + index := sort.Search(len(lines), func(i int) bool { + return lines[i] >= target + }) + + // Check if the found line starts with the target prefix + if index < len(lines) && strings.HasPrefix(lines[index], target) { + return lines[index], nil // Found the line + } + + return "", fmt.Errorf("no line starting with '%s' found", target) } diff --git a/internal/archiver/archiver.go b/internal/archiver/writer.go similarity index 96% rename from internal/archiver/archiver.go rename to internal/archiver/writer.go index ce11852..6c65dce 100644 --- a/internal/archiver/archiver.go +++ b/internal/archiver/writer.go @@ -21,17 +21,15 @@ import ( "github.com/slyrz/warc" ) -const ArchiveDir = "archive/" - type Writer struct { - file *os.File - cdxjEntries []CDXJEntry - waczWriter *zip.Writer - warcWriter *warc.Writer - warcOffset int + file *os.File + indexEntries []IndexEntry + waczWriter *zip.Writer + warcWriter *warc.Writer + warcOffset int } -type CDXJEntry struct { +type IndexEntry struct { URL string `json:"url"` Offset int `json:"offset"` Status string `json:"status"` @@ -124,7 +122,7 @@ func (s *Writer) AddRecord(response *http.Response) { return } - cdxjEntry := CDXJEntry{ + indexEntry := IndexEntry{ Status: fmt.Sprintf("%d", response.StatusCode), Length: recordLen, Mime: response.Header.Get("Content-Type"), @@ -136,7 +134,7 @@ func (s *Writer) AddRecord(response *http.Response) { parsedURL: *response.Request.URL, URL: response.Request.URL.String(), } - s.cdxjEntries = append(s.cdxjEntries, cdxjEntry) + s.indexEntries = append(s.indexEntries, indexEntry) s.warcOffset += recordLen } @@ -209,7 +207,7 @@ func (s *Writer) createIndex() error { } cdx := []string{} - for _, entry := range s.cdxjEntries { + for _, entry := range s.indexEntries { domainParts := strings.Split(entry.parsedURL.Hostname(), ".") slices.Reverse(domainParts) searchableURL := strings.Join(domainParts, ",") @@ -245,7 +243,7 @@ func (s *Writer) createPages() error { header += "\n" pagesWriter.Write([]byte(header)) - for _, e := range s.cdxjEntries { + for _, e := range s.indexEntries { page := PageEntry{ URL: e.parsedURL.String(), TS: e.time.Format(time.RFC3339), diff --git a/internal/routes/app.go b/internal/routes/app.go index e0f0617..d3f9724 100644 --- a/internal/routes/app.go +++ b/internal/routes/app.go @@ -67,6 +67,7 @@ func NewServer(container *services.Container) { // Resource route resourceHandler := resourceHandler{container} http.HandleFunc("GET /resources", container.CookieSession.Auth(resourceHandler.indexHandler)) + http.HandleFunc("GET /archive", container.CookieSession.Auth(resourceHandler.archiveHandler)) // User routes userHandler := userHandler{container} diff --git a/internal/routes/resource.go b/internal/routes/resource.go index 8e8acd3..7b06457 100644 --- a/internal/routes/resource.go +++ b/internal/routes/resource.go @@ -64,8 +64,8 @@ func (h *resourceHandler) indexHandler(w http.ResponseWriter, r *http.Request) { } pageReportView := h.ReportService.GetPageReport(rid, pv.Crawl.Id, tab, page) - - source := h.Container.ArchiveService.ReadArchive(&pv.Project, pageReportView.PageReport.URL) + isArchived := h.Container.ArchiveService.ArchiveExists(&pv.Project) + isTextMedia := strings.HasPrefix(pageReportView.PageReport.MediaType, "text/") data := &struct { PageReportView *models.PageReportView @@ -73,14 +73,14 @@ func (h *resourceHandler) indexHandler(w http.ResponseWriter, r *http.Request) { Eid string Ep string Tab string - Source string + Archived bool }{ ProjectView: pv, Eid: eid, Ep: ep, Tab: tab, PageReportView: pageReportView, - Source: source, + Archived: isArchived && isTextMedia, } pageView := &PageView{ @@ -91,3 +91,94 @@ func (h *resourceHandler) indexHandler(w http.ResponseWriter, r *http.Request) { h.Renderer.RenderTemplate(w, "resources", pageView) } + +func (h *resourceHandler) archiveHandler(w http.ResponseWriter, r *http.Request) { + user, ok := h.CookieSession.GetUser(r.Context()) + if !ok { + http.Redirect(w, r, "/signout", http.StatusSeeOther) + return + } + + pid, err := strconv.Atoi(r.URL.Query().Get("pid")) + if err != nil { + http.Redirect(w, r, "/", http.StatusSeeOther) + return + } + + rid, err := strconv.Atoi(r.URL.Query().Get("rid")) + if err != nil { + http.Redirect(w, r, "/", http.StatusSeeOther) + return + } + + eid := r.URL.Query().Get("eid") + ep := r.URL.Query().Get("ep") + if eid == "" && ep == "" { + http.Redirect(w, r, "/", http.StatusSeeOther) + return + } + + tab := r.URL.Query().Get("t") + if tab == "" { + tab = "details" + } + + pv, err := h.ProjectViewService.GetProjectView(pid, user.Id) + if err != nil { + http.Redirect(w, r, "/", http.StatusSeeOther) + return + } + + isArchived := h.Container.ArchiveService.ArchiveExists(&pv.Project) + if !isArchived { + http.Redirect(w, r, "/", http.StatusSeeOther) + return + } + + pageReportView := h.ReportService.GetPageReport(rid, pv.Crawl.Id, "default", 1) + isTextMedia := strings.HasPrefix(pageReportView.PageReport.MediaType, "text/") + if !isTextMedia { + http.Redirect(w, r, "/", http.StatusSeeOther) + return + } + + archive := h.Container.ArchiveService.ReadArchive(&pv.Project, pageReportView.PageReport.URL) + + var headers, body string + index := strings.Index(archive, "\r\n\r\n") + if index != -1 { + // Split the string into two parts: before and after the first newline + headers = archive[:index] + body = strings.TrimSpace(archive[index+1:]) + } else { + // If there's no newline, the entire text is the first part + headers = archive + body = "" + } + + data := &struct { + PageReportView *models.PageReportView + ProjectView *models.ProjectView + Eid string + Ep string + Tab string + Headers string + Body string + }{ + ProjectView: pv, + PageReportView: pageReportView, + Eid: eid, + Ep: ep, + Tab: tab, + Headers: headers, + Body: body, + } + + pageView := &PageView{ + Data: data, + User: *user, + PageTitle: "ARCHIVE_VIEW", + } + + h.Renderer.RenderTemplate(w, "archive", pageView) +} diff --git a/internal/services/crawler.go b/internal/services/crawler.go index 40fcc7f..b61c299 100644 --- a/internal/services/crawler.go +++ b/internal/services/crawler.go @@ -92,17 +92,22 @@ func (s *CrawlerService) StartCrawler(p models.Project, b models.BasicAuth) erro defer s.removeCrawler(&p) defer s.repository.DeleteCrawlData(&previousCrawl) + archiveEnabled := false if p.Archive { archiver, err := s.ArchiveService.GetArchiveWriter(&p) if err != nil { log.Printf("Failed to create archive: %v", err) - c.OnResponse(s.crawlerHandler.responseCallback(crawl, &p, c)) } else { defer archiver.Close() c.OnResponse(s.crawlerHandler.archiveCallback(crawl, &p, c, archiver)) + archiveEnabled = true } } + if !archiveEnabled { + c.OnResponse(s.crawlerHandler.responseCallback(crawl, &p, c)) + } + log.Printf("Crawling %s...", p.URL) c.AddRequest(&crawler.RequestMessage{URL: u, Data: crawlerData{}}) diff --git a/translations/translation.en.yaml b/translations/translation.en.yaml index 13c7867..698054d 100644 --- a/translations/translation.en.yaml +++ b/translations/translation.en.yaml @@ -23,6 +23,7 @@ EXPORT_VIEW: Export CRAWL_AUTH_VIEW: Project HTTP Basic Authentication EXPLORER: URL Explorer DELETE_ACCOUNT_VIEW: Delete Account +ARCHIVE_VIEW: Archive Source Code ERROR_50x: Status 50x ERROR_50x_DESC: This kind of errors usually occour due to a server bug or missconfiguration, the affected pages don't load properly and show an error page instead, scaring your users and annoying search engines. diff --git a/web/css/archive.css b/web/css/archive.css new file mode 100644 index 0000000..2c7e0ca --- /dev/null +++ b/web/css/archive.css @@ -0,0 +1,14 @@ +.archive { + background-color: var(--light-bg-color); + border: 1px solid var(--border-color); + font-family: 'Nimbus Mono PS', 'Courier New', monospace; + font-size: .85rem; + margin-bottom: calc(var(--line-height) * 2); + overflow: scroll; + padding: var(--line-height) 1.5rem; + max-height: calc(var(--line-height) * 20); +} + +.archive:last-child { + margin-bottom: 0; +} \ No newline at end of file diff --git a/web/css/style.css b/web/css/style.css index f9bdb29..9fe3638 100644 --- a/web/css/style.css +++ b/web/css/style.css @@ -12,4 +12,5 @@ @import "credentials.css"; @import "intro.css"; @import "mint-intro.css"; -@import "footer.css"; \ No newline at end of file +@import "footer.css"; +@import "archive.css"; \ No newline at end of file diff --git a/web/templates/archive.html b/web/templates/archive.html new file mode 100644 index 0000000..389cf41 --- /dev/null +++ b/web/templates/archive.html @@ -0,0 +1,66 @@ +{{ template "head" . }} + +{{ with .Data }} + + +
+ +
+
+
+ {{ if .Eid }} + Site Issues + / + {{ trans .Eid }} + / + Details + {{ else if .Ep }} + Page Details + / + Details + {{ end }} +
+
+ + +
+ +
+
+
+
+ {{ if .PageReportView.PageReport.Title }} + {{ .PageReportView.PageReport.Title }}
+ {{ end }} + {{ .PageReportView.PageReport.URL }} + +
+
+
+
+ +
+
+
+

Headers

+

This block shows the response headers as received by the crawler.

+
{{ .Headers }}
+ + {{ if .Body }} +

Body

+

This block shows the response body as received by the crawler.

+
{{ .Body }}
+ {{ end }} +
+
+
+ +
+{{ end }} +{{ template "footer" . }} diff --git a/web/templates/resources.html b/web/templates/resources.html index ec0b765..62151ef 100644 --- a/web/templates/resources.html +++ b/web/templates/resources.html @@ -115,7 +115,6 @@ - {{ $cid := .ProjectView.Crawl.Id }} {{ $pid := .ProjectView.Project.Id }} {{ $crawlSitemap := .ProjectView.Project.CrawlSitemap }} @@ -127,6 +126,9 @@
General details about this URL. + {{ if .Archived }} +
This project is archived: view archived response. + {{ end }}
From f426a133f7e295ebb8465481d01d633a50bfce95 Mon Sep 17 00:00:00 2001 From: StJudeWasHere <707925+StJudeWasHere@users.noreply.github.com> Date: Wed, 13 Nov 2024 12:11:29 +0100 Subject: [PATCH 6/6] Remove archive directory if empty --- internal/archiver/reader.go | 7 ++++ internal/archiver/writer.go | 1 - internal/models/archive_record.go | 6 ++++ internal/routes/resource.go | 22 +++--------- internal/services/{archiver.go => archive.go} | 36 +++++++++++++++++-- web/templates/archive.html | 6 ++-- 6 files changed, 55 insertions(+), 23 deletions(-) create mode 100644 internal/models/archive_record.go rename internal/services/{archiver.go => archive.go} (69%) diff --git a/internal/archiver/reader.go b/internal/archiver/reader.go index 7e7cb68..3ae2d8b 100644 --- a/internal/archiver/reader.go +++ b/internal/archiver/reader.go @@ -27,6 +27,8 @@ func NewReader(waczPath string) *Reader { } } +// ReadArchive reads the archive and returns the contents of the warc record for +// the specified URL as a string. func (s *Reader) ReadArchive(urlStr string) (content string) { wacz, err := zip.OpenReader(s.waczPath) if err != nil { @@ -73,6 +75,8 @@ func (s *Reader) ReadArchive(urlStr string) (content string) { return string(c) } +// getCDXEntry Looks for the specified URL in the index file and returns an IndexEntry if found, +// otherwise it returns an error. func (s *Reader) getCDXEntry(wacz *zip.ReadCloser, urlStr string) (*IndexEntry, error) { file, err := s.getZipFile(wacz, "indexes/index.cdx") if err != nil { @@ -118,6 +122,7 @@ func (s *Reader) getCDXEntry(wacz *zip.ReadCloser, urlStr string) (*IndexEntry, return &record, nil } +// getZipFile returns a *zip.File from a wacz file. If not found it returns an error. func (s *Reader) getZipFile(wacz *zip.ReadCloser, waczFile string) (*zip.File, error) { for _, file := range wacz.File { if file.Name == waczFile { @@ -128,6 +133,8 @@ func (s *Reader) getZipFile(wacz *zip.ReadCloser, waczFile string) (*zip.File, e return nil, errors.New("warc file file not found") } +// searchFileSegment searches the target string in WACZ file index using bynary search. +// It loads the index contents in memory. func (s *Reader) searchFileSegment(offset, length int64, target string) (string, error) { file, err := os.Open(s.waczPath) if err != nil { diff --git a/internal/archiver/writer.go b/internal/archiver/writer.go index 6c65dce..079a1c9 100644 --- a/internal/archiver/writer.go +++ b/internal/archiver/writer.go @@ -233,7 +233,6 @@ func (s *Writer) createIndex() error { // addPage adds a new page record in the pages.jsonl file. func (s *Writer) createPages() error { - // Add the pages.jsonl and add the file header pagesWriter, err := s.waczWriter.Create("pages/pages.jsonl") if err != nil { return err diff --git a/internal/models/archive_record.go b/internal/models/archive_record.go new file mode 100644 index 0000000..bde7bf9 --- /dev/null +++ b/internal/models/archive_record.go @@ -0,0 +1,6 @@ +package models + +type ArchiveRecord struct { + Headers string + Body string +} diff --git a/internal/routes/resource.go b/internal/routes/resource.go index 7b06457..79e8b32 100644 --- a/internal/routes/resource.go +++ b/internal/routes/resource.go @@ -92,6 +92,8 @@ func (h *resourceHandler) indexHandler(w http.ResponseWriter, r *http.Request) { h.Renderer.RenderTemplate(w, "resources", pageView) } +// archiveHandler the HTTP request for the archive page. It loads the data from the +// archive and displays the source code of the crawler's response for a specific resource. func (h *resourceHandler) archiveHandler(w http.ResponseWriter, r *http.Request) { user, ok := h.CookieSession.GetUser(r.Context()) if !ok { @@ -142,19 +144,7 @@ func (h *resourceHandler) archiveHandler(w http.ResponseWriter, r *http.Request) return } - archive := h.Container.ArchiveService.ReadArchive(&pv.Project, pageReportView.PageReport.URL) - - var headers, body string - index := strings.Index(archive, "\r\n\r\n") - if index != -1 { - // Split the string into two parts: before and after the first newline - headers = archive[:index] - body = strings.TrimSpace(archive[index+1:]) - } else { - // If there's no newline, the entire text is the first part - headers = archive - body = "" - } + record := h.Container.ArchiveService.ReadArchiveRecord(&pv.Project, pageReportView.PageReport.URL) data := &struct { PageReportView *models.PageReportView @@ -162,16 +152,14 @@ func (h *resourceHandler) archiveHandler(w http.ResponseWriter, r *http.Request) Eid string Ep string Tab string - Headers string - Body string + ArchiveRecord *models.ArchiveRecord }{ ProjectView: pv, PageReportView: pageReportView, Eid: eid, Ep: ep, Tab: tab, - Headers: headers, - Body: body, + ArchiveRecord: record, } pageView := &PageView{ diff --git a/internal/services/archiver.go b/internal/services/archive.go similarity index 69% rename from internal/services/archiver.go rename to internal/services/archive.go index 332275e..ebd707f 100644 --- a/internal/services/archiver.go +++ b/internal/services/archive.go @@ -2,8 +2,11 @@ package services import ( "errors" + "log" "os" + "path/filepath" "strconv" + "strings" "github.com/stjudewashere/seonaut/internal/archiver" "github.com/stjudewashere/seonaut/internal/models" @@ -26,11 +29,22 @@ func (s *ArchiveService) GetArchiveWriter(p *models.Project) (*archiver.Writer, } // ReadArchive reads an URLs WACZ record from a project's archive. -func (s *ArchiveService) ReadArchive(p *models.Project, urlStr string) string { +func (s *ArchiveService) ReadArchiveRecord(p *models.Project, urlStr string) *models.ArchiveRecord { waczPath := s.getArchiveFile(p) reader := archiver.NewReader(waczPath) - return reader.ReadArchive(urlStr) + content := reader.ReadArchive(urlStr) + + record := &models.ArchiveRecord{} + index := strings.Index(content, "\r\n\r\n") + if index != -1 { + record.Headers = content[:index] + record.Body = strings.TrimSpace(content[index+1:]) + } else { + record.Headers = content + } + + return record } // ArchiveExists checks if a wacz file exists for the current project. @@ -50,6 +64,24 @@ func (s *ArchiveService) DeleteArchive(p *models.Project) { file := s.getArchiveFile(p) os.Remove(file) + + // Check if the archive directory is empty and remove it. + dir := filepath.Dir(file) + d, err := os.Open(dir) + if err != nil { + log.Printf("failed to open archive dir %s: %v", dir, err) + return + } + + _, err = d.ReadDir(1) + if err == nil { + return // dir is not empty. + } + + err = os.Remove(dir) + if err != nil { + log.Printf("failed to remove empty archive dir %s: %v", dir, err) + } } // GetArchiveFilePath returns the project's wacz file path if it exists, diff --git a/web/templates/archive.html b/web/templates/archive.html index 389cf41..da9359c 100644 --- a/web/templates/archive.html +++ b/web/templates/archive.html @@ -50,12 +50,12 @@

Headers

This block shows the response headers as received by the crawler.

-
{{ .Headers }}
+
{{ .ArchiveRecord.Headers }}
- {{ if .Body }} + {{ if .ArchiveRecord.Body }}

Body

This block shows the response body as received by the crawler.

-
{{ .Body }}
+
{{ .ArchiveRecord.Body }}
{{ end }}