Skip to content

Commit

Permalink
Add WACZ archive as export option
Browse files Browse the repository at this point in the history
  • Loading branch information
StJudeWasHere committed Nov 8, 2024
1 parent 627c673 commit 1342faf
Show file tree
Hide file tree
Showing 18 changed files with 509 additions and 16 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ bin/*
# Ignore any file in the web/static folder except favicon.ico and robots.txt
# The frontend build will copy fonts and styles into this folder.

archive/*
web/static/*
!web/static/favicon.ico
!web/static/robots.txt
Empty file added archive/.gitignore
Empty file.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ require (
github.com/gorilla/websocket v1.5.3
github.com/microcosm-cc/bluemonday v1.0.27
github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4
github.com/slyrz/warc v0.0.0-20150806225202-a50edd19b690
github.com/spf13/viper v1.19.0
github.com/temoto/robotstxt v1.1.2
github.com/turk/go-sitemap v0.0.0-20210912154218-82ad01095e30
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ github.com/sagikazarmark/locafero v0.6.0 h1:ON7AQg37yzcRPU69mt7gwhFEBwxI6P9T4Qu3
github.com/sagikazarmark/locafero v0.6.0/go.mod h1:77OmuIc6VTraTXKXIs/uvUxKGUXjE1GbemJYHqdNjX0=
github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE=
github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ=
github.com/slyrz/warc v0.0.0-20150806225202-a50edd19b690 h1:2RLSydlHktw3Fo4nwOQwjexn1d49KJb/i+EmlT4D878=
github.com/slyrz/warc v0.0.0-20150806225202-a50edd19b690/go.mod h1:LuhAhBK7l5/QEJmiz3tVGLi8n0IwqAwLX/ndr+6XSDE=
github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo=
github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0=
github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8=
Expand Down
1 change: 1 addition & 0 deletions internal/models/project.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ type Project struct {
Deleting bool
BasicAuth bool
CheckExternalLinks bool
Archive bool
}
18 changes: 13 additions & 5 deletions internal/repository/project.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@ func (ds *ProjectRepository) SaveProject(project *models.Project, uid int) {
allow_subdomains,
basic_auth,
user_id,
check_external_links
check_external_links,
archive
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`

stmt, _ := ds.DB.Prepare(query)
Expand All @@ -40,6 +41,7 @@ func (ds *ProjectRepository) SaveProject(project *models.Project, uid int) {
project.BasicAuth,
uid,
project.CheckExternalLinks,
project.Archive,
)
if err != nil {
log.Printf("saveProject: %v\n", err)
Expand All @@ -61,7 +63,8 @@ func (ds *ProjectRepository) FindProjectsByUser(uid int) []models.Project {
basic_auth,
deleting,
created,
check_external_links
check_external_links,
archive
FROM projects
WHERE user_id = ?
ORDER BY url ASC`
Expand All @@ -86,6 +89,7 @@ func (ds *ProjectRepository) FindProjectsByUser(uid int) []models.Project {
&p.Deleting,
&p.Created,
&p.CheckExternalLinks,
&p.Archive,
)
if err != nil {
log.Println(err)
Expand All @@ -112,7 +116,8 @@ func (ds *ProjectRepository) FindProjectById(id int, uid int) (models.Project, e
basic_auth,
deleting,
created,
check_external_links
check_external_links,
archive
FROM projects
WHERE id = ? AND user_id = ?`

Expand All @@ -131,6 +136,7 @@ func (ds *ProjectRepository) FindProjectById(id int, uid int) (models.Project, e
&p.Deleting,
&p.Created,
&p.CheckExternalLinks,
&p.Archive,
)
if err != nil {
log.Println(err)
Expand Down Expand Up @@ -169,7 +175,8 @@ func (ds *ProjectRepository) UpdateProject(p *models.Project) error {
crawl_sitemap = ?,
allow_subdomains = ?,
basic_auth = ?,
check_external_links = ?
check_external_links = ?,
archive = ?
WHERE id = ?
`
_, err := ds.DB.Exec(
Expand All @@ -181,6 +188,7 @@ func (ds *ProjectRepository) UpdateProject(p *models.Project) error {
p.AllowSubdomains,
p.BasicAuth,
p.CheckExternalLinks,
p.Archive,
p.Id,
)

Expand Down
1 change: 1 addition & 0 deletions internal/routes/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ func NewServer(container *services.Container) {
http.HandleFunc("GET /export/csv", container.CookieSession.Auth(exportHandler.csvHandler))
http.HandleFunc("GET /export/sitemap", container.CookieSession.Auth(exportHandler.sitemapHandler))
http.HandleFunc("GET /export/resources", container.CookieSession.Auth(exportHandler.resourcesHandler))
http.HandleFunc("GET /export/wazc", container.CookieSession.Auth(exportHandler.waczHandler))

// Issues routes
issueHandler := issueHandler{container}
Expand Down
78 changes: 77 additions & 1 deletion internal/routes/export.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ package routes
import (
"fmt"
"io"
"log"
"net/http"
"os"
"strconv"
"time"

Expand Down Expand Up @@ -43,10 +45,17 @@ func (h *exportHandler) indexHandler(w http.ResponseWriter, r *http.Request) {
return
}

archiveExists := h.Container.ProjectService.ArchiveExists(&pv.Project)
h.Renderer.RenderTemplate(w, "export", &PageView{
Data: struct{ Project models.Project }{Project: pv.Project},
User: *user,
PageTitle: "EXPORT_VIEW",
Data: struct {
Project models.Project
ArchiveExists bool
}{
Project: pv.Project,
ArchiveExists: archiveExists,
},
})
}

Expand Down Expand Up @@ -169,3 +178,70 @@ func (h *exportHandler) resourcesHandler(w http.ResponseWriter, r *http.Request)
w.Header().Add("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s.csv\"", fileName))
e(w, &pv.Crawl)
}

// waczHandler exports the WACZ archive of a specific project.
// It expects a "pid" query parameter with the project's id. It checks if
// the file exists before passing it to the response.
func (h *exportHandler) waczHandler(w http.ResponseWriter, r *http.Request) {
user, ok := h.CookieSession.GetUser(r.Context())
if !ok {
http.Redirect(w, r, "/signout", http.StatusSeeOther)
return
}

pid, err := strconv.Atoi(r.URL.Query().Get("pid"))
if err != nil {
http.Redirect(w, r, "/", http.StatusSeeOther)
return
}

p, err := h.ProjectService.FindProject(pid, user.Id)
if err != nil {
http.Redirect(w, r, "/", http.StatusSeeOther)
return
}

archiveFilePath, err := h.Container.ProjectService.GetArchiveFilePath(&p)
if err != nil {
http.Redirect(w, r, "/", http.StatusSeeOther)
return
}

file, err := os.Open(archiveFilePath)
if err != nil {
http.Error(w, "File not found", http.StatusNotFound)
return
}
defer file.Close()

info, err := file.Stat()
if err != nil {
http.Error(w, "File not found", http.StatusNotFound)
return
}

size := info.Size()

w.Header().Set("Content-Type", "application/wacz")
w.Header().Add("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s.wacz\"", p.Host))
w.Header().Set("Content-Length", fmt.Sprintf("%d", size))

buf := make([]byte, 4096)
for {
n, err := file.Read(buf)
if n > 0 {
if _, writeErr := w.Write(buf[:n]); writeErr != nil {
log.Printf("Failed to write data: %v", writeErr)
break
}

w.(http.Flusher).Flush()
}
if err != nil {
if err != io.EOF {
log.Printf("Error reading file: %v", err)
}
break
}
}
}
11 changes: 11 additions & 0 deletions internal/routes/project.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ func (h *projectHandler) addPostHandler(w http.ResponseWriter, r *http.Request)
basicAuth = false
}

archive, err := strconv.ParseBool(r.FormValue("archive"))
if err != nil {
archive = false
}

project := &models.Project{
URL: r.FormValue("url"),
IgnoreRobotsTxt: ignoreRobotsTxt,
Expand All @@ -118,6 +123,7 @@ func (h *projectHandler) addPostHandler(w http.ResponseWriter, r *http.Request)
AllowSubdomains: allowSubdomains,
BasicAuth: basicAuth,
CheckExternalLinks: checkExternalLinks,
Archive: archive,
}

err = h.ProjectService.SaveProject(project, user.Id)
Expand Down Expand Up @@ -263,6 +269,11 @@ func (h *projectHandler) editPostHandler(w http.ResponseWriter, r *http.Request)
p.BasicAuth = false
}

p.Archive, err = strconv.ParseBool(r.FormValue("archive"))
if err != nil {
p.Archive = false
}

err = h.ProjectService.UpdateProject(&p)
if err != nil {
pageView := &PageView{
Expand Down
Loading

0 comments on commit 1342faf

Please sign in to comment.