Skip to content

Commit

Permalink
Check if sitemap is blocked in robots.txt
Browse files Browse the repository at this point in the history
  • Loading branch information
StJudeWasHere committed Feb 21, 2024
1 parent 666bd9f commit 363c11f
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 36 deletions.
89 changes: 58 additions & 31 deletions internal/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,23 @@ type Options struct {
}

type Crawler struct {
url *url.URL
options *Options
queue *queue.Queue
storage *urlstorage.URLStorage
sitemapStorage *urlstorage.URLStorage
sitemapChecker *httpcrawler.SitemapChecker
sitemapExists bool
sitemaps []string
robotstxtExists bool
responseCounter int
robotsChecker *httpcrawler.RobotsChecker
prStream chan *models.PageReportMessage
allowedDomains map[string]bool
mainDomain string
httpCrawler *httpcrawler.HttpCrawler
qStream chan *httpcrawler.RequestMessage
url *url.URL
options *Options
queue *queue.Queue
storage *urlstorage.URLStorage
sitemapStorage *urlstorage.URLStorage
sitemapChecker *httpcrawler.SitemapChecker
sitemapExists bool
sitemapIsBlocked bool
sitemaps []string
robotstxtExists bool
responseCounter int
robotsChecker *httpcrawler.RobotsChecker
prStream chan *models.PageReportMessage
allowedDomains map[string]bool
mainDomain string
httpCrawler *httpcrawler.HttpCrawler
qStream chan *httpcrawler.RequestMessage
}

func NewCrawler(url *url.URL, options *Options) *Crawler {
Expand Down Expand Up @@ -72,29 +73,50 @@ func NewCrawler(url *url.URL, options *Options) *Crawler {
}

sitemaps := robotsChecker.GetSitemaps(url)
nonBlockedSitemaps := []string{}
if len(sitemaps) == 0 {
sitemaps = []string{url.Scheme + "://" + url.Host + "/sitemap.xml"}
}

sitemapIsBlocked := false
for _, sm := range sitemaps {
parsedSm, err := url.Parse(sm)
if err != nil {
continue
}

if robotsChecker.IsBlocked(parsedSm) {
sitemapIsBlocked = true
if !options.IgnoreRobotsTxt {
continue
}
}

nonBlockedSitemaps = append(nonBlockedSitemaps, sm)
}

sitemaps = nonBlockedSitemaps

sitemapChecker := httpcrawler.NewSitemapChecker(httpClient, options.MaxPageReports)
qStream := make(chan *httpcrawler.RequestMessage)

c := &Crawler{
url: url,
options: options,
queue: q,
storage: storage,
sitemapStorage: urlstorage.New(),
sitemapChecker: sitemapChecker,
sitemapExists: sitemapChecker.SitemapExists(sitemaps),
sitemaps: sitemaps,
robotsChecker: robotsChecker,
robotstxtExists: robotsChecker.Exists(url),
allowedDomains: map[string]bool{mainDomain: true, "www." + mainDomain: true},
mainDomain: mainDomain,
prStream: make(chan *models.PageReportMessage),
qStream: qStream,
httpCrawler: httpcrawler.New(httpClient, qStream),
url: url,
options: options,
queue: q,
storage: storage,
sitemapStorage: urlstorage.New(),
sitemapChecker: sitemapChecker,
sitemapExists: sitemapChecker.SitemapExists(sitemaps),
sitemapIsBlocked: sitemapIsBlocked,
sitemaps: sitemaps,
robotsChecker: robotsChecker,
robotstxtExists: robotsChecker.Exists(url),
allowedDomains: map[string]bool{mainDomain: true, "www." + mainDomain: true},
mainDomain: mainDomain,
prStream: make(chan *models.PageReportMessage),
qStream: qStream,
httpCrawler: httpcrawler.New(httpClient, qStream),
}

go func() {
Expand Down Expand Up @@ -294,6 +316,11 @@ func (c *Crawler) RobotstxtExists() bool {
return c.robotstxtExists
}

// Returns true if any of the website's sitemaps is blocked in the robots.txt file
func (c *Crawler) SitemapIsBlocked() bool {
return c.sitemapIsBlocked
}

// Returns a slice with all the crawlable Links from the PageReport's links.
// URLs extracted from internal Links and ExternalLinks are crawlable only if the domain name is allowed and
// if they don't have the "nofollow" attribute. If they have the "nofollow" attribute, they are also considered
Expand Down
1 change: 1 addition & 0 deletions internal/crawler/crawler_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ func (s *Service) StartCrawler(p models.Project) (*models.Crawl, error) {

crawl.RobotstxtExists = c.RobotstxtExists()
crawl.SitemapExists = c.SitemapExists()
crawl.SitemapIsBlocked = c.SitemapIsBlocked()

crawl, err = s.store.SaveEndCrawl(crawl)
if err != nil {
Expand Down
4 changes: 4 additions & 0 deletions internal/datastore/project.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ func (ds *Datastore) SaveEndCrawl(c *models.Crawl) (*models.Crawl, error) {
noindex = ?,
robotstxt_exists = ?,
sitemap_exists = ?,
sitemap_blocked = ?,
links_internal_follow = ?,
links_internal_nofollow = ?,
links_external_follow = ?,
Expand All @@ -186,6 +187,7 @@ func (ds *Datastore) SaveEndCrawl(c *models.Crawl) (*models.Crawl, error) {
c.Noindex,
c.RobotstxtExists,
c.SitemapExists,
c.SitemapIsBlocked,
c.InternalFollowLinks,
c.InternalNoFollowLinks,
c.ExternalFollowLinks,
Expand Down Expand Up @@ -216,6 +218,7 @@ func (ds *Datastore) GetLastCrawl(p *models.Project) models.Crawl {
issues_end,
robotstxt_exists,
sitemap_exists,
sitemap_blocked,
links_internal_follow,
links_internal_nofollow,
links_external_follow,
Expand All @@ -241,6 +244,7 @@ func (ds *Datastore) GetLastCrawl(p *models.Project) models.Crawl {
&crawl.IssuesEnd,
&crawl.RobotstxtExists,
&crawl.SitemapExists,
&crawl.SitemapIsBlocked,
&crawl.InternalFollowLinks,
&crawl.InternalNoFollowLinks,
&crawl.ExternalFollowLinks,
Expand Down
1 change: 1 addition & 0 deletions internal/models/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ type Crawl struct {
BlockedByRobotstxt int // URLs blocked by robots.txt
Noindex int // URLS with noindex attribute
SitemapExists bool
SitemapIsBlocked bool
RobotstxtExists bool
InternalFollowLinks int
InternalNoFollowLinks int
Expand Down
1 change: 1 addition & 0 deletions migrations/0049_sitemap_blocked.down.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE `crawls` DROP COLUMN `sitemap_blocked`;
1 change: 1 addition & 0 deletions migrations/0049_sitemap_blocked.up.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE `crawls` ADD COLUMN `sitemap_blocked` tinyint NOT NULL DEFAULT 0;
13 changes: 8 additions & 5 deletions web/templates/dashboard.html
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,17 @@ <h2 class="title">Current Crawl</h2>

<p class="crawler-item">
{{ if .ProjectView.Crawl.SitemapExists }}

<svg width="24" height="24" xmlns="http://www.w3.org/2000/svg" fill-rule="evenodd" clip-rule="evenodd"><path d="M24 4.685l-16.327 17.315-7.673-9.054.761-.648 6.95 8.203 15.561-16.501.728.685z"/></svg>
<span>Sitemap.xml found.</span>

{{ if .ProjectView.Crawl.SitemapIsBlocked }}
<svg width="24" height="24" xmlns="http://www.w3.org/2000/svg" fill-rule="evenodd" clip-rule="evenodd"><path d="M12 11.293l10.293-10.293.707.707-10.293 10.293 10.293 10.293-.707.707-10.293-10.293-10.293 10.293-.707-.707 10.293-10.293-10.293-10.293.707-.707 10.293 10.293z"/></svg>
<span>Sitemap blocked.</span>
{{ else }}
<svg width="24" height="24" xmlns="http://www.w3.org/2000/svg" fill-rule="evenodd" clip-rule="evenodd"><path d="M24 4.685l-16.327 17.315-7.673-9.054.761-.648 6.95 8.203 15.561-16.501.728.685z"/></svg>
<span>Sitemap found.</span>
{{ end }}
{{ else }}

<svg width="24" height="24" xmlns="http://www.w3.org/2000/svg" fill-rule="evenodd" clip-rule="evenodd"><path d="M12 11.293l10.293-10.293.707.707-10.293 10.293 10.293 10.293-.707.707-10.293-10.293-10.293 10.293-.707-.707 10.293-10.293-10.293-10.293.707-.707 10.293 10.293z"/></svg>
<span>Sitemap.xml not found.</span>
<span>Sitemap not found.</span>

{{ end }}
</p>
Expand Down

0 comments on commit 363c11f

Please sign in to comment.