Skip to content

Commit

Permalink
add sitemap based crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
yujonglee committed Jun 28, 2024
1 parent 8d54c61 commit c9a4a42
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 33 deletions.
103 changes: 73 additions & 30 deletions core/lib/canary/crawler.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,73 @@
defmodule Canary.Crawler do
@callback run(String.t()) :: {:ok, list(tuple())} | {:error, any()}
@modules [Canary.Crawler.Sitemap, Canary.Crawler.Fallback]

def run(url) do
@modules
|> Enum.reduce_while({:error, :failed}, fn module, _acc ->
case module.run(url) do
{:ok, result} -> {:halt, {:ok, result}}
_ -> {:cont, {:error, :failed}}
end
end)
end
end

defmodule Canary.Crawler.Sitemap do
def run(url) do
urls =
Req.new(base_url: url)
|> ReqCrawl.Sitemap.attach()
|> Req.get!(url: "/sitemap.xml")
|> get_in([Access.key(:private), :crawl_sitemap, Access.elem(1)])

if urls == nil or urls == [] do
{:error, :not_found}
else
pairs =
urls
|> Enum.map(&fetch_url/1)
|> Enum.reject(&is_nil/1)

{:ok, pairs}
end
end

defp fetch_url(url) do
case Req.get(url: url) do
{:ok, %{status: 200, body: body}} -> {url, body}
_ -> nil
end
end
end

defmodule Canary.Crawler.Fallback do
defmodule Filter do
@behaviour Crawler.Fetcher.UrlFilter.Spec

def filter(url, opts) do
{:ok, URI.new!(url).host == opts.host}
end
end

defmodule Scraper do
@behaviour Crawler.Scraper.Spec

def scrape(%Crawler.Store.Page{url: url, body: body, opts: opts} = page) do
opts.store_pid |> Agent.update(&Map.put(&1, normalize(url), body))
{:ok, page}
end

defp normalize(url) do
url
|> URI.parse()
|> Map.put(:query, nil)
|> Map.put(:fragment, nil)
|> URI.to_string()
|> String.replace_trailing("/", "")
end
end

def run(url) do
{:ok, store_pid} = Agent.start_link(fn -> %{} end)

Expand All @@ -10,8 +79,8 @@ defmodule Canary.Crawler do
interval: 10,
max_pages: 100 * 100,
max_depths: 5,
url_filter: Canary.Crawler.Filter,
scraper: Canary.Crawler.Scraper,
url_filter: Filter,
scraper: Scraper,
store_pid: store_pid,
user_agent: "Canary (github.com/fastrepl/canary)"
)
Expand All @@ -23,11 +92,11 @@ defmodule Canary.Crawler do
{:ok, result}

_ ->
:error
{:error, :failed}
end
end

defp wait(opts, seconds_left \\ 10) do
defp wait(opts, seconds_left \\ 20) do
cond do
seconds_left < 0 ->
Crawler.stop(opts)
Expand All @@ -42,29 +111,3 @@ defmodule Canary.Crawler do
end
end
end

defmodule Canary.Crawler.Filter do
@behaviour Crawler.Fetcher.UrlFilter.Spec

def filter(url, opts) do
{:ok, URI.new!(url).host == opts.host}
end
end

defmodule Canary.Crawler.Scraper do
@behaviour Crawler.Scraper.Spec

def scrape(%Crawler.Store.Page{url: url, body: body, opts: opts} = page) do
opts.store_pid |> Agent.update(&Map.put(&1, normalize(url), body))
{:ok, page}
end

defp normalize(url) do
url
|> URI.parse()
|> Map.put(:query, nil)
|> Map.put(:fragment, nil)
|> URI.to_string()
|> String.replace_trailing("/", "")
end
end
4 changes: 2 additions & 2 deletions core/lib/canary/workers/fetcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ defmodule Canary.Workers.Fetcher do
end

defp process(%Source{type: :web} = src) do
{:ok, data} = Canary.Crawler.run(src.web_base_url)
{:ok, pairs} = Canary.Crawler.run(src.web_base_url)

inputs =
data
pairs
|> Enum.flat_map(fn {url, html} ->
html
|> Canary.Reader.html_to_md!()
Expand Down
4 changes: 3 additions & 1 deletion core/mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ defmodule Canary.MixProject do
{:crawler,
git: "https://github.com/fredwu/crawler.git",
ref: "6866bbe287c760b7e4bba1925e80f2a4494d7af3"},
{:oapi_github, "~> 0.3.3"}
{:oapi_github, "~> 0.3.3"},
{:req_crawl, "~> 0.2.0"},
{:saxy, "~> 1.5"}
]
end

Expand Down
2 changes: 2 additions & 0 deletions core/mix.lock
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,12 @@
"postgrex": {:hex, :postgrex, "0.18.0", "f34664101eaca11ff24481ed4c378492fed2ff416cd9b06c399e90f321867d7e", [:mix], [{:db_connection, "~> 2.1", [hex: :db_connection, repo: "hexpm", optional: false]}, {:decimal, "~> 1.5 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:table, "~> 0.1.0", [hex: :table, repo: "hexpm", optional: true]}], "hexpm", "a042989ba1bc1cca7383ebb9e461398e3f89f868c92ce6671feb7ef132a252d1"},
"reactor": {:hex, :reactor, "0.8.4", "344d02ba4a0010763851f4e4aa0ff190ebe7e392e3c27c6cd143dde077b986e7", [:mix], [{:libgraph, "~> 0.16", [hex: :libgraph, repo: "hexpm", optional: false]}, {:spark, "~> 2.0", [hex: :spark, repo: "hexpm", optional: false]}, {:splode, "~> 0.2", [hex: :splode, repo: "hexpm", optional: false]}, {:telemetry, "~> 1.2", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "49c1fd3c786603cec8140ce941c41c7ea72cc4411860ccdee9876c4ca2204f81"},
"req": {:hex, :req, "0.5.0", "6d8a77c25cfc03e06a439fb12ffb51beade53e3fe0e2c5e362899a18b50298b3", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 1.6 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "dda04878c1396eebbfdec6db6f3d4ca609e5c8846b7ee88cc56eb9891406f7a3"},
"req_crawl": {:hex, :req_crawl, "0.2.1", "e68302507888311ad0af0a7c8a541474d860258b9cb225b69da64941869ae735", [:mix], [{:req, "~> 0.4", [hex: :req, repo: "hexpm", optional: false]}, {:saxy, "~> 1.5", [hex: :saxy, repo: "hexpm", optional: true]}], "hexpm", "6c4e81de66c064d5077bb30967987e8b5e082e81144280fe8fa59f5d744d615b"},
"retry": {:hex, :retry, "0.18.0", "dc58ebe22c95aa00bc2459f9e0c5400e6005541cf8539925af0aa027dc860543", [:mix], [], "hexpm", "9483959cc7bf69c9e576d9dfb2b678b71c045d3e6f39ab7c9aa1489df4492d73"},
"rewrite": {:hex, :rewrite, "0.10.5", "6afadeae0b9d843b27ac6225e88e165884875e0aed333ef4ad3bf36f9c101bed", [:mix], [{:glob_ex, "~> 0.1", [hex: :glob_ex, repo: "hexpm", optional: false]}, {:sourceror, "~> 1.0", [hex: :sourceror, repo: "hexpm", optional: false]}], "hexpm", "51cc347a4269ad3a1e7a2c4122dbac9198302b082f5615964358b4635ebf3d4f"},
"rustler": {:hex, :rustler, "0.32.1", "f4cf5a39f9e85d182c0a3f75fa15b5d0add6542ab0bf9ceac6b4023109ebd3fc", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:toml, "~> 0.6", [hex: :toml, repo: "hexpm", optional: false]}], "hexpm", "b96be75526784f86f6587f051bc8d6f4eaff23d6e0f88dbcfe4d5871f52946f7"},
"salsa20": {:hex, :salsa20, "1.0.4", "404cbea1fa8e68a41bcc834c0a2571ac175580fec01cc38cc70c0fb9ffc87e9b", [:mix], [], "hexpm", "745ddcd8cfa563ddb0fd61e7ce48d5146279a2cf7834e1da8441b369fdc58ac6"},
"saxy": {:hex, :saxy, "1.5.0", "0141127f2d042856f135fb2d94e0beecda7a2306f47546dbc6411fc5b07e28bf", [:mix], [], "hexpm", "ea7bb6328fbd1f2aceffa3ec6090bfb18c85aadf0f8e5030905e84235861cf89"},
"slugify": {:hex, :slugify, "1.3.1", "0d3b8b7e5c1eeaa960e44dce94382bee34a39b3ea239293e457a9c5b47cc6fd3", [:mix], [], "hexpm", "cb090bbeb056b312da3125e681d98933a360a70d327820e4b7f91645c4d8be76"},
"sourceror": {:hex, :sourceror, "1.3.0", "70ab9e8bf6df085a1effba4b49ad621b7153b065f69ef6cdb82e6088f2026029", [:mix], [], "hexpm", "1794c3ceeca4eb3f9437261721e4d9cbf846d7c64c7aee4f64062b18d5ce1eac"},
"spark": {:hex, :spark, "2.2.4", "077363750eec4d80ffd4b20075676d17fce8bf82af1aa6aa51d2a539685b8d83", [:mix], [{:igniter, "~> 0.2", [hex: :igniter, repo: "hexpm", optional: false]}, {:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}, {:sourceror, "~> 1.2", [hex: :sourceror, repo: "hexpm", optional: false]}], "hexpm", "fd92bdd4508852bcd445c463d0536d0f130ed828d90401d17a061bcdeca4372a"},
Expand Down

0 comments on commit c9a4a42

Please sign in to comment.