Skip to content
This repository has been archived by the owner on Jul 9, 2022. It is now read-only.

Conti: scraper fixed (#73) #90

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 26 additions & 28 deletions src/sites/conti.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from datetime import datetime
import logging
import re
import json

from bs4 import BeautifulSoup

Expand All @@ -11,39 +13,36 @@ class Conti(SiteCrawler):
actor = "Conti"

def _handle_page(self, body: str):
soup = BeautifulSoup(body, "html.parser")

victim_divs = soup.find_all("div", class_="card")
news_list = re.findall(r'newsList\(\s(\[.*\])\s\);', body)

if len(news_list) > 0:
news_list = json.loads(news_list[0])

for div in victim_divs:
# parse all the stuff out of the html
name = div.find("div", class_="title").text[1:-1].strip()

footer_div = div.find("div", class_="footer")
published = footer_div.find("div")
published_dt = datetime.strptime(published.text.strip(), "%B %d, %Y")

url = self.url + footer_div.find_all("div")[-1].find("a").attrs["href"]

logging.debug(f"Found victim: {name}")
for elem in news_list:
name = elem['title']
published_dt = datetime.fromtimestamp(elem['date'])
url = f"{self.url}{elem['url']}"

logging.debug(f"Found victim: {name} ({url}) published at {published_dt}")

# check if the org is already seen (search by url because name isn't guarenteed unique)
q = self.session.query(Victim).filter_by(url=url, site=self.site)
# check if the org is already seen (search by url because name isn't guarenteed unique)
q = self.session.query(Victim).filter_by(url=url, site=self.site)

if q.count() == 0:
# new org
v = Victim(name=name, url=url, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
self.session.add(v)
self.new_victims.append(v)
else:
# already seen, update last_seen
v = q.first()
v.last_seen = datetime.utcnow()
if q.count() == 0:
# new org
v = Victim(name=name, url=url, published=published_dt, first_seen=datetime.utcnow(), last_seen=datetime.utcnow(), site=self.site)
self.session.add(v)
self.new_victims.append(v)
else:
# already seen, update last_seen
v = q.first()
v.last_seen = datetime.utcnow()

# add the org to our seen list
self.current_victims.append(v)
# add the org to our seen list
self.current_victims.append(v)

self.session.commit()
self.session.commit()

def scrape_victims(self):
with Proxy() as p:
Expand All @@ -60,7 +59,6 @@ def scrape_victims(self):
# start at the last page and go backwards, in case a new victim was added while running (unlikely but possible)
for i in range(max_page_num, 0, -1):
r = p.get(f"{self.url}/page/{i}", headers=self.headers)

self._handle_page(r.content.decode())

# check one past the last page to see if new orgs were added that caused another page to be added
Expand Down