Skip to content

Commit

Permalink
chore: Remove pandas
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Sep 28, 2024
1 parent 0eb8b9f commit 3f16685
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 48 deletions.
16 changes: 8 additions & 8 deletions generic_scrapy/base_spiders/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
import scrapy
from scrapy.exceptions import UsageError

VALID_DATE_FORMATS = {
"date": "%Y-%m-%d",
"datetime": "%Y-%m-%dT%H:%M:%S",
"year": "%Y",
"year-month": "%Y-%m",
}


class BaseSpider(scrapy.Spider):
"""
Expand All @@ -27,13 +34,6 @@ class BaseSpider(scrapy.Spider):
"""

VALID_DATE_FORMATS = {
"date": "%Y-%m-%d",
"datetime": "%Y-%m-%dT%H:%M:%S",
"year": "%Y",
"year-month": "%Y-%m",
}

# Regarding the data source.
date_format = "datetime"
date_required = False
Expand Down Expand Up @@ -63,7 +63,7 @@ def __init__(
self.from_date = from_date
self.until_date = until_date

self.date_format = self.VALID_DATE_FORMATS[self.date_format]
self.date_format = VALID_DATE_FORMATS[self.date_format]

# Related to incremental crawls.
if crawl_directory:
Expand Down
18 changes: 9 additions & 9 deletions generic_scrapy/commands/incrementalupdate.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import csv
import os.path
from datetime import datetime, timedelta

import pandas as pd
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError

from generic_scrapy.base_spiders.base_spider import BaseSpider
from generic_scrapy.base_spiders.base_spider import VALID_DATE_FORMATS


class IncrementalUpdate(ScrapyCommand):
Expand All @@ -24,7 +24,7 @@ def add_options(self, parser):
parser.add_argument(
"--date_field_name",
type=str,
help="The data field to use for checking for the number of items downloaded the last time.",
help="The date field to use for checking for the number of items downloaded the last time.",
)
parser.add_argument(
"--crawl_directory",
Expand All @@ -50,12 +50,12 @@ def run(self, args, opts):

max_date = None
if opts.date_field_name:
directory = spidercls.get_file_store_directory()
file_name = f"{spidercls.export_outputs['main']['name']}.csv"
max_date = pd.read_csv(os.path.join(directory, file_name))[opts.date_field_name].agg(["max"])["max"]
max_date = datetime.strptime(max_date, BaseSpider.VALID_DATE_FORMATS["datetime"]).replace(
tzinfo=datetime.timezone.utc
) + timedelta(seconds=1)
with open(
os.path.join(spidercls.get_file_store_directory(), f"{spidercls.export_outputs['main']['name']}.csv")
) as f:
max_date = datetime.strptime(
max(row[opts.date_field_name] for row in csv.DictReader(f)), VALID_DATE_FORMATS["datetime"]
).replace(tzinfo=datetime.timezone.utc) + timedelta(seconds=1)

self.crawler_process.crawl(spidercls, from_date=max_date, crawl_directory=opts.crawl_directory)
self.crawler_process.start()
1 change: 0 additions & 1 deletion requirements.in
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
pandas
scrapy
11 changes: 0 additions & 11 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,10 @@ lxml==4.9.3
# via
# parsel
# scrapy
numpy==1.25.2
# via pandas
packaging==23.1
# via
# parsel
# scrapy
pandas==2.0.3
# via -r requirements.in
parsel==1.8.1
# via
# itemloaders
Expand All @@ -77,10 +73,6 @@ pydispatcher==2.0.7
# via scrapy
pyopenssl==24.2.1
# via scrapy
python-dateutil==2.8.2
# via pandas
pytz==2023.3
# via pandas
queuelib==1.6.2
# via scrapy
requests==2.32.3
Expand All @@ -101,7 +93,6 @@ setuptools==74.1.1
six==1.16.0
# via
# automat
# python-dateutil
# requests-file
tldextract==3.4.4
# via scrapy
Expand All @@ -111,8 +102,6 @@ twisted==24.7.0rc1
# via scrapy
typing-extensions==4.7.1
# via twisted
tzdata==2023.3
# via pandas
urllib3==2.2.2
# via requests
w3lib==2.1.2
Expand Down
19 changes: 0 additions & 19 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,11 @@ lxml==4.9.3
# -r requirements.txt
# parsel
# scrapy
numpy==1.25.2
# via
# -r requirements.txt
# pandas
packaging==23.1
# via
# -r requirements.txt
# parsel
# scrapy
pandas==2.0.3
# via -r requirements.txt
parsel==1.8.1
# via
# -r requirements.txt
Expand Down Expand Up @@ -119,14 +113,6 @@ pyopenssl==24.2.1
# via
# -r requirements.txt
# scrapy
python-dateutil==2.8.2
# via
# -r requirements.txt
# pandas
pytz==2023.3
# via
# -r requirements.txt
# pandas
queuelib==1.6.2
# via
# -r requirements.txt
Expand Down Expand Up @@ -156,7 +142,6 @@ six==1.16.0
# via
# -r requirements.txt
# automat
# python-dateutil
# requests-file
tldextract==3.4.4
# via
Expand All @@ -174,10 +159,6 @@ typing-extensions==4.7.1
# via
# -r requirements.txt
# twisted
tzdata==2023.3
# via
# -r requirements.txt
# pandas
urllib3==2.2.2
# via
# -r requirements.txt
Expand Down

0 comments on commit 3f16685

Please sign in to comment.