From 3f1668537a09ad0106b741b99e063620f380fc32 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 28 Sep 2024 14:10:56 -0400 Subject: [PATCH] chore: Remove pandas --- generic_scrapy/base_spiders/base_spider.py | 16 ++++++++-------- generic_scrapy/commands/incrementalupdate.py | 18 +++++++++--------- requirements.in | 1 - requirements.txt | 11 ----------- requirements_dev.txt | 19 ------------------- 5 files changed, 17 insertions(+), 48 deletions(-) diff --git a/generic_scrapy/base_spiders/base_spider.py b/generic_scrapy/base_spiders/base_spider.py index f9fd490..5b37551 100644 --- a/generic_scrapy/base_spiders/base_spider.py +++ b/generic_scrapy/base_spiders/base_spider.py @@ -3,6 +3,13 @@ import scrapy from scrapy.exceptions import UsageError +VALID_DATE_FORMATS = { + "date": "%Y-%m-%d", + "datetime": "%Y-%m-%dT%H:%M:%S", + "year": "%Y", + "year-month": "%Y-%m", +} + class BaseSpider(scrapy.Spider): """ @@ -27,13 +34,6 @@ class BaseSpider(scrapy.Spider): """ - VALID_DATE_FORMATS = { - "date": "%Y-%m-%d", - "datetime": "%Y-%m-%dT%H:%M:%S", - "year": "%Y", - "year-month": "%Y-%m", - } - # Regarding the data source. date_format = "datetime" date_required = False @@ -63,7 +63,7 @@ def __init__( self.from_date = from_date self.until_date = until_date - self.date_format = self.VALID_DATE_FORMATS[self.date_format] + self.date_format = VALID_DATE_FORMATS[self.date_format] # Related to incremental crawls. if crawl_directory: diff --git a/generic_scrapy/commands/incrementalupdate.py b/generic_scrapy/commands/incrementalupdate.py index 0b2782f..29fa7bf 100644 --- a/generic_scrapy/commands/incrementalupdate.py +++ b/generic_scrapy/commands/incrementalupdate.py @@ -1,11 +1,11 @@ +import csv import os.path from datetime import datetime, timedelta -import pandas as pd from scrapy.commands import ScrapyCommand from scrapy.exceptions import UsageError -from generic_scrapy.base_spiders.base_spider import BaseSpider +from generic_scrapy.base_spiders.base_spider import VALID_DATE_FORMATS class IncrementalUpdate(ScrapyCommand): @@ -24,7 +24,7 @@ def add_options(self, parser): parser.add_argument( "--date_field_name", type=str, - help="The data field to use for checking for the number of items downloaded the last time.", + help="The date field to use for checking for the number of items downloaded the last time.", ) parser.add_argument( "--crawl_directory", @@ -50,12 +50,12 @@ def run(self, args, opts): max_date = None if opts.date_field_name: - directory = spidercls.get_file_store_directory() - file_name = f"{spidercls.export_outputs['main']['name']}.csv" - max_date = pd.read_csv(os.path.join(directory, file_name))[opts.date_field_name].agg(["max"])["max"] - max_date = datetime.strptime(max_date, BaseSpider.VALID_DATE_FORMATS["datetime"]).replace( - tzinfo=datetime.timezone.utc - ) + timedelta(seconds=1) + with open( + os.path.join(spidercls.get_file_store_directory(), f"{spidercls.export_outputs['main']['name']}.csv") + ) as f: + max_date = datetime.strptime( + max(row[opts.date_field_name] for row in csv.DictReader(f)), VALID_DATE_FORMATS["datetime"] + ).replace(tzinfo=datetime.timezone.utc) + timedelta(seconds=1) self.crawler_process.crawl(spidercls, from_date=max_date, crawl_directory=opts.crawl_directory) self.crawler_process.start() diff --git a/requirements.in b/requirements.in index acdade7..268b0db 100644 --- a/requirements.in +++ b/requirements.in @@ -1,2 +1 @@ -pandas scrapy diff --git a/requirements.txt b/requirements.txt index 0b324dc..0b4de64 100644 --- a/requirements.txt +++ b/requirements.txt @@ -51,14 +51,10 @@ lxml==4.9.3 # via # parsel # scrapy -numpy==1.25.2 - # via pandas packaging==23.1 # via # parsel # scrapy -pandas==2.0.3 - # via -r requirements.in parsel==1.8.1 # via # itemloaders @@ -77,10 +73,6 @@ pydispatcher==2.0.7 # via scrapy pyopenssl==24.2.1 # via scrapy -python-dateutil==2.8.2 - # via pandas -pytz==2023.3 - # via pandas queuelib==1.6.2 # via scrapy requests==2.32.3 @@ -101,7 +93,6 @@ setuptools==74.1.1 six==1.16.0 # via # automat - # python-dateutil # requests-file tldextract==3.4.4 # via scrapy @@ -111,8 +102,6 @@ twisted==24.7.0rc1 # via scrapy typing-extensions==4.7.1 # via twisted -tzdata==2023.3 - # via pandas urllib3==2.2.2 # via requests w3lib==2.1.2 diff --git a/requirements_dev.txt b/requirements_dev.txt index 974b685..e7edf09 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -78,17 +78,11 @@ lxml==4.9.3 # -r requirements.txt # parsel # scrapy -numpy==1.25.2 - # via - # -r requirements.txt - # pandas packaging==23.1 # via # -r requirements.txt # parsel # scrapy -pandas==2.0.3 - # via -r requirements.txt parsel==1.8.1 # via # -r requirements.txt @@ -119,14 +113,6 @@ pyopenssl==24.2.1 # via # -r requirements.txt # scrapy -python-dateutil==2.8.2 - # via - # -r requirements.txt - # pandas -pytz==2023.3 - # via - # -r requirements.txt - # pandas queuelib==1.6.2 # via # -r requirements.txt @@ -156,7 +142,6 @@ six==1.16.0 # via # -r requirements.txt # automat - # python-dateutil # requests-file tldextract==3.4.4 # via @@ -174,10 +159,6 @@ typing-extensions==4.7.1 # via # -r requirements.txt # twisted -tzdata==2023.3 - # via - # -r requirements.txt - # pandas urllib3==2.2.2 # via # -r requirements.txt