Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter date articles for all methods #106

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions gnews/gnews.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from bs4 import BeautifulSoup as Soup

from gnews.utils.constants import AVAILABLE_COUNTRIES, AVAILABLE_LANGUAGES, TOPICS, BASE_URL, USER_AGENT
from gnews.utils.utils import process_url
from gnews.utils.utils import period_to_datetime, process_url, str_to_datetime

logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO,
datefmt='%m/%d/%Y %I:%M:%S %p')
Expand Down Expand Up @@ -200,7 +200,7 @@ def _clean(html):

def _process(self, item):
url = process_url(item, self._exclude_websites)
if url:
if url and (inspect.stack()[2][3] == 'get_news' or inspect.stack()[2][3] == 'get_news_by_site' or self._is_date_in_range(item.get("published", ""))):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @v-cardona can you please add test cases for this chnage? also can you please update README.md with example?

title = item.get("title", "")
item = {
'title': title,
Expand All @@ -210,6 +210,18 @@ def _process(self, item):
'publisher': item.get("source", " ")
}
return item

def _is_date_in_range(self, date_str):
published_date = str_to_datetime(date_str)

if self._start_date and self._end_date:
return self._start_date.date() <= published_date.date() <= self._end_date.date()
elif self._period:
start_period = period_to_datetime(self._period)

return start_period <= published_date

return True

def docstring_parameter(*sub):
def dec(obj):
Expand Down Expand Up @@ -242,7 +254,7 @@ def get_top_news(self):
"""
This function returns top news stories for the current time
:return: A list of dictionaries with structure: {0}.
..To implement date range try get_news('?')
..To implement a specific date range retriving news try get_news('?')
"""
query = "?"
return self._get_news(query)
Expand All @@ -253,7 +265,7 @@ def get_news_by_topic(self, topic: str):
Function to get news from one of Google's key topics
:param topic: TOPIC names i.e {1}
:return: A list of dictionaries with structure: {0}.
..To implement date range try get_news('topic')
..To implement a specific date range retriving news try get_news('topic')
"""
topic = topic.upper()
if topic in TOPICS:
Expand All @@ -269,7 +281,7 @@ def get_news_by_location(self, location: str):
This function is used to get news from a specific location (city, state, and country)
:param location: (type: str) The location for which you want to get headlines
:return: A list of dictionaries with structure: {0}.
..To implement date range try get_news('location')
..To implement a specific date range retriving news try get_news('location')
"""
if location:
query = '/headlines/section/geo/' + location + '?'
Expand Down
18 changes: 18 additions & 0 deletions gnews/utils/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from datetime import datetime, timedelta
import hashlib
import json
import logging
Expand All @@ -24,3 +25,20 @@ def process_url(item, exclude_websites):
if re.match(GOOGLE_NEWS_REGEX, url):
url = requests.head(url).headers.get('location', url)
return url

def str_to_datetime(date_str:str):
return datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S GMT')

def period_to_datetime(period: str):
now = datetime.now()
ammount = int(period[:-1])
delta = period[-1]

if delta == 'd':
init_date = now - timedelta(days=ammount)
elif delta == 'h':
init_date = now - timedelta(hours=ammount)
elif delta == 'm':
init_date = now - timedelta(days=ammount*30)

return init_date