From 7c7ddeae5ee5f07b947a542989d134e76b34d4a6 Mon Sep 17 00:00:00 2001 From: subaru-hello Date: Sat, 23 Nov 2024 09:23:59 +0900 Subject: [PATCH 1/2] empty From 4283f2c09cc4751661f2bc1864d48dad53fbfe08 Mon Sep 17 00:00:00 2001 From: subaru-hello Date: Sat, 23 Nov 2024 22:45:51 +0900 Subject: [PATCH 2/2] =?UTF-8?q?=E4=B8=96=E7=94=B0=E8=B0=B7=E7=B7=8F?= =?UTF-8?q?=E5=90=88=E9=81=8B=E5=8B=95=E5=85=AC=E5=9C=92=E3=81=AEHTML?= =?UTF-8?q?=E3=82=92ETL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- py-functions/main.py | 4 +- .../kanagawa/kawasaki}/todoroki.py | 0 .../src/handlers/setagaya_sougou_scrape.py | 10 --- .../src/handlers/tokyo/setagaya/scraping.py | 86 +++++++++++++++++++ .../tokyo/setagaya/setagaya_sougou_scrape.py | 12 +++ py-functions/src/models/scraping.py | 31 ------- 6 files changed, 100 insertions(+), 43 deletions(-) rename py-functions/src/{models => handlers/kanagawa/kawasaki}/todoroki.py (100%) delete mode 100644 py-functions/src/handlers/setagaya_sougou_scrape.py create mode 100644 py-functions/src/handlers/tokyo/setagaya/scraping.py create mode 100644 py-functions/src/handlers/tokyo/setagaya/setagaya_sougou_scrape.py delete mode 100644 py-functions/src/models/scraping.py diff --git a/py-functions/main.py b/py-functions/main.py index 276cf56..8cdb4fe 100644 --- a/py-functions/main.py +++ b/py-functions/main.py @@ -7,7 +7,7 @@ from src.handlers.addmessage import addmessage from src.handlers.makeuppercase import makeuppercase from src.handlers.test import introduction -from src.handlers.setagaya_sougou_scrape import scraping +from src.handlers.tokyo.setagaya.setagaya_sougou_scrape import setagaya_sougou_availability initialize_app() @@ -19,4 +19,4 @@ def on_request_example(req: https_fn.Request) -> https_fn.Response: addmessage makeuppercase introduction -scraping \ No newline at end of file +setagaya_sougou_availability \ No newline at end of file diff --git a/py-functions/src/models/todoroki.py b/py-functions/src/handlers/kanagawa/kawasaki/todoroki.py similarity index 100% rename from py-functions/src/models/todoroki.py rename to py-functions/src/handlers/kanagawa/kawasaki/todoroki.py diff --git a/py-functions/src/handlers/setagaya_sougou_scrape.py b/py-functions/src/handlers/setagaya_sougou_scrape.py deleted file mode 100644 index 3a39e8b..0000000 --- a/py-functions/src/handlers/setagaya_sougou_scrape.py +++ /dev/null @@ -1,10 +0,0 @@ -import requests -from bs4 import BeautifulSoup -from firebase_functions import https_fn -from src.models.scraping import Scraping -# -@https_fn.on_request() -def scraping(req: https_fn.Request) -> https_fn.Response: - url = "https://www.se-sports.or.jp/facility/sougou/" - scraping_result = Scraping(url).execute() - return https_fn.Response(scraping_result, status=200) \ No newline at end of file diff --git a/py-functions/src/handlers/tokyo/setagaya/scraping.py b/py-functions/src/handlers/tokyo/setagaya/scraping.py new file mode 100644 index 0000000..2b94652 --- /dev/null +++ b/py-functions/src/handlers/tokyo/setagaya/scraping.py @@ -0,0 +1,86 @@ +import requests +from bs4 import BeautifulSoup +import logging + +logging.basicConfig(level=logging.INFO) + + +class Scraping: + TARGET_TEXT = "総合運動場 個人開放" + KEYWORDS = ["陸上競技", "・陸上"] + + def __init__(self, url): + self.url = url + + def get_soup(self, url): + """HTMLを取得してBeautifulSoupオブジェクトを返す""" + try: + response = requests.get(url) + response.raise_for_status() + return BeautifulSoup(response.text, 'html.parser') + except requests.RequestException as e: + logging.error(f"ウェブページの取得中にエラーが発生しました: {e}") + return None + + def get_href(self, url): + """個人開放のリンクを取得""" + soup = self.get_soup(url) + if not soup: + return None + + news_list = soup.find('div', class_='news-list') + if not news_list: + logging.error("指定されたセクション 'news-list' が見つかりません。") + return None + + for item in news_list.find_all('div', class_='news-item'): + news_link_div = item.find('div', class_='news-link') + if not news_link_div: + continue + news_link_text = news_link_div.get_text(strip=True) + if self.TARGET_TEXT in news_link_text: + a_tag = item.find("a", href=True) + if a_tag: + return a_tag['href'] + logging.info(f"'{self.TARGET_TEXT}' に一致するリンクが見つかりませんでした。") + return None + + def get_availability_today(self, url): + """詳細ページのタイトルと内容を取得""" + soup = self.get_soup(url) + if not soup: + return None, None + + title_tag = soup.find("h1") + title = title_tag.get_text(strip=True) if title_tag else "タイトルが見つかりません。" + + content_div = soup.find('div', class_="news-contents") + if not content_div: + logging.error("コンテンツのセクション 'news-contents' が見つかりません。") + return title, "コンテンツが見つかりませんでした。" + + body_text = "" + for paragraph in content_div.find_all("p"): + paragraph_content = paragraph.decode_contents().replace("
", "\n") + for line in paragraph_content.split("\n"): + clean_line = BeautifulSoup(line, 'html.parser').get_text(strip=True) + if any(keyword in clean_line for keyword in self.KEYWORDS): + body_text += clean_line + "\n" + + if not body_text: + body_text = "該当するコンテンツが見つかりませんでした。" + + return title, body_text + + def execute(self): + """実行""" + href = self.get_href(self.url) + if not href: + logging.error(f"'{self.TARGET_TEXT}' のリンクが見つかりません。") + return None, None + + absolute_url = href if href.startswith("http") else f"{self.url.rstrip('/')}/{href.lstrip('/')}" + + title, body = self.get_availability_today(absolute_url) + logging.info(f"タイトル: {title}\n内容:\n{body}") + return title, body \ No newline at end of file diff --git a/py-functions/src/handlers/tokyo/setagaya/setagaya_sougou_scrape.py b/py-functions/src/handlers/tokyo/setagaya/setagaya_sougou_scrape.py new file mode 100644 index 0000000..2f13021 --- /dev/null +++ b/py-functions/src/handlers/tokyo/setagaya/setagaya_sougou_scrape.py @@ -0,0 +1,12 @@ +from firebase_functions import https_fn +from src.handlers.tokyo.setagaya.scraping import Scraping +@https_fn.on_request() +def setagaya_sougou_availability(req: https_fn.Request) -> https_fn.Response: + "世田谷総合運動場の陸上競技場貸出状況を取得する関数" + url = "https://www.se-sports.or.jp/facility/sougou/" + title, body = Scraping(url).execute() + html = f""" +

{title}

+
{body}
+ """ + return https_fn.Response(html, status=200) \ No newline at end of file diff --git a/py-functions/src/models/scraping.py b/py-functions/src/models/scraping.py deleted file mode 100644 index b2ed349..0000000 --- a/py-functions/src/models/scraping.py +++ /dev/null @@ -1,31 +0,0 @@ -import requests -from bs4 import BeautifulSoup - -class Scraping: - def __init__(self, url): - self.url = url - - def execute(self): - # サイトのHTMLを取得 - response = requests.get(self.url) - res = "" - if response.status_code == 200: - html_content = response.text - - # htmlを掬い取る - soup = BeautifulSoup(html_content, 'html.parser') - - news_list = soup.find('div', class_='news-list') - if news_list: - items = news_list.find_all('div', class_='news-item') - for item in items: - date = item.find('div', class_='news-time').text - description = item.find('div', class_='news-link').text - print(f"Date: {date} - Info: {description}") - res = f"Date: {date} - Info: {description}" - else: - print("Couldn't find the required section.") - else: - print(f"Failed to fetch the webpage. Status code: {response.status_code}") - - return res \ No newline at end of file