Merge pull request #2 from donggook-me/development

[FEAT] add gptCaller
Google-DSC-Kookmin · Sep 4, 2023 · 6b72824 · 6b72824
2 parents 19e2a94 + 31065b6
commit 6b72824
Show file tree

Hide file tree

Showing 6 changed files with 71 additions and 3 deletions.
diff --git a/gitPrincipal.txt b/gitPrincipal.txt
@@ -0,0 +1,5 @@
+1. master 브랜치는 최종 배포용이다.
+2. 새로운 기능 추가시 feature 브랜치로 checkout 해서 작업한다.
+3. feature 브랜치에서 작업이 끝나면 리드에게 development 브랜치로 merging request 한다.
+4. 리드가 확인 후 development 브랜치에 새로운 기능을 병합한다.
+5. 신규 기능들이 모두 병합되고 정상 작동될시 master 브랜치로 합치고, 배포한다.
diff --git a/install.txt b/install.txt
@@ -8,3 +8,4 @@ On Linux (Ubuntu/Debian):
 sudo apt-get install tesseract-ocr
 
 
+
diff --git a/requirements.txt b/requirements.txt
@@ -3,3 +3,4 @@ beautifulsoup4==4.10.0
 Pillow==8.2.0
 pytesseract==0.3.8
 pymongo==3.12.0
+openai==0.27.0
diff --git a/src/crawl.py b/src/crawl.py
@@ -3,6 +3,9 @@
 from filter import Filter
 from reader import Reader
 import pymongo
+from gptCaller import GPTAPICaller
+from config import OPENAI_API_KEY
+
 
 def scrape_kookmin_news(url):
     # Send a GET request to the URL and parse the HTML
@@ -31,20 +34,30 @@ def scrape_kookmin_news(url):
         # url 에서 장학금 관련 이미지만 추출하는 코드. 
         image_info = [img["src"] for img in soup.find_all("img") if img["src"].startswith("https://")]
         image_text = ""
+
         for img_url in image_info:
             # Use the Reader class to extract text from the image URL
             img_text = Reader.read_text_from_image_url(img_url)
             img_text = Filter.filter_text(img_text)
             # Add the extracted text to the post_info
             image_text += "\n" + (img_text)
 
+        # Initialize the GPT API caller with your API key
+        openai_api_key = "YOUR_API_KEY"
+        gpt_caller = GPTAPICaller(openai_api_key)
+
+        # Call the GPT API to generate text based on the prompt
+        generated_text = gpt_caller.generate_text(text_info, image_text)
+
+
         # Add the post information to the post list
         post_info.append({
             "title": title,
             "url": link,
             "text_info": text_info,
             "image_text": image_text,
-            "image_info": image_info
+            "image_info": image_info,
+            "generated_text": generated_text
         })
 
     return post_info

diff --git a/src/gptCaller.py b/src/gptCaller.py
@@ -0,0 +1,48 @@
+import openai
+
+class GPTAPICaller:
+    def __init__(self, api_key, model="text-davinci-002"):
+        # Initialize the OpenAI API client with your API key and the model you want to use
+        openai.api_key = api_key
+        self.model = model
+
+    def generate_text(self, text):
+        try:
+            # Call the OpenAI API to generate text based on the provided prompt
+            response = openai.Completion.create(
+                engine=self.model,
+                prompt=text,
+                max_tokens=100,  # You can adjust the number of tokens based on your needs
+            )
+
+            # Extract and return the generated text from the API response
+            return response.choices[0].text.strip()
+
+        except Exception as e:
+            print("Error calling the GPT API:", e)
+            return None
+
+    def generate_Integrated_text(self, text_first, text_second):
+        try:
+                # Example text prompt
+            prompt = f"""Hi you are here for summarizing and anlayzing texts I'll give you.
+            the sentence starts from ::: is the text what I want you to work. there is two kind of text. 
+            
+            this is the fist text which is grammerly correct ::: {text_first} 
+            this is the second text which is grammerly incorrect. so you need to make it clear. ::: {text_second}
+            
+            this is command.
+            
+            STEP 1 : The first or second one could be empty In this case you just return "No Content" 
+            If there is at least one kind of text, try to find below infomation from the sentences.
+            
+            this is korean. So I'll give you the korean word category that I ask you to find.
+            ["신청기간", "지원자격", "지원혜택", "전반적인 장학금 요약"]
+            
+            
+            STEP 2 :So In case there is some text infomation, 
+            please return following Json type Including above category(by korean word)
+            """
+
+            integrated_text = self.generate_text(prompt)
+            return integrated_text
diff --git a/src/main.py b/src/main.py
@@ -6,7 +6,7 @@
 db = client["kookmin_news"]  # Replace "kookmin_news" with your database name
 collection = db["posts"]
 
-def save_to_mongodb(url):
+def crawlAndSave(url):
     # Get data from crawling
     post_data_list = scrape_kookmin_news(url)
 
@@ -30,6 +30,6 @@ def check_saved_data():
 
 if __name__ == "__main__":
     url = 'https://www.kookmin.ac.kr/user/kmuNews/notice/7/index.do'
-    save_to_mongodb(url)
+    crawlAndSave(url)
     check_saved_data()
     print("saving complete")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,3 +8,4 @@ On Linux (Ubuntu/Debian):
		sudo apt-get install tesseract-ocr