Skip to content

Commit

Permalink
Merge pull request #2 from donggook-me/development
Browse files Browse the repository at this point in the history
[FEAT] add gptCaller
  • Loading branch information
donggook-me authored Sep 4, 2023
2 parents 19e2a94 + 31065b6 commit 6b72824
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 3 deletions.
5 changes: 5 additions & 0 deletions gitPrincipal.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1. master 브랜치는 최종 배포용이다.
2. 새로운 기능 추가시 feature 브랜치로 checkout 해서 작업한다.
3. feature 브랜치에서 작업이 끝나면 리드에게 development 브랜치로 merging request 한다.
4. 리드가 확인 후 development 브랜치에 새로운 기능을 병합한다.
5. 신규 기능들이 모두 병합되고 정상 작동될시 master 브랜치로 합치고, 배포한다.
1 change: 1 addition & 0 deletions install.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ On Linux (Ubuntu/Debian):
sudo apt-get install tesseract-ocr



1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ beautifulsoup4==4.10.0
Pillow==8.2.0
pytesseract==0.3.8
pymongo==3.12.0
openai==0.27.0
15 changes: 14 additions & 1 deletion src/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
from filter import Filter
from reader import Reader
import pymongo
from gptCaller import GPTAPICaller
from config import OPENAI_API_KEY


def scrape_kookmin_news(url):
# Send a GET request to the URL and parse the HTML
Expand Down Expand Up @@ -31,20 +34,30 @@ def scrape_kookmin_news(url):
# url 에서 장학금 관련 이미지만 추출하는 코드.
image_info = [img["src"] for img in soup.find_all("img") if img["src"].startswith("https://")]
image_text = ""

for img_url in image_info:
# Use the Reader class to extract text from the image URL
img_text = Reader.read_text_from_image_url(img_url)
img_text = Filter.filter_text(img_text)
# Add the extracted text to the post_info
image_text += "\n" + (img_text)

# Initialize the GPT API caller with your API key
openai_api_key = "YOUR_API_KEY"
gpt_caller = GPTAPICaller(openai_api_key)

# Call the GPT API to generate text based on the prompt
generated_text = gpt_caller.generate_text(text_info, image_text)


# Add the post information to the post list
post_info.append({
"title": title,
"url": link,
"text_info": text_info,
"image_text": image_text,
"image_info": image_info
"image_info": image_info,
"generated_text": generated_text
})

return post_info
Expand Down
48 changes: 48 additions & 0 deletions src/gptCaller.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import openai

class GPTAPICaller:
def __init__(self, api_key, model="text-davinci-002"):
# Initialize the OpenAI API client with your API key and the model you want to use
openai.api_key = api_key
self.model = model

def generate_text(self, text):
try:
# Call the OpenAI API to generate text based on the provided prompt
response = openai.Completion.create(
engine=self.model,
prompt=text,
max_tokens=100, # You can adjust the number of tokens based on your needs
)

# Extract and return the generated text from the API response
return response.choices[0].text.strip()

except Exception as e:
print("Error calling the GPT API:", e)
return None

def generate_Integrated_text(self, text_first, text_second):
try:
# Example text prompt
prompt = f"""Hi you are here for summarizing and anlayzing texts I'll give you.
the sentence starts from ::: is the text what I want you to work. there is two kind of text.
this is the fist text which is grammerly correct ::: {text_first}
this is the second text which is grammerly incorrect. so you need to make it clear. ::: {text_second}
this is command.
STEP 1 : The first or second one could be empty In this case you just return "No Content"
If there is at least one kind of text, try to find below infomation from the sentences.
this is korean. So I'll give you the korean word category that I ask you to find.
["신청기간", "지원자격", "지원혜택", "전반적인 장학금 요약"]
STEP 2 :So In case there is some text infomation,
please return following Json type Including above category(by korean word)
"""

integrated_text = self.generate_text(prompt)
return integrated_text
4 changes: 2 additions & 2 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
db = client["kookmin_news"] # Replace "kookmin_news" with your database name
collection = db["posts"]

def save_to_mongodb(url):
def crawlAndSave(url):
# Get data from crawling
post_data_list = scrape_kookmin_news(url)

Expand All @@ -30,6 +30,6 @@ def check_saved_data():

if __name__ == "__main__":
url = 'https://www.kookmin.ac.kr/user/kmuNews/notice/7/index.do'
save_to_mongodb(url)
crawlAndSave(url)
check_saved_data()
print("saving complete")

0 comments on commit 6b72824

Please sign in to comment.