Skip to content

Commit

Permalink
changed docker and grabbing
Browse files Browse the repository at this point in the history
  • Loading branch information
Wamy-Dev committed Jun 21, 2022
1 parent d6bfdca commit e23a654
Show file tree
Hide file tree
Showing 7 changed files with 23 additions and 38 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.env
Binary file added components/.DS_Store
Binary file not shown.
4 changes: 0 additions & 4 deletions components/.env

This file was deleted.

18 changes: 4 additions & 14 deletions components/grabber.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,32 @@
#Libraries and importing
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import undetected_chromedriver as uc
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import TimeoutException
import random
import json
import time
import decouple
from decouple import config
import requests
from os import getcwd
#
SELENIUMCLIENT = config('SELENIUMCLIENT')
print('starting process')
#getting updated input file
url = "https://raw.githubusercontent.com/Wamy-Dev/ReziWebsite/main/Input%20Data.txt"
directory = getcwd()
r = requests.get(url)
data = open("./components/Input Data.txt", "wb")
data.write(r.content)
data.close()
#setting up chrome settings
uc = webdriver
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless') #remove hashtag at the start to run in headless mode, must also remove extension for this to work, not recommended
chrome_options = uc.ChromeOptions()
chrome_options.add_extension('./resources/ublockorigin.crx')
chrome_options.add_extension('./resources/popupblockerpro.crx')
chrome_options.add_argument("start-maximized")
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.arguments.extend(["--no-sandbox", "--disable-setuid-sandbox"])
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36")
#wd = uc.Chrome(executable_path='./resources/chromedriver',options=chrome_options) #if local, make sure
wd = uc.Remote(SELENIUMCLIENT, options=chrome_options) #if for remote
wd = uc.Chrome(chrome_options) #if in docker container
json_data={}
#getting the links and setting up json
def link_container(site_name,container_tag,class_tag,html,domain):
Expand Down
2 changes: 0 additions & 2 deletions components/sendtosearch.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import meilisearch
import requests
import json
import os
from datetime import datetime
import decouple
from decouple import config
#
SEARCHCLIENT = config('SEARCHCLIENT')
Expand Down
34 changes: 17 additions & 17 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
#Welcome to Rezi!
# ____ _
# / __ \___ ____ (_)
# / /_/ / _ \/_ / / /
# / _, _/ __/ / /_/ /
#/_/ |_|\___/ /___/_/
#Rezi was written in Python 3.9.6 on Sublime Text.
#Please visit the github at https://github.com/Wamy-Dev/ReziWebsite
import requests
import sys
from datetime import datetime
sys.path.append('./components')
#
now=datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f"Time started: {current_time}.")
#
#Welcome to Rezi!
# ____ _
# / __ \___ ____ (_)
# / /_/ / _ \/_ / / /
# / _, _/ __/ / /_/ /
#/_/ |_|\___/ /___/_/
#Rezi was written in Python 3.9.6 on Sublime Text.
#Please visit the github at https://github.com/Wamy-Dev/ReziWebsite
import requests
import sys
from datetime import datetime
sys.path.append('./components')
#
now=datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f"Time started: {current_time}.")
#
import grabber
2 changes: 1 addition & 1 deletion requirements.txt → resources/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
beautifulsoup4
selenium
selenium-stealth
undetected-chromedriver
output
meilisearch
python-decouple

0 comments on commit e23a654

Please sign in to comment.