-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain3.py
102 lines (82 loc) · 2.88 KB
/
main3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import time
start_time = time.time()
import sys
# sys.path.append('/usr/lib/python3/dist-packages/beautifulsoup4')
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
#set chromodriver.exe path
driver = webdriver.Chrome(executable_path="/Users/obaylan/PycharmProjects/scrapLegalNet/venv/chromedriver")
#implicit wait
driver.implicitly_wait(0.5)
#maximize browser
driver.maximize_window()
#launch URL
driver.get(' https://legalbank.net/arama/mahkeme-kararlari')
#identify element
def next_page():
element = driver.find_element(By.XPATH,'//*[@id="content_content_pgr_btnPagerNext"]')
driver.execute_script("arguments[0].scrollIntoView();", element)
driver.find_element(By.XPATH,'//*[@id="content_content_pgr_btnPagerNext"]').click();
content = driver.page_source
soup = BeautifulSoup(content, 'lxml')
return soup
def get_data(links, num):
cnt=0
for link in links:
html_text = requests.get(link).content.decode('utf-8')
text = BeautifulSoup(html_text, 'lxml')
try:
res = text.find('div', class_='belge').text
except:
res = 'NO BELGE'
last_char = str(num)+ str(cnt)+ ".txt"
print(last_char)
# print('Last character : ', last_char)
with open(last_char, "w") as file:
file.write(str(res))
file.close()
cnt = cnt+1
# except IOError:
# print(last_char)
content = requests.get(' https://legalbank.net/arama/mahkeme-kararlari').content
soup = BeautifulSoup(content, 'lxml')
results = soup.find_all('div', class_='result')
result_a = []
links_list = []
# for result in results:
# result_a.append(list(result.a.attrs.values())[0])
# for each in result_a:
# links_list.append(f'{"https://legalbank.net/"}{each}')
# sub_lists = np.array_split(links_list, 2)
# pool = Pool()
# result1 = pool.apply_async(get_data, [sub_lists[0]])
# result2 = pool.apply_async(get_data, [sub_lists[1]])
# answer1 = result1.get()
# answer2 = result2.get()
get_data(links_list, 1)
for i in range(2, 2612):
next_page()
print("--- %s seconds ---" % (time.time() - start_time))
for i in range(2612, 261812):
soup = next_page()
results = soup.find_all('div', class_='result')
result_a = []
links_list = []
for result in results:
result_a.append(list(result.a.attrs.values())[0])
for each in result_a:
links_list.append(f'{"https://legalbank.net/"}{each}')
sub_lists = np.array_split(links_list, 2)
pool = Pool()
result1 = pool.apply_async(get_data, [sub_lists[0]])
result2 = pool.apply_async(get_data, [sub_lists[1]])
answer1 = result1.get()
answer2 = result2.get()
get_data(links_list, i)
print("--- %s seconds ---" % (time.time() - start_time), " COUNT=", i)
#close browser
driver.quit()