-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
117 lines (97 loc) · 3.51 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import csv
import json
import sys
import requests
import urllib
from urllib.request import Request, urlopen, HTTPError
from urllib.parse import urlparse
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file', help='To mention file')
parser.add_argument('-u', '--url', help='Passing one url')
parser.add_argument('-p', '--pages', action='store_true', help='To download pages/post')
args = parser.parse_args()
def get_urls(filename):
urls = []
file = open(filename, "r")
for i in file:
i = i.replace("\n", "")
urls.append(i)
return urls
def get_data_url(url_link):
req = Request(url_link, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
## Fetching hostname of the URL
parsed_uri = urlparse(url_link)
result = '{uri.netloc}'.format(uri=parsed_uri)
print(result)
# Write data to file
filename = "data/" + result + "-raw.txt"
file_ = open(filename, 'wb')
file_.write(webpage)
file_.close()
with open(filename) as json_file:
json_data = json.load(json_file)
C_data = []
for n in json_data:
r={}
r["Modified"] = n['modified']
r["Title"] = n['title']['rendered']
r["Content"] = n['content']['rendered']
r["Link"] = n['link']
# JSON Conversion
j_data = {
"modified/posted" : r["Modified"],
"title" : r["Title"],
"content" : r["Content"],
"link" : r["Link"]
}
C_data.append(j_data)
print("Title: " + r["Title"])
print("Status: Downloaded")
json_object = json.dumps(C_data, indent = 4)
# Writing to sample.json
with open("data/" + result + "-data.json", "w") as outfile:
outfile.write(json_object)
print("Extracted Successfully")
try:
option = sys.argv[1]
if option == "-f":
urls = get_urls(args.file)
urls = list(urls)
for i in urls:
if args.pages == True:
i = i + "/wp-json/wp/v2/pages/?per_page=100"
else:
i = i + "/wp-json/wp/v2/posts/?per_page=100"
try:
get_data_url(i)
except urllib.error.HTTPError as e:
ResponseData = e.read().decode("utf8", "ignore")
print(e)
else:
try:
url = args.url
if args.pages == True:
url = url + "/wp-json/wp/v2/pages/?per_page=100"
else:
url = url + "/wp-json/wp/v2/posts/?per_page=100"
get_data_url(url)
except urllib.error.HTTPError as e: # Error handling begins
ResponseData = e.read().decode("utf8", "ignore")
fcode = e.code
if fcode == 404:
print("Ops! The URL you supplied is not a WordPress URL. Please check for any typo and try again.")
if fcode == 403:
print("The URL you are trying to locate is forbidden.")
if fcode == 401:
print("The URL you are trying to access has marked you unauthorized.")
if fcode == 408:
print("The URL you are trying to access has timed out.")
if fcode == 500:
print("The URL you are trying to access has returned Internal Server Access.")
if fcode == 502:
print("The URL you are trying to access has returned Bad Gateway error.")
except IndexError:
print("No arguments passed! use -h to check all commands")