-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path__init.py
87 lines (52 loc) · 1.71 KB
/
__init.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Panic at the distro
# Author: s0nlxaftrsh0ck
# Date: 5/16/2022
# Usage: Download my favorite linux distros + other ISOs as well
# Sites to scrape from:
# https://distrowatch.com/
# Should I get ISOs direct from the sites they originated from??
# For parsing a web page
from cgitb import html
from bs4 import BeautifulSoup
# For grabbing and using the ISOs via .torrent files
import qbittorrent
# For converting / downloading the page needed for scraping
import requests
# For using system functions / read and write files
import os
page = requests.get("https://distrowatch.com")
page_content = page.text
with open('/home/s0nlx/Coding/Panic-at-the-Distro/.sites/DistroWatch-Torrent.html', 'w', encoding='utf-8') as fp:
fp.write(page_content)
# print(page.status_code) # Received Status code 200 @ https://distrowatch.com
# print(page.content)
soup = BeautifulSoup(page_content, 'html.parser')
# print(soup.prettify())
# print(list(soup.children))
# BeautifulSoup Objects for DistroWatch
# [0] = DocType
# [1] = Tags
# [2] = NavigableString
print([type(item) for item in list(soup.children)])
# Put the tags into the html
html = list(soup.children)[1]
print("\nTag List created\n")
# Print out URLs?
a_tag = html.a
link = a_tag['href']
print("\nPrinting out links\n")
print(link)
# Print out the tag list
#print(list(html.children))
#print("\nBody created...\n")
#body = list(html.children)[0]
#print("\nPrinting body\n")
#print(list(body.children))
#print("\nGrabbing paragraphs\n")
#p = list(body.children)[1]
#print("\nPrinting Text from paragraphs\n")
#print(p.get_text())
#print("\n--->End<---\n")
# Finding all instances of a tag at once
#all_option = soup.find_all('option')[0].get_text()
#print(all_option)