forked from streetsidesoftware/cspell-dicts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate.py
83 lines (70 loc) · 2.12 KB
/
update.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
Update django's dictionary using official documentation's index
Set VERSION and FILENAME before running :
```sh
pip install -r requirements.txt
python update.py
```
"""
from bs4 import BeautifulSoup
import requests
BASE_URL = "https://docs.djangoproject.com/en/{}/genindex/"
VERSION = "4.2" # update to match current stable version number
FILENAME = "src/django.txt"
def update():
"""
Update FILENAME contents
1. get https://docs.djangoproject.com/en/VERSION/genindex/ content
2. extract all DT tags from the index
3. cleanup (remove details, links, parenthesis, etc.)
4. split dotted terms if needed (django.db -> django AND db)
5. deduplicate & sort the list
6. write each words in FILENAME
"""
fullPage = requests.get(BASE_URL.format(VERSION)).text
soup = BeautifulSoup(fullPage, "html.parser")
data = soup.find("div", role="main") # ignore the rest of the page
# init with terms missing from index (mostly closing template tags)
terms_list = [
"elif",
"empty",
"endautoescape",
"endblocktrans",
"endcomment",
"endfor",
"endfilter",
"endif",
"endifequal",
"endifnotequal",
"endifchanged",
"endspaceless",
"endverbatim",
"endwith",
"openblock",
"closeblock",
"openvariable",
"closevariable",
"openbrace",
"closebrace",
"opencomment",
"closecomment",
]
for index_entry in data.find_all("dt"):
if index_entry.a:
entry = index_entry.a
else:
entry = index_entry
try:
# cleanup spaces & parenthesis
term = entry.string.strip().split(" ", 1)[0].split("(", 1)[0]
except AttributeError:
continue
if term.startswith("--"): # ignore command line params
continue
for word in term.split("."):
terms_list.append(word)
with open(FILENAME, "w") as output:
for word in sorted(list(set(terms_list))):
output.write(word + "\n")
if __name__ == "__main__":
update()