-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscrap.py
139 lines (115 loc) · 6.29 KB
/
webscrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import random
import time
import os
from datetime import datetime, date, timezone
base_url = 'https://travel.state.gov/content/travel/en/legal/visa-law0/visa-bulletin/{}/visa-bulletin-for-{}-{}.html'
# 2005 is the first year of china backlog at all
# 2007 is the first year of china backlog monthly
# 2015oct is the first year of 4 tables
# https://travel.state.gov/content/travel/en/legal/visa-law0/visa-bulletin/2016/visa-bulletin-for-october-2015.html
# this is a big jump month and the divergence of the finalaction and priority
# https://www.myprioritydate.com/VisaBulletin
# 2007
years = range(2016, 2022)[::-1]
months = ['january', 'february', 'march', 'april', 'may', 'june',
'july', 'august', 'september', 'october', 'november', 'december'][::-1]
master_list = []
master_list_long = []
df_list_long = [pd.DataFrame() for _ in range(4)]
for year in years:
for month in months:
# the website treats the last quarter differently
if month in ['october', 'november', 'december']:
url = base_url.format(str(year+1), month, str(year))
else:
url = base_url.format(str(year), month, str(year))
print(url)
timeout = random.randint(3, 30)
print(f"Accessing URL: {url} (timeout: {timeout} seconds)")
time.sleep(timeout)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
china_tables = []
tables = soup.find_all('table')
for table in tables:
table_rows = table.find_all('tr')
for row in table_rows:
cells = row.find_all('td')
for cell in cells:
if "CHINA" in cell.get_text():
if table not in china_tables:
china_tables.append(table)
df_list = []
for i, table in enumerate(china_tables):
rows = table.find_all('tr')
columns = [cell.get_text().strip() for cell in rows[0].find_all('td')]
data = []
for row in rows[1:]:
data.append([cell.get_text().strip() for cell in row.find_all('td')])
df = pd.DataFrame(data, columns=columns)
df.replace('U', pd.NA, inplace=True)
# unpivot the scores columns into rows
df_long = pd.melt(df, id_vars=df.columns[0], value_vars=df.columns[1:],
var_name='countries', value_name='date_gov')
df_long['date'] = pd.to_datetime('01' + month + str(year))
df_long.loc[df_long['date_gov'] == 'C', 'date_gov'] = (df_long.loc[df_long['date_gov'] == 'C', 'date'])
df_long['date_gov'] = pd.to_datetime(df_long['date_gov'])
df_long['delay'] = df_long['date'] - df_long['date_gov']
df_long['delay_days'] = df_long['delay'].dt.days
df_long.columns.values[0] = 'VisaType'
if not df.empty:
df_list.append(df)
if len(china_tables) == 4:
df_list_long[i] = pd.concat([df_list_long[i], df_long], axis=0)
else:
# when only two table is avaiable, only update #1 to #1_df, #2 to #3_df
df_list_long[2*(i > 0)] = pd.concat([df_list_long[2*(i > 0)],df_long], axis=0)
df_list_long[2*(i > 0)+1] = pd.concat([df_list_long[2*(i > 0)+1],df_long], axis=0)
for i, table in enumerate(df_list):
print(f"Table {i+1}:")
print(table)
print("\n")
## print all tables indiviually
# filename = f"data/{year}_{month}_Table_{i+1}.csv"
# if not os.path.exists('data'):
# os.mkdir('data')
# table.to_csv(filename, index=False)
if df_list:
master_list.append({'year': str(year), 'month': month, 'df_list': df_list})
#master_list_long.append(df_list_long)
# only the last two tables out of the 4 are usefully for employment analysis
finaldate=df_list_long[2]
prioitydate=df_list_long[3]
# concatenate the 2 tables
world_merged_table = pd.concat([finaldate.assign(datetype='Final Action Date'),
prioitydate.assign(datetype='Priority Date')])
world_merged_table['countries'] = np.where(world_merged_table['countries'].str.contains('All Chargeability'), 'ROW',
np.where(world_merged_table['countries'].str.contains('CHINA'), 'CHINA', world_merged_table['countries']))
######################################################
# seperating the 4 tables
eb2_final=finaldate.loc[(finaldate['VisaType'] == '2nd') & (finaldate['countries'].str.contains('CHINA'))].reset_index(drop=True)
eb2_priority=prioitydate.loc[(prioitydate['VisaType'] == '2nd') & (prioitydate['countries'].str.contains('CHINA'))].reset_index(drop=True)
eb1_final=finaldate.loc[(finaldate['VisaType'] == '1st') & (finaldate['countries'].str.contains('CHINA'))].reset_index(drop=True)
eb1_priority=prioitydate.loc[(prioitydate['VisaType'] == '1st') & (prioitydate['countries'].str.contains('CHINA'))].reset_index(drop=True)
# keep the country info, replace long strings into short strings
finaldate['countries2'] = np.where(finaldate['countries'].str.contains('All Chargeability'), 'ROW',
np.where(finaldate['countries'].str.contains('CHINA'), 'CHINA', finaldate['countries']))
prioitydate['countries2'] = np.where(prioitydate['countries'].str.contains('All Chargeability'), 'ROW',
np.where(prioitydate['countries'].str.contains('CHINA'), 'CHINA', prioitydate['countries']))
# concatenate the four tables
merged_table = pd.concat([eb2_final.assign(category='EB2 Final Action Date'),
eb2_priority.assign(category='EB2 Priority Date'),
eb1_final.assign(category='EB1 Final Action Date'),
eb1_priority.assign(category='EB1 Priority Date')])
#########################
# print the final tables with timestamp to plot in R later
today = datetime.now()
timestamp = today.strftime("%Y-%m-%d_%H-%M-%S")
if not os.path.exists('data'):
os.mkdir('data')
filename_world = os.path.join("data", f"{timestamp}_rawdata_world.csv")
world_merged_table.to_csv(filename_world, index=False)