-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcreate_dataset.py
84 lines (67 loc) · 3.58 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import sys
import csv
import os
import json
from unicodedata import category
from tqdm import tqdm
from utils import *
# userid_dict = {line.strip(): {"politifact_fake": [], "politifact_real": [
# ], "gossipcop_fake": [], "gossipcop_real": []} for line in lines}
def new_user():
return {"politifact_fake": [], "politifact_real": [], "gossipcop_fake": [], "gossipcop_real": [],
"politifact_fake_rt": [], "politifact_real_rt": [], "gossipcop_fake_rt": [], "gossipcop_real_rt": [], }
maxInt = sys.maxsize
while True:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
try:
csv.field_size_limit(maxInt)
break
except OverflowError:
maxInt = int(maxInt / 10)
users = {}
for data_choice in data_collection_choice:
with open('{}/{}_{}.csv'.format(dataset_path, data_choice["news_source"],
data_choice["label"]), encoding="UTF-8") as csvfile:
lines = len(csvfile.readlines())
with open('{}/{}_{}.csv'.format(dataset_path, data_choice["news_source"],
data_choice["label"]), encoding="UTF-8") as csvfile:
reader = csv.DictReader(csvfile)
dump_dir = "{}/tweets".format(dump_location)
dump_dir_rt = "{}/retweets".format(dump_location)
print("Collecting {} {}".format(
data_choice["news_source"], data_choice["label"]))
for news in tqdm(reader, total=lines):
for tweet_id in news["tweet_ids"].split("\t"):
if (tweet_id.isdigit() and os.path.exists("{}/{}.json".format(dump_dir, tweet_id))):
tweet = json.load(
open("{}/{}.json".format(dump_dir, tweet_id), 'r'))
user_id = tweet["user"]["id_str"]
if user_id not in users:
users[user_id] = new_user()
data_category = "{}_{}".format(data_choice["news_source"], data_choice["label"])
users[user_id][data_category].append(tweet_id)
if (tweet_id.isdigit() and os.path.exists("{}/{}.json".format(dump_dir_rt, tweet_id))):
retweets = json.load(open("{}/{}.json".format(dump_dir_rt, tweet_id), "r"))
for retweet in retweets:
try:
user_id = retweet["user"]["id_str"]
if user_id not in users:
users[user_id] = new_user()
data_category = "{}_{}_rt".format(data_choice["news_source"], data_choice["label"])
users[user_id][data_category].append(tweet_id)
except Exception as e:
print(retweet)
for userid in users:
users[userid]["total_count"] = 0
for data_choice in data_collection_choice:
data_category = "{}_{}".format(data_choice["news_source"], data_choice["label"])
data_category_rt = "{}_{}_rt".format(data_choice["news_source"], data_choice["label"])
users[userid][data_category + "_count"] = len(users[userid][data_category])
users[userid]["total_count"] += len(users[userid][data_category])
users[userid][data_category_rt + "_count"] = len(users[userid][data_category_rt])
users[userid]["total_count"] += len(users[userid][data_category_rt])
json.dump(users, open(
"{}/user_map_all.json".format(dump_location), "w"), indent=4)
with open('{}/all.txt'.format(dump_location), 'w', encoding='utf-8') as f:
f.write(str.join('\n', (str(x) for x in users.keys())))