-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch.py
49 lines (45 loc) · 1.46 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# -*- coding: utf-8 -*-
import os
os.chdir('/home/excited/tools/doubanMovieComments/')
print os.getcwd()
import datetime
import sys
from scraper import doubanMovieComments
import pymongo
import time
import json
import logging
client = pymongo.MongoClient('localhost', 2717)
db = client['doubancomments']
print datetime.datetime.now()
mid = sys.argv[1]
mname = sys.argv[2]
captcha_id = sys.argv[3]
captcha_word = sys.argv[4]
count = int(sys.argv[5])
session = doubanMovieComments('[email protected]','qiqi19900701',captcha_word,captcha_id)
total_valid = 0
data_error = 0
duplicate_cid = 0
for i in range(1,count,20):
index = i if i != 1 else 0
logging.info("scanning %d to %d comments."%(index,index+20))
try:
comments = session.loadComments('%s'%mid,start=index)
except:
comments = "page load error."
if len(comments) != 0 and comments != "page load error." and comments != "ConnectTimeout":
total_valid += len(comments)
for j in comments:
if db['%s'%mname].find({"cid":"%s"%j['cid']}).count() == 1:
duplicate_cid += 1
else:
result = db['%s'%mname].insert_one(j)
else:
data_error += 20
logging.info("valid records %d"%total_valid)
logging.info("exsisted records %d"%duplicate_cid)
logging.info("page error failed %d records"%data_error)
time.sleep(15)
session.logout()
print {"total_valid":total_valid,"exsisted":duplicate_cid,"data_error":data_error}