-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path新片场.py
123 lines (106 loc) · 5.41 KB
/
新片场.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Request
import json
def convert(s):
if s is str and s.isdigit:
return int(s.resplace(','))
else:
return 0
class XpcSpider(scrapy.Spider):
name = 'xpc'
allowed_domains = ['xinpianchang.com', 'openapi-vtom.vmovier.com']
start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=tabArticle']
# 獲取每個視頻的鏈接
def parse(self, response):
pid_list = response.xpath(
'//ul[@class="video-list"]/li[@class="enter-filmplay"]/@data-articleid').extract() # pid列表
cookies = {"Authorization": "01D3EF58AA36A73BCAA36A438BAA36A9459AA36AFD0C8371FE04"}
for pid in pid_list:
url = 'https://www.xinpianchang.com/a%s?from=ArticleList' % pid
request = response.follow(url, self.parse_post)
request.meta['pid'] = pid
yield request
'''
pages=response.xpath('//div[@class="page-wrap"]/div[@class="page"]/a/@href').extract()
for page in pages:
yield response.follow(page,self.parse,cookies=cookies)
'''
# 解析單個視頻信息
def parse_post(self, response):
pid = response.meta['pid']
post = {
"pid": pid
}
cotegray = response.xpath('//span[contains(@class,"cate")]/a/text()').extract()
post['cotegray'] = '|'.join([cote.strip() for cote in cotegray])
post['title'] = response.xpath('//div[@class="title-wrap"]/h3/text()').extract()
post['play_counts'] = response.xpath('//i[@class="fs_12 fw_300 c_b_6 v-center play-counts"]/text()').extract()
post['like_counts'] = response.xpath(
'//span[@class="v-center like-counts fs_12 c_w_f fw_300"]/text()').extract()
lable = response.xpath('//div[@class="fs_12 fw_300 c_b_3 tag-wrapper"]//text()').extract()
post['lable'] = '|'.join([lab.strip() for lab in lable])
# 視頻鏈接
vid, = re.findall('vid: "(.*?)"', response.text)
video_url = 'https://openapi-vtom.vmovier.com/v3/video/%s?expand=resource&usage=xpc_web' % vid
request = Request(video_url, self.parse_video)
request.meta['post'] = post
yield request
# 評論鏈接
comment_url = "https://app.xinpianchang.com/comments?resource_id=%s&type=article&page=1&per_page=24" % pid
request = Request(comment_url, self.parse_comment)
yield request
# 創作人信息
cid_list = response.xpath('//ul[@class="creator-list"]/li/a/@data-userid').extract()
creator_url = "https://www.xinpianchang.com/u%s?from=articleList"
# 中间关系信息表
for cid in cid_list:
request = response.follow(creator_url % cid, self.parse_composer)
request.meta['cid'] = cid
yield request
cr = {}
cr['pid'] = pid
cr['cid'] = cid
cr['roles'] = response.xpath(
'//ul[@class="creator-list"]/li/a[@data-userid=$var]/following-sibling::div['
'1]/a/following-sibling::span/text()',
var=cid).extract()
yield cr
# 作品信息表
def parse_video(self, response):
post = response.meta['post']
result = json.loads(response.text)
post['video'] = result['data']['resource']['default']['https_url']
post['duration'] = result['data']['resource']['default']['duration']
yield post
# 评论信息表
def parse_comment(self, response):
comment = {}
result = json.loads(response.text)
list = result['data']['list']
for li in list:
# 評論人信息:名字、頭像、id、當前作品pid、評論、評論被點贊
comment['uname'] = li['userInfo']['username']
comment['avatar'] = li['userInfo']['avatar']
comment['cid'] = li['userInfo']['id']
comment['pid'] = li['resource_id']
comment['contentlove'] = li['count_approve']
comment['content'] = li['content']
yield comment
# 工作人员信息
def parse_composer(self, response):
composer = {}
composer['banner'] = response.xpath('//div[@class="banner-wrap"]/@style').extract()
composer['name'] = response.xpath('//div[@class="creator-info"]/p[@class="creator-name fs_26 fw_600 '
'c_b_26"]/text()').extract()
composer['like_counts'] = convert(response.xpath('//div[@class="creator-info"]/p[@class="creator-det'
'ail fs_14 fw_300 c_b_9"]/span[@class="like-wrap"]/span[2]/text()').extract())
composer['fans_counts'] = response.xpath('//span[@class="fans-wrap"]/span[2]/text()').extract()
composer['follow_counts'] = response.xpath(' //div[@class="creator-info"]/p[@class="creator-detail fs_14 fw'
'_300 c_b_9"]/span[@class="follow-wrap"]/span[2]/text()').extract()
composer['location'] = response.xpath(
'//span[@class="icon-location v-center"]/following-sibling::span[1]/text()').extract()
composer['carerr'] = response.xpath(' //div[@class="creator-info"]/p[@class="creator-detail fs_14 fw_300 c_b_9'
'"]/span[@class="icon-career v-center"]/following-sibling::span[1]/text()').extract()
yield composer