# -*- coding: utf-8 -*- import scrapy import json class
NeihanSpider(scrapy.Spider): name = 'neihan' allowed_domains =
['neihanshequ.com'] start_urls =
['http://neihanshequ.com/bar/1/?is_json=1&app_name=neihanshequ_web&max_time=1521163598']
def parse(self, response): # 返回的是json数据 # 转换为python中的字典 rs =
json.loads(response.text) if rs.get('message') == 'success': # 取出数据 data =
rs.get('data') # 取出下一页的时间戳 max_time = data.get('max_time') # 段子数据 detail_data =
data.get('data') # for循环遍历数据,取出每一条段子 for dz in detail_data: text =
dz.get('group').get('text') print(text) # 找到下一页,根据时间戳拼接完整的加载更多url # if
max_time: # next_url =
'http://neihanshequ.com/bar/1/?is_json=1&app_name=neihanshequ_web&max_time='+str(max_time)
# # 发起请求 # yield scrapy.Request( # url=next_url # )需要在middlewares文件中设置的一下随机请求头
热门工具 换一换