基于Python的Scrapy爬虫入门：代码详解

# allowed_domains = ['tuchong.com']

# start_urls = ['http://tuchong.com/']

def start_requests(self):

url = 'https://tuchong.com/rest/tags/%s/posts?page=%d&count=20&order=weekly';

# 抓取10个页面，每页20个图集

# 指定 parse 作为回调函数并返回 Requests 请求对象

for page in range(1, 11):

yield scrapy.Request(url=url % ('美男', page), callback=self.parse)

# 回调函数，处理抓取内容填充 TuchongItem 属性

def parse(self, response):

body = json.loads(response.body_as_unicode())

items = []

for post in body['postList']:

item = TuchongItem()

item['type'] = post['type']

item['post_id'] = post['post_id']

item['site_id'] = post['site_id']

item['title'] = post['title']

item['url'] = post['url']

item['excerpt'] = post['excerpt']

item['image_count'] = int(post['image_count'])

item['images'] = {}

# 将 images 处理成 {img_id: img_url} 对象数组

　　推荐阅读

　　摆脱尴尬，我国IPv6加速跑需要“魔鬼步伐”

CTO练习营 | 12月3-5日，深圳，是时刻成为优良的技巧治理者了人工智能、大年夜数据、云计算、物联网，其实都是>>>详细阅读

地址：http://www.17bianji.com/lsqh/39298.html