本文共 7389 字,大约阅读时间需要 24 分钟。
前两篇文章中,简单用scrapy写了一个小demo,本篇文章主要目标是完整用scrapy爬取,慕课网所有免费的课程、标题、图片、地址、学习人数、难度、方向、分类、时长、评分、评论数等。
其实,很简单,在上一次的代码中修改调整一下就好。
Spider
# -*- coding: utf-8 -*-import scrapyimport jsonfrom urllib import parse as urlparsefrom scrapyDemo.ImoocCourseItem import ImoocCourseItem# 慕课网爬取class ImoocSpider(scrapy.Spider): # spider的名字定义了Scrapy如何定位(并初始化)spider,所以其必须是唯一的 name = "imooc" # URL列表 start_urls = ['http://www.imooc.com/course/list'] # 域名不在列表中的URL不会被爬取。 allowed_domains = ['www.imooc.com'] def parse(self, response): # 课程类型 types = response.css('div.course-content .course-nav-row')[2].css( '.bd ul li a') for key in range(len(types)): if key == 0: continue course_type = types[key].css('::text').extract_first() # 类型的url type_url = types[key].css('::attr(href)').extract_first() # print (item) yield scrapy.Request( url=urlparse.urljoin(response.url, type_url), callback=self.parse_by_type, meta={ 'course_type': course_type }) # 按课程类型爬取 def parse_by_type(self, response): itemBase = response.meta item = ImoocCourseItem() item['course_type'] = itemBase['course_type'] # print(item) learn_nodes = response.css('a.course-card') # 遍历该页上所有课程列表 for learn_node in learn_nodes: course_url = learn_node.css("::attr(href)").extract_first() # 拼接课程详情页地址 course_url = urlparse.urljoin(response.url, course_url) # 课程地址 item['course_url'] = course_url # 课程图片 item['image'] = learn_node.css( "img.course-banner::attr(src)").extract_first() # 分类 cate = learn_node.css("div.course-label label::text").extract() item['cate'] = ','.join(cate) # 进入课程详情页面 yield scrapy.Request( url=course_url, callback=self.parse_learn, meta=item) # 下一页地址 next_page_url = response.css( u'div.page a:contains("下一页")::attr(href)').extract_first() if next_page_url: yield scrapy.Request( url=urlparse.urljoin(response.url, next_page_url), callback=self.parse_by_type, meta={ 'course_type': item['course_type'] }) # 课程详情 def parse_learn(self, response): item = response.meta # 课程标题 item['title'] = response.xpath( '//h2[@class="l"]/text()').extract_first() # 课程简介 item['brief'] = response.xpath( '//div[@class="course-brief"]/p/text()').extract_first() staticItem = response.css( 'div#main div.statics div.static-item span.meta-value::text' ).extract() # 难度级别 item['difficulty_level'] = staticItem[0] # 课程时长 item['duration'] = staticItem[1] # 综合评分 item['overall_rating'] = staticItem[2] # 评论数 item['evaluation_number'] = response.css( 'a.person-num span.person-num::text').extract_first().replace( '人评价', '') # 教师id item['teacher_id'] = response.css( 'div.teacher-info a img::attr(data-userid)').extract_first() # 学习人数 ids = response.url.split('/')[-1] yield scrapy.Request( url=urlparse.urljoin(response.url, '/course/AjaxCourseMembers?ids=' + ids), callback=self.parse_learn_num, meta=item) # 爬取学习人数 def parse_learn_num(self, response): item = response.meta data = json.loads(response.body_as_unicode()) # 学习人数 item['learn_num'] = data['data'][0]['numbers'] # print (item) yield item
Item
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass ImoocCourseItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() image = scrapy.Field() brief = scrapy.Field() cate = scrapy.Field() course_type = scrapy.Field() course_url = scrapy.Field() learn_num = scrapy.Field() difficulty_level = scrapy.Field() duration = scrapy.Field() overall_rating = scrapy.Field() evaluation_number = scrapy.Field() teacher_id = scrapy.Field() pass
pipelines
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlfrom scrapyDemo.db.dbhelper import DBHelperclass ScrapydemoPipeline(object): # 连接数据库 def __init__(self): self.db = DBHelper() def process_item(self, item, spider): # 插入数据库 self.db.insert(item) return item
保存至数据库
这里也附上建表语句吧
CREATE TABLE `imooc_courses` ( `id` int(11) NOT NULL AUTO_INCREMENT, `title` varchar(255) DEFAULT NULL, `cate` varchar(255) DEFAULT NULL, `type` varchar(11) DEFAULT NULL, `image` varchar(255) DEFAULT NULL, `brief` varchar(255) DEFAULT NULL, `course_url` varchar(255) DEFAULT NULL, `learn_num` int(11) DEFAULT '0', `difficulty_level` varchar(255) DEFAULT NULL, `duration` varchar(255) DEFAULT NULL, `overall_rating` varchar(255) DEFAULT NULL, `evaluation_number` int(11) DEFAULT '0', `teacher_id` int(11) DEFAULT NULL, PRIMARY KEY (`id`)) ENGINE=InnoDB AUTO_INCREMENT=792 DEFAULT CHARSET=utf8mb4;
下面是保存到数据库的类
# -*- coding: utf-8 -*-import pymysqlfrom twisted.enterprise import adbapifrom scrapy.utils.project import get_project_settings #导入seetings配置import timeclass DBHelper(): '''这个类也是读取settings中的配置,自行修改代码进行操作''' def __init__(self): settings = get_project_settings() #获取settings配置,设置需要的信息 dbparams = dict( host=settings['MYSQL_HOST'], #读取settings中的配置 db=settings['MYSQL_DBNAME'], user=settings['MYSQL_USER'], passwd=settings['MYSQL_PASSWD'], port=settings['MYSQL_PORT'], charset='utf8', #编码要加上,否则可能出现中文乱码问题 cursorclass=pymysql.cursors.DictCursor, use_unicode=False, ) #**表示将字典扩展为关键字参数,相当于host=xxx,db=yyy.... dbpool = adbapi.ConnectionPool('pymysql', **dbparams) self.dbpool = dbpool def connect(self): return self.dbpool #创建数据库 def insert(self, item): sql = "insert into imooc_courses(title,type,cate,image,brief,course_url,learn_num,difficulty_level,duration,overall_rating,evaluation_number,teacher_id) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" #调用插入的方法 query = self.dbpool.runInteraction(self._conditional_insert, sql, item) #调用异常处理方法 query.addErrback(self._handle_error) return item #写入数据库中 def _conditional_insert(self, tx, sql, item): params = (item["title"], item['course_type'],item['cate'], item['image'], item['brief'], item['course_url'], item['learn_num'], item['difficulty_level'], item['duration'], item['overall_rating'], item['evaluation_number'], item['teacher_id']) tx.execute(sql, params) #错误处理方法 def _handle_error(self, failue): print('--------------database operation exception!!-----------------') print(failue)
大功告成
scrapy crawl imooc
转载地址:http://imuws.baihongyu.com/