使用scrapy爬遍百度百科，附所有源码_Jiale685

网络投稿 02-07 3204

零、安装相关库使用conda或者pip安装scrapy安装mysql 一、新建项目

使用Terminal（cmd）cd到项目路径，输入如下命令

scrapy startproject bdbk_2 # bdbk_2是自己起的项目名二、设计items

打开项目根目录下的items文件，这里类似java下的类提前编写所需对象，做如下编辑

# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy from scrapy.item import Item,Field class Bdbk2Item(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 时间戳 ticks = Field() # url url = Field() # 词条标题 title = Field() # html内容 content = Field() # 基本介绍 basic_introduction = Field() # 编写次数 writeNum = Field() # 最近更新者 recently_updated_user = Field() # 最近更新时间 recently_updated_date = Field() # 较大贡献名单 outstanding_contribution_list = Field() 三、编写爬虫

打开根目录下spiders文件夹下的xxx_spiders进行编辑，这里写我们的爬虫，做如下编辑

import urllib import time import sys import scrapy from ..items import Bdbk2Item from scrapy.selector import Selector from bs4 import BeautifulSoup import re from ..pipelines import MySQLPipeline class QuotesSpider(scrapy.Spider): name = "bdbk_2" def __init__(self): self.strat_url = 'https://baike.baidu.com/item/%E4%BA%AC%E4%B8%9C/210931' self.db = MySQLPipeline() self.new_url_set = set() self.old_urls_set = set() self.new_url_set.add(self.strat_url) def parse_webpage(self, html_cont): main_url = 'https://baike.baidu.com' soup = BeautifulSoup(html_cont[0],'html.parser') child_urls = soup.find_all('a', href=re.compile(r'/item/(\%\w{2})+')) for child_url in child_urls: new_url = child_url['href'] if new_url.startswith('/item/'): self.new_url_set.add(main_url + new_url) def print_item(self, item): print('************************************************************************************************************************************************************************') print(item['ticks']) print(item['url']) print(item['title']) print(item['basic_introduction']) # print(item['content']) print(item['writeNum']) print(item['recently_updated_user']) print(item['recently_updated_date']) print(str(item['outstanding_contribution_list'])) print('************************************************************************************************************************************************************************') def start_requests(self): yield scrapy.Request(url=self.strat_url, callback=self.parse) def parse(self, response): try: selector = Selector(response) '''//div[2]/dl[1]/dd/h1''' title = selector.xpath('//div/dl/dd/h1/text()').extract_first() # 下面两句可以提取HTML所有文字 # content = selector.xpath('//html') # content = content[0].xpath('string(.)').extract() content = selector.xpath('//html').extract() yield self.parse_webpage(content) basic_introduction_list = selector.xpath('//div[contains(@class,"lemma-summary") or contains(@class,"lemmaWgt-lemmaSummary")]//text()').extract() basic_introduction = ''.join([item.strip('\n') for item in basic_introduction_list]) writeNum = selector.xpath('//div[3]/dl/dd[1]/ul/li[2]/text()').extract_first() recently_updated_user = selector.xpath('//div[3]/dl/dd[1]/ul/li[3]/span[2]/a/text()').extract_first() recently_updated_date = selector.xpath('//div[3]/dl/dd[1]/ul/li[3]/span[2]/span/text()').extract_first() outstanding_contribution_list = selector.xpath('//div[3]/dl/dd[2]/ul/li/a[@class="usercard show-userCard"]/text()').extract() try: if response.url in self.new_url_set: self.new_url_set.remove(response.url) else: pass self.old_urls_set.add(response.url) except: pass item = Bdbk2Item() item['ticks'] = time.time() item['url'] = response.url item['title'] = title item['content'] = content item['basic_introduction'] = basic_introduction item['writeNum'] = writeNum item['recently_updated_user'] = recently_updated_user item['recently_updated_date'] = recently_updated_date item['outstanding_contribution_list'] = str(outstanding_contribution_list) # self.print_item(item) yield item for url in self.new_url_set: if url not in self.old_urls_set: result = self.db.check_url(url) if result is None: yield scrapy.Request(url=url, callback=self.parse) # else: # print("存在") except: pass 四、存储这样就是一个爬虫了，但我们还需要存储结果，打开根目录下的pipelines.py文件，这里是负责处理item的地方，你可以输出、存到文件、或者存到数据库针对我的这个项目，你需要提前在mysql建立spider数据库，bdbk_test表，表的属性如下： # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface from itemadapter import ItemAdapter import pymysql as pymysql from pymysql.cursors import DictCursor import time class Bdbk2Pipeline: def process_item(self, item, spider): return item class MySQLPipeline(object): def __init__(self): self.connect = pymysql.connect( host='127.0.0.1', port=3306, db='spider', user='root', passwd='123456', charset='utf8', use_unicode=True) self.cursor = self.connect.cursor(DictCursor) print("连接数据库成功") def process_item(self, item, spider): try: self.cursor.execute( """insert into bdbk_test( time_id, url, title, content, basic_introduction, writeNum, recently_updated_user, recently_updated_date, outstanding_contribution_list ) value (%s, %s, %s, %s, %s, %s, %s, %s, %s)""", ( item['ticks'], item['url'], item['title'], item['content'], item['basic_introduction'], item['writeNum'], item['recently_updated_user'], item['recently_updated_date'], item['outstanding_contribution_list'] ) ) self.connect.commit() except: pass return item def check_url(self, url): # with self.cursor as cursor: # sql为你的查询语句 sql = "SELECT time_id FROM bdbk_test WHERE url= %s" try: self.cursor.execute(sql, (url)) result = self.cursor.fetchone() except: result = None return result def close_spider(self, spider): # 关闭游标和连接 self.cursor.close() self.connect.close() print("数据库关闭游标和连接") # if __name__ == '__main__': # my = MySQLPipeline() # my.check_url('https://baike.baidu.com/item/%E8%A1%A1%E9%98%B3%E5%8E%BF') 五、设置文件

编写setting.py文件，这里对爬虫做一些默认设置，比如模拟浏览器

# Scrapy settings for bdbk_2 project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'bdbk_2' SPIDER_MODULES = ['bdbk_2.spiders'] NEWSPIDER_MODULE = 'bdbk_2.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'bdbk_2 (+http://·)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'bdbk_2.middlewares.Bdbk2SpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'bdbk_2.middlewares.Bdbk2DownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'bdbk_2.pipelines.Bdbk2Pipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' ROBOTSTXT_OBEY = False # 下一行写了就启用数据库，不写就不会启用pipelines.py文件 ITEM_PIPELINES = { 'bdbk_2.pipelines.MySQLPipeline': 300 } 六、启动爬虫

1.一般是通过Terminal（cmd）cd到项目路径，输入如下命令

scrapy crawl example

2.我个人推荐，假如你使用的是pycharm，就在项目根目录建立main.py文件

import scrapy from scrapy.cmdline import execute execute(['scrapy', 'crawl', 'bdbk_2'])

然后运行main文件就可以运行爬虫了，你也可以打断点进行调试

七、附一个成果截图吧

一天大概能爬1-2w数据，如果去掉content = Field()这个字段，效率应该能增加很多