scrapy爬虫实例之携带cookie登录人人网

爬什么

爬取人人网的个人档案页面。由于这个页面是登录之后才能够访问的，我们爬虫的时候必须要携带cookie。

怎么做

使用了spider自带的一个功能函数start_requests(self)，使用yield带上cookies发起请求。

def start_requests(self):
    cookies = "anonymid=k5jjy9gb-p4f3be; _r01_=1; taihe_bi_sdk_uid=41402119b11006539defa847b6465809; jebe_key=538aac78-0044-476f-980c-0e53071b02d6%7C42d7b478811716336baa94c4523e9833%7C1579351511075%7C1%7C1579351510138; __utma=151146938.1215165037.1579351626.1579351626.1579351626.1; __utmz=151146938.1579351626.1.1.utmcsr=mail.qq.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _de=04F2D88119EA2B63E16F2C7283EEE4526DEBB8C2103DE356; depovince=GW; jebecookies=34dc0299-7196-4cfd-a488-2bc049b14d0c|||||; JSESSIONID=abcMtuJJwHUBCaBrWmObx; ick_login=29877d8a-34a7-4d2c-bb52-a247fb308dae; taihe_bi_sdk_session=97aeb76cc7bd9c0c4fe9d7c6dcfb5eed; p=bde487db3148887ef533de31e99862b30; first_login_flag=1; ln_uact=3321647547@qq.com; ln_hurl=http://hdn.xnimg.cn/photos/hdn221/20200118/2045/h_main_TCCN_9c6c00011d14195a.jpg; t=de1623c5b5c112750032f096cc552d6d0; societyguester=de1623c5b5c112750032f096cc552d6d0; id=973482150; xnsid=97c70d4d; ver=7.0; loginfrom=null; wp_fold=0; jebe_key=538aac78-0044-476f-980c-0e53071b02d6%7C42d7b478811716336baa94c4523e9833%7C1582296534360%7C1%7C1582296533331"
    cookies = {i.split("=")[0]:i.split("=")[1] for i in cookies.split("; ")} #将cookies转化为字典
    yield scrapy.Request(
        self.start_urls[0],
        callback=self.parse,
        cookies=cookies
    )

代码

renren.py：

import scrapy
import re

class RenrenSpider(scrapy.Spider):
    name = 'renren'
    allowed_domains = ['renren.com']
    start_urls = ['http://www.renren.com/973482150/profile']

    def start_requests(self):
        cookies = "anonymid=k5jjy9gb-p4f3be; _r01_=1; taihe_bi_sdk_uid=41402119b11006539defa847b6465809; jebe_key=538aac78-0044-476f-980c-0e53071b02d6%7C42d7b478811716336baa94c4523e9833%7C1579351511075%7C1%7C1579351510138; __utma=151146938.1215165037.1579351626.1579351626.1579351626.1; __utmz=151146938.1579351626.1.1.utmcsr=mail.qq.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _de=04F2D88119EA2B63E16F2C7283EEE4526DEBB8C2103DE356; depovince=GW; jebecookies=34dc0299-7196-4cfd-a488-2bc049b14d0c|||||; JSESSIONID=abcMtuJJwHUBCaBrWmObx; ick_login=29877d8a-34a7-4d2c-bb52-a247fb308dae; taihe_bi_sdk_session=97aeb76cc7bd9c0c4fe9d7c6dcfb5eed; p=bde487db3148887ef533de31e99862b30; first_login_flag=1; ln_uact=3321647547@qq.com; ln_hurl=http://hdn.xnimg.cn/photos/hdn221/20200118/2045/h_main_TCCN_9c6c00011d14195a.jpg; t=de1623c5b5c112750032f096cc552d6d0; societyguester=de1623c5b5c112750032f096cc552d6d0; id=973482150; xnsid=97c70d4d; ver=7.0; loginfrom=null; wp_fold=0; jebe_key=538aac78-0044-476f-980c-0e53071b02d6%7C42d7b478811716336baa94c4523e9833%7C1582296534360%7C1%7C1582296533331"
        cookies = {i.split("=")[0]:i.split("=")[1] for i in cookies.split("; ")} #将cookies转化为字典形式
        yield scrapy.Request( #使用scrapy.Request发起请求
            self.start_urls[0], #这里的start_urls是个列表，该参数是字符串类型，要设置为列表的第0个元素
            callback=self.parse,
            cookies=cookies #在请求中带上cookies
        )

    def parse(self, response):
        print(re.findall("要么绽放要么死去",response.body.decode())) #这里使用正则查找响应页面是否包含用户名，以此确认爬虫是否访问到个人主页
        yield scrapy.Request(
            "http://www.renren.com/973482150/profile?v=info_timeline", #个人资料页
            callback=self.parse_detail, #这里就不需要设置cookies，因为上一条请求设置过了，爬虫发起的请求都将带上cookies
        )

    def parse_detail(self, response):
        print(re.findall("要么绽放要么死去",response.body.decode())) #使用正则确认个人资料页是否成功抓取

settings.py：

# -*- coding: utf-8 -*-

# Scrapy settings for cookielogin project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'cookielogin'

SPIDER_MODULES = ['cookielogin.spiders']
NEWSPIDER_MODULE = 'cookielogin.spiders'

COOKIES_DEBUG = True #我们开启这个cookies_debug的功能，可以在输出界面看到cookies的传输情况
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'cookielogin (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'cookielogin.middlewares.CookieloginSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'cookielogin.middlewares.CookieloginDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'cookielogin.pipelines.CookieloginPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

总结

我们用到了scrapy内置的函数，发起带cookies的请求，以访问登录后的页面。这个cookies一旦设置，爬虫发起的所有请求都将带上cookies。我们通过在settings.py开启COOKIES_DEBUG功能，追踪cookies的传送过程。

将cookies转化为字典形式的方法：cookies = {i.split("=")[0]:i.split("=")[1] for i in cookies.split("; ")}这个方法很常用，要熟记。
正则表达式re.findall("要么绽放要么死去",response.body.decode())是在查找响应页面中是否含有要么绽放要么死去这个字符串，最终会返回所有的要么绽放要么死去字符串。
response.body.decode()这个属性是表示响应页面的html字符串。