爬取4567电影网「建议收藏」

爬取4567电影网「建议收藏」movie.py虫子#-*-coding:utf-8-*-importscrapyfrommoviePro1.itemsimportMoviepro1ItemclassMovieS

大家好,又见面了,我是你们的朋友全栈君。

movie.py虫子

# -*- coding: utf-8 -*-
import scrapy
from moviePro1.items import Moviepro1Item


class MovieSpider(scrapy.Spider):
name = 'movie'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.4567tv.tv/index.php/vod/show/class/喜剧/id/1.html']

# 定义通用爬虫模板:
url = "https://www.4567tv.tv/index.php/vod/show/class/喜剧/id/1/page/%d.html"

page = 1

# 用于解析电影名称:
def parse(self, response):
print("正在爬取第{}页的电影数据....".format(self.page))
li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
for li in li_list:
item = Moviepro1Item()
name = li.xpath('./div/a/@title').extract_first()
item["name"] = name
detail_url = 'https://www.4567tv.tv' + li.xpath('./div/a/@href').extract_first()

# 对详情页url手动请求发送
yield scrapy.Request(detail_url, callback=self.parse_detail,
meta={"item": item}) # 让Request将一个数据值(字典的形式)传递给回调函数
if self.page < 5:
print("-----------------------------------------------------------")
self.page += 1
new_url = format(self.url % self.page)
yield scrapy.Request(new_url, callback=self.parse)

# 解析电影简介:
def parse_detail(self, response):
# 接收请求传参的数据(字典的形式)
item = response.meta["item"]
desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[3]/text()').extract_first()
item["desc"] = desc
yield item

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class Moviepro1Item(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
desc = scrapy.Field()

middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class Moviepro1SpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.

# Should return None or raise an exception.
return None

def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.

# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i

def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.

# Should return either None or an iterable of Request, dict
# or Item objects.
pass

def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.

# Must return only requests (not items).
for r in start_requests:
yield r

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)


class Moviepro1DownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.

# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None

def process_response(self, request, response, spider):
# Called with the response returned from the downloader.

# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response

def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.

# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


class Moviepro1Pipeline(object):
def process_item(self, item, spider):
print(item)
return item

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for moviePro1 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'moviePro1'

SPIDER_MODULES = ['moviePro1.spiders']
NEWSPIDER_MODULE = 'moviePro1.spiders'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'



# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'moviePro1 (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'moviePro1.middlewares.Moviepro1SpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'moviePro1.middlewares.Moviepro1DownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'moviePro1.pipelines.Moviepro1Pipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。

发布者:全栈程序员-用户IM,转载请注明出处:https://javaforall.cn/154556.html原文链接:https://javaforall.cn

【正版授权,激活自己账号】: Jetbrains全家桶Ide使用,1年售后保障,每天仅需1毛

【官方授权 正版激活】: 官方授权 正版激活 支持Jetbrains家族下所有IDE 使用个人JB账号...

(0)


相关推荐

  • idea2022.01.12密钥激活码[最新免费获取]

    (idea2022.01.12密钥激活码)最近有小伙伴私信我,问我这边有没有免费的intellijIdea的激活码,然后我将全栈君台教程分享给他了。激活成功之后他一直表示感谢,哈哈~IntelliJ2021最新激活注册码,破解教程可免费永久激活,亲测有效,下面是详细链接哦~https://javaforall.cn/100143.html…

  • if python用法_for循环语句

    if python用法_for循环语句今天,我们将学习Python中if语句的基本使用。if在Python中用作某个条件或值的判断,格式为:if条件: 执行语句1else: 执行语句2else是当条件不成立时运行的代码。我们先来看个例子,程序判断天气情况并输出是否要带伞:weather=input(“今日天气是:”)ifweather==”雨天” print(“今天出门需要带伞”)else: print(“今天出门不需要带伞”)运行代码,输入雨天会提示要带伞。if语句中用的两个“=”是什么呢

  • js 判断字符串中是否包含某个字符include的坑「建议收藏」

    js 判断字符串中是否包含某个字符include的坑「建议收藏」方法一indexOf()(推荐)varstr=”123″;console.log(str.indexOf(“3″)!=-1);//trueindexOf()方法可返回某个指定的字符串值在字符串中首次出现的位置。如果要检索的字符串值没有出现,则该方法返回-1。方法二test()varstr=”123”;varreg=RegExp(/3/);console.log(reg.test(str));//truetest()方法用于检索字

  • java多线程并发之旅-14-lock free queue 无锁队列[通俗易懂]

    java多线程并发之旅-14-lock free queue 无锁队列[通俗易懂]无锁队列能实现吗?上面说的加锁的环形队列,可以保证线程安全。但是加锁能不能去掉呢?答案是肯定的,请看下面的娓娓道来。i++是原子操作吗?i++和++i是原子操作吗?有一个很多人也许都不是很清楚的问题:i++或++i是一个原子操作吗?在上一节,其实已经提到了,在SMP(对称多处理器)上,即使是单条递减汇编指令,其原子性也是不能保证的。那么在单处理机系统中呢?在编译器对C/C++源代码…

  • golang map转json

    golang map转json““//maptojsonpackagemainimport(“encoding/json”“fmt”)funcmain(){s:=[]map[string]interface{}{}m1:=map[string]interface{}{“name”:”John”,”age”:10}m2:=map[string]interface{}{“

  • 几款软件加密/加壳工具的比较「建议收藏」

    几款软件加密/加壳工具的比较「建议收藏」几款.Net加密/加壳工具的比较前言使用过.NET的程序员都知道,.NET是一个巨大的跨时代进步,它开发效率高、功能强、界面观、耐用、新的语言C#已经提交为行业规范、CLR共公运行库资源丰富,这所有的特点标志着它成为主流编程语言是必然的。可是它也有一个缺点,那就是编译好的程序集可以完全反编译成源代码,这给一些不法份子提供了很好的机会,试想想,您辛苦的劳动成果就这样给了别…

发表回复

您的电子邮箱地址不会被公开。

关注全栈程序员社区公众号