大家好,又见面了,我是你们的朋友全栈君。
首先创建 ImagesRename
在spiders 里面创建 ImgRename.py 输入代码
import scrapy from ImagesRename.items import ImagesrenameItem class ImgsrenameSpider(scrapy.Spider): name = 'tujigu' start_urls = ['https://www.tujigu.com/a/28177/'] #替换自己喜欢女孩地址 def parse(self, response): # 实例化item item = ImagesrenameItem() # 注意imgurls是一个集合也就是多张图片 item['imgurl'] = response.xpath("//div[@class='content']//@src").extract() # 抓取文章标题作为图集名称 item['imgname'] = response.xpath("//div[@class='weizhi']//h1").extract_first() yield item # 提取图片,存入文件夹 # print(item['ImgUrl']) next_page = response.xpath('//*[@id="pages"]//a[11]//@href').extract_first() if next_page is not None: yield response.follow(next_page, callback=self.parse)
在items里面替换
import scrapy class ImagesrenameItem(scrapy.Item): # define the fields for your item here like: imgurl = scrapy.Field() imgname = scrapy.Field() pass
在middlewares 里面添加
class NovelUserAgentMiddleWare(object): #随即user_AGENT def __init__(self): self.user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", ] def process_request(self, request, spider): import random ua = random.choice(self.user_agent_list) print('User-Agent:' + ua) request.headers.setdefault('User-Agent', ua) class NovelProxyMiddleWare(object): #随即IP def process_request(self, request, spider): proxy = self.get_random_proxy() print("Request proxy is {}".format(proxy)) request.meta["proxy"] = "http://" + proxy def get_random_proxy(self): import random with open('a.txt', 'r', encoding="utf-8") as f:#打开IP的地址 txt = f.read() return random.choice(txt.split('\n'))
在pipelines里面替换
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface from itemadapter import ItemAdapter class ImagesrenamePipeline: def process_item(self, item, spider): return item import re from scrapy.pipelines.images import ImagesPipeline from scrapy import Request class ImagesrenamePipeline(ImagesPipeline): def get_media_requests(self, item, info): # 循环每一张图片地址下载,若传过来的不是集合则无需循环直接yield for image_url in item['imgurl']: # meta里面的数据是从spider获取,然后通过meta传递给下面方法:file_path yield Request(image_url,meta={'name':item['imgname']}) # 重命名,若不重写这函数,图片名为哈希,就是一串乱七八糟的名字 def file_path(self, request, response=None, info=None): # 提取url前面名称作为图片名。 image_guid = request.url.split('/')[-1] # 接收上面meta传递过来的图片名称 name = request.meta['name'] # 过滤windows字符串,不经过这么一个步骤,你会发现有乱码或无法下载 #name = re.sub(u"([^\u4e00-\u9fa5])", "", name) name = re.findall(r'[^<>/h1第0-9页NO. ]', name) name= ''.join(name) # 分文件夹存储的关键:{0}对应着name;{1}对应着image_guid filename = u'{0}/{1}'.format(name, image_guid) return filename
最后settings里添加
BOT_NAME = 'ImagesRename' SPIDER_MODULES = ['ImagesRename.spiders'] NEWSPIDER_MODULE = 'ImagesRename.spiders' RETRY_ENABLED = True #打开重试开关 RETRY_TIMES = 20 #重试次数 DOWNLOAD_TIMEOUT = 3 #超时 RETRY_HTTP_CODES = [429,404,403] #重试 ITEM_PIPELINES = { 'ImagesRename.pipelines.ImagesrenamePipeline': 300, } DOWNLOADER_MIDDLEWARES = { 'ImagesRename.middlewares.NovelUserAgentMiddleWare': 544, #随即user 'ImagesRename.middlewares.NovelProxyMiddleWare': 543,#随即IP } #没有在middlewares添加随即,就不用添加 # 设置图片存储目录 IMAGES_STORE = 'D:\学习\pythonProject\scrapy\ImagesRename' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'ImagesRename (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False
发布者:全栈程序员-用户IM,转载请注明出处:https://javaforall.cn/154228.html原文链接:https://javaforall.cn
【正版授权,激活自己账号】: Jetbrains全家桶Ide使用,1年售后保障,每天仅需1毛
【官方授权 正版激活】: 官方授权 正版激活 支持Jetbrains家族下所有IDE 使用个人JB账号...