大家好,又见面了,我是你们的朋友全栈君。
首先,MySQL创建好数据库和表
image
然后编写各个模块
item.py
import scrapy
class JianliItem(scrapy.Item):
name = scrapy.Field()
url = scrapy.Field()
pipeline.py
import pymysql #导入数据库的类
class JianliPipeline(object):
conn = None
cursor = None
def open_spider(self,spider):
print(‘开始爬虫’)
self.conn = pymysql.Connect(host=’127.0.0.1′,port=3306,user=’root’,password=”,db=’jianli’) #链接数据库
def process_item(self, item, spider): #编写向数据库中存储数据的相关代码
self.cursor = self.conn.cursor() #1.链接数据库
sql = ‘insert into jl values(“%s”,”%s”)’%(item[‘name’],item[‘url’]) #2.执行sql语句
try: #执行事务
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self,spider):
print(‘爬虫结束’)
self.cursor.close()
self.conn.close()
spider
# -*- coding: utf-8 -*-
import scrapy
import re
from lxml import etree
from jianli.items import JianliItem
class FxhSpider(scrapy.Spider):
name = ‘jl’
# allowed_domains = [‘feixiaohao.com’]
start_urls = [‘http://sc.chinaz.com/jianli/free_{}.html’.format(i) for i in range(3)]
def parse(self,response):
tree = etree.HTML(response.text)
a_list = tree.xpath(‘//div[@id=”container”]/div/a’)
for a in a_list:
item = JianliItem (
name=a.xpath(“./img/@alt”)[0],
url=a.xpath(“./@href”)[0]
)
yield item
settings.py
#USER_AGENT
headers = {
“user-agent”:”Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36″
}
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
‘jianli.pipelines.JianliPipeline’: 300,
}
查看存储情况
image
发布者:全栈程序员-用户IM,转载请注明出处:https://javaforall.cn/153138.html原文链接:https://javaforall.cn
【正版授权,激活自己账号】: Jetbrains全家桶Ide使用,1年售后保障,每天仅需1毛
【官方授权 正版激活】: 官方授权 正版激活 支持Jetbrains家族下所有IDE 使用个人JB账号...