使用scrpy爬百度贴吧,实现分页
- tieba_baidu.py 注释:scrapy startproject tieba_baidu
import scrapy
from lab1 import items
from scrapy import Request
class TiebaBaiduSpider(scrapy.Spider):
name = ‘tieba.baidu’ 注释:tieba.baidu爬虫的名字
allowed_domains = [‘tieba.baidu.com’]
#这个要写具体吧的url
start_urls = [‘https://tieba.baidu.com/f?kw=lol&ie=utf-8&pn=0’]
def start_requests(self):
注释:开始一个下一页的请求
yield Request("https://tieba.baidu.com/f?kw=lol&ie=utf-8&pn=0",callback=self.parse)
def parse(self, response):
results = response.css("div.threadlist_lz")
for result in results:
#可能其他的属性当中也有.j_th_tit属性,所以前面要加a
注释:调用items中Lab1Item类,实例化对象
item = items.Lab1Item()
item['title'] = result.css("a.j_th_tit::text").extract_first()
item['author'] = result.css("div.threadlist_author > span.tb_icon_author::attr(title)").extract_first()
yield item
#获取下一页的url
next_page_url = "https:" + response.css("div.pagination-default > a::attr(href)").extract_first()
注释:判断是否是最后一页
if next_page_url is not None:
yield Request(next_page_url,callback=self.parse,dont_filter=True)
2.settings中放开前面的#
3.pipelines.py
注释:连接sqlite3数据库
import sqlite3
class Lab1Pipeline(object):
def __init__(self):
注释:连接数据库tieba.db
self.conn = sqlite3.connect("tieba.db")
def process_item(self, item, spider):
注释:创建一个游标对象
cursor = self.conn.cursor()
注释:插入数据尽量用三个引号,防止转译
insert_sql = """insert into tieba_list(title,author) values (?,?)"""
注释:把爬虫页爬取的元素放在里面
params = (item['title'],item['author'])
注释:用游标执行爬取的数据和插入的数据
cursor.execute(insert_sql,params)
注释:提交事件
self.conn.commit()
4.items.py
把爬取的内容设置字段