Python利用scrapy框架,爬取大众点评部分商铺数据~
分享一下,自己从0开始,用python爬取数据的历程。希望可以可以帮到一起从0开始的小伙伴~~加油。
首先,我的开发环境是:
电脑:macOS Sierra 10.12.6 编译器:PyCharm + 终端
我的电脑自带的Python版本为2.7,我下载了一个Python3.6。使用3.6版本的来进行本次的编写, 将新下载的Python配置到环境变量里。一般他会自带pip。打开终端,cd到pip所在目录,终端输入 pip scrapy
打开终端,cd到你想创建的项目目录下,终端输入 scrapy startproject Test
就会在该目录下自动生成一些文件,接下来只要修改其中的一些文件就可以了。
使用PyCharm打开,先截图一下目录结构:
根目录就是你创建的项目名,然后会有一个spiders文件夹,里面会有__init__.py
根目录下的文件,__init__.py , items.py , middlewares.py , pipelines.py , settings.py
cd到Test目录下,终端输入 scrapy genspider ShopSpider "dianping.com"
会在Test目录下生成一个ShopSpider.py文件。
文件都创建好了。去想要爬的网站看一下它源码的标签结构。
根据想要爬的数据,修改items.py文件
import scrapy class TestItem(scrapy.Item): # 餐馆名 shop_name = scrapy.Field() # 首页图 shop_img = scrapy.Field() # 评星 shop_star = scrapy.Field() # 评价人数 shop_evaluation = scrapy.Field() # 人均价位 shop_price = scrapy.Field() # 菜系 shop_type = scrapy.Field() # 地址1 shop_address1 = scrapy.Field() # 详细地址 shop_address2 = scrapy.Field() # 推荐菜1 shop_food1 = scrapy.Field() # 推荐菜2 shop_food2 = scrapy.Field() # 推荐菜3 shop_food3 = scrapy.Field() # 口味评分 shop_sweet = scrapy.Field() # 环境评分 shop_environment = scrapy.Field() # 服务评分 shop_server = scrapy.Field()
修改爬虫文件ShopSpider.py
# -*- coding: utf-8 -*- import scrapy from Test.items import TestItem class ShopSpider(scrapy.Spider): """ 功能:大众点评沈阳美食店铺数据 """ # 爬虫名 name = 'ShopSpider' # 作用范围 allowed_domains = ['dianping.com'] # baseurl url = 'http://www.dianping.com/shenyang/ch10/g2714p' offset = 1 # 爬取的url start_urls = [url + str(offset)] def parse(self, response): for each in response.xpath("//div[@class='shop-list J_shop-list shop-all-list']/ul/li"): # 初始化模型对象≤ item = TencentItem() item['shop_name'] = each.xpath(".//img/@title").extract()[0] # 分割图片url imgorl = each.xpath(".//img/@src").extract()[0] img = imgorl.split('%')[0] item['shop_img'] = img item['shop_star'] = each.xpath(".//div[@class='comment']/span/@title").extract()[0] # 评价人数和平均价格 通过循环次数去找到两个相同的标签下的数据 price_tag = 0 for price in each.xpath(".//div[@class='comment']"): for p in price.xpath(".//a/b/text()"): if price_tag == 0: # 当评价人数为空的时候,第一个获得到的数据包含'¥'那么就是价格,否则是评价人数 ep = price.xpath(".//a/b/text()").extract()[0] if '¥' in ep: item['shop_price'] = ep else: item['shop_evaluation'] = ep price_tag += 1 elif price_tag == 1: item['shop_price'] = price.xpath(".//a/b/text()").extract()[1] price_tag += 1 # 商店类型 和 地址,防止地址1不存在,需要判断 at_tag = 0 for at in each.xpath(".//div[@class='tag-addr']"): for att in at.xpath(".//a/span[@class='tag']/text()"): if at_tag == 0: item['shop_type'] = at.xpath(".//a/span[@class='tag']/text()").extract()[0] at_tag += 1 elif at_tag == 1: item['shop_address1'] = at.xpath(".//a/span[@class='tag']/text()").extract()[1] at_tag += 1 # 地址2 item['shop_address2'] = each.xpath(".//div[@class='tag-addr']/span[@class='addr']/text()").extract()[0] # 推荐菜 判断个数 food_tag = 0 for food in each.xpath(".//div[@class='recommend']"): for f in food.xpath(".//a/text()"): if food_tag == 0: item['shop_food1'] = food.xpath(".//a/text()").extract()[0] food_tag += 1 elif food_tag == 1: item['shop_food2'] = food.xpath(".//a/text()").extract()[1] food_tag += 1 elif food_tag == 2: item['shop_food3'] = food.xpath(".//a/text()").extract()[2] food_tag += 1 # 其他评分 score_tag = 0 for score in each.xpath(".//span[@class='comment-list']"): for s in score.xpath(".//span/b/text()"): if score_tag == 0: item['shop_sweet'] = score.xpath(".//span/b/text()").extract()[0] score_tag += 1 elif score_tag == 1: item['shop_environment'] = score.xpath(".//span/b/text()").extract()[1] score_tag += 1 elif score_tag == 2: item['shop_server'] = score.xpath(".//span/b/text()").extract()[2] score_tag += 1 yield item if self.offset < 50: self.offset += 1 # # # 每次处理完一页的数据之后,重新发送下一页页面请求 # # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
其中遇到了一些问题,都是通过百度一点点补全的~写了主要的注释。
修改pipelines.py
import json class TestPipeline(object): """ 功能:保存item数据 """ def __init__(self): # 打开文件 self.filename = open("shuiguoshengxian.json", "w") def process_item(self, item, spider): # 将获取到的每条item转换为json格式 text = json.dumps(dict(item), ensure_ascii=False) + ",\n" self.filename.write(text) return item def close_spider(self, spider): # 关闭文件 self.filename.close()
__init__方法中的文件名就是你要输出的json文件名。
修改setting.py文件
DEFAULT_REQUEST_HEADERS = { # 'User-Agent': "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } ''' 伪造一个用户信息,防止403 ''' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
ITEM_PIPELINES = { 'Tencent.pipelines.TencentPipeline': 300, } ''' 防止403崩溃。 ''' HTTPERROR_ALLOWED_CODES = [403]
要注意的就是USER_AGENT的设置。防止拒绝访问403错误。
终端输入 scrapy crawl ShopSpider
爬取成功,就可以看到一个.json文件了,打开就可以看到其中爬到的数据。
类似于:
{"shop_name": "张福光九九草莓采摘园", "shop_img": "http://p0.meituan.net/deal/cbb3476245a7a22becae0835e072a031325900.png", "shop_star": "五星商户", "shop_evaluation": "22", "shop_price": "¥122", "shop_type": "水果生鲜", "shop_address1": "苏家屯区", "shop_address2": "来胜村", "shop_sweet": "9.1", "shop_environment": "9.1", "shop_server": "9.2"}, {"shop_name": "糖糖水果捞", "shop_img": "http://p0.meituan.net/waimaipoi/cc0c567369d52a43f9607a8f2734ad7033647.jpg", "shop_star": "准五星商户", "shop_evaluation": "13", "shop_price": "¥22", "shop_type": "水果生鲜", "shop_address1": "和平区", "shop_address2": "南京南街228-36号6门", "shop_sweet": "8.7", "shop_environment": "8.7", "shop_server": "8.7"}, {"shop_name": "奉鲜果切水果捞(浑南店)", "shop_img": "http://p0.meituan.net/deal/571c8808dead876be5b84a640128b12297393.jpg", "shop_star": "四星商户", "shop_evaluation": "11", "shop_type": "水果生鲜", "shop_address1": "浑南区", "shop_address2": "浑南新区夹河街A-20号10门", "shop_sweet": "7.9", "shop_environment": "7.9", "shop_server": "8.0"},
写一个创建数据库表的py,准备将爬到的数据存在数据库里。
# -*- coding: utf-8 -*- import pymysql serverIp = "数据库ip地址" userName = "登录用户名" password = "登录密码" databaseName = "数据库名" # 打开数据库连接 db = pymysql.connect(serverIp, userName, password, databaseName) # 使用cursor()方法创建一个游标对象cursor cursor = db.cursor() # 创建表语句 注意长度限制 sql = """CREATE TABLE shuiguoshengxian ( shop_id INT PRIMARY KEY auto_increment, shop_name VARCHAR(50), shop_img VARCHAR(150), shop_star VARCHAR(10), shop_evaluation INT, shop_price INT, shop_type VARCHAR(10), shop_address1 VARCHAR(15), shop_address2 VARCHAR(50), shop_food1 VARCHAR(20), shop_food2 VARCHAR(20), shop_food3 VARCHAR(20), shop_sweet FLOAT, shop_environment FLOAT, shop_server FLOAT)""" # 使用execute()方法执行SQL查询 cursor.execute(sql) # 使用 fetchone() 方法获取单条数据. # data = cursor.fetchone() # print("Database version : %s " % data) cursor.close() # 关闭数据库连接 db.close()
写一个上传json数据到数据库的py
# -*- coding: utf-8 -*- import json import pymysql serverIp = "数据库ip地址" userName = "登录用户名" password = "登录密码" databaseName = "数据库名" # 打开数据库连接 注意最后一个参数charset='utf8' db = pymysql.connect(host=serverIp, user=userName, passwd=password, db=databaseName, port=3306, charset="utf8") # 使用cursor()方法创建一个游标对象cursor cursor = db.cursor() data = [] with open('shuiguoshengxian.json') as f: for line in f: # 需要数据为json格式,所以去掉每行末尾的',' data.append(json.loads(line[0:-2])) for item in data: # 使用get方法如果对应key没有值,则赋一个默认值 # 防止字符串中包含单引号 shop_name_str = item.get('shop_name', "").replace("'", "\\\'") shop_img_str = item.get('shop_img', '') shop_star_str = item.get('shop_star', '') shop_evaluation_str = item.get('shop_evaluation', 0) shop_price_stro = item.get('shop_price', '0') if shop_price_stro != '0': # 将前面的'¥'过滤掉 shop_price_str = shop_price_stro[1:] else: shop_price_str = 0 shop_type_str = item.get('shop_type', '') shop_address1_str = item.get('shop_address1', '') shop_address2_str = item.get('shop_address2', '') shop_food1_str = item.get('shop_food1', '') shop_food2_str = item.get('shop_food2', '') shop_food3_str = item.get('shop_food3', '') shop_sweet_str = item.get('shop_sweet', 0.0) shop_environment_str = item.get('shop_environment', 0.0) shop_server_str = item.get('shop_server', 0.0) str = "INSERT INTO shuiguoshengxian(shop_name, shop_img, shop_star, shop_evaluation, shop_price, shop_type, shop_address1, shop_address2, shop_food1, shop_food2, shop_food3, shop_sweet, shop_environment, shop_server) VALUES " str = str + "('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s');\r\n" % (shop_name_str, shop_img_str, shop_star_str, shop_evaluation_str, shop_price_str, shop_type_str, shop_address1_str, shop_address2_str, shop_food1_str, shop_food2_str, shop_food3_str, shop_sweet_str, shop_environment_str, shop_server_str) # str = "UPDATE shops SET shop_price = '%s' WHERE shop_name = '%s';" % (shop_price_str, shop_name_str) cursor.execute(str) f.close() cursor.close() db.commit() db.close() print("success")
注意,要把json文件放在项目根目录下,因为
with open('shuiguoshengxian.json') as f:
如果在别的路径,可以填具体路径。
OK,整个流程就是这样。
说了一通,并不详细,如果新人看到了,可能有很多疑问,欢迎提问,我会的都会解答的,。也欢迎大神来批评~~写的代码虽然实现了想要的功能效果,指定漏洞百出,希望得到批评指点,谢谢。
~~~与君共勉。