python爬取文件自动保存本地
趁热又把文件保存到本地的代码实现了。根据目标网站的文件分类形式
我选择先处理站点每5分钟的数据,以district->year->month的文件夹嵌套形式来存储文件。同时用类包装了上一篇爬虫的代码,代码分别如下:
AutoFile.py
import os
import urllib
from urllib.request import urlopen
import time
import random
class AutoFile:
root_path = "F:\Pemsdata\\"
head_url = 'http://pems.dot.ca.gov'
request = urllib.request
def __init__(self, filetype):
print("正在启动文件模块..........")
self.root_path = self.root_path + filetype + '\\'
proxy = self.request.ProxyHandler({
'http': '127.0.0.1:1080',
'https': '127.0.0.1:1080'
})
opener = self.request.build_opener(proxy)
self.request.install_opener(opener)
print("正在加载目录:" + self.root_path)
def mkdir(self, path):
path = path.strip()
path = path.rstrip("\\")
isExists = os.path.exists(path)
# 判断结果
if not isExists:
os.makedirs(path)
print(path + ' 创建成功')
else:
print(path + ' 目录已存在')
return True
def file_num(self, f_path):
os.chdir(f_path)
li = os.listdir()
num = 0
for i in li:
if i[len(i) - 2:len(i)] == 'gz':
num = num + 1
print("在" + f_path + "目录下已经存文件个数:" + str(num))
return num
def check_dir(self, f_district_path, f_yy_path, month, list_path):
f_path = self.root_path + f_district_path
if (self.mkdir(f_path)): # 判断该区域目录是否存在
f_path = f_path + '\\' + f_yy_path
if (self.mkdir(f_path)): # 判断该区域下该年份目录是否存在
f_path = f_path + '\\' + month
if (self.mkdir(f_path)): # 判断月份是否存在
print("开始下载........")
num = self.file_num(f_path)
self.download_file(f_path, list_path, num)
def schedule(self, a, b, c):
per = 100 * a * b / c
if per > 100:
per = 100
print('%.2f%%' % per)
def download_file(self, f_path, list_file, num):
e_num = len(list_file)
if (num == e_num): # 比较已经下载了多少个文件
print("已经是最新的数据")
else:
d_num = e_num - num
print("还需要下载文件个数:" + str(d_num))
d_num = num + d_num
for i in range(num, d_num, 1): # 从已经下载的文件开始,避免重复下载
day = list_file[i]
print("正在下载文件:" + day['file_name'])
time.sleep(random.randint(1, 3))
print(self.head_url + day['url'])
self.request.urlretrieve(self.head_url + day['url'],
f_path + '\\' + day['file_name'], self.schedule)
print("下载结束")
AutoLogin.py 回调函数借鉴了网上的例子,但很奇怪我这里的文件总大小是-1 ????
import requests
import json
from BD2019.autoFile import AutoFile
class AutoLogin:
head_url = 'http://pems.dot.ca.gov'
district_id = '0'
yy = '0'
filetype = ''
session = ''
proxies = {
'http': '127.0.0.1:1080',
'https': '127.0.0.1:1080'
}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
data = {"redirect": "", "username": "账号",
"password": "密码", "login": "Login"}
autofile = None
def __init__(self):
print("正在启动爬虫模块..........")
self.session = requests.session()
self.session.post("http://pems.dot.ca.gov/?dnode=Clearinghouse",
proxies=self.proxies, data=self.data)
def get_station_5min_url(self, year, area_id):
if year < 2001 or year > 2019:
print("年份不合法!爬取失败!")
return None
file_url = self.head_url + "/?srq=clearinghouse&"
self.yy = str(year) # yy 年份 2001-2019 这里先以范围优先,故时间定位2019
self.filetype = 'station_5min' # type 数据内容的种类 这里先以5分钟站点检测数据为例
self.autofile = AutoFile(self.filetype) # 加载文件处理的类
for id in range(3, area_id, 1): # district_id 范围 3-12
self.district_id = str(id)
print('正在爬取' + self.yy + '年,第' + self.district_id +
'区域,类型为' + self.filetype + '的数据')
html = self.session.get(file_url +
'district_id=' + self.district_id +
'&geotag=null&yy=' + self.yy +
'&type=' + self.filetype +
'&returnformat=text',
proxies=self.proxies, headers=self.headers)
data = json.loads(html.text)
urls = data['data']
for month in urls: # 获取月份信息
print('正在爬取' + month + '的内容')
list_path = list()
for day in urls[month]: # 获取文件信息
list_path.append(day)
# print(day['file_name'] + ' ' + self.head_url + day['url'])
self.autofile.check_dir(self.district_id, self.yy, month, list_path)
还有一个main函数:
from BD2019.autoLogin import AutoLogin
if __name__ == '__main__':
superman = AutoLogin()
superman.get_station_5min_url(2019, 4)
# for year in range(2001, 2019, 1): # 轻易别尝试 (。﹏。)
# for district_id in range(3, 12, 1):
# superman.get_station_5min_url(year, district_id)
运行截图如下:
做到这里基本上5分钟站点的数据就可以爬取了。