python爬取文件自动保存本地

趁热又把文件保存到本地的代码实现了。根据目标网站的文件分类形式

python爬取文件自动保存本地

我选择先处理站点每5分钟的数据，以district->year->month的文件夹嵌套形式来存储文件。同时用类包装了上一篇爬虫的代码，代码分别如下：

AutoFile.py

import os
import urllib
from urllib.request import urlopen
import time
import random


class AutoFile:
    root_path = "F:\Pemsdata\\"
    head_url = 'http://pems.dot.ca.gov'
    request = urllib.request

    def __init__(self, filetype):
        print("正在启动文件模块..........")
        self.root_path = self.root_path + filetype + '\\'
        proxy = self.request.ProxyHandler({
            'http': '127.0.0.1:1080',
            'https': '127.0.0.1:1080'
        })
        opener = self.request.build_opener(proxy)
        self.request.install_opener(opener)
        print("正在加载目录：" + self.root_path)

    def mkdir(self, path):
        path = path.strip()
        path = path.rstrip("\\")
        isExists = os.path.exists(path)
        # 判断结果
        if not isExists:
            os.makedirs(path)
            print(path + ' 创建成功')
        else:
            print(path + ' 目录已存在')
        return True

    def file_num(self, f_path):
        os.chdir(f_path)
        li = os.listdir()
        num = 0
        for i in li:
            if i[len(i) - 2:len(i)] == 'gz':
                num = num + 1
        print("在" + f_path + "目录下已经存文件个数：" + str(num))
        return num

    def check_dir(self, f_district_path, f_yy_path, month, list_path):
        f_path = self.root_path + f_district_path
        if (self.mkdir(f_path)):  # 判断该区域目录是否存在
            f_path = f_path + '\\' + f_yy_path
            if (self.mkdir(f_path)):  # 判断该区域下该年份目录是否存在
                f_path = f_path + '\\' + month
                if (self.mkdir(f_path)):  # 判断月份是否存在
                    print("开始下载........")
                    num = self.file_num(f_path)
                    self.download_file(f_path, list_path, num)

    def schedule(self, a, b, c):
        per = 100 * a * b / c
        if per > 100:
            per = 100
        print('%.2f%%' % per)

    def download_file(self, f_path, list_file, num):
        e_num = len(list_file)
        if (num == e_num):  # 比较已经下载了多少个文件
            print("已经是最新的数据")
        else:
            d_num = e_num - num
            print("还需要下载文件个数：" + str(d_num))
            d_num = num + d_num
            for i in range(num, d_num, 1):  # 从已经下载的文件开始，避免重复下载
                day = list_file[i]
                print("正在下载文件：" + day['file_name'])
                time.sleep(random.randint(1, 3))
                print(self.head_url + day['url'])
                self.request.urlretrieve(self.head_url + day['url'],
                                         f_path + '\\' + day['file_name'], self.schedule)

        print("下载结束")

AutoLogin.py 回调函数借鉴了网上的例子，但很奇怪我这里的文件总大小是-1 ????

import requests
import json
from BD2019.autoFile import AutoFile


class AutoLogin:
    head_url = 'http://pems.dot.ca.gov'
    district_id = '0'
    yy = '0'
    filetype = ''

    session = ''
    proxies = {
        'http': '127.0.0.1:1080',
        'https': '127.0.0.1:1080'
    }
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                             "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
    data = {"redirect": "", "username": "账号",
            "password": "密码", "login": "Login"}

    autofile = None

    def __init__(self):
        print("正在启动爬虫模块..........")
        self.session = requests.session()
        self.session.post("http://pems.dot.ca.gov/?dnode=Clearinghouse",
                          proxies=self.proxies, data=self.data)

    def get_station_5min_url(self, year, area_id):
        if year < 2001 or year > 2019:
            print("年份不合法！爬取失败！")
            return None
        file_url = self.head_url + "/?srq=clearinghouse&"
        self.yy = str(year)  # yy 年份 2001-2019 这里先以范围优先，故时间定位2019
        self.filetype = 'station_5min'  # type 数据内容的种类 这里先以5分钟站点检测数据为例
        self.autofile = AutoFile(self.filetype)  # 加载文件处理的类
        for id in range(3, area_id, 1):  # district_id 范围 3-12
            self.district_id = str(id)
            print('正在爬取' + self.yy + '年,第' + self.district_id +
                  '区域,类型为' + self.filetype + '的数据')
            html = self.session.get(file_url +
                                    'district_id=' + self.district_id +
                                    '&geotag=null&yy=' + self.yy +
                                    '&type=' + self.filetype +
                                    '&returnformat=text',
                                    proxies=self.proxies, headers=self.headers)
            data = json.loads(html.text)
            urls = data['data']
            for month in urls:  # 获取月份信息
                print('正在爬取' + month + '的内容')
                list_path = list()
                for day in urls[month]:  # 获取文件信息
                    list_path.append(day)
                    # print(day['file_name'] + ' ' + self.head_url + day['url'])
                self.autofile.check_dir(self.district_id, self.yy, month, list_path)

还有一个main函数：

from BD2019.autoLogin import AutoLogin

if __name__ == '__main__':
    superman = AutoLogin()
    superman.get_station_5min_url(2019, 4)
    # for year in range(2001, 2019, 1): # 轻易别尝试 (。﹏。)
    #     for district_id in range(3, 12, 1):
    #         superman.get_station_5min_url(year, district_id)

运行截图如下：

python爬取文件自动保存本地

做到这里基本上5分钟站点的数据就可以爬取了。

python爬取文件自动保存本地

相关推荐