Python初探,获取头条上的图片
接触Python后,其最普遍的应用场景是获取网站的公开数据。
于是,尝试着用Python下载头条上的图片
一、获得搜索结果
在搜索框中,输入“街拍”,然后打开调试界面,会得到
在搜索框中,输入“街拍”,然后打开调试界面,会得到请求的参数,以及返回的数据
通过调整参数中的offset可以模拟下滚动过程中获取数据的过程
对json数据分析后可以得到详情页的地址
二、处理详情页
页面的呈现,是通过JS来动态加载
而图片的地址信息时存储于BASE_DATA中的gallery
对地址分析后,可以得到图片的网址,最后通过下载函数下载就可以了
效果图:
三、全部代码:
import requests
import json
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
}
pic_local_dir = r"G:\pic\\" # 图片存储目录
def download_pic(pic_url, dir_path,):
"""
这是用来下载图片的
:param pic_url:图片url
:param dir_path:本地文件存储地址
:return:成功与否
"""
print("正在下载的图片地址是:%s" % pic_url)
try:
pic = requests.get(pic_url, headers=headers, timeout=50)
except Exception as e:
print("当前图片错误无法下载")
print("当前图片错误无法下载 %s" % e )
return False
results = pic_url.split("/")
pic_name = results[len(results)-1]
string = dir_path + pic_name + '.jpg'
fp = open(string, 'wb')
fp.write(pic.content)
fp.close()
return True
def solve_detail_page(page_url):
"""
处理详情页,用来下载图片
:param page_url:
:return success or not:
"""
r = requests.get(page_url, headers=headers)
html_str = r.text
start_index = html_str.find("JSON.parse")
end_index = html_str.find("siblingList:")
page = html_str[start_index + 12:end_index - 8]
s1 = page.replace("\/", "")
ss1 = s1.split("url")
items = []
for s1 in ss1:
items.append(s1[s1.find("http:"):s1.find(r'\"}')])
for ii in items:
if not ii.startswith("http"):
items.remove(ii)
elif len(ii) == 0:
items.remove(ii)
elif ii == '\r':
items.remove(ii)
elif ii.find(r'\",\"') >= 0:
items.remove(ii)
elif ii.find(" ") >= 0:
items.remove(ii)
elif ii.find(r'\",') >= 0:
items.remove(ii)
for ii in items:
if not ii.startswith("http"):
items.remove(ii)
elif len(ii) == 0:
items.remove(ii)
elif '\r' == ii:
items.remove(ii)
elif 0 <= ii.find(r'\",\"'):
items.remove(ii)
elif 0 <= ii.find(" "):
items.remove(ii)
elif 0 <= ii.find(r'\",'):
items.remove(ii)
names = []
urls = []
for ii in items:
te = ii.split("\\")
count = names.count(te[len(te)-1])
if count == 0:
names.append(te[len(te) - 1])
urls.append(ii)
for url_i in urls:
print(url_i)
download_pic(url_i.replace('\\', '/'), pic_local_dir)
def solve_data(response):
"""
处理每一个Item的数据
:param response:
:return:
"""
data = response.text
print("content:", data)
json_text = data.strip('() \n\t\r')
# 将文本转换为json对象
obj = json.loads(json_text)
print(obj)
json_data = obj['data']
i = 0
items_len = len(json_data)
try:
while i < items_len:
single_detail_url = json_data[i]['article_url']
solve_detail_page(single_detail_url)
i += 1
except KeyError:
i += 1
pass
finally:
pass
def main():
"""
入口函数
:return:
"""
url = r"https://www.toutiao.com/api/search/content/"
params = {'aid':24,'offset':0,'format':'json','keyword':'街拍','autoload':True,'count':20,'cur_tab':1,'from':'search_tab','pd':'synthesis'}
my_offset = 0
my_count = 20
while my_offset <201:
print("Current offset: %d" % my_offset)
params['offset'] = my_offset
r = requests.get(url, params=params, headers=headers)
solve_data(r)
my_offset = my_offset + my_count
if __name__ == '__main__':
main()