Python 爬虫练习--- requests+beautifulSoup+re
爬取某互联网公司网站的投资产品信息:https://member.niwodai.com/portal/inteBid/inteBidPage.do
首先解析网站结构:
经过上述的网页源代码解析,下面是是实现的具体步骤:
import requests
from requests import RequestException
import re
headers = {'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
def get_html():
response = requests.get('https://member.niwodai.com/portal/inteBid/inteBidPage.do', headers = headers)
try:
if response.status_code == 200:
return response.text
return None
except RequestException:
print('there is an error')
return None
def parse_html(html):
pattern = re.compile('mar_r5">(.*?)</strong>.*?fc_9">(.*?)</span>.*?Numfont">(.*?)</em>%.*?'+
'mar_r5">(.*?)</em>.*?</div>', re.S)
result = re.findall(pattern, html)
for i in result:
yield {'product': i[0],
'持有期限': i[1],
'rate': i[2] + '%',
'剩余待满金额': i[3] + '元'
}
def main():
html = get_html()
for item in parse_html(html):
print(item)
if __name__ == '__main__':
main()
最终结果: