import requests
import re
def access_to_web_pages(offset):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
url = 'http://maoyan.com/board/4?offset='+str(offset)
response = requests.get(url,headers=headers)
return response
def get_the_data(response):
pattern = re.compile('<dd>.*?>(.*?)</i>.*?<p class="name">.*?>(.*?)</a>.*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>.*?</dd>',re.S)
items = re.findall(pattern,response.text)
return items
def data_processing(items):
item_list = []
for item in items:
item = list(item)
item_list.append(item)
for i in range(len(item_list)):
item_list[i][2] = item_list[i][2].strip()
return item_list
def save_the_data(item_list):
with open('maoyan.txt','a',encoding='utf-8')as f:
for i in item_list:
for j in i:
f.write(j)
f.write('\n')
def main():
i = 0
while i < 10:
offset = i * 10
response = access_to_web_pages(offset)
items = get_the_data(response)
item_list = data_processing(items)
save_the_data(item_list)
i = i + 1
if __name__ == '__main__':
main()
