Python - 为什么这个数据被写入文件不正确?
只有第一个结果正在写入csv,每行包含一个url的一个字母。这不是所有的URL都被写入,每行一个。Python - 为什么这个数据被写入文件不正确?
我在这段代码的最后一部分没有做什么,导致只用结果之一而不是所有结果写入cvs?
import requests
from bs4 import BeautifulSoup
import csv
def grab_listings():
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/2/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/3/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/4/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/5/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/6/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/7/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/8/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/9/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
l = grab_listings()
with open ("gyms.csv", "wb") as file:
writer = csv.writer(file)
for row in l:
writer.writerow(row)
简体:
import requests
from bs4 import BeautifulSoup
import csv
def grab_listings():
for i in range(0, 5):
url = "http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/{}/"
r = requests.get(url.format(i + 1))
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class": "wlt_search_results"})
for elem in l_area.findAll("a", {"class": "frame"}):
yield elem["href"]
l = grab_listings()
with open("gyms.csv", "w") as file:
writer = csv.writer(file)
for row in l:
writer.writerow(row)
这当然是代码上的一个重大改进,但是我怀疑这对于OP来说更有用,如果你解释了你已经改变了什么,以及为什么**。 – jonrsharpe
这段代码与原来的代码不一样..你说得对,它可以被简化,但第一个案例没有附加页面,其他所有代码从2到9,而不是从1到5 - 萨洛51分钟前 – Salo
谢谢你的解答。我实际上使用过你的方法,因为它显示了一行中的每个URL,这正是我所寻找的。如上所述,每个字母用逗号分隔。 你能告诉我如何修改你的代码,以便每个字母的逗号不会打破结果吗? – McLeodx
所以我重构你的代码了一下,我认为它应该工作,你会想到它现在:
import requests
from bs4 import BeautifulSoup
import csv
def grab_listings(page_idx):
ret = []
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/"
"page/{}/").format(page_idx) # the index of the page will be inserted here
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class": "wlt_search_results"})
for elem in l_area.findAll("a", {"class": "frame"}):
# be sure to add all your results to a list and return it,
# if you return here then you will only get the first result
ret.append(elem["href"])
return ret
def main():
l = [] # this will be a list of lists
# call the function 9 times here with idx from 1 till 9
for page_idx in range(1, 10):
l.append(grab_listings(page_idx))
print l
with open("gyms.csv", "wb") as f:
writer = csv.writer(f)
for row in l:
# be sure that your row is a list here, if it is only
# a string all characters will be seperated by a comma.
writer.writerow(row)
# for writing each URL in one line separated by commas at the end
# with open("gyms.csv", "wb") as f:
# for row in l:
# string_to_write = ',\n'.join(row)
# f.write(string_to_write)
if __name__ == '__main__':
main()
我添加了一些意见的代码并希望它足够说明问题。如果不是只是要求:)
你在第一次返回后将不会获得更多的数据。 –
你的函数在你第一次返回任何东西时停止执行,在第一个循环中 –
你有两个问题:1.'return'只能发生每个功能一次(你可能想要阅读发电机和'yield');和2.因此,你因此迭代*单个字符串*并将每个字符传递给'writerow'。结果是不可避免的。 – jonrsharpe