使用Python刮擦BeautifulSoup 4 - 初学者
问题描述:
感谢您对上一个问题的帮助(here)。不过,我目前仍然坚持让我的最终数据框准备就绪。我已经能够从原始表格中提取所有数据并将其显示为我想要的形式,但是现在我想将主队和客队添加到df中,但似乎无法弄清楚。这是我目前所拥有的,并且我想要刮掉here is the site。使用Python刮擦BeautifulSoup 4 - 初学者
from urllib.request import urlopen # import the library
from bs4 import BeautifulSoup # Import BS
from bs4 import SoupStrainer # Import Soup Strainer
import pandas as pd # import pandas as a package
basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid='
matchid = '6172'
scrapeweb1 = basescrape+matchid
page = urlopen(scrapeweb1) # access the website
only_tables = SoupStrainer('table', attrs={"width" : "583"}) # parse only table elements when parsing
soup = BeautifulSoup(page, 'html.parser', parse_only=only_tables) # parse the html
only_teams = SoupStrainer('table', attrs={"width" : "376"}) # parse only team qtr score elements when parsing
soup2 = BeautifulSoup(page, 'html.parser', parse_only=only_teams) # parse the html
# only valid rows with player data in
table = soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"})
# create variables to keep the data in
hometeam = []
awayteam = []
player = []
kicks = []
handballs = []
disposals = []
marks = []
goals = []
behinds = []
tackles = []
hitouts = []
inside50s = []
freesfor = []
freesagainst = []
fantasy = []
supercoach = []
# Find all the <tr> tag pairs, skip the first one, then for each.
for row in soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"}):
# Create a variable of all the <td> tag pairs in each <tr> tag pair,
col = row.find_all('td')
# Create a variable of the string inside 1st <td> tag pair,
column_1 = col[0].string.strip()
# and append it to player variable
player.append(column_1)
# Create a variable of the string inside 2nd <td> tag pair,
column_2 = col[1].string.strip()
# and append it to kicks variable
kicks.append(column_2)
# Create a variable of the string inside 3rd <td> tag pair,
column_3 = col[2].string.strip()
# and append it to handballs variable
handballs.append(column_3)
# Create a variable of the string inside 4th <td> tag pair,
column_4 = col[3].string.strip()
# and append it to disposals variable
disposals.append(column_4)
# Create a variable of the string inside 5th <td> tag pair,
column_5 = col[4].string.strip()
# and append it to marks variable
marks.append(column_5)
# Create a variable of the string inside 5th <td> tag pair,
column_6 = col[5].string.strip()
# and append it to goals variable
goals.append(column_6)
# Create a variable of the string inside 5th <td> tag pair,
column_7 = col[6].string.strip()
# and append it to behinds variable
behinds.append(column_7)
# Create a variable of the string inside 5th <td> tag pair,
column_8 = col[7].string.strip()
# and append it to tackles variable
tackles.append(column_8)
# Create a variable of the string inside 5th <td> tag pair,
column_9 = col[8].string.strip()
# and append it to hitouts variable
hitouts.append(column_9)
# Create a variable of the string inside 5th <td> tag pair,
column_10 = col[9].string.strip()
# and append it to inside50s variable
inside50s.append(column_10)
# Create a variable of the string inside 5th <td> tag pair,
column_11 = col[10].string.strip()
# and append it to freesfo variable
freesfor.append(column_11)
# Create a variable of the string inside 5th <td> tag pair,
column_12 = col[11].string.strip()
# and append it to freesagainst variable
freesagainst.append(column_12)
# Create a variable of the string inside 5th <td> tag pair,
column_13 = col[12].string.strip()
# and append it to fantasy variable
fantasy.append(column_13)
# Create a variable of the string inside 5th <td> tag pair,
column_14 = col[13].string.strip()
# and append it to supercoach variable
supercoach.append(column_14)
# Find all the <tr> tag pairs, then for each.
for row in soup2.find_all("tr", class_= "leftbold"):
# Create a variable of all the <td> tag pairs in each <tr> tag pair,
col2 = row.find_all('td')
# Create a variable of the string inside 1st <td> tag pair,
hometeam = col2[0].string.strip()
# and append it to player variable
# hometeam.append(column2_1)
# Create a variable of the string inside 2nd <td> tag pair,
awayteam = col2[1].string.strip()
# and append it to kicks variable
# awayteam.append(column2_2)
# Create a variable of the value of the columns
columns = {'match_id': matchid, 'home_team': hometeam, 'away_team': awayteam, 'player': player, 'kicks': kicks, 'handballs': handballs, 'disposals': disposals, 'marks': marks, 'goals': goals, 'behinds': behinds, 'tackles': tackles, 'hitouts': hitouts, 'inside_50s': inside50s, 'frees_for': freesfor, 'frees_against': freesagainst, 'fantasy': fantasy, 'supercoach': supercoach}
# Create a dataframe from the columns variable - n
df = pd.DataFrame(columns, columns = ['match_id', 'home_team', 'away_team', 'player', 'kicks', 'handballs', 'disposals', 'marks', 'goals', 'behinds', 'tackles', 'hitouts', 'inside_50s', 'frees_for', 'frees_against', 'fantasy', 'supercoach'])
print(df)
# print(soup.prettify())
# print(table)
很显然,数据帧不会工作,因为数组并非全部长度相同。我该如何刮擦主客场球队并将其保存到一个变量中,以使其以与matchid相同的方式工作?
还有一种方法可以让“hometeam”变量出现在前22行,而“awayteam”出现在第23-44行?这样的球员归因于一个团队?
我觉得我在这里做这部分错误:
# Find all the <tr> tag pairs, then for each.
for row in soup2.find_all("tr", class_= "leftbold"):
# Create a variable of all the <td> tag pairs in each <tr> tag pair,
col2 = row.find_all('td')
# Create a variable of the string inside 1st <td> tag pair,
hometeam = col2[0].string.strip()
# and append it to player variable
# hometeam.append(column2_1)
# Create a variable of the string inside 2nd <td> tag pair,
awayteam = col2[1].string.strip()
# and append it to kicks variable
# awayteam.append(column2_2)
非常感谢您的帮助。
(还有一个额外的问题,我无法使用“.join”运算符来运行scrapeweb1,因为我已阅读过在字符串中使用“+”不是最佳做法。失败,低于)
scrapeweb1 = "".join(basescrape, matchid)
编辑:所以我检查了源和它似乎有在该表中一些不正确的HTML ...
<table border="0" cellspacing="0" cellpadding="0" width="376" id="matchscoretable">
<tr>
<th class="leftbold" height="23" width="100">Team</td>
它使用“/ TD”,而不是“/th“,当通过美丽的汤分析时会导致表格标签关闭...
[<table border="0" cellpadding="0" cellspacing="0" id="matchscoretable" width="376">
<tr>
<th class="leftbold" height="23" width="100">Team</th></tr></table>]
我可能要看看获得主场和客场的球队名字的另一种方式
答
这里有一种方法你可以做到这一点:
from urllib.request import urlopen # import the library
from bs4 import BeautifulSoup # Import BS
from bs4 import SoupStrainer # Import Soup Strainer
import pandas as pd # import pandas as a package
basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid='
matchid = '6172'
url = ''.join([basescrape,matchid])
# changed the table width to 585 to get first row with team name
only_tables = SoupStrainer('table', attrs={"width" : "585"}) # parse only table elements when parsing
soup = BeautifulSoup(urlopen(url), 'html.parser', parse_only=only_tables) # parse the html
# use the table titles as anchor points
teams = soup.find_all('td', attrs={'class':'innertbtitle', 'align':'left'})
# create an empty list for the players
player_list = []
# iterate through anchor points
for team in teams:
# extract team name from the table title
team_name = team.text.strip().split(' ', maxsplit=1)[0]
# get the rows from the next table relative to anchor point
trs = team.find_next('table', attrs={'width':583}).find_all('tr')
# create list of labels using first row in table
labels = [td.text for td in trs.pop(0).find_all('td')]
# iterate through the remaining rows
for row in trs:
# build dictionary using label as key and text of each cell as value
player_dict = {label:value.text for label,value in
zip(labels, row.find_all('td'))}
# add team name to dictionary
player_dict['team'] = team_name
# append dictionary to the list
player_list.append(player_dict)
# create the dataframe
df = pd.DataFrame(player_list)
print(df)
答
我设法解决这个问题,这里现在已完成的代码...
from urllib.request import urlopen # import the library
from bs4 import BeautifulSoup # Import BS
from bs4 import SoupStrainer # Import Soup Strainer
import pandas as pd # import pandas as a package
basescrape = 'http://www.footywire.com/afl/footy/ft_match_statistics?mid='
matchid = '6172'
scrapeweb1 = basescrape+matchid
page = urlopen(scrapeweb1) # access the website
page2 = urlopen(scrapeweb1) # access the website
only_tables = SoupStrainer('table', attrs={"width" : "583"}) # parse only table elements when parsing
soup = BeautifulSoup(page, 'html.parser', parse_only=only_tables) # parse the html
soup2 = BeautifulSoup(page2, 'html.parser') # parse the html
# only valid rows with player data in
table = soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"})
# create variables to keep the data in
Table1 = soup2.find_all('table', attrs={'width':"375"})[1]
hometeam = Table1.find_all('td', attrs={'width':"124"})[0].string.strip()
awayteam = Table1.find_all('td', attrs={'width':"124"})[1].string.strip()
player = []
kicks = []
handballs = []
disposals = []
marks = []
goals = []
behinds = []
tackles = []
hitouts = []
inside50s = []
freesfor = []
freesagainst = []
fantasy = []
supercoach = []
# Find all the <tr> tag pairs, skip the first one, then for each.
for row in soup.find_all("tr", attrs={"onmouseover" : "this.bgColor='#cbcdd0';"}):
# Create a variable of all the <td> tag pairs in each <tr> tag pair,
col = row.find_all('td')
# Create a variable of the string inside 1st <td> tag pair,
column_1 = col[0].string.strip()
# and append it to player variable
player.append(column_1)
# Create a variable of the string inside 2nd <td> tag pair,
column_2 = col[1].string.strip()
# and append it to kicks variable
kicks.append(column_2)
# Create a variable of the string inside 3rd <td> tag pair,
column_3 = col[2].string.strip()
# and append it to handballs variable
handballs.append(column_3)
# Create a variable of the string inside 4th <td> tag pair,
column_4 = col[3].string.strip()
# and append it to disposals variable
disposals.append(column_4)
# Create a variable of the string inside 5th <td> tag pair,
column_5 = col[4].string.strip()
# and append it to marks variable
marks.append(column_5)
# Create a variable of the string inside 5th <td> tag pair,
column_6 = col[5].string.strip()
# and append it to goals variable
goals.append(column_6)
# Create a variable of the string inside 5th <td> tag pair,
column_7 = col[6].string.strip()
# and append it to behinds variable
behinds.append(column_7)
# Create a variable of the string inside 5th <td> tag pair,
column_8 = col[7].string.strip()
# and append it to tackles variable
tackles.append(column_8)
# Create a variable of the string inside 5th <td> tag pair,
column_9 = col[8].string.strip()
# and append it to hitouts variable
hitouts.append(column_9)
# Create a variable of the string inside 5th <td> tag pair,
column_10 = col[9].string.strip()
# and append it to inside50s variable
inside50s.append(column_10)
# Create a variable of the string inside 5th <td> tag pair,
column_11 = col[10].string.strip()
# and append it to freesfo variable
freesfor.append(column_11)
# Create a variable of the string inside 5th <td> tag pair,
column_12 = col[11].string.strip()
# and append it to freesagainst variable
freesagainst.append(column_12)
# Create a variable of the string inside 5th <td> tag pair,
column_13 = col[12].string.strip()
# and append it to fantasy variable
fantasy.append(column_13)
# Create a variable of the string inside 5th <td> tag pair,
column_14 = col[13].string.strip()
# and append it to supercoach variable
supercoach.append(column_14)
# Create a variable of the value of the columns
columns = {'match_id': matchid, 'home_team': hometeam, 'away_team': awayteam, 'player': player, 'kicks': kicks, 'handballs': handballs, 'disposals': disposals, 'marks': marks, 'goals': goals, 'behinds': behinds, 'tackles': tackles, 'hitouts': hitouts, 'inside_50s': inside50s, 'frees_for': freesfor, 'frees_against': freesagainst, 'fantasy': fantasy, 'supercoach': supercoach}
# Create a dataframe from the columns variable - n
df = pd.DataFrame(columns, columns = ['match_id', 'home_team', 'away_team', 'player', 'kicks', 'handballs', 'disposals', 'marks', 'goals', 'behinds', 'tackles', 'hitouts', 'inside_50s', 'frees_for', 'frees_against', 'fantasy', 'supercoach'])
print(df)
非常感谢您花时间做到这一点,这是我的理解,这刮擦了所有的数据,并把它放在一个很好的数据框? 编辑:我只是设法得到一些时间来运行它,哇,这是多么整洁!非常感谢您的帮助。对不起,这样的痛苦,但你能帮我理解每一行吗?我拼命地尝试改进我的python编码。 – MSalty
我的理解到目前为止(使用“players_list = []”作为第1行) 1.创建一个名为“players_list”的空变量 2.查看“团队”中的每个“团队”BS元素 3.团队名称是由头部分配,由空格分开,返回第一个值(团队名称) 4.然后定义一个新的BS元素,它是我们要刮的实际表中的行 5.分配每行的标题为DF创建标签 6.创建一个循环遍历表中的每一行 7.创建另一个循环,将每列的值赋给标题并将其存储在“player_dict”中... – MSalty
8.检索将每个'td'括号中的每个值存储在正确的标签中9。在'player_dict'中创建一个名为'team'的新列,并将其分配到行 10.将完成的行添加到“player_list”变量 11.创建播放列表的数据框 这一切是否正确?我错过了某人吗? 再次感谢。 – MSalty