BeautifulSoup
BeautifulSoup库
参考北理工Python课程
基本使用
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,"lxml")
print(soup.prettify()) #补全HTML并格式化
print(soup.title.string) #输出title的内容
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title">
<b>
The Dormouse's story
</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
Elsie
</a>
,
<a class="sister" href="http://example.com/lacie" id="link2">
Lacie
</a>
and
<a class="sister" href="http://example.com/tillie" id="link3">
Tillie
</a>
;
and they lived at the bottom of a well.
</p>
<p class="story">
...
</p>
</body>
</html>
The Dormouse's story
标签选择器
选择元素
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,"html.parser")
print(soup.find_all('a')) #选择所有的a标签,返回一个列表
print(soup.find('p')) #选择第一个p标签,如果存在则返回
print(soup.p) #选择第一个p标签,如果存在则返回。等价于上面的写法
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
<p class="title"><b>The Dormouse's story</b></p>
<p class="title"><b>The Dormouse's story</b></p>
获取名称
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" ><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,"html.parser")
print(soup.p.name) #输出 p
p
获取属性
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,"html.parser")
print(soup.p['class']) #获取第一个p标签的class属性值
print(soup.p.attrs['class']) #等价于上面的写法
l=soup.find_all('p')
for i in l: #遍历所有p标签的class属性值
print(i.attrs['class'])
['title']
['title']
['title']
['story']
['story']
获取内容
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,'html.parser')
print(soup.p.string) #输出第一个p标签的内容
l=soup.find_all('p')
for i in l:
print(i.string)
The Dormouse's story
The Dormouse's story
None
...
嵌套选择
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<div>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</div>
"""
soup=BeautifulSoup(html,'html.parser')
div=soup.find('div')
print(type(div)) #返回的是一个标签类型
print(div.p) #选择div里的第一个p标签,如果存在则返回
print(div.find_all('p')) #选择div里所有的p标签,返回一个列表
print(div.p.a.string) #选择div里第一个p标签里第一个a标签的文本内容
<class 'bs4.element.Tag'>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
[<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <p class="story">...</p>]
Elsie
子节点和子孙节点
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b><a>我是一个a</a></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,'html.parser')
print(soup.p.contents) #选择第一个p标签的所有子节点
[<b>The Dormouse's story</b>, <a>我是一个a</a>]
from bs4 import BeautifulSoup
import re
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b><div>我是孙子</div>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,'html.parser')
print(soup.p.children)
for i,child in enumerate(soup.p.children): #迭代遍历子节点
print(i,child)
<list_iterator object at 0x000002C6C8C9ACC0>
0 <b><div>我是孙子</div>The Dormouse's story</b>
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b><div>我是孙子节点<a>我是曾孙节点</a></div>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,'html.parser')
print(soup.p.descendants) #输出子孙节点
for i,child in enumerate(soup.p.descendants): #迭代子孙节点
print(i,child)
<generator object descendants at 0x000002C6C8C66FC0>
0 <b><div>我是孙子节点<a>我是曾孙节点</a></div>The Dormouse's story</b>
1 <div>我是孙子节点<a>我是曾孙节点</a></div>
2 我是孙子节点
3 <a>我是曾孙节点</a>
4 我是曾孙节点
5 The Dormouse's story
父节点和祖先节点
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
<a>我是第二个a</a>
"""
soup=BeautifulSoup(html,'html.parser')
print(soup.a.parent) #输出第一个a标签的父节点
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<p><div>我是最小的</div></p>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,'html.parser')
print(list(enumerate(soup.div.parents))) #输出第一个div标签的祖先节点
[(0, <p><div>我是最小的</div></p>), (1, <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<p><div>我是最小的</div></p>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>), (2, <body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<p><div>我是最小的</div></p>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>), (3, <html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<p><div>我是最小的</div></p>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>), (4,
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<p><div>我是最小的</div></p>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>)]
兄弟节点
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,'html.parser')
print(list(enumerate(soup.a.next_siblings))) #获取第一个a标签的后继兄弟节点
print(list(enumerate(soup.a.previous_siblings))) #获取a标签的前驱节点
[(0, ',\n'), (1, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>), (2, ' and\n'), (3, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>), (4, ';\nand they lived at the bottom of a well.')]
[(0, 'Once upon a time there were three little sisters; and their names were\n')]
标准选择器
find_all(name,attrs,recursive,text,**kwargs)
可根据标签名,属性,内容查找文档
name
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,'html.parser')
print(soup.find_all('p'))
print(type(soup.find_all('p')))
[<p class="title"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <p class="story">...</p>]
<class 'bs4.element.ResultSet'>
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<div>div1</div>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<div>div2</div>
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
<div>div3</div>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,'html.parser')
for p in soup.find_all('p'):
print(p.find_all('div'))
[]
[<div>div1</div>, <div>div2</div>, <div>div3</div>]
[]
attrs
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<div>div1</div>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<div>div2</div>
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
<a href="123"></a>
<div>div3</div>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,'html.parser')
print(soup.find_all(attrs={'class':'title'})) #根据属性查找
print(soup.find_all(attrs={'href':'123'})) #字典形式填入属性参数
[<p class="title"><b>The Dormouse's story</b></p>]
[<a href="123"></a>]
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<div>div1</div>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<div>div2</div>
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
<div>div3</div>
<div id="123"></div>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,'html.parser')
print(soup.find_all(class_='title')) #不使用字典形式更方便,注意:class属性使用时需要在class后面加一个下划线,避免与关键字冲突
print(soup.find_all(id=123))
[<p class="title"><b>The Dormouse's story</b></p>]
[<div id="123"></div>]
text
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<div>div1</div>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<div>div2</div>
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
<div>div3</div>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,'html.parser')
print(soup.find_all(text='...')) #选择文本,直接返回标签内容,不返还标签
print(soup.find_all(text='a'))
['...']
[]
find(name,attrs,recursive,text,**kwargs)
find返回单个元素,find_all返回所有元素。可以看做find返回find_all的第一个结果
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<div>div1</div>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<div>div2</div>
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
<div>div3</div>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,'html.parser')
print(soup.find('ul'))
print(soup.find('p',class_="title"))
None
<p class="title"><b>The Dormouse's story</b></p>
find_parents() find_parent()
find_parents()返回所有祖先节点,而find_parent()直接返回父节点
find_next_siblings() find_next_sibling()
find_next_siblings()返回后面所有兄弟节点 ,而find_next_sibling()则返回后面兄弟的第一个节点
find_previous_siblings() find_previous_sibling()
find_previoues_siblings()返回前面所有兄弟节点,而find_prrvious_sibling()则返回前面的第一个兄弟节点
find_all_next() find_next()
find_all_next()返回节点后所有符合条件的节点,而find_next返回第一个符合条件的节点
find_all_previous() find_previous()
find_all_prtvious()返回节点后所有符合条件的节点,find_previous()返回第一个符合条件的节点
CSS选择器
通过select()直接传入CSS选择器即可完成选择
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<div>div1</div>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<div>div2</div>
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
<div>div3</div>
and they lived at the bottom of a well.</p>
<div id=123 class="story"></div>
<div name="div"></div>
<ul>
<li>li1</li>
<li>li2</li>
</ul>
<p class="test"></p>
<p id="123" >
<span class="story"></span>
</p>
"""
soup=BeautifulSoup(html,'html.parser')
print(soup.select('#123')) #选择id
print(soup.select('ul li')) #选择ul下的li
print(soup.select('div')[0])
print(soup.select('.test')) #选择class="test"的所有标签
print(soup.select('#123 .story')) #选择id="123"的标签下class="story"的标签
[<div class="story" id="123"></div>, <p id="123">
<span class="story"></span>
</p>]
[<li>li1</li>, <li>li2</li>]
<div>div1</div>
[<p class="test"></p>]
[<span class="story"></span>]
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<div>div1</div>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<div>div2</div>
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
<div>div3</div>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,"html.parser")
for p in soup.select('p'):
print(p.select('a'))
[]
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
[]
获取属性
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<div>div1</div>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<div>div2</div>
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
<div>div3</div>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,"html.parser")
for a in soup.select('a'):
print(a.attrs['href'])
print(a['class'])
http://example.com/elsie
['sister']
http://example.com/lacie
['sister']
http://example.com/tillie
['sister']
获取内容
from bs4 import BeautifulSoup
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<div>div1</div>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<div>div2</div>
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
<div>div3</div>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html,'html.parser')
print(soup.find('a').get_text())
print(".........")
for a in soup.select('a'):
print(a.get_text())
Elsie
.........
Elsie
Lacie
Tillie